import urllib html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read() print type(html_str) print html_str[:10] print html_str.find("Fluffy") # pip install beautifulsoup4 from bs4 import BeautifulSoup document = BeautifulSoup(html_str) print type(document) h1_tag = document.find("h1") print type(h1_tag) h1_tag.string img_tag = document.find('img') img_tag['src'] h1_tag = document.find('h1') h1_tag.string img_tag = document.find('img') img_tag['src'] print img_tag.string print img_tag h2_tags = document.find_all("h2") print h2_tags[0].string print h2_tags[1].string [tag.string for tag in h2_tags] img_tags = document.find_all("img") [tag['src'] for tag in img_tags] spans = document.find_all("span", attrs={"class": "lastcheckup"}) imgs = document.find_all("img", attrs={'src': 'http://placekitten.com/100/100'}) print imgs kitten_tags = document.find_all("div", attrs={'class': 'kitten'}) for kitten_tag in kitten_tags: h2_tag = kitten_tag.find('h2') print h2_tag.string for kitten_tag in kitten_tags: h2_tag = kitten_tag.find('h2') kitten_name = h2_tag.string a_tags = kitten_tag.find_all('a') a_tag_strings = [tag.string for tag in a_tags] a_joined = ', '.join(a_tag_strings) print h2_tag.string + ": " + a_joined foo = ["a", "b", "c"] "!".join(foo) kitten_shows = {} for kitten_tag in kitten_tags: h2_tag = kitten_tag.find('h2') kitten_name = h2_tag.string a_tags = kitten_tag.find_all('a') a_tag_strings = [tag.string for tag in a_tags] kitten_shows[kitten_name] = a_tag_strings kitten_shows import urllib from bs4 import BeautifulSoup html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read() document = BeautifulSoup(html_str) attrs_we_are_looking_for = {'class': 'kitten'} kitten_divs = document.find_all("div", attrs=attrs_we_are_looking_for) kittens_list = [] for kitten_div in kitten_divs: kitten_dict = {} # add name! h2_tag = kitten_div.find("h2") # does the same thing #h2_tags = kitten_div.find_all("h2")[0] kitten_dict["name"] = h2_tag.string # add last checkup! span_tag = kitten_div.find("span") kitten_dict["lastcheckup"] = span_tag.string # add image url img_tag = kitten_div.find("img") kitten_dict["img"] = img_tag['src'] kittens_list.append(kitten_dict) kittens_list [ {'name': 'Fluffy', 'img': 'http://placekitten.com/100/100', 'lastcheckup': '2014-01-17'}, {'name': 'Monsieur Whiskeurs', 'img': 'http://placekitten.com/120/100', 'lastcheckup': '2013-11-02'} ] import urllib html_str = urllib.urlopen("http://www.journalism.columbia.edu/page/10/10?category_ids%5B%5D=2&category_ids%5B%5D=3&category_ids%5B%5D=37").read() document = BeautifulSoup(html_str) faculty_list = [] experts = document.find('ul', attrs={'class': 'experts-list'}) faculty_list = experts.find_all('li') all_faculty = [] for faculty_li in faculty_list: faculty_dict = {} # get title title_tag = faculty_li.find('p', attrs={'class': 'description'}) if title_tag is None: continue faculty_dict['title'] = title_tag.string # get faculty member name h4_tag = faculty_li.find('h4') a_tag = h4_tag.find('a') faculty_dict['name'] = a_tag.string all_faculty.append(faculty_dict) faculty_list import pandas as pd faculty_frame = pd.DataFrame(all_faculty) faculty_frame[faculty_frame["title"]=="Adjunct Faculty"] cats = ["Garfield", "Heathcliff", "Grumpy Cat", "Socks"] cats_output = [] for item in cats: # create an empty dictionary cat_dict = {} # do some stuff, add stuff to the dictionary cat_dict['name'] = item cat_dict['name_length'] = len(item) # append that dictionary to our output list cats_output.append(cat_dict) cats_output # #what we want: #[ # {"name": "Garfield", "name_length": 8}, # {"name": "Heathcliff", "name_length": 10}, # ... #] import pandas as pd cats_df = pd.DataFrame(cats_output) cats_df import urllib from bs4 import BeautifulSoup html_str = urllib.urlopen("http://www.menupages.com/restaurants/all-areas/morningside-heights/all-cuisines/").read() document = BeautifulSoup(html_str) table_tag = document.find("table") restaurant_list = [] for tr_tag in table_tag.find_all("tr"): restaurant_dict = {} reviews_tag = tr_tag.find("td", attrs={'class': 'reviews'}) if reviews_tag is None: continue restaurant_dict['reviews'] = int(reviews_tag.string) restaurant_list.append(restaurant_dict) restaurant_list [ {"name": "Ajanta", "price": 2, "rating": 3.0, "reviews": 43, {"name": "Amigos", "price": 3, "rating": 3.0, "reviews": 5} ... ]