import urllib
html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read()
print type(html_str)
print html_str[:10]
print html_str.find("Fluffy")
# pip install beautifulsoup4
from bs4 import BeautifulSoup
document = BeautifulSoup(html_str)
print type(document)
h1_tag = document.find("h1")
print type(h1_tag)
h1_tag.string
img_tag = document.find('img')
img_tag['src']
h1_tag = document.find('h1')
h1_tag.string
img_tag = document.find('img')
img_tag['src']
print img_tag.string
print img_tag
h2_tags = document.find_all("h2")
print h2_tags[0].string
print h2_tags[1].string
[tag.string for tag in h2_tags]
img_tags = document.find_all("img")
[tag['src'] for tag in img_tags]
spans = document.find_all("span", attrs={"class": "lastcheckup"})
imgs = document.find_all("img", attrs={'src': 'http://placekitten.com/100/100'})
print imgs
kitten_tags = document.find_all("div", attrs={'class': 'kitten'})
for kitten_tag in kitten_tags:
h2_tag = kitten_tag.find('h2')
print h2_tag.string
for kitten_tag in kitten_tags:
h2_tag = kitten_tag.find('h2')
kitten_name = h2_tag.string
a_tags = kitten_tag.find_all('a')
a_tag_strings = [tag.string for tag in a_tags]
a_joined = ', '.join(a_tag_strings)
print h2_tag.string + ": " + a_joined
foo = ["a", "b", "c"]
"!".join(foo)
kitten_shows = {}
for kitten_tag in kitten_tags:
h2_tag = kitten_tag.find('h2')
kitten_name = h2_tag.string
a_tags = kitten_tag.find_all('a')
a_tag_strings = [tag.string for tag in a_tags]
kitten_shows[kitten_name] = a_tag_strings
kitten_shows
import urllib
from bs4 import BeautifulSoup
html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read()
document = BeautifulSoup(html_str)
attrs_we_are_looking_for = {'class': 'kitten'}
kitten_divs = document.find_all("div", attrs=attrs_we_are_looking_for)
kittens_list = []
for kitten_div in kitten_divs:
kitten_dict = {}
# add name!
h2_tag = kitten_div.find("h2")
# does the same thing
#h2_tags = kitten_div.find_all("h2")[0]
kitten_dict["name"] = h2_tag.string
# add last checkup!
span_tag = kitten_div.find("span")
kitten_dict["lastcheckup"] = span_tag.string
# add image url
img_tag = kitten_div.find("img")
kitten_dict["img"] = img_tag['src']
kittens_list.append(kitten_dict)
kittens_list
[
{'name': 'Fluffy',
'img': 'http://placekitten.com/100/100',
'lastcheckup': '2014-01-17'},
{'name': 'Monsieur Whiskeurs',
'img': 'http://placekitten.com/120/100',
'lastcheckup': '2013-11-02'}
]
import urllib
html_str = urllib.urlopen("http://www.journalism.columbia.edu/page/10/10?category_ids%5B%5D=2&category_ids%5B%5D=3&category_ids%5B%5D=37").read()
document = BeautifulSoup(html_str)
faculty_list = []
experts = document.find('ul', attrs={'class': 'experts-list'})
faculty_list = experts.find_all('li')
all_faculty = []
for faculty_li in faculty_list:
faculty_dict = {}
# get title
title_tag = faculty_li.find('p', attrs={'class': 'description'})
if title_tag is None:
continue
faculty_dict['title'] = title_tag.string
# get faculty member name
h4_tag = faculty_li.find('h4')
a_tag = h4_tag.find('a')
faculty_dict['name'] = a_tag.string
all_faculty.append(faculty_dict)
faculty_list
import pandas as pd
faculty_frame = pd.DataFrame(all_faculty)
faculty_frame[faculty_frame["title"]=="Adjunct Faculty"]
cats = ["Garfield", "Heathcliff", "Grumpy Cat", "Socks"]
cats_output = []
for item in cats:
# create an empty dictionary
cat_dict = {}
# do some stuff, add stuff to the dictionary
cat_dict['name'] = item
cat_dict['name_length'] = len(item)
# append that dictionary to our output list
cats_output.append(cat_dict)
cats_output
#
#what we want:
#[
# {"name": "Garfield", "name_length": 8},
# {"name": "Heathcliff", "name_length": 10},
# ...
#]
import pandas as pd
cats_df = pd.DataFrame(cats_output)
cats_df
import urllib
from bs4 import BeautifulSoup
html_str = urllib.urlopen("http://www.menupages.com/restaurants/all-areas/morningside-heights/all-cuisines/").read()
document = BeautifulSoup(html_str)
table_tag = document.find("table")
restaurant_list = []
for tr_tag in table_tag.find_all("tr"):
restaurant_dict = {}
reviews_tag = tr_tag.find("td", attrs={'class': 'reviews'})
if reviews_tag is None:
continue
restaurant_dict['reviews'] = int(reviews_tag.string)
restaurant_list.append(restaurant_dict)
restaurant_list
[
{"name": "Ajanta", "price": 2, "rating": 3.0, "reviews": 43,
{"name": "Amigos", "price": 3, "rating": 3.0, "reviews": 5}
...
]