import urllib

html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read()

print type(html_str)

print html_str[:10]

print html_str.find("Fluffy")

# pip install beautifulsoup4
from bs4 import BeautifulSoup

document = BeautifulSoup(html_str)
print type(document)

h1_tag = document.find("h1")
print type(h1_tag)

h1_tag.string

img_tag = document.find('img')
img_tag['src']

h1_tag = document.find('h1')
h1_tag.string

img_tag = document.find('img')
img_tag['src']
print img_tag.string

print img_tag

h2_tags = document.find_all("h2")
print h2_tags[0].string
print h2_tags[1].string
[tag.string for tag in h2_tags]

img_tags = document.find_all("img")
[tag['src'] for tag in img_tags]

spans = document.find_all("span", attrs={"class": "lastcheckup"})


imgs = document.find_all("img", attrs={'src': 'http://placekitten.com/100/100'})
print imgs

kitten_tags = document.find_all("div", attrs={'class': 'kitten'})

for kitten_tag in kitten_tags:
    h2_tag = kitten_tag.find('h2')
    print h2_tag.string

for kitten_tag in kitten_tags:
    h2_tag = kitten_tag.find('h2')
    kitten_name = h2_tag.string
    a_tags = kitten_tag.find_all('a')
    a_tag_strings = [tag.string for tag in a_tags]
    a_joined = ', '.join(a_tag_strings)
    print h2_tag.string + ": " + a_joined
    

foo = ["a", "b", "c"]
"!".join(foo)

kitten_shows = {}
for kitten_tag in kitten_tags:
    h2_tag = kitten_tag.find('h2')
    kitten_name = h2_tag.string
    a_tags = kitten_tag.find_all('a')
    a_tag_strings = [tag.string for tag in a_tags]
    kitten_shows[kitten_name] = a_tag_strings
kitten_shows


import urllib
from bs4 import BeautifulSoup

html_str = urllib.urlopen("http://static.decontextualize.com/kittens.html").read()

document = BeautifulSoup(html_str)

attrs_we_are_looking_for = {'class': 'kitten'}
kitten_divs = document.find_all("div", attrs=attrs_we_are_looking_for)

kittens_list = []
for kitten_div in kitten_divs:
    kitten_dict = {}
    
    # add name!
    h2_tag = kitten_div.find("h2")
    # does the same thing
    #h2_tags = kitten_div.find_all("h2")[0]
    kitten_dict["name"] = h2_tag.string

    # add last checkup!
    span_tag = kitten_div.find("span")
    kitten_dict["lastcheckup"] = span_tag.string
    
    # add image url
    img_tag = kitten_div.find("img")
    kitten_dict["img"] = img_tag['src']
    
    kittens_list.append(kitten_dict)
    
kittens_list


[
  {'name': 'Fluffy',
   'img': 'http://placekitten.com/100/100',
   'lastcheckup': '2014-01-17'},
  {'name': 'Monsieur Whiskeurs',
   'img': 'http://placekitten.com/120/100',
   'lastcheckup': '2013-11-02'}
]

import urllib
html_str = urllib.urlopen("http://www.journalism.columbia.edu/page/10/10?category_ids%5B%5D=2&category_ids%5B%5D=3&category_ids%5B%5D=37").read()


document = BeautifulSoup(html_str)
faculty_list = []

experts = document.find('ul', attrs={'class': 'experts-list'})
faculty_list = experts.find_all('li')
all_faculty = []
for faculty_li in faculty_list:
    faculty_dict = {}
    # get title
    title_tag = faculty_li.find('p', attrs={'class': 'description'})
    if title_tag is None:
        continue
    faculty_dict['title'] = title_tag.string

    # get faculty member name
    h4_tag = faculty_li.find('h4')
    a_tag = h4_tag.find('a')
    
    faculty_dict['name'] = a_tag.string
    all_faculty.append(faculty_dict)
    
faculty_list

import pandas as pd
faculty_frame = pd.DataFrame(all_faculty)
faculty_frame[faculty_frame["title"]=="Adjunct Faculty"]

cats = ["Garfield", "Heathcliff", "Grumpy Cat", "Socks"]

cats_output = []

for item in cats:
    # create an empty dictionary
    cat_dict = {}
    
    # do some stuff, add stuff to the dictionary
    cat_dict['name'] = item
    cat_dict['name_length'] = len(item)
    
    # append that dictionary to our output list
    cats_output.append(cat_dict)

cats_output

#
#what we want:
#[
#  {"name": "Garfield", "name_length": 8},
#  {"name": "Heathcliff", "name_length": 10},
#  ...
#]

import pandas as pd
cats_df = pd.DataFrame(cats_output)
cats_df

import urllib
from bs4 import BeautifulSoup

html_str = urllib.urlopen("http://www.menupages.com/restaurants/all-areas/morningside-heights/all-cuisines/").read() 

document = BeautifulSoup(html_str)

table_tag = document.find("table")

restaurant_list = []

for tr_tag in table_tag.find_all("tr"):
    restaurant_dict = {}
    
    reviews_tag = tr_tag.find("td", attrs={'class': 'reviews'})
    if reviews_tag is None:
        continue
    restaurant_dict['reviews'] = int(reviews_tag.string)
    
    restaurant_list.append(restaurant_dict)
    
restaurant_list

[
 {"name": "Ajanta", "price": 2, "rating": 3.0, "reviews": 43,
 {"name": "Amigos", "price": 3, "rating": 3.0, "reviews": 5}
 ...
 
]