import pandas as pd
url = "http://zh.wikipedia.org/wiki/%E7%9C%81%E4%BC%9A"
dfs = pd.read_html(url, attrs={'class': 'wikitable'})
dfs[0].head()
import requests
s = requests.session()
data = {'email':'***','password':'***'} # 参数
url = "http://www.zhihu.com/login"
s.post(url,data)
r = s.get('http://www.zhihu.com/')

import requests
from bs4 import BeautifulSoup
import re
url = "http://movie.douban.com/top250"
r = requests.get(url)
soup_packtpage = BeautifulSoup(r.text)

# 根据网页特征定义抓取函数
def namefunc(movie):
    names = [x.findChild('span',attrs={'class':'title'}).string for x in movie]
    return names
def scorefunc(movie):
    scores = [float(str(x.findChild('em').string)) for x in movie]
    return scores
def numfunc(movie):
    num = [x.findChild('span',attrs=None).string for x in movie]
    num = [int(str(re.sub('\D', '', x))) for x in num]
    return num
url = "http://movie.douban.com/top250"
def getinfo(url):
    r = requests.get(url)
    soup_packtpage = BeautifulSoup(r.text)
    movie = soup_packtpage.findAll('div',attrs={'class':'info'})
    names = namefunc(movie)
    scores = scorefunc(movie)
    num = numfunc(movie)
    res = {'names': names, 'scores': scores, 'num': num}
    return res

# 得到不同网址
urls = []
index = range(0,250,25)
for x in index:
    urls.append('http://movie.douban.com/top250?start='+str(x)+'&filter=&type=')
urls

# 对每个网址进行抓取
res = {'names': [], 'scores': [], 'num': []}
for url in urls:
    new = getinfo(url)
    res['names'].extend(new['names'])
    res['scores'].extend(new['scores'])
    res['num'].extend(new['num'])

import pandas as pd
df = pd.DataFrame(res)
df.head()

%load_ext rpy2.ipython

%%R -i df -w 500 -h 300 
library(ggplot2)
p = ggplot(df,aes(x = num, y = scores)) + geom_point(size=4,alpha=0.5) + stat_smooth()
print(p)