import pandas as pd url = "http://zh.wikipedia.org/wiki/%E7%9C%81%E4%BC%9A" dfs = pd.read_html(url, attrs={'class': 'wikitable'}) dfs[0].head() import requests s = requests.session() data = {'email':'***','password':'***'} # 参数 url = "http://www.zhihu.com/login" s.post(url,data) r = s.get('http://www.zhihu.com/') import requests from bs4 import BeautifulSoup import re url = "http://movie.douban.com/top250" r = requests.get(url) soup_packtpage = BeautifulSoup(r.text) # 根据网页特征定义抓取函数 def namefunc(movie): names = [x.findChild('span',attrs={'class':'title'}).string for x in movie] return names def scorefunc(movie): scores = [float(str(x.findChild('em').string)) for x in movie] return scores def numfunc(movie): num = [x.findChild('span',attrs=None).string for x in movie] num = [int(str(re.sub('\D', '', x))) for x in num] return num url = "http://movie.douban.com/top250" def getinfo(url): r = requests.get(url) soup_packtpage = BeautifulSoup(r.text) movie = soup_packtpage.findAll('div',attrs={'class':'info'}) names = namefunc(movie) scores = scorefunc(movie) num = numfunc(movie) res = {'names': names, 'scores': scores, 'num': num} return res # 得到不同网址 urls = [] index = range(0,250,25) for x in index: urls.append('http://movie.douban.com/top250?start='+str(x)+'&filter=&type=') urls # 对每个网址进行抓取 res = {'names': [], 'scores': [], 'num': []} for url in urls: new = getinfo(url) res['names'].extend(new['names']) res['scores'].extend(new['scores']) res['num'].extend(new['num']) import pandas as pd df = pd.DataFrame(res) df.head() %load_ext rpy2.ipython %%R -i df -w 500 -h 300 library(ggplot2) p = ggplot(df,aes(x = num, y = scores)) + geom_point(size=4,alpha=0.5) + stat_smooth() print(p)