import requests from BeautifulSoup import BeautifulSoup import HTMLParser res = requests.get("http://bryannotes.blogspot.tw/") soup = BeautifulSoup(res.text.encode("utf-8")) bid_table = soup.findAll('h3',{'class':'post-title entry-title'}) print bid_table[1].findAll('a',{'href':True}) bid_file = open("blog_links.txt",'w') for link in bid_table: links = str([tag['href'] for tag in link.findAll('a',{'href':True})])[3:-2] bid_file.write(links+"\n") print links bid_file.close() bid_list = open('blog_links.txt','r') h = HTMLParser.HTMLParser() blog = {} for line in bid_list.readlines(): pagelink = line.strip() request_get = requests.get(pagelink) soup_post = BeautifulSoup(request_get.text.encode("utf-8")) body = h.unescape(soup_post.find("div",{'class':'post-body entry-content'}).text) title = h.unescape(soup_post.find("h3",{'class':'post-title entry-title'}).text) blog[title] = body for key in blog: print key, print len(blog[key]) # coding=UTF-8 f = open("C:\\blog_text.txt","w") for key in blog: f.write(key.encode('utf-8')+",") f.write(blog[key].encode('utf-8')+"\n") f.close()