import BeautifulSoup as bs from IPython.display import HTML import urllib2 import re url = 'http://pt.wikipedia.org/w/index.php?title=Wikip%C3%A9dia:Esplanada/geral&action=history' headers = { 'User-Agent' : 'Mozilla/5.0' } req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() soup = bs.BeautifulSoup(html) topics = soup.findAll('li', text=re.compile(u'\(novo tópico:*')) topics_l = [] for topic in topics: topics_l.append({}) t = topic.findParent() topics_l[-1]['title'] = t.findAll('a')[1] topics_l[-1]['author'] = t.findParent().find('span', attrs={'class': 'history-user'}).a topics_l[-1]['date'] = t.findParent().find('a', attrs={'class': 'mw-changeslist-date'}) def html_new_topics(topics): html_list = '

{} novos tópicos

' for k in topic.keys(): topic[k]['href'] = 'http://pt.wikipedia.org' + str(topic[k]['href']) html_list += str(topic['title']) + ' - ' + str(topic['author']) + ' - ' + str(topic['date']) html_list += '

' return html_list HTML(html_new_topics(topics_l)) url = topics_l[0]['title']['href'] headers = { 'User-Agent' : 'Mozilla/5.0' } req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() soup = bs.BeautifulSoup(html) len(cont_div.findAll(text=True)) cont_div = soup.find('div', attrs={'id': 'mw-content-text'}) for i in cont_div: if type(i) == bs.Tag: if i.name != 'table' and i.name != 'dl': print i if i.name == 'dl': break def html_new_topics(topics, content=False): html_list = '

{} novos tópicos

' html_list += '{} - {} - {}'.format(topic['title'], topic['author'], topic['date']) if content: html_list += '
' + topic_content html_list += '

' return html_list HTML(html_new_topics(topics_l, content=True))