import urllib, urllib2, html5lib, re
from lxml import etree
positive = u"ojczyzna OR ojczyzny OR patriota OR (polska AND krew) OR chwala OR chwała".encode('utf-8')
negative = "depresja"
splitWords = re.compile(r'\W', re.U).split
isWord = re.compile(ur'^[A-Za-ząęśćńźżółĄĘŚĆŃŹŻÓŁ]{2,}$', re.U).match
def parse(kw):
kw = "boost:popular in:literature/poetry %s" % kw
kw = urllib.urlencode({'q': kw, 'type': 'deviation'})
url = "http://backend.deviantart.com/rss.xml?" + kw
s = {}
for item in etree.parse(urllib2.urlopen(url)).find('channel').findall('item'):
item = item.find('{http://search.yahoo.com/mrss/}text')
if item is None or 'the' in item.text or 'you' in item.text: continue
for textnode in html5lib.parse(item.text, treebuilder='etree').itertext():
for word in splitWords(textnode):
if not isWord(word): continue
word = word.lower()
if word in s:
s[word] += 1
else:
s[word] = 1
s = [k for k,v in s.iteritems() if v > 1]
return set(s)
parse(positive) - parse(negative)
{u'chwa\u0142a', u'nadziei', u'nowe', u'ojczyzna', u'polska', u'wraca'}