In [1]:

import urllib, urllib2, html5lib, re
from lxml import etree

In [2]:

positive = u"ojczyzna OR ojczyzny OR patriota OR (polska AND krew) OR chwala OR chwała".encode('utf-8')
negative = "depresja"

In [3]:

splitWords = re.compile(r'\W', re.U).split
isWord = re.compile(ur'^[A-Za-ząęśćńźżółĄĘŚĆŃŹŻÓŁ]{2,}$', re.U).match

In [8]:

def parse(kw):
    kw = "boost:popular in:literature/poetry %s" % kw
    kw = urllib.urlencode({'q': kw, 'type': 'deviation'})
    url = "http://backend.deviantart.com/rss.xml?" + kw
    s = {}
    for item in etree.parse(urllib2.urlopen(url)).find('channel').findall('item'):
        item = item.find('{http://search.yahoo.com/mrss/}text')
        if item is None or 'the' in item.text or 'you' in item.text: continue
        for textnode in html5lib.parse(item.text, treebuilder='etree').itertext():
            for word in splitWords(textnode):
                if not isWord(word): continue
                word = word.lower()
                if word in s:
                    s[word] += 1
                else:
                    s[word] = 1
    s = [k for k,v in s.iteritems() if v > 1]
    return set(s)

In [9]:

parse(positive) - parse(negative)

Out[9]:

{u'chwa\u0142a', u'nadziei', u'nowe', u'ojczyzna', u'polska', u'wraca'}

In [ ]: