import requests
from collections import defaultdict
class Classifier(object):
def __init__(self, mltUrl="http://localhost:8983/solr/mlt",
idField="Id",
mltFields="Title Body",
tagField="Tags",
filterQuery="Tags:*",
numNearest=10):
self.mltUrl = mltUrl
self.idField = idField
self.mltFields = mltFields if type(mltFields) == str else " ".join(mltFields)
self.tagField = tagField
self.filterQuery = filterQuery
self.numNearest = numNearest
self.sess = requests.Session()
def classifyDoc(self,docId,
method="best" #or "sorted" or "details"
):
#send the MLT query to Solr
params = {"q": self.idField + ":" + docId,
"mlt.fl": self.mltFields,
"fl": self.tagField,
"fq": self.filterQuery,
"rows": self.numNearest,
"wt":"json"
}
resp = sess.get(url=self.mltUrl,params=params)
#Perform error checking
if resp.status_code != 200:
raise IOError("HTTP Status " + str(resp.status_code))
json = resp.json()
if int(json["match"]["numFound"]) == 0:
raise RuntimeError("no document with that id")
if int(json["response"]["numFound"]) == 0:
raise RuntimeError("no interesting terms in document")
#If no errors, then collect and count tags for each similar document
tagDict = defaultdict(int)
for tagList in json["response"]["docs"] :
for tag in tagList[self.tagField].split(' '):
tagDict[tag] += 1
#Return the best tag, all of the tags sorted best
#to worst, or the list of tags and their count
if method == "best":
return max(tagDict, key=tagDict.get)
elif method == "sorted":
return sorted(tagDict, key=lambda x : tagDict[x], reverse=True)
elif method == "details":
return tagDict
c = Classifier(mltUrl="http://localhost:8983/solr/mlt",
idField="Id",
mltFields="Title Body",
tagField="Tags",
filterQuery="Tags:*",
numNearest=10)
And as an example, use it to find the plausible tags for a document.
print c.classifyDoc("8723",method="sorted")
[u'star-trek', u'holodeck', u'star-trek-tng', u'technology', u'magical-transportation', u'harry-potter', u'weapon', u'time-travel', u'diagon-alley']
First create a function that tests the classifier.
def classifierTester(q="*:*"):
sess = requests.Session()
#retrieve all documents that have Tags and match the q argument
resp = sess.get(url="http://localhost:8983/solr/select",params={"q":q,"fq":"Tags:*","fl":"Id Tags","rows":"9999999","wt":"json"})
docs = resp.json()["response"]["docs"]
#classify each document and count the number of matches
count = 0
hitCount = 0
for doc in docs :
count += 1
try:
if c.classifyDoc(doc["Id"]) in doc["Tags"].split(' ') :
hitCount += 1
except Exception:
pass
print "{0} out of {1} correct. That's {2}%".format(hitCount,count,100*float(hitCount)/count)
Perform various tests:
#All questions that have tags
classifierTester()
4041 out of 5805 correct. That's 69.6124031008%
#Tags present in 10 or more questions
classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens novel a-song-of-ice-and-fire george-r-r-martin video-games short-stories batman technology game-of-thrones suggested-order superman stargate-sgstar-trek-voyager voldemort battlestar-galactica dune robots fantasy-genre star-trek-dstolkien plot canon alien-franchise fringe wolverine borg vampire rings-of-power thor horcrux weapon star-trek-tos green-lantern firefly the-walking-dead spider-man cartoon jedi isaac-asimov zombie magical-creatures spaceship languages terminator science powers the-new-star-trek-enterprise the-hunger-games buffy hogwarts dark-knight-rises ftl-drive space tv-series the-legend-of-korra history-of young-adult supernatural spells continuity phantom-menace robert-a-heinlein twilight avatar-the-last-airbender darth-vader prometheus avengers-vs-x-men religion science-fiction-genre the-hobbit hard-sci-fi larry-niven history magical-theory magical-items clones wheel-of-time middle-earth anime character-identification sith iron-man horror super-hero stargate-atlantis known-space enders-game my-little-pony transformers orson-scott-card physics computers races my-little-pony-fim elves warp stargate-universe captain-america frank-herbert john-carter the-force back-to-the-future babylon-klingon music the-incredible-hulk luke-skywalker the-hulk animals warfare neal-stephenson gandalf extended-universe h-p-lovecraft star-trek-q indiana-jones biology inception philip-k-dick tron-legacy inheritance-cycle terra-nova hitchhikers-guide space-exploration farscape character-development werewolf the-clone-wars alternate-history sauron star-trek-data highlander parallel-universe dcau cthulhu-mythos terminology blade-runner vulcan ghost tron christopher-paolini wolverine-and-the-xmen obi-wan-kenobi mass-effect neil-gaiman good-against-evil jk-rowling authors urban-fantasy terry-pratchett names paradox influences yoda han-solo warhammer40k economics alien-c-3po mistborn ringworld discworld online-resources apocalypse timeline star-trek-real-world society snow-crash the-flash quidditch brandon-sanderson eureka arthur-c-clarke)")
3966 out of 5297 correct. That's 74.8725693789%
#Tags present in 50 or more questions
classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens novel a-song-of-ice-and-fire george-r-r-martin video-games short-stories batman technology game-of-thrones suggested-order superman stargate-sgstar-trek-voyager voldemort battlestar-galactica dune robots fantasy-genre)")
3556 out of 4457 correct. That's 79.784608481%
#Tags present in 100 or more questions
classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens)")
3323 out of 4042 correct. That's 82.2117763483%
#Tags present in 500 or more questions
classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars)")
2136 out of 2344 correct. That's 91.1262798635%