import requests from collections import defaultdict class Classifier(object): def __init__(self, mltUrl="http://localhost:8983/solr/mlt", idField="Id", mltFields="Title Body", tagField="Tags", filterQuery="Tags:*", numNearest=10): self.mltUrl = mltUrl self.idField = idField self.mltFields = mltFields if type(mltFields) == str else " ".join(mltFields) self.tagField = tagField self.filterQuery = filterQuery self.numNearest = numNearest self.sess = requests.Session() def classifyDoc(self,docId, method="best" #or "sorted" or "details" ): #send the MLT query to Solr params = {"q": self.idField + ":" + docId, "mlt.fl": self.mltFields, "fl": self.tagField, "fq": self.filterQuery, "rows": self.numNearest, "wt":"json" } resp = sess.get(url=self.mltUrl,params=params) #Perform error checking if resp.status_code != 200: raise IOError("HTTP Status " + str(resp.status_code)) json = resp.json() if int(json["match"]["numFound"]) == 0: raise RuntimeError("no document with that id") if int(json["response"]["numFound"]) == 0: raise RuntimeError("no interesting terms in document") #If no errors, then collect and count tags for each similar document tagDict = defaultdict(int) for tagList in json["response"]["docs"] : for tag in tagList[self.tagField].split(' '): tagDict[tag] += 1 #Return the best tag, all of the tags sorted best #to worst, or the list of tags and their count if method == "best": return max(tagDict, key=tagDict.get) elif method == "sorted": return sorted(tagDict, key=lambda x : tagDict[x], reverse=True) elif method == "details": return tagDict c = Classifier(mltUrl="http://localhost:8983/solr/mlt", idField="Id", mltFields="Title Body", tagField="Tags", filterQuery="Tags:*", numNearest=10) print c.classifyDoc("8723",method="sorted") def classifierTester(q="*:*"): sess = requests.Session() #retrieve all documents that have Tags and match the q argument resp = sess.get(url="http://localhost:8983/solr/select",params={"q":q,"fq":"Tags:*","fl":"Id Tags","rows":"9999999","wt":"json"}) docs = resp.json()["response"]["docs"] #classify each document and count the number of matches count = 0 hitCount = 0 for doc in docs : count += 1 try: if c.classifyDoc(doc["Id"]) in doc["Tags"].split(' ') : hitCount += 1 except Exception: pass print "{0} out of {1} correct. That's {2}%".format(hitCount,count,100*float(hitCount)/count) #All questions that have tags classifierTester() #Tags present in 10 or more questions classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens novel a-song-of-ice-and-fire george-r-r-martin video-games short-stories batman technology game-of-thrones suggested-order superman stargate-sgstar-trek-voyager voldemort battlestar-galactica dune robots fantasy-genre star-trek-dstolkien plot canon alien-franchise fringe wolverine borg vampire rings-of-power thor horcrux weapon star-trek-tos green-lantern firefly the-walking-dead spider-man cartoon jedi isaac-asimov zombie magical-creatures spaceship languages terminator science powers the-new-star-trek-enterprise the-hunger-games buffy hogwarts dark-knight-rises ftl-drive space tv-series the-legend-of-korra history-of young-adult supernatural spells continuity phantom-menace robert-a-heinlein twilight avatar-the-last-airbender darth-vader prometheus avengers-vs-x-men religion science-fiction-genre the-hobbit hard-sci-fi larry-niven history magical-theory magical-items clones wheel-of-time middle-earth anime character-identification sith iron-man horror super-hero stargate-atlantis known-space enders-game my-little-pony transformers orson-scott-card physics computers races my-little-pony-fim elves warp stargate-universe captain-america frank-herbert john-carter the-force back-to-the-future babylon-klingon music the-incredible-hulk luke-skywalker the-hulk animals warfare neal-stephenson gandalf extended-universe h-p-lovecraft star-trek-q indiana-jones biology inception philip-k-dick tron-legacy inheritance-cycle terra-nova hitchhikers-guide space-exploration farscape character-development werewolf the-clone-wars alternate-history sauron star-trek-data highlander parallel-universe dcau cthulhu-mythos terminology blade-runner vulcan ghost tron christopher-paolini wolverine-and-the-xmen obi-wan-kenobi mass-effect neil-gaiman good-against-evil jk-rowling authors urban-fantasy terry-pratchett names paradox influences yoda han-solo warhammer40k economics alien-c-3po mistborn ringworld discworld online-resources apocalypse timeline star-trek-real-world society snow-crash the-flash quidditch brandon-sanderson eureka arthur-c-clarke)") #Tags present in 50 or more questions classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens novel a-song-of-ice-and-fire george-r-r-martin video-games short-stories batman technology game-of-thrones suggested-order superman stargate-sgstar-trek-voyager voldemort battlestar-galactica dune robots fantasy-genre)") #Tags present in 100 or more questions classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars comics movie marvel-comics lord-of-the-rings dc-comics doctor-who futurama star-trek-tng tv x-men time-travel stargate books magic avengers the-matrix aliens)") #Tags present in 500 or more questions classifierTester(q="Tags:(harry-potter story-identification star-trek star-wars)")