import pywikibot from pywikibot import pagegenerators enwp = pywikibot.Site('en','wikipedia') import re from collections import defaultdict def page_class(page): talk = page.toggleTalkPage() cats = talk.categories() for cat in cats: cat_tit = cat.title().split('Category:')[1] match = re.search(r'(\w+)\-Class', cat_tit) if match: return match.group(1) return None classed_pages = defaultdict(list) #currently there is a bug in pywikibot that only allows 25 random pages at a time for i in range(0,2001): #print 'making new random' random_pages = enwp.randompages(namespaces=[0], step=25, total=25) count25 = 0 for page in random_pages: count25 += 1 wikiclass = page_class(page) if wikiclass: if wikiclass not in classed_pages.keys(): print wikiclass classed_pages[wikiclass].append(page.get()) if count25 == 24: break print "done" import json json.dump(classed_pages, open('test_class_data.json','w')) sum([len(l) for l in classed_pages.itervalues()])