import codecs
import json
import os, sys
from collections import defaultdict
from multiprocessing import Pool as ThreadPool
from IPython.display import display, HTML
import pandas as pd
sys.path.append(os.path.abspath('../../WKP-python-toolkit'))
import wekeypedia
inflections = defaultdict(dict)
ignore_list = "{}()[]<>./,;\"':!?&#=*&%"
def from_file(name):
diff_txt = ""
with codecs.open(name, "r", encoding="utf-8-sig") as f:
data = json.load(f)
return data
def list_revisions(page):
return os.listdir("data/%s" % (page))
def revision_stems(revision_filename):
p = wekeypedia.WikipediaPage()
# print revision_filename
rev = from_file(revision_filename)
# extract diff text
diff = rev["diff"]["*"]
# bug with Ethics#462124891
if diff == False:
return { "added": {}, "deleted": {} }
diff = p.extract_plusminus(diff)
# count stems by added/deleted
stems = {}
stems["added"] = p.count_stems(diff["added"], inflections)
stems["deleted"] = p.count_stems(diff["deleted"], inflections)
# p.print_plusminus_terms_overview(stems)
return stems
def source_stems(s):
p = wekeypedia.WikipediaPage(s)
revisions = list_revisions(s)
result = {
"added": defaultdict(dict),
"deleted": defaultdict(dict) }
print "%s: %s revisions" % (s, len(revisions))
i = 0
for r in revisions:
i += 1
print "\rrevisions: %s (%s/%s)" % (r, i, len(revisions),),
stems = revision_stems("data/%s/%s" % (s, r))
for x in ["added", "deleted"]:
for stem in stems[x].iteritems():
result[x].setdefault(stem[0], 0)
result[x][stem[0]] += stem[1]
print "\r ",
return result
def to_df(a):
df_add = pd.DataFrame([ [ x[1] ] for x in a["added"].iteritems() ], index=a["added"].keys())
df_add.columns = [ 'added' ]
df_del = pd.DataFrame([ [ x[1] ] for x in a["deleted"].iteritems() ], index=a["deleted"].keys())
df_del.columns = [ 'deleted' ]
df = df_add.join(df_del, how="outer")
return df
def clean_and_compute(df):
ignore_list = [ "a", "of", "and", "to", "the", "is", "for", "or" , "in", "that", "it", "|", "ref",
"http", "''", "``", "s", "an", "-", "=", "*", "==", "===", "====", "name=", "nbsp", "style=", "5px",
"font-siz", "|-", "--", "wikiquot", "/ref", "'s" ]
df = df.drop([ w for w in ignore_list if w in df.index ])
df["added - deleted"] = map(lambda x, y: x-y, df["added"], df["deleted"])
df["abs(added - deleted)"] = map(lambda x, y: abs(x-y), df["added"], df["deleted"])
df = df.sort(["abs(added - deleted)", "added"], ascending=[0, 0])
return df
love = source_stems("Love")
love = to_df(love)
Love: 6324 revisions
love = clean_and_compute(love)
love.head(20)
added | deleted | added - deleted | abs(added - deleted) | |
---|---|---|---|---|
love | 41315 | 40990 | 325 | 325 |
be | 5886 | 5831 | 55 | 55 |
god | 4074 | 4024 | 50 | 50 |
with | 4237 | 4193 | 44 | 44 |
by | 4147 | 4104 | 43 | 43 |
one | 3169 | 3135 | 34 | 34 |
cite | 556 | 522 | 34 | 34 |
from | 2039 | 2007 | 32 | 32 |
thi | 3509 | 3478 | 31 | 31 |
which | 3357 | 3326 | 31 | 31 |
not | 3376 | 3346 | 30 | 30 |
on | 3083 | 3055 | 28 | 28 |
are | 3558 | 3531 | 27 | 27 |
other | 2959 | 2934 | 25 | 25 |
use | 2679 | 2654 | 25 | 25 |
also | 1620 | 1595 | 25 | 25 |
human | 2025 | 2001 | 24 | 24 |
word | 2679 | 2656 | 23 | 23 |
romant | 2364 | 2341 | 23 | 23 |
term | 1731 | 1708 | 23 | 23 |
wisdom = source_stems("Wisdom")
wisdom = to_df(wisdom)
Wisdom: 1634 revisions
wisdom = clean_and_compute(wisdom)
wisdom.head(20)
added | deleted | added - deleted | abs(added - deleted) | |
---|---|---|---|---|
wisdom | 3140 | 3025 | 115 | 115 |
with | 1093 | 1064 | 29 | 29 |
be | 699 | 675 | 24 | 24 |
he | 506 | 484 | 22 | 22 |
cite | 138 | 116 | 22 | 22 |
one | 645 | 624 | 21 | 21 |
wise | 619 | 599 | 20 | 20 |
from | 508 | 488 | 20 | 20 |
by | 494 | 475 | 19 | 19 |
which | 252 | 234 | 18 | 18 |
not | 416 | 399 | 17 | 17 |
are | 421 | 405 | 16 | 16 |
person | 308 | 292 | 16 | 16 |
virtu | 394 | 379 | 15 | 15 |
knowledg | 808 | 794 | 14 | 14 |
who | 304 | 290 | 14 | 14 |
god | 270 | 256 | 14 | 14 |
proverb | 97 | 83 | 14 | 14 |
thi | 502 | 489 | 13 | 13 |
other | 375 | 362 | 13 | 13 |
morality = source_stems("Morality")
morality = to_df(morality)
Morality: 2776 revisions
morality = clean_and_compute(morality)
morality.head(20)
added | deleted | added - deleted | abs(added - deleted) | |
---|---|---|---|---|
moral | 7595 | 7421 | 174 | 174 |
cite | 728 | 665 | 63 | 63 |
journal | 620 | 565 | 55 | 55 |
on | 1577 | 1530 | 47 | 47 |
be | 1934 | 1898 | 36 | 36 |
with | 1347 | 1313 | 34 | 34 |
religion | 956 | 923 | 33 | 33 |
are | 1661 | 1629 | 32 | 32 |
by | 1097 | 1066 | 31 | 31 |
ethic | 1356 | 1326 | 30 | 30 |
from | 1106 | 1078 | 28 | 28 |
thi | 1005 | 978 | 27 | 27 |
cultur | 923 | 897 | 26 | 26 |
other | 780 | 756 | 24 | 24 |
year | 240 | 216 | 24 | 24 |
have | 824 | 801 | 23 | 23 |
behavior | 776 | 753 | 23 | 23 |
studi | 602 | 579 | 23 | 23 |
page | 197 | 174 | 23 | 23 |
theori | 571 | 549 | 22 | 22 |
ethics = source_stems("Ethics")
ethics = to_df(ethics)
Ethics: 3739 revisions
ethics = clean_and_compute(ethics)
ethics.head(20)
added | deleted | added - deleted | abs(added - deleted) | |
---|---|---|---|---|
ethic | 10236 | 10012 | 224 | 224 |
moral | 2874 | 2800 | 74 | 74 |
be | 3130 | 3066 | 64 | 64 |
are | 2791 | 2744 | 47 | 47 |
on | 1924 | 1877 | 47 | 47 |
right | 2024 | 1982 | 42 | 42 |
philosophi | 1356 | 1318 | 38 | 38 |
not | 1585 | 1548 | 37 | 37 |
with | 1681 | 1645 | 36 | 36 |
by | 1795 | 1761 | 34 | 34 |
cite | 320 | 289 | 31 | 31 |
wa | 1461 | 1431 | 30 | 30 |
one | 1430 | 1401 | 29 | 29 |
good | 1406 | 1377 | 29 | 29 |
virtu | 782 | 753 | 29 | 29 |
theori | 1752 | 1725 | 27 | 27 |
thi | 1514 | 1487 | 27 | 27 |
action | 831 | 804 | 27 | 27 |
person | 1333 | 1307 | 26 | 26 |
from | 1111 | 1087 | 24 | 24 |
love.to_csv("data/love.terms.csv", encoding="utf-8")
wisdom.to_csv("data/wisdom.terms.csv", encoding="utf-8")
ethics.to_csv("data/ethics.terms.csv", encoding="utf-8")
morality.to_csv("data/morality.terms.csv", encoding="utf-8")
love[ love["added - deleted"] < 0 ].head(20)
added | deleted | added - deleted | abs(added - deleted) | |
---|---|---|---|---|
nature, | 1 | 7 | -6 | 6 |
band | 175 | 180 | -5 | 5 |
love== | 160 | 165 | -5 | 5 |
br | 225 | 229 | -4 | 4 |
13 | 157 | 161 | -4 | 4 |
februari | 44 | 48 | -4 | 4 |
|love | 30 | 34 | -4 | 4 |
fact|dat | 4 | 8 | -4 | 4 |
food, | 2 | 6 | -4 | 4 |
organizations, | 2 | 6 | -4 | 4 |
g | 102 | 105 | -3 | 3 |
iniqu | 9 | 12 | -3 | 3 |
/b | 7 | 10 | -3 | 3 |
86 | 7 | 10 | -3 | 3 |
feelings, | 5 | 8 | -3 | 3 |
id | 404 | 406 | -2 | 2 |
me | 227 | 229 | -2 | 2 |
|ero | 223 | 225 | -2 | 2 |
etc | 190 | 192 | -2 | 2 |
patient | 173 | 175 | -2 | 2 |