%matplotlib inline
import io
import re
import bz2
import urllib.request
import unittest.mock
import unicodedata as ud
import collections
import contextlib
import xmltodict
import tqdm
@contextlib.contextmanager
def stream_tqdm(f, **kwargs):
with tqdm.tqdm(**kwargs) as pbar:
fread = f.read
def read(size):
pbar.update(size)
return fread(size)
with unittest.mock.patch.object(f, 'read', read):
yield
def read_ltwiki(url, *args, **kwargs):
with urllib.request.urlopen(url) as f:
with stream_tqdm(f, unit='B', unit_scale=True, total=int(f.headers['Content-Length'])):
stream = bz2.open(f, 'rb')
xmltodict.parse(stream, *args, **kwargs)
def strip_accents(s, letters='ąčęėįšųūž'):
s = ud.normalize('NFC', s)
s = [c if c in letters else ud.normalize('NFD', c) for c in s]
s = [c for c in ''.join(s) if not ud.combining(c)]
return ud.normalize('NFC', ''.join(s))
def extract_words(path, page):
tag, attrs = path[-1]
if tag == 'page' and page['revision']['format'] == 'text/x-wiki':
text = page['revision']['text'].get('#text', '')
text = strip_accents(text.lower())
wfreq.update(filter(None, tokens_re.split(text)))
return True
alphabet = 'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'
tokens_re = re.compile('[^%s]+' % alphabet, flags=re.IGNORECASE)
wfreq = collections.Counter()
print('reading ltwiki...')
url = 'https://dumps.wikimedia.org/ltwiki/{date}/ltwiki-{date}-pages-articles.xml.bz2'.format(date='20161201')
read_ltwiki(url, item_depth=2, item_callback=extract_words)
print('writing words.jsonl...')
with open('words.jsonl', 'w') as f:
for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
f.write(json.dumps([word, freq]) + '\n')
reading ltwiki...
147MB [10:54, 238KB/s]
writing words.jsonl...
0%| | 0/1404085 [00:00<?, ?it/s]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-9ba7b72654b7> in <module>() 41 with open('words.jsonl', 'w') as f: 42 for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)): ---> 43 f.write(json.dumps([word, freq]) + '\n') NameError: name 'json' is not defined
import os
import re
import collections
import tqdm
import gramtool
import json
import subprocess
def nlines(filename):
return int(subprocess.run(['wc', '-l', filename], stdout=subprocess.PIPE).stdout.split()[0])
alphabet = 'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'
tokens_re = re.compile('[^%s]+' % alphabet, flags=re.IGNORECASE)
print('loading gramtool...')
gt = gramtool.GramTool()
wfreq = collections.Counter()
if os.path.exists('words.jsonl'):
print('reading words.jsonl...')
with open('words.jsonl') as f:
for line in tqdm.tqdm(f, total=nlines('words.jsonl')):
word, freq = json.loads(line)
wfreq[word] = freq
else:
print('reading texts.csv...')
with open('texts.csv') as f:
for line in tqdm.tqdm(f, total=nlines('texts.csv')):
wfreq.update(map(str.lower, filter(None, tokens_re.split(line))))
print('writing words.jsonl...')
with open('words.jsonl', 'w') as f:
for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
f.write(json.dumps([word, freq]) + '\n')
stats = {}
print('detecting part of speach...')
for token, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
poses = set()
for rule in gt.grammar.iter_rules(token):
for word in rule:
poses.add(word.form.spec[0])
stats[token] = ''.join(poses)
print('writing stats.csv...')
with open('stats.csv', 'w') as f:
print('word,freq,n_forms,forms', file=f)
for word, freq in wfreq.most_common():
print('%s,%d,%d,%s' % (word, freq, len(stats[word]), stats[word]), file=f)
loading gramtool...
10%|█ | 11504/109751 [00:00<00:00, 115039.56it/s]
reading words.jsonl...
100%|██████████| 109751/109751 [00:01<00:00, 93626.93it/s] 0%| | 100/109751 [00:00<01:49, 999.00it/s]
detecting part of speach...
100%|██████████| 109751/109751 [02:15<00:00, 807.20it/s]
writing stats.csv...
import pandas as pd
import matplotlib as mpl
mpl.rc('font', family='Ubuntu', size=16)
mpl.rc('figure', figsize=(16, 10))
data = pd.read_csv('stats.csv')
data.head()
word | freq | n_forms | forms | |
---|---|---|---|---|
0 | ir | 889793 | 1 | c |
1 | ar | 300743 | 1 | c |
2 | straipsnio | 296790 | 1 | n |
3 | lietuvos | 228649 | 1 | n |
4 | straipsnis | 205871 | 1 | n |
Meaning of forms: https://github.com/sirex/gramtool/blob/master/gramtool/data/grammar.yaml#L5
# Parts of speech
pos:
n: noun # Daiktavardis
a: adjective # Būdvardis
v: verb # Veiksmažodis
V: infinitive # Bendratis
e: adverb # Prieveismis
i: interjection # Jaustukas
p: preposition # Prielinksnis
c: conjunction # Jungtukas
P: particle # Dalelytė
data.forms.value_counts().plot.barh(grid=True, figsize=(10, 14))
<matplotlib.axes._subplots.AxesSubplot at 0x7f91f79b4d68>
data.n_forms.value_counts().plot.bar(grid=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f91fcb3ce10>
data[data.n_forms==0].head(40)
word | freq | n_forms | forms | |
---|---|---|---|---|
11 | nr | 122149 | 0 | NaN |
29 | m | 72730 | 0 | NaN |
35 | d | 66065 | 0 | NaN |
59 | p | 40491 | 0 | NaN |
94 | l | 27240 | 0 | NaN |
102 | lt | 24765 | 0 | NaN |
113 | inter | 23479 | 0 | NaN |
115 | lrs | 23308 | 0 | NaN |
116 | id | 23223 | 0 | NaN |
117 | doc | 23212 | 0 | NaN |
118 | http | 23204 | 0 | NaN |
119 | sho | 23195 | 0 | NaN |
120 | dokpaieska | 23193 | 0 | NaN |
121 | pls | 23193 | 0 | NaN |
143 | žin | 19760 | 0 | NaN |
161 | i | 18230 | 0 | NaN |
188 | a | 16463 | 0 | NaN |
246 | ip | 12713 | 0 | NaN |
357 | eb | 8801 | 0 | NaN |
392 | pvm | 8067 | 0 | NaN |
401 | t | 7958 | 0 | NaN |
472 | s | 6763 | 0 | NaN |
523 | iip | 6164 | 0 | NaN |
567 | es | 5753 | 0 | NaN |
578 | r | 5641 | 0 | NaN |
617 | nereg | 5249 | 0 | NaN |
651 | ii | 4993 | 0 | NaN |
659 | skiria | 4954 | 0 | NaN |
697 | naudotis | 4704 | 0 | NaN |
718 | vyriausioji | 4587 | 0 | NaN |
731 | ol | 4533 | 0 | NaN |
758 | b | 4308 | 0 | NaN |
764 | vyriausiosios | 4249 | 0 | NaN |
775 | verstis | 4166 | 0 | NaN |
779 | perkančioji | 4117 | 0 | NaN |
799 | esant | 4012 | 0 | NaN |
800 | vadovaudamasis | 4010 | 0 | NaN |
802 | mo | 4001 | 0 | NaN |
804 | atsižvelgdamas | 3993 | 0 | NaN |
809 | kol | 3955 | 0 | NaN |