In [1]:

%matplotlib inline

In [1]:

import io
import re
import bz2
import urllib.request
import unittest.mock
import unicodedata as ud
import collections
import contextlib
import xmltodict
import tqdm

In [4]:

@contextlib.contextmanager
def stream_tqdm(f, **kwargs):
    with tqdm.tqdm(**kwargs) as pbar:
        fread = f.read
        def read(size):
            pbar.update(size)
            return fread(size)
        with unittest.mock.patch.object(f, 'read', read):
            yield

def read_ltwiki(url, *args, **kwargs):
    with urllib.request.urlopen(url) as f:
        with stream_tqdm(f, unit='B', unit_scale=True, total=int(f.headers['Content-Length'])):
            stream = bz2.open(f, 'rb')
            xmltodict.parse(stream, *args, **kwargs)

def strip_accents(s, letters='ąčęėįšųūž'):
    s = ud.normalize('NFC', s)
    s = [c if c in letters else ud.normalize('NFD', c) for c in s]
    s = [c for c in ''.join(s) if not ud.combining(c)]
    return ud.normalize('NFC', ''.join(s))

def extract_words(path, page):
    tag, attrs = path[-1]
    if tag == 'page' and page['revision']['format'] == 'text/x-wiki':
        text = page['revision']['text'].get('#text', '')
        text = strip_accents(text.lower())
        wfreq.update(filter(None, tokens_re.split(text)))
    return True

alphabet = 'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'
tokens_re = re.compile('[^%s]+' % alphabet, flags=re.IGNORECASE)

wfreq = collections.Counter()

print('reading ltwiki...')
url = 'https://dumps.wikimedia.org/ltwiki/{date}/ltwiki-{date}-pages-articles.xml.bz2'.format(date='20161201')
read_ltwiki(url, item_depth=2, item_callback=extract_words)

print('writing words.jsonl...')
with open('words.jsonl', 'w') as f:
    for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
        f.write(json.dumps([word, freq]) + '\n')

reading ltwiki...

147MB [10:54, 238KB/s]

writing words.jsonl...

  0%|          | 0/1404085 [00:00<?, ?it/s]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-9ba7b72654b7> in <module>()
     41 with open('words.jsonl', 'w') as f:
     42     for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
---> 43         f.write(json.dumps([word, freq]) + '\n')

NameError: name 'json' is not defined

In [2]:

import os
import re
import collections
import tqdm
import gramtool
import json
import subprocess


def nlines(filename):
    return int(subprocess.run(['wc', '-l', filename], stdout=subprocess.PIPE).stdout.split()[0])


alphabet = 'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'
tokens_re = re.compile('[^%s]+' % alphabet, flags=re.IGNORECASE)

print('loading gramtool...')
gt = gramtool.GramTool()

wfreq = collections.Counter()

if os.path.exists('words.jsonl'):
    print('reading words.jsonl...')
    with open('words.jsonl') as f:
        for line in tqdm.tqdm(f, total=nlines('words.jsonl')):
            word, freq = json.loads(line)
            wfreq[word] = freq
else:
    print('reading texts.csv...')
    with open('texts.csv') as f:
        for line in tqdm.tqdm(f, total=nlines('texts.csv')):
            wfreq.update(map(str.lower, filter(None, tokens_re.split(line))))

    print('writing words.jsonl...')
    with open('words.jsonl', 'w') as f:
        for word, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
            f.write(json.dumps([word, freq]) + '\n')

stats = {}

print('detecting part of speach...')
for token, freq in tqdm.tqdm(wfreq.most_common(), total=len(wfreq)):
    poses = set()
    for rule in gt.grammar.iter_rules(token):
        for word in rule:
            poses.add(word.form.spec[0])
    stats[token] = ''.join(poses)

print('writing stats.csv...')
with open('stats.csv', 'w') as f:
    print('word,freq,n_forms,forms', file=f)
    for word, freq in wfreq.most_common():
        print('%s,%d,%d,%s' % (word, freq, len(stats[word]), stats[word]), file=f)

loading gramtool...

 10%|█         | 11504/109751 [00:00<00:00, 115039.56it/s]

reading words.jsonl...

100%|██████████| 109751/109751 [00:01<00:00, 93626.93it/s]
  0%|          | 100/109751 [00:00<01:49, 999.00it/s]

detecting part of speach...

100%|██████████| 109751/109751 [02:15<00:00, 807.20it/s]

writing stats.csv...

In [3]:

import pandas as pd
import matplotlib as mpl

In [4]:

mpl.rc('font', family='Ubuntu', size=16)
mpl.rc('figure', figsize=(16, 10))

In [5]:

data = pd.read_csv('stats.csv')

In [6]:

data.head()

Out[6]:

	word	freq	n_forms	forms
0	ir	889793	1	c
1	ar	300743	1	c
2	straipsnio	296790	1	n
3	lietuvos	228649	1	n
4	straipsnis	205871	1	n

Meaning of forms: https://github.com/sirex/gramtool/blob/master/gramtool/data/grammar.yaml#L5

# Parts of speech
pos:
  n: noun          # Daiktavardis
  a: adjective     # Būdvardis
  v: verb          # Veiksmažodis
  V: infinitive    # Bendratis
  e: adverb        # Prieveismis
  i: interjection  # Jaustukas
  p: preposition   # Prielinksnis
  c: conjunction   # Jungtukas
  P: particle      # Dalelytė

In [7]:

data.forms.value_counts().plot.barh(grid=True, figsize=(10, 14))

Out[7]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f91f79b4d68>

In [8]:

data.n_forms.value_counts().plot.bar(grid=True)

Out[8]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f91fcb3ce10>

In [9]:

data[data.n_forms==0].head(40)

Out[9]:

	word	freq	forms
11	nr	122149	NaN
29	m	72730	NaN
35	d	66065	NaN
59	p	40491	NaN
94	l	27240	NaN
102	lt	24765	NaN
113	inter	23479	NaN
115	lrs	23308	NaN
116	id	23223	NaN
117	doc	23212	NaN
118	http	23204	NaN
119	sho	23195	NaN
120	dokpaieska	23193	NaN
121	pls	23193	NaN
143	žin	19760	NaN
161	i	18230	NaN
188	a	16463	NaN
246	ip	12713	NaN
357	eb	8801	NaN
392	pvm	8067	NaN
401	t	7958	NaN
472	s	6763	NaN
523	iip	6164	NaN
567	es	5753	NaN
578	r	5641	NaN
617	nereg	5249	NaN
651	ii	4993	NaN
659	skiria	4954	NaN
697	naudotis	4704	NaN
718	vyriausioji	4587	NaN
731	ol	4533	NaN
758	b	4308	NaN
764	vyriausiosios	4249	NaN
775	verstis	4166	NaN
779	perkančioji	4117	NaN
799	esant	4012	NaN
800	vadovaudamasis	4010	NaN
802	mo	4001	NaN
804	atsižvelgdamas	3993	NaN
809	kol	3955	NaN

In [ ]: