#!/usr/bin/env python
# coding: utf-8

# Getting tree fragments from TSG derivations
# -------------------------------------------
# 
# Below we extract a simple Tree-Substitution Grammar (TSG) and parse sentences with it,
# and show which tree fragments were used in the derivations and how to extract them.

# In[1]:


import io
from discodop import parser, runexp, tree


# In[2]:


# Go to a temporary directory where we will create a simple treebank
# and store the extract grammar
get_ipython().run_line_magic('cd', '/tmp')
with io.open('treebankExample.mrg', 'w', encoding='utf8') as out:
    out.write(u"""(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (JJ yellow) (NN cat))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN cat)) (VP (VBP ate) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))
""")


# In[3]:


# Write a paramater file containing the specification for the grammar we will extract.
with io.open('mygrammar.prm', 'w', encoding='utf8') as out:
    out.write(u"""stages=[
    dict(name='dop', mode='pcfg', dop='doubledop',
        m=1000, estimator='rfe', objective = 'mpp')
],
corpusfmt='bracket',
traincorpus=dict(
    path='treebankExample.mrg', encoding='utf8',
    numsents=7, maxwords=100),
testcorpus=dict(
    path='treebankExample.mrg', encoding='utf8',
    numsents=7, maxwords=100, skiptrain=False),
postagging=dict(
    method='unknownword', model='4',
    unknownthreshold=1, openclassthreshold=50,
    simplelexsmooth=True),
binarization=dict(
    method='default', factor='right',
    h=1, v=1),
numproc=1, punct=None, functions=None, morphology=None, transformations=None, relationalrealizational=False, removeempty=False, ensureroot=False,
""")


# In[4]:


# Extract the grammar using the command line interface;
# the grammar will end up in several files under /tmp/mygrammar/
get_ipython().system('discodop grammar param mygrammar.prm mygrammar')


# In[5]:


# The fragments that this grammar is composed of are listed in mygrammar/dop.fragments.gz
get_ipython().system(' zcat mygrammar/dop.fragments.gz | head')


# In[6]:


# Load the grammar and construct a Parser object for it
top = 'S'  # the root label in the treebank
directory = 'mygrammar'
params = parser.readparam(directory + '/params.prm')
parser.readgrammars(directory, params.stages, params.postagging, top=getattr(params, 'top', top))
myparser = parser.Parser(params)


# In[7]:


# We now parse a sentence with two different probability models.
# The first is the relative frequency estimate (RFE),
# the second is the shortest derivation criterion, with ties
# broken by relative frequencies (the most probable shortest derivation, MPSD).
# With this small treebank they give the same result, but with
# a larger treebank these two disambiguation methods can select
# a different best parse.
sent = 'The hungry dog ate the dog'
myparser.stages[-1].estimator = 'rfe'
result = list(myparser.parse(sent.split()))
result


# In[8]:


# These are the fragments used in the Most Probable Derivation
result[0].fragments


# In[9]:


for a in result[0].fragments: print(tree.DrawTree(a))


# In[10]:


# Now we switch to the Most Probable Shortest Derivation
myparser.stages[-1].estimator = 'shortest'
result = list(myparser.parse(sent.split()))
for a in result[0].fragments: print(tree.DrawTree(a))