Notebook

Getting tree fragments from TSG derivations¶

Below we extract a simple Tree-Substitution Grammar (TSG) and parse sentences with it, and show which tree fragments were used in the derivations and how to extract them.

In [1]:

import io
from discodop import parser, runexp, tree

In [2]:

# Go to a temporary directory where we will create a simple treebank
# and store the extract grammar
%cd /tmp
with io.open('treebankExample.mrg', 'w', encoding='utf8') as out:
    out.write(u"""(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (JJ yellow) (NN cat))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN cat)) (VP (VBP ate) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))
""")

/tmp

In [3]:

# Write a paramater file containing the specification for the grammar we will extract.
with io.open('mygrammar.prm', 'w', encoding='utf8') as out:
    out.write(u"""stages=[
    dict(name='dop', mode='pcfg', dop='doubledop',
        m=1000, estimator='rfe', objective = 'mpp')
],
corpusfmt='bracket',
traincorpus=dict(
    path='treebankExample.mrg', encoding='utf8',
    numsents=7, maxwords=100),
testcorpus=dict(
    path='treebankExample.mrg', encoding='utf8',
    numsents=7, maxwords=100, skiptrain=False),
postagging=dict(
    method='unknownword', model='4',
    unknownthreshold=1, openclassthreshold=50,
    simplelexsmooth=True),
binarization=dict(
    method='default', factor='right',
    h=1, v=1),
numproc=1, punct=None, functions=None, morphology=None, transformations=None, relationalrealizational=False, removeempty=False, ensureroot=False,
""")

In [4]:

# Extract the grammar using the command line interface;
# the grammar will end up in several files under /tmp/mygrammar/
!discodop grammar param mygrammar.prm mygrammar

7 training sentences after length restriction <= 100
known words: 10, signature types seen: 0
open class tags: 

closed class tags: DT:1 JJ:3 NN:3 VBP:2
treebank fan-out before binarization: 1 #6
(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))
The mouse ate the cat
binarization: default right h=1 v=1 ; cpu time elapsed: 0.001019s
binarized treebank fan-out: 1 #6
extracting recurring fragments
finished 0--7
getting exact counts for 25 fragments
exact indices chunk 1 of 1
merged 9 cover fragments up to depth 1 with max 999 frontier non-terminals.
found 34 fragments
DOP model based on 7 sentences, 69 nodes, 45 nonterminals
labels: 45 of which preterminals: 11
clauses: 74  lexical clauses: 21 non-lexical clauses: 53
max fan-out: 1 in 7/7 01	VP VBP NP mean: 1
max variables: 2 in 7/7 01	VP VBP NP
max parsing complexity: 3 in 1/1 01	S}<13> S}<8> VBP mean 2.43243
All left hand sides sum to 1 +/- epsilon=1e-16
equal number of nodes, but not equivalent:
coarse labels without mapping: { DT, DT@The, DT@the, JJ, NN, NN@cat, NN@dog, NN@mouse, NP, NP|<JJ>, ... }
wrote grammar to mygrammar/dop.{rules,lex,backtransform}.gz

In [5]:

# The fragments that this grammar is composed of are listed in mygrammar/dop.fragments.gz
! zcat mygrammar/dop.fragments.gz | head

(NP (DT 0=) (NN 1=))	11
(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))	3
(NP (DT 0=) (NN 1=cat))	6
(NP (DT 0=) (NP|<JJ> 1=))	3
(NP|<JJ> (JJ 0=) (NN 1=))	3
(S (NP (DT 0=The) (NN 1=mouse)) (VP (VBP 2=saw) (NP 3=)))	2
(S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=saw) (NP (DT 3=the) (NP|<JJ> (JJ 4=) (NN 5=)))))	2
(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))	5
(S (NP 0=) (VP (VBP 1=saw) (NP (DT 2=the) (NN 3=))))	3
(S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=) (NP (DT 3=the) (NN 4=))))	4

In [6]:

# Load the grammar and construct a Parser object for it
top = 'S'  # the root label in the treebank
directory = 'mygrammar'
params = parser.readparam(directory + '/params.prm')
parser.readgrammars(directory, params.stages, params.postagging, top=getattr(params, 'top', top))
myparser = parser.Parser(params)

In [7]:

# We now parse a sentence with two different probability models.
# The first is the relative frequency estimate (RFE),
# the second is the shortest derivation criterion, with ties
# broken by relative frequencies (the most probable shortest derivation, MPSD).
# With this small treebank they give the same result, but with
# a larger treebank these two disambiguation methods can select
# a different best parse.
sent = 'The hungry dog ate the dog'
myparser.stages[-1].estimator = 'rfe'
result = list(myparser.parse(sent.split()))
result

Out[7]:

[DictObj(msg='DOP:\titems 20, edges 22, blocked 0\n\tdisambiguation: 4 derivations, 1 parsetrees, 0.000451s\n\tp=6.407e-05 0.00s cpu time elapsed\n',
 	name='dop',
 	prob=6.406868578758892e-05,
 	parsetrees=[('(S (NP (DT 0) (NP|<JJ> (JJ 1) (NN 2))) (VP (VBP 3) (NP (DT 4) (NN 5))))', 6.406868578758892e-05, ['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])],
 	noparse=False,
 	golditems=0,
 	totalgolditems=0,
 	elapsedtime=0.0015919999999999268,
 	parsetree=ParentedTree('S', [ParentedTree('NP', [ParentedTree('DT', [0]), ParentedTree('JJ', [1]), ParentedTree('NN', [2])]), ParentedTree('VP', [ParentedTree('VBP', [3]), ParentedTree('NP', [ParentedTree('DT', [4]), ParentedTree('NN', [5])])])]),
 	numitems=20,
 	fragments=['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])]

In [8]:

# These are the fragments used in the Most Probable Derivation
result[0].fragments

Out[8]:

['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))',
 '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))',
 '(DT 0=The)',
 '(JJ 0=hungry)',
 '(NN 0=dog)',
 '(VBP 0=ate)',
 '(NN 0=dog)']

In [9]:

for a in result[0].fragments: print(tree.DrawTree(a))

         S             
 ┌───────┴───┐          
 │           VP        
 │   ┌───────┴───┐      
 │   │           NP    
 │   │       ┌───┴───┐  
 NP VBP      DT      NN
 │   │       │       │  
... ...     the     ...

     NP            
 ┌───┴─────┐        
 │      NP|<JJ>    
 │   ┌─────┴─────┐  
 DT  JJ          NN
 │   │           │  
... ...         ...

 DT
 │  
The

  JJ  
  │    
hungry

 NN
 │  
dog

VBP
 │  
ate

 NN
 │  
dog

In [10]:

# Now we switch to the Most Probable Shortest Derivation
myparser.stages[-1].estimator = 'shortest'
result = list(myparser.parse(sent.split()))
for a in result[0].fragments: print(tree.DrawTree(a))

         S             
 ┌───────┴───┐          
 │           VP        
 │   ┌───────┴───┐      
 │   │           NP    
 │   │       ┌───┴───┐  
 NP VBP      DT      NN
 │   │       │       │  
... ...     the     ...

     NP            
 ┌───┴─────┐        
 │      NP|<JJ>    
 │   ┌─────┴─────┐  
 DT  JJ          NN
 │   │           │  
... ...         ...

 DT
 │  
The

  JJ  
  │    
hungry

 NN
 │  
dog

VBP
 │  
ate

 NN
 │  
dog