Below we extract a simple Tree-Substitution Grammar (TSG) and parse sentences with it, and show which tree fragments were used in the derivations and how to extract them.
import io
from discodop import parser, runexp, tree
# Go to a temporary directory where we will create a simple treebank
# and store the extract grammar
%cd /tmp
with io.open('treebankExample.mrg', 'w', encoding='utf8') as out:
out.write(u"""(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN mouse)) (VP (VBP saw) (NP (DT the) (JJ yellow) (NN cat))))
(S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP saw) (NP (DT the) (NN cat))))
(S (NP (DT The) (NN cat)) (VP (VBP ate) (NP (DT the) (NN dog))))
(S (NP (DT The) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))
""")
/tmp
# Write a paramater file containing the specification for the grammar we will extract.
with io.open('mygrammar.prm', 'w', encoding='utf8') as out:
out.write(u"""stages=[
dict(name='dop', mode='pcfg', dop='doubledop',
m=1000, estimator='rfe', objective = 'mpp')
],
corpusfmt='bracket',
traincorpus=dict(
path='treebankExample.mrg', encoding='utf8',
numsents=7, maxwords=100),
testcorpus=dict(
path='treebankExample.mrg', encoding='utf8',
numsents=7, maxwords=100, skiptrain=False),
postagging=dict(
method='unknownword', model='4',
unknownthreshold=1, openclassthreshold=50,
simplelexsmooth=True),
binarization=dict(
method='default', factor='right',
h=1, v=1),
numproc=1, punct=None, functions=None, morphology=None, transformations=None, relationalrealizational=False, removeempty=False, ensureroot=False,
""")
# Extract the grammar using the command line interface;
# the grammar will end up in several files under /tmp/mygrammar/
!discodop grammar param mygrammar.prm mygrammar
7 training sentences after length restriction <= 100 known words: 10, signature types seen: 0 open class tags: closed class tags: DT:1 JJ:3 NN:3 VBP:2 treebank fan-out before binarization: 1 #6 (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4)))) The mouse ate the cat binarization: default right h=1 v=1 ; cpu time elapsed: 0.001019s binarized treebank fan-out: 1 #6 extracting recurring fragments finished 0--7 getting exact counts for 25 fragments exact indices chunk 1 of 1 merged 9 cover fragments up to depth 1 with max 999 frontier non-terminals. found 34 fragments DOP model based on 7 sentences, 69 nodes, 45 nonterminals labels: 45 of which preterminals: 11 clauses: 74 lexical clauses: 21 non-lexical clauses: 53 max fan-out: 1 in 7/7 01 VP VBP NP mean: 1 max variables: 2 in 7/7 01 VP VBP NP max parsing complexity: 3 in 1/1 01 S}<13> S}<8> VBP mean 2.43243 All left hand sides sum to 1 +/- epsilon=1e-16 equal number of nodes, but not equivalent: coarse labels without mapping: { DT, DT@The, DT@the, JJ, NN, NN@cat, NN@dog, NN@mouse, NP, NP|<JJ>, ... } wrote grammar to mygrammar/dop.{rules,lex,backtransform}.gz
# The fragments that this grammar is composed of are listed in mygrammar/dop.fragments.gz
! zcat mygrammar/dop.fragments.gz | head
(NP (DT 0=) (NN 1=)) 11 (NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=))) 3 (NP (DT 0=) (NN 1=cat)) 6 (NP (DT 0=) (NP|<JJ> 1=)) 3 (NP|<JJ> (JJ 0=) (NN 1=)) 3 (S (NP (DT 0=The) (NN 1=mouse)) (VP (VBP 2=saw) (NP 3=))) 2 (S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=saw) (NP (DT 3=the) (NP|<JJ> (JJ 4=) (NN 5=))))) 2 (S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=)))) 5 (S (NP 0=) (VP (VBP 1=saw) (NP (DT 2=the) (NN 3=)))) 3 (S (NP (DT 0=The) (NN 1=)) (VP (VBP 2=) (NP (DT 3=the) (NN 4=)))) 4
# Load the grammar and construct a Parser object for it
top = 'S' # the root label in the treebank
directory = 'mygrammar'
params = parser.readparam(directory + '/params.prm')
parser.readgrammars(directory, params.stages, params.postagging, top=getattr(params, 'top', top))
myparser = parser.Parser(params)
# We now parse a sentence with two different probability models.
# The first is the relative frequency estimate (RFE),
# the second is the shortest derivation criterion, with ties
# broken by relative frequencies (the most probable shortest derivation, MPSD).
# With this small treebank they give the same result, but with
# a larger treebank these two disambiguation methods can select
# a different best parse.
sent = 'The hungry dog ate the dog'
myparser.stages[-1].estimator = 'rfe'
result = list(myparser.parse(sent.split()))
result
[DictObj(msg='DOP:\titems 20, edges 22, blocked 0\n\tdisambiguation: 4 derivations, 1 parsetrees, 0.000451s\n\tp=6.407e-05 0.00s cpu time elapsed\n', name='dop', prob=6.406868578758892e-05, parsetrees=[('(S (NP (DT 0) (NP|<JJ> (JJ 1) (NN 2))) (VP (VBP 3) (NP (DT 4) (NN 5))))', 6.406868578758892e-05, ['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])], noparse=False, golditems=0, totalgolditems=0, elapsedtime=0.0015919999999999268, parsetree=ParentedTree('S', [ParentedTree('NP', [ParentedTree('DT', [0]), ParentedTree('JJ', [1]), ParentedTree('NN', [2])]), ParentedTree('VP', [ParentedTree('VBP', [3]), ParentedTree('NP', [ParentedTree('DT', [4]), ParentedTree('NN', [5])])])]), numitems=20, fragments=['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)'])]
# These are the fragments used in the Most Probable Derivation
result[0].fragments
['(S (NP 0=) (VP (VBP 1=) (NP (DT 2=the) (NN 3=))))', '(NP (DT 0=) (NP|<JJ> (JJ 1=) (NN 2=)))', '(DT 0=The)', '(JJ 0=hungry)', '(NN 0=dog)', '(VBP 0=ate)', '(NN 0=dog)']
for a in result[0].fragments: print(tree.DrawTree(a))
S ┌───────┴───┐ │ VP │ ┌───────┴───┐ │ │ NP │ │ ┌───┴───┐ NP VBP DT NN │ │ │ │ ... ... the ... NP ┌───┴─────┐ │ NP|<JJ> │ ┌─────┴─────┐ DT JJ NN │ │ │ ... ... ... DT │ The JJ │ hungry NN │ dog VBP │ ate NN │ dog
# Now we switch to the Most Probable Shortest Derivation
myparser.stages[-1].estimator = 'shortest'
result = list(myparser.parse(sent.split()))
for a in result[0].fragments: print(tree.DrawTree(a))
S ┌───────┴───┐ │ VP │ ┌───────┴───┐ │ │ NP │ │ ┌───┴───┐ NP VBP DT NN │ │ │ │ ... ... the ... NP ┌───┴─────┐ │ NP|<JJ> │ ┌─────┴─────┐ DT JJ NN │ │ │ ... ... ... DT │ The JJ │ hungry NN │ dog VBP │ ate NN │ dog