The data was exported from an Excel file, in Open Office, with "Save As"->"Text CSV", as "UTF-8", "{Tabulator}" as field seperator and double quotes (") as text seperator.
%load_ext autoreload
%autoreload 2
import helpers.diana
skip_lines = [
"Satire_and_humor_01_Two travellers",
"Satire_and_humor_02_The king's dispute",
"Satire_and_humor_03_King and slave",
"Satire_and_humor_04_Herdsman and Shamkhal",
"Satire_and_humor_05_Bratak and the rich",
"Satire_and_humor_06_Shephard and khan",
"Satire_and_humor_07_Bolo and the old woman",
"Satire_and_humor_08_King and fool",
"Satire_and_humor_09_Shax Abbas and the widower's son",
"Satire_and_humor_10_Shakh Abbas and the mother",
"Satire_and_humor_11_Shakh Abbas and the man",
"Satire_and_humor_12_Master and boat",
"Satire_and_humor_13_Craft of the smith",
"Satire_and_humor_14_Mulla and suslik",
"Satire_and_humor_15_Mullah and the poor",
"Satire_and_humor_16_Mention me in the prayer",
"Satire_and_humor_17_Big alkham",
"Satire_and_humor_18_Chief and burried",
"Satire_and_humor_19_Khinkal in paradise",
"Satire_and_humor_20_The horse's pace",
"Satire_and_humor_21_Cat",
"Satire_and_humor_22_Man_bought_kuvshin",
"Satire_and_humor_23_Crying for the son",
"Satire_and_humor_24_The escaping hare",
"Satire_and_humor_25_broken",
"Satire_and_humor_26_reversed legs",
"Satire_and_humor_27_friendship with the snow",
"Satire_and_humor_28_Man and bird",
"nice reflexive, but not clear whether the second one is in a relative clause or in the main clause",
"looks like reflexive in a postpositional phrase"
]
tier_numbers = {
"clause_id": 5,
"clause_type": 6,
"grammatical_relation": 7,
"pos_agreement": 9,
"last_line": 11
}
ag = helpers.diana.from_excel("data/AvarAnnotation.csv", skip_lines=skip_lines, tier_numbers=tier_numbers)
Error: duplicate clause ID: #3 Error: duplicate clause ID: #627 Error: duplicate clause ID: #628
import collections
verbs = [ 'COP', 'SAY', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in verbs}
others = [ 'A', 'S', 'P', 'EXP', 'STIM', 'zero-A', 'zero-S', 'zero-P', 'zero-EXP', 'zero-STIM' ]
search_terms = verbs + others
word_orders = collections.defaultdict(int)
word_orders_ids = collections.defaultdict(list)
for wo in helpers.diana.word_orders(ag, search_terms):
word_orders[tuple(wo.word_order)] += 1
word_orders_ids[tuple(wo.word_order)].append(wo.clause_id)
for word_order, count in word_orders.items():
print("{0} => {1}".format(word_order, count))
if count < 5:
print(" {1}".format(word_order, word_orders_ids[word_order]))
('S', 'v.intr') => 172 ('SAY', 'A') => 50 ('EXP', 'zero-STIM', 'v.aff') => 4 ['clause_id..n#78', 'clause_id..n#611', 'clause_id..n#64', 'clause_id..n#57'] ('zero-EXP', 'STIM', 'v.aff') => 10 ('zero-EXP', 'zero-STIM', 'v.aff') => 2 ['clause_id..n#706', 'clause_id..n#295'] ('zero-S', 'v.intr') => 125 ('A', 'SAY') => 61 ('EXP', 'STIM', 'v.aff') => 13 ('v.aff', 'EXP', 'STIM') => 1 ['clause_id..n#270'] ('STIM', 'v.aff', 'EXP') => 3 ['clause_id..n#97', 'clause_id..n#98', 'clause_id..n#417'] ('v.intr', 'S') => 34 ('zero-S', 'COP') => 20 ('v.tr', 'P', 'A') => 1 ['clause_id..n#443'] ('SAY', 'zero-A') => 2 ['clause_id..n#244', 'clause_id..n#688'] ('zero-A', 'v.tr', 'zero-P') => 1 ['clause_id..n#120'] ('zero-A', 'P', 'v.tr') => 134 ('STIM', 'EXP', 'v.aff') => 4 ['clause_id..n#359', 'clause_id..n#99', 'clause_id..n#52', 'clause_id..n#997'] ('zero-P', 'v.tr', 'A') => 3 ['clause_id..n#880', 'clause_id..n#236', 'clause_id..n#274'] ('A', 'P', 'v.tr') => 85 ('P', 'v.tr', 'A') => 16 ('P', 'v.tr', 'zero-A') => 1 ['clause_id..n#6'] ('S', 'COP') => 101 ('v.tr', 'A', 'P') => 10 ('A', 'zero-P', 'v.tr') => 17 ('P', 'zero-A', 'v.tr') => 1 ['clause_id..n#185'] ('zero-A', 'SAY') => 27 ('A', 'v.tr', 'P') => 12 ('zero-A', 'zero-P', 'v.tr') => 41 ('COP', 'S') => 15 ('zero-A', 'v.tr', 'P') => 14 ('P', 'A', 'v.tr') => 13 ('EXP', 'v.aff', 'STIM') => 6
word_orders_main = []
word_orders_main_count = collections.defaultdict(int)
word_orders_sub = []
word_orders_sub_count = collections.defaultdict(int)
main_clause_types = [ "m", "m.rs" ]
sub_clause_types = [ "sub", "sub.rs" ]
clause_types = main_clause_types + sub_clause_types
search_terms = verbs + ['A', 'S', 'P', 'EXP', 'STIM']
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if "V" in wo.word_order and wo.clause_type in clause_types and len(wo.word_order) > 1:
if wo.clause_type in sub_clause_types:
word_orders_sub.append(wo.word_order)
word_orders_sub_count[tuple(wo.word_order)] += 1
else:
word_orders_main.append(wo.word_order)
word_orders_main_count[tuple(wo.word_order)] += 1
Hypothesis H0: It does not depend on the clause type (sub vs. main) whether the clause unit is verb final.
main_v_fin = 0; main_v_nonfin = 0; sub_v_fin = 0; sub_v_nonfin = 0;
for wo, c in word_orders_main_count.items():
if wo[-1] == "V":
main_v_fin += c
else:
main_v_nonfin += c
for wo, c in word_orders_sub_count.items():
if wo[-1] == "V":
sub_v_fin += c
else:
sub_v_nonfin += c
cont_table = [ [main_v_fin, main_v_nonfin], [sub_v_fin, sub_v_nonfin] ]
cont_table
[[454, 149], [160, 15]]
import scipy.stats
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
1.0194909237201304e-06
We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The clause type affects the verb "finalness". In this case subordinate clauses have a sifgnificant higher count of verb final word orders.
Here are are the basic counts:
print("Counts for main clauses:")
for wo, c in word_orders_main_count.items():
print("{0} => {1}".format(wo, c))
print("\nCounts for sub clauses:")
for wo, c in word_orders_sub_count.items():
print("{0} => {1}".format(wo, c))
Counts for main clauses: ('V', 'A') => 51 ('V', 'P') => 12 ('STIM', 'EXP', 'V') => 3 ('V', 'P', 'A') => 1 ('STIM', 'V') => 7 ('P', 'V', 'A') => 14 ('S', 'V') => 215 ('EXP', 'V') => 4 ('A', 'V') => 70 ('A', 'V', 'P') => 10 ('EXP', 'STIM', 'V') => 11 ('V', 'S') => 42 ('V', 'A', 'P') => 10 ('P', 'A', 'V') => 9 ('EXP', 'V', 'STIM') => 5 ('A', 'P', 'V') => 73 ('P', 'V') => 62 ('V', 'EXP', 'STIM') => 1 ('STIM', 'V', 'EXP') => 3 Counts for sub clauses: ('V', 'A') => 1 ('V', 'P') => 2 ('STIM', 'EXP', 'V') => 1 ('P', 'V', 'A') => 2 ('STIM', 'V') => 3 ('A', 'P', 'V') => 11 ('S', 'V') => 57 ('A', 'V') => 8 ('EXP', 'STIM', 'V') => 2 ('V', 'S') => 7 ('EXP', 'V', 'STIM') => 1 ('A', 'V', 'P') => 2 ('P', 'V') => 74 ('P', 'A', 'V') => 4
particles = [ 'G', 'BEN', 'TIME', 'LOC', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
for i, p in enumerate(particles):
print(p)
print(" Count after verb: {0}".format(pos_counts[i][0]))
print(" Count before verb: {0}".format(pos_counts[i][1]))
G Count after verb: 25 Count before verb: 155 BEN Count after verb: 9 Count before verb: 39 TIME Count after verb: 5 Count before verb: 64 LOC Count after verb: 12 Count before verb: 55 ADD Count after verb: 22 Count before verb: 22
particles = [ 'BEN', 'G', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
for i, p in enumerate(particles):
print(p)
print(" Count after verb: {0}".format(pos_counts[i][0]))
print(" Count before verb: {0}".format(pos_counts[i][1]))
BEN Count after verb: 8 Count before verb: 31 G Count after verb: 23 Count before verb: 90 ADD Count after verb: 20 Count before verb: 20
I am using a binomial test here: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom_test.html#scipy.stats.binom_test
part_sum = [ 0, 0 ]
for i, p in enumerate(particles):
part_sum[0] += pos_counts[i][0]
part_sum[1] += pos_counts[i][1]
print("Test for '{0}'".format(p))
print(scipy.stats.binom_test(pos_counts[i]))
print("Test for 'BEN+G+ADD'")
print(scipy.stats.binom_test(part_sum))
Test for 'BEN' 0.000294076875434 Test for 'G' 1.48849064758e-10 Test for 'ADD' 1.0 Test for 'BEN+G+ADD' 5.94769311956e-11
Except for "ADD" it is very unlikely that those counts are random. So the difference for "BEN" and "G" and "BEN+G+ADD" is significant.
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
Hypothesis H0: It does not depend on the grammatical relation type if a participant appears before or after the verb.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
BEN_G_ADD = [ 0, 0 ]
BEN_G_ADD[0] = pos_counts[0][0] + pos_counts[1][0] + pos_counts[2][0]
BEN_G_ADD[1] = pos_counts[0][1] + pos_counts[1][1] + pos_counts[2][1]
cont_table = [ BEN_G_ADD, pos_counts[3] ]
cont_table
[[56, 216], [80, 188]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.013473954825883712
We reject the null hpythesis H0 because p < 0.05. The grammatical relation type does have influence on whether a participant appears before or after the verb. In this case, A appears significantly more often before the verb then BEN + G + ADD.
cont_table = [ BEN_G_ADD, pos_counts[4] ]
cont_table
[[56, 216], [37, 250]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.016904389278624213
Again, we reject the null hypothesis because p < 0.05. P occurs more often before the verb then BEN + G + ADD.
cont_table = [ BEN_G_ADD, pos_counts[5] ]
cont_table
[[56, 216], [12, 55]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.73423084920817272
We cannot reject the null hypothesis because p > 0.05. LOC does not occurs more often before the verb then BEN + G + ADD.
A_P = [ 0, 0 ]
A_P[0] = pos_counts[3][0] + pos_counts[4][0]
A_P[1] = pos_counts[3][1] + pos_counts[4][1]
cont_table = [ BEN_G_ADD, A_P ]
cont_table
[[56, 216], [117, 438]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.92756787940652896
We cannot reject H0. There is no difference between BEN + G + ADD vs. A + P.
A_P_LOC = [ 0, 0 ]
A_P_LOC[0] = pos_counts[3][0] + pos_counts[4][0] + pos_counts[5][0]
A_P_LOC[1] = pos_counts[3][1] + pos_counts[4][1] + pos_counts[5][1]
cont_table = [ BEN_G_ADD, A_P_LOC ]
cont_table
[[56, 216], [129, 493]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
1.0
We cannot reject H0. There is no difference between BEN + G + ADD vs. A + P + LOC
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC', 'S' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles
before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
for i, p in enumerate(particles):
if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index(p):
pos_counts[i][0] += 1
else:
pos_counts[i][1] += 1
cont_table = [ pos_counts[0], pos_counts[4] ]
cont_table
[[8, 31], [33, 158]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.64794235656672949
We cannot reject H0. There is no significant difference between BEN and P if they appear before or after the verb.
cont_table = [ pos_counts[2], pos_counts[4] ]
cont_table
[[20, 20], [33, 158]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
3.8451757980430857e-05
We can reject H0. P occurs more often before the verb then ADD.
BEN_ADD = [ 0, 0 ]
BEN_ADD[0] = pos_counts[0][0] + pos_counts[2][0]
BEN_ADD[1] = pos_counts[0][1] + pos_counts[2][1]
cont_table = [ BEN_ADD, pos_counts[4] ]
cont_table
[[28, 51], [33, 158]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.0021392471158436389
We reject reject H0 as p < 0.05. P appears significantly more often before the verb then BEN + ADD.
cont_table = [ pos_counts[3], pos_counts[4] ]
cont_table
[[76, 162], [33, 158]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.00053348950732782618
We reject H0 as p < 0.05. P apppears significantly more often before the verb than P.
cont_table = [ pos_counts[3], pos_counts[5] ]
cont_table
[[76, 162], [9, 36]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.15502514539950046
We cannot reject H0. There is no significant difference between A and S if they appear before or after the verb.
Hypothesis 1 (H0): It does not depend on the type of the verb (SAY vs. others) if the A is expressed ouvertly.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
search_terms = other_verbs + [ 'SAY', 'A', 'zero-A' ]
verb_map = { v: "V" for v in other_verbs }
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if 'SAY' in wo.word_order:
if 'A' in wo.word_order:
SAY_counts[0] += 1
elif 'zero-A' in wo.word_order:
SAY_counts[1] += 1
if 'V' in wo.word_order:
if 'A' in wo.word_order:
others_counts[0] += 1
elif 'zero-A' in wo.word_order:
others_counts[1] += 1
cont_table = [ SAY_counts, others_counts ]
cont_table
[[111, 29], [157, 192]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
2.0383277596699758e-12
We reject the null hypothesis because p < 0.05. A is more often ouvert in SAY sentences than in any other sentence with other verb types.
H0: It does not depend on the verb type (A vs. others) if A is before or after the verb.
search_terms = other_verbs + [ 'SAY', 'A' ]
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
if 'SAY' in wo.word_order and 'A' in wo.word_order:
if wo.word_order.index("SAY") < wo.word_order.index("A"):
SAY_counts[0] += 1
else:
SAY_counts[1] += 1
if "V" in wo.word_order and "A" in wo.word_order:
if wo.word_order.index("V") < wo.word_order.index("A"):
others_counts[0] += 1
else:
others_counts[1] += 1
cont_table = [ SAY_counts, others_counts ]
cont_table
[[50, 61], [30, 127]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
6.633759085937411e-06
We reject the null hypothesis as p < 0.05. A does significantly more often preceed the verb in non-SAY sentences.
print("A after SAY: {0}".format(SAY_counts[0]))
print("A before SAY: {0}".format(SAY_counts[1]))
A after SAY: 50 A before SAY: 61
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreements = [0, 0]; noagreements = [0, 0];
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
for agr in wo.agreement:
if wo.clause_type in sub_clause_types:
if agr == "noagr":
noagreements[0] += 1
else:
agreements[0] += 1
else:
if agr == "noagr":
noagreements[1] += 1
else:
agreements[1] += 1
print("I found {0} verbs with and {1} verbs without agreement.".format(agreements[0]+agreements[1], noagreements[0]+noagreements[1]))
print("In main clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[1], noagreements[1]))
print("In sub clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[0], noagreements[0]))
no agreement annotation in clause unit clause_id..n#737 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#718 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#679 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#645 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#465 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#370 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#377 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#268 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#274 for grammatical relation 'v.tr' I found 397 verbs with and 314 verbs without agreement. In main clauses: I found 280 verbs with and 191 verbs without agreement. In sub clauses: I found 117 verbs with and 123 verbs without agreement.
Hypothesis H0: It does not depend on the verbal agreement if S (or P or STIM) arguments are expressed overtly.
search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ]
cont_table_S = [ [0, 0], [0, 0] ]
cont_table_P = [ [0, 0], [0, 0] ]
cont_table_STIM = [ [0, 0], [0, 0] ]
cont_table_S_P_STIM = [ [0, 0], [0, 0] ]
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
agreeing = 0
if len(wo.word_order) != len(wo.agreement):
continue
for i, w in enumerate(wo.word_order):
if w in verbs:
if wo.agreement[i] == "noagr":
agreeing = 1
if "zero-S" in wo.word_order:
cont_table_S[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "S" in wo.word_order:
cont_table_S[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
if "zero-P" in wo.word_order:
cont_table_P[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "P" in wo.word_order:
cont_table_P[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
if "zero-STIM" in wo.word_order:
cont_table_STIM[agreeing][0] += 1
cont_table_S_P_STIM[agreeing][0] += 1
elif "STIM" in wo.word_order:
cont_table_STIM[agreeing][1] += 1
cont_table_S_P_STIM[agreeing][1] += 1
no agreement annotation in clause unit clause_id..n#47 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#737 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#718 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#679 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#645 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#275 for grammatical relation 'P' no agreement annotation in clause unit clause_id..n#465 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#370 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#377 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#268 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#274 for grammatical relation 'v.tr'
cont_table_S
[[81, 216], [64, 101]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S)
pvalue
0.012097485903857692
We reject the null hypothesis, because p < 0.05. S is signifantly more often overt when there is agreement.
cont_table_P
[[38, 164], [22, 116]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_P)
pvalue
0.56320828600188433
We cannot reject H0 as p > 0.05. There is no influence of agreement on overt P (or vice versa).
cont_table_STIM
[[5, 27], [1, 10]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_STIM)
pvalue
1.0
Here it is not possible to calculate, as the number of non-agreements is too low. Practically all verbs that have a STIM argument show agreement, whether the argument is expressed or not.
cont_table_S_P_STIM
[[124, 407], [87, 227]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S_P_STIM)
pvalue
0.16283038815723336
We cannot reject the hypothesis as p > 0.05. It does not depend on the verbal agreement whether S + P + STIM is expressed overtly.
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreement_sum = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
for agr in wo.agreement:
agreement_sum[agr] += 1
for agr, count in agreement_sum.items():
print("{} => {}".format(agr, count))
no agreement annotation in clause unit clause_id..n#737 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#718 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#679 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#645 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#465 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#370 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#377 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#268 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#274 for grammatical relation 'v.tr' b+b-3 => 8 v.tr => 1 w+w-1 => 10 y-1 => 1 l-pl => 1 r-pl => 37 l+r-pl => 1 y+y-2 => 2 y-2 => 4 w-1 => 107 b-3 => 223 r+l-pl => 1 noagrt => 1 noagr => 314
We check here first, if the agreement makes sense, i.e. whether the class marker on P, S or STIM is the same as on the verb. All other cases are printed with clause ID, to check manually.
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ]
agreements = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
v_class = None; n_class = None; v_marker = None; n_type = None;
agreement = False
zero = False
if len(wo.word_order) != len(wo.agreement):
print("length on blue and yellow line different in ID {}".format(wo.clause_id))
continue
for i, w in enumerate(wo.word_order):
if w in verbs:
if wo.agreement[i] != "noagr":
agreement = True
if "-" in wo.agreement[i]:
v_marker, v_class = wo.agreement[i].split("-")
else:
print("no dash in v agr in ID {}".format(wo.clause_id))
else:
if "-" in wo.agreement[i]:
n_split = wo.agreement[i].split("-")
if len(n_split) > 2:
print("more than one dash in n agr in ID {}".format(wo.clause_id))
n_class = n_split[1]
#n_type = n_split[1]
if "." in n_class:
n_class, _ = n_class.split(".")
else:
print("no dash in n agr in ID {}".format(wo.clause_id))
if w.startswith("zero-"):
zero = True
if v_class != n_class and agreement:
print("n class does not equal v class in ID {} (n_class: {} vs. v_class: {})".format(wo.clause_id, n_class, v_class))
elif v_class is not None and n_class is not None and not zero:
agreements["{}-{}".format(v_marker, v_class)] += 1
no dash in n agr in ID clause_id..n#2 no agreement annotation in clause unit clause_id..n#47 for grammatical relation 'S' length on blue and yellow line different in ID clause_id..n#47 n class does not equal v class in ID clause_id..n#978 (n_class: 2 vs. v_class: 1) no agreement annotation in clause unit clause_id..n#737 for grammatical relation 'v.intr' length on blue and yellow line different in ID clause_id..n#737 no agreement annotation in clause unit clause_id..n#718 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#718 no agreement annotation in clause unit clause_id..n#679 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#679 no dash in v agr in ID clause_id..n#373 n class does not equal v class in ID clause_id..n#373 (n_class: 3 vs. v_class: None) no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'v.intr' length on blue and yellow line different in ID clause_id..n#131 no agreement annotation in clause unit clause_id..n#645 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#645 no agreement annotation in clause unit clause_id..n#275 for grammatical relation 'P' length on blue and yellow line different in ID clause_id..n#275 no agreement annotation in clause unit clause_id..n#465 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#465 no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'v.intr' length on blue and yellow line different in ID clause_id..n#666 no agreement annotation in clause unit clause_id..n#370 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#370 no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'S' length on blue and yellow line different in ID clause_id..n#762 no agreement annotation in clause unit clause_id..n#377 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#377 no agreement annotation in clause unit clause_id..n#268 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#268 n class does not equal v class in ID clause_id..n# (n_class: pl vs. v_class: 3) no dash in v agr in ID clause_id..n#716 n class does not equal v class in ID clause_id..n#716 (n_class: 3 vs. v_class: None) no agreement annotation in clause unit clause_id..n#274 for grammatical relation 'v.tr' length on blue and yellow line different in ID clause_id..n#274
Here are the counts for all overt arguments where class markers were equal:
for agr, count in agreements.items():
print("{} => {}".format(agr, count))
l+r-pl => 1 w+w-1 => 6 y-2 => 4 w-1 => 54 b-3 => 184 b+b-3 => 6 l-pl => 1 r-pl => 30 y+y-2 => 2
search_terms = [ "COP", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "COP" in wo.word_order:
possible += 1
if "S" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
116 / 136 = 0.8529411764705882
search_terms = [ "v.intr", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.intr" in wo.word_order:
possible += 1
if "S" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
206 / 331 = 0.622356495468278
search_terms = [ "v.tr", "A", "P" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.tr" in wo.word_order:
possible += 2
if "A" in wo.word_order:
overt += 1
if "P" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
444 / 698 = 0.6361031518624641
search_terms = [ "v.aff", "EXP", "STIM" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
if "v.aff" in wo.word_order:
possible += 2
if "EXP" in wo.word_order:
overt += 1
if "STIM" in wo.word_order:
overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
68 / 86 = 0.7906976744186046
search_terms = [ "v.intr", "S" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.intr" in wo.word_order:
v_index = wo.word_order.index("v.intr")
if wo.agreement[v_index] == "noagr":
noagr_possible += 1
if "S" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 1
if "S" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
no agreement annotation in clause unit clause_id..n#47 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#737 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#131 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'S' no agreement annotation in clause unit clause_id..n#666 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'v.intr' no agreement annotation in clause unit clause_id..n#762 for grammatical relation 'S' with agreement: 100 / 161 = 0.6211180124223602 without agreement: 101 / 165 = 0.6121212121212121
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.909500375789
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
search_terms = [ "v.tr", "A", "P" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.tr" in wo.word_order:
v_index = wo.word_order.index("v.tr")
if wo.agreement[v_index] == "noagr":
noagr_possible += 2
if "A" in wo.word_order:
noagr_overt += 1
if "P" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 2
if "A" in wo.word_order:
agr_overt += 1
if "P" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
no agreement annotation in clause unit clause_id..n#718 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#679 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#169 for grammatical relation 'A' no agreement annotation in clause unit clause_id..n#177 for grammatical relation 'A' no agreement annotation in clause unit clause_id..n#501 for grammatical relation 'A' no agreement annotation in clause unit clause_id..n#645 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#275 for grammatical relation 'P' no agreement annotation in clause unit clause_id..n#465 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#370 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#377 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#268 for grammatical relation 'v.tr' no agreement annotation in clause unit clause_id..n#274 for grammatical relation 'v.tr' with agreement: 257 / 404 = 0.6361386138613861 without agreement: 172 / 270 = 0.6370370370370371
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
1.0
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
search_terms = [ "v.aff", "EXP", "STIM" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
if len(wo.word_order) != len(wo.agreement):
continue
if "v.aff" in wo.word_order:
v_index = wo.word_order.index("v.aff")
if wo.agreement[v_index] == "noagr":
noagr_possible += 2
if "EXP" in wo.word_order:
noagr_overt += 1
if "STIM" in wo.word_order:
noagr_overt += 1
else:
agr_possible += 2
if "EXP" in wo.word_order:
agr_overt += 1
if "STIM" in wo.word_order:
agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 49 / 64 = 0.765625 without agreement: 19 / 22 = 0.8636363636363636
Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.543844035181
We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.
v_tree = { "v.aff": collections.defaultdict(int), "v.tr": collections.defaultdict(int) }
for wo in word_orders:
v = None
if "v.tr" in wo:
v = "v.tr"
if "v.aff" in wo:
v = "v.aff"
if v is not None:
wo2 = tuple([e for e in sorted(wo) if e != "v.aff" and e != "v.tr"])
v_tree[v][wo2] += word_orders[wo]
for v in ["v.aff", "v.tr"]:
print(v)
for e in v_tree[v]:
print("{0} => {1}".format(e, v_tree[v][e]))
v.aff ('EXP', 'zero-STIM') => 4 ('EXP', 'STIM') => 27 ('zero-EXP', 'zero-STIM') => 2 ('STIM', 'zero-EXP') => 10 v.tr ('P', 'zero-A') => 150 ('zero-A', 'zero-P') => 42 ('A', 'P') => 137 ('A', 'zero-P') => 20
Hypothesis 1 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the A/EXP is expressed ouvertly.
For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).
import scipy.stats
cont_table = [
[ v_tree["v.aff"][('EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'STIM')],
v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('STIM', 'zero-EXP')] ],
[ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('A', 'zero-P')],
v_tree["v.tr"][('P', 'zero-A')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
[[31, 12], [157, 192]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.0010331791250171565
We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The verb type affects the ouvertness of the A/EXP argument. In this case the "v.aff" sentences have sigificantly more ouvert arguments EXP then the "v.tr" sentences have ouvert A.
Hypothesis 2 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the P/STIM is expressed ouvertly.
import scipy.stats
cont_table = [
[ v_tree["v.aff"][('STIM', 'zero-EXP')] + v_tree["v.aff"][('EXP', 'STIM')],
v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'zero-STIM')] ],
[ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('P', 'zero-A')],
v_tree["v.tr"][('A', 'zero-P')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
[[37, 6], [287, 62]]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
0.67113505746109936
In this case we cannot reject the null hypothesis, as p > 0.05. There is no statistical evidence that the verb type affects the ouvertness of P/STIM.
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in other_verbs }
A_values = []
P_values = []
S_values = []
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
word_order = [w for w in wo.word_order if not w.startswith("zero-")]
if "V" in word_order:
v_index = word_order.index("V")
if "A" in word_order:
A_values.append(word_order.index("A") - v_index)
if "P" in word_order:
P_values.append(word_order.index("P") - v_index)
if "S" in word_order:
S_values.append(word_order.index("S") - v_index)
%matplotlib inline
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(14,4))
axs[0].hist(S_values, range(min(S_values), max(S_values)+2))
axs[0].set_title("Positions of S")
axs[1].hist(A_values, range(min(A_values), max(A_values)+2))
axs[1].set_title("Positions of A")
axs[2].hist(P_values, range(min(P_values), max(P_values)+2))
ret = axs[2].set_title("Positions of P")
plt.figure(figsize=(10,6))
plt.boxplot([S_values, A_values, P_values])
plt.title("Positions of S, A and P")
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])
A_values = [[], []]
P_values = [[], []]
S_values = [[], []]
clause_types = ["m", "m.rs", "sub", "sub.rs"]
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
word_order = [w for w in wo.word_order if not w.startswith("zero-")]
if "V" in word_order and wo.clause_type in clause_types:
ind = 0
if wo.clause_type == "sub" or wo.clause_type == "sub.rs":
ind = 1
v_index = word_order.index("V")
if "A" in word_order:
A_values[ind].append(word_order.index("A") - v_index)
if "P" in word_order:
P_values[ind].append(word_order.index("P") - v_index)
if "S" in word_order:
S_values[ind].append(word_order.index("S") - v_index)
fig, axs = plt.subplots(2, 3, figsize=(14,10))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2))
axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text))
axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2))
axs[ind][1].set_title("Positions of A {0} clauses".format(type_text))
axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2))
ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text))
fig, axs = plt.subplots(1, 2, figsize=(14,6))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]])
axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text))
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])