%load_ext autoreload %autoreload 2 import helpers.diana tier_numbers = { "clause_id": 2, "clause_type": 3, "grammatical_relation": 4, "pos_agreement": 5, "last_line": 7 } ag = helpers.diana.from_excel("data/Hinuq3.csv", tier_numbers=tier_numbers) import collections verbs = [ 'COP', 'SAY', 'v.tr', 'v.intr', 'v.aff' ] verb_map = { v: "V" for v in verbs} others = [ 'A', 'S', 'P', 'EXP', 'STIM', 'zero-A', 'zero-S', 'zero-P', 'zero-EXP', 'zero-STIM' ] search_terms = verbs + others word_orders = collections.defaultdict(int) word_orders_ids = collections.defaultdict(list) for wo in helpers.diana.word_orders(ag, search_terms): word_orders[tuple(wo.word_order)] += 1 word_orders_ids[tuple(wo.word_order)].append(wo.clause_id) for word_order, count in word_orders.items(): print("{0} => {1}".format(word_order, count)) if count < 5: print(" {1}".format(word_order, word_orders_ids[word_order])) word_orders_main = [] word_orders_main_count = collections.defaultdict(int) word_orders_sub = [] word_orders_sub_count = collections.defaultdict(int) main_clause_types = [ "m", "m.rs" ] sub_clause_types = [ "sub", "sub.rs" ] clause_types = main_clause_types + sub_clause_types search_terms = verbs + ['A', 'S', 'P', 'EXP', 'STIM'] for wo in helpers.diana.word_orders(ag, search_terms, verb_map): if "V" in wo.word_order and wo.clause_type in clause_types and len(wo.word_order) > 1: if wo.clause_type in sub_clause_types: word_orders_sub.append(wo.word_order) word_orders_sub_count[tuple(wo.word_order)] += 1 else: word_orders_main.append(wo.word_order) word_orders_main_count[tuple(wo.word_order)] += 1 main_v_fin = 0; main_v_nonfin = 0; sub_v_fin = 0; sub_v_nonfin = 0; for wo, c in word_orders_main_count.items(): if wo[-1] == "V": main_v_fin += c else: main_v_nonfin += c for wo, c in word_orders_sub_count.items(): if wo[-1] == "V": sub_v_fin += c else: sub_v_nonfin += c cont_table = [ [main_v_fin, main_v_nonfin], [sub_v_fin, sub_v_nonfin] ] cont_table import scipy.stats oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue print("Counts for main clauses:") for wo, c in word_orders_main_count.items(): print("{0} => {1}".format(wo, c)) print("\nCounts for sub clauses:") for wo, c in word_orders_sub_count.items(): print("{0} => {1}".format(wo, c)) particles = [ 'G', 'BEN', 'TIME', 'LOC', 'ADD' ] pos_counts = [ [0, 0] for _ in particles ] search_terms = verbs + particles for wo in helpers.diana.word_orders(ag, search_terms, verb_map): for i, p in enumerate(particles): if "V" in wo.word_order and p in wo.word_order: if wo.word_order.index("V") < wo.word_order.index(p): pos_counts[i][0] += 1 else: pos_counts[i][1] += 1 for i, p in enumerate(particles): print(p) print(" Count after verb: {0}".format(pos_counts[i][0])) print(" Count before verb: {0}".format(pos_counts[i][1])) particles = [ 'BEN', 'G', 'ADD' ] pos_counts = [ [0, 0] for _ in particles ] search_terms = verbs + particles before = 0 after = 0 for wo in helpers.diana.word_orders(ag, search_terms, verb_map): for i, p in enumerate(particles): if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order: if wo.word_order.index("V") < wo.word_order.index(p): pos_counts[i][0] += 1 else: pos_counts[i][1] += 1 for i, p in enumerate(particles): print(p) print(" Count after verb: {0}".format(pos_counts[i][0])) print(" Count before verb: {0}".format(pos_counts[i][1])) part_sum = [ 0, 0 ] for i, p in enumerate(particles): part_sum[0] += pos_counts[i][0] part_sum[1] += pos_counts[i][1] print("Test for '{0}'".format(p)) print(scipy.stats.binom_test(pos_counts[i])) print("Test for 'BEN+G+ADD'") print(scipy.stats.binom_test(part_sum)) particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC' ] pos_counts = [ [0, 0] for _ in particles ] search_terms = verbs + particles before = 0 after = 0 for wo in helpers.diana.word_orders(ag, search_terms, verb_map): for i, p in enumerate(particles): if "V" in wo.word_order and p in wo.word_order: if wo.word_order.index("V") < wo.word_order.index(p): pos_counts[i][0] += 1 else: pos_counts[i][1] += 1 BEN_G_ADD = [ 0, 0 ] BEN_G_ADD[0] = pos_counts[0][0] + pos_counts[1][0] + pos_counts[2][0] BEN_G_ADD[1] = pos_counts[0][1] + pos_counts[1][1] + pos_counts[2][1] cont_table = [ BEN_G_ADD, pos_counts[3] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ BEN_G_ADD, pos_counts[4] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ BEN_G_ADD, pos_counts[5] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue A_P = [ 0, 0 ] A_P[0] = pos_counts[3][0] + pos_counts[4][0] A_P[1] = pos_counts[3][1] + pos_counts[4][1] cont_table = [ BEN_G_ADD, A_P ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue A_P_LOC = [ 0, 0 ] A_P_LOC[0] = pos_counts[3][0] + pos_counts[4][0] + pos_counts[5][0] A_P_LOC[1] = pos_counts[3][1] + pos_counts[4][1] + pos_counts[5][1] cont_table = [ BEN_G_ADD, A_P_LOC ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC', 'S' ] pos_counts = [ [0, 0] for _ in particles ] search_terms = verbs + particles before = 0 after = 0 for wo in helpers.diana.word_orders(ag, search_terms, verb_map): for i, p in enumerate(particles): if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order: if wo.word_order.index("V") < wo.word_order.index(p): pos_counts[i][0] += 1 else: pos_counts[i][1] += 1 cont_table = [ pos_counts[0], pos_counts[4] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ pos_counts[2], pos_counts[4] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue BEN_ADD = [ 0, 0 ] BEN_ADD[0] = pos_counts[0][0] + pos_counts[2][0] BEN_ADD[1] = pos_counts[0][1] + pos_counts[2][1] cont_table = [ BEN_ADD, pos_counts[4] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ pos_counts[3], pos_counts[4] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ pos_counts[3], pos_counts[5] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ] search_terms = other_verbs + [ 'SAY', 'A', 'zero-A' ] verb_map = { v: "V" for v in other_verbs } SAY_counts = [ 0, 0 ] others_counts = [ 0, 0 ] for wo in helpers.diana.word_orders(ag, search_terms, verb_map): if 'SAY' in wo.word_order: if 'A' in wo.word_order: SAY_counts[0] += 1 elif 'zero-A' in wo.word_order: SAY_counts[1] += 1 if 'V' in wo.word_order: if 'A' in wo.word_order: others_counts[0] += 1 elif 'zero-A' in wo.word_order: others_counts[1] += 1 cont_table = [ SAY_counts, others_counts ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue search_terms = other_verbs + [ 'SAY', 'A' ] SAY_counts = [ 0, 0 ] others_counts = [ 0, 0 ] for wo in helpers.diana.word_orders(ag, search_terms, verb_map): if 'SAY' in wo.word_order and 'A' in wo.word_order: if wo.word_order.index("SAY") < wo.word_order.index("A"): SAY_counts[0] += 1 else: SAY_counts[1] += 1 if "V" in wo.word_order and "A" in wo.word_order: if wo.word_order.index("V") < wo.word_order.index("A"): others_counts[0] += 1 else: others_counts[1] += 1 cont_table = [ SAY_counts, others_counts ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue print("A after SAY: {0}".format(SAY_counts[0])) print("A before SAY: {0}".format(SAY_counts[1])) verbs = [ 'v.tr', 'v.intr', 'v.aff' ] agreements = [0, 0]; noagreements = [0, 0]; for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True): for agr in wo.agreement: if wo.clause_type in sub_clause_types: if agr == "noagr": noagreements[0] += 1 else: agreements[0] += 1 else: if agr == "noagr": noagreements[1] += 1 else: agreements[1] += 1 print("I found {0} verbs with and {1} verbs without agreement.".format(agreements[0]+agreements[1], noagreements[0]+noagreements[1])) print("In main clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[1], noagreements[1])) print("In sub clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[0], noagreements[0])) search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ] cont_table_S = [ [0, 0], [0, 0] ] cont_table_P = [ [0, 0], [0, 0] ] cont_table_STIM = [ [0, 0], [0, 0] ] cont_table_S_P_STIM = [ [0, 0], [0, 0] ] for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True): agreeing = 0 if len(wo.word_order) != len(wo.agreement): continue for i, w in enumerate(wo.word_order): if w in verbs: if wo.agreement[i] == "noagr": agreeing = 1 if "zero-S" in wo.word_order: cont_table_S[agreeing][0] += 1 cont_table_S_P_STIM[agreeing][0] += 1 elif "S" in wo.word_order: cont_table_S[agreeing][1] += 1 cont_table_S_P_STIM[agreeing][1] += 1 if "zero-P" in wo.word_order: cont_table_P[agreeing][0] += 1 cont_table_S_P_STIM[agreeing][0] += 1 elif "P" in wo.word_order: cont_table_P[agreeing][1] += 1 cont_table_S_P_STIM[agreeing][1] += 1 if "zero-STIM" in wo.word_order: cont_table_STIM[agreeing][0] += 1 cont_table_S_P_STIM[agreeing][0] += 1 elif "STIM" in wo.word_order: cont_table_STIM[agreeing][1] += 1 cont_table_S_P_STIM[agreeing][1] += 1 cont_table_S oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S) pvalue cont_table_P oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_P) pvalue cont_table_STIM oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_STIM) pvalue cont_table_S_P_STIM oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S_P_STIM) pvalue verbs = [ 'v.tr', 'v.intr', 'v.aff' ] agreement_sum = collections.defaultdict(int) for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True): for agr in wo.agreement: agreement_sum[agr] += 1 for agr, count in agreement_sum.items(): print("{} => {}".format(agr, count)) verbs = [ 'v.tr', 'v.intr', 'v.aff' ] search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ] agreements = collections.defaultdict(int) for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True): v_class = None; n_class = None; v_marker = None; n_type = None; agreement = False zero = False if len(wo.word_order) != len(wo.agreement): print("length on blue and yellow line different in ID {}".format(wo.clause_id)) continue for i, w in enumerate(wo.word_order): if w in verbs: if wo.agreement[i] != "noagr": agreement = True if "-" in wo.agreement[i]: v_marker, v_class = wo.agreement[i].split("-") else: print("no dash in v agr in ID {}".format(wo.clause_id)) else: if "-" in wo.agreement[i]: n_split = wo.agreement[i].split("-") if len(n_split) > 2: print("more than one dash in n agr in ID {}".format(wo.clause_id)) n_class = n_split[1] #n_type = n_split[1] if "." in n_class: n_class, _ = n_class.split(".") else: print("no dash in n agr in ID {}".format(wo.clause_id)) if w.startswith("zero-"): zero = True if v_class != n_class and agreement: print("n class does not equal v class in ID {} (n_class: {} vs. v_class: {})".format(wo.clause_id, n_class, v_class)) elif v_class is not None and n_class is not None and not zero: agreements["{}-{}".format(v_marker, v_class)] += 1 for agr, count in agreements.items(): print("{} => {}".format(agr, count)) cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ], [ agreements["b-3"]+agreements["b-hpl"], agreement_sum["b-3"]+agreement_sum["b-hpl"]-agreements["b-3"]-agreements["b-hpl"] ] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ], [ agreements["r-5"]+agreements["r-hpl"]+agreements["r-nhpl"], agreement_sum["r-5"]+agreement_sum["r-hpl"]+agreement_sum["r-nhpl"]-agreements["r-5"]-agreements["r-hpl"]-agreements["r-nhpl"] ] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ], [ agreements["y-2"]+agreements["y-4"], agreement_sum["y-2"]+agreement_sum["y-4"]-agreements["y-2"]-agreements["y-4"] ] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue search_terms = [ "COP", "S" ] possible = 0 overt = 0 for wo in helpers.diana.word_orders(ag, search_terms): if "COP" in wo.word_order: possible += 1 if "S" in wo.word_order: overt += 1 print("{} / {} = {}".format(overt, possible, float(overt)/possible)) search_terms = [ "v.intr", "S" ] possible = 0 overt = 0 for wo in helpers.diana.word_orders(ag, search_terms): if "v.intr" in wo.word_order: possible += 1 if "S" in wo.word_order: overt += 1 print("{} / {} = {}".format(overt, possible, float(overt)/possible)) search_terms = [ "v.tr", "A", "P" ] possible = 0 overt = 0 for wo in helpers.diana.word_orders(ag, search_terms): if "v.tr" in wo.word_order: possible += 2 if "A" in wo.word_order: overt += 1 if "P" in wo.word_order: overt += 1 print("{} / {} = {}".format(overt, possible, float(overt)/possible)) search_terms = [ "v.aff", "EXP", "STIM" ] possible = 0 overt = 0 for wo in helpers.diana.word_orders(ag, search_terms): if "v.aff" in wo.word_order: possible += 2 if "EXP" in wo.word_order: overt += 1 if "STIM" in wo.word_order: overt += 1 print("{} / {} = {}".format(overt, possible, float(overt)/possible)) search_terms = [ "v.intr", "S" ] agr_possible = 0 agr_overt = 0 noagr_possible = 0 noagr_overt = 0 for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True): if len(wo.word_order) != len(wo.agreement): continue if "v.intr" in wo.word_order: v_index = wo.word_order.index("v.intr") if wo.agreement[v_index] == "noagr": noagr_possible += 1 if "S" in wo.word_order: noagr_overt += 1 else: agr_possible += 1 if "S" in wo.word_order: agr_overt += 1 print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible)) print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible)) cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ] oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) print(pvalue) search_terms = [ "v.tr", "A", "P" ] agr_possible = 0 agr_overt = 0 noagr_possible = 0 noagr_overt = 0 for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True): if len(wo.word_order) != len(wo.agreement): continue if "v.tr" in wo.word_order: v_index = wo.word_order.index("v.tr") if wo.agreement[v_index] == "noagr": noagr_possible += 2 if "A" in wo.word_order: noagr_overt += 1 if "P" in wo.word_order: noagr_overt += 1 else: agr_possible += 2 if "A" in wo.word_order: agr_overt += 1 if "P" in wo.word_order: agr_overt += 1 print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible)) print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible)) cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ] oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) print(pvalue) search_terms = [ "v.aff", "EXP", "STIM" ] agr_possible = 0 agr_overt = 0 noagr_possible = 0 noagr_overt = 0 for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True): if len(wo.word_order) != len(wo.agreement): continue if "v.aff" in wo.word_order: v_index = wo.word_order.index("v.aff") if wo.agreement[v_index] == "noagr": noagr_possible += 2 if "EXP" in wo.word_order: noagr_overt += 1 if "STIM" in wo.word_order: noagr_overt += 1 else: agr_possible += 2 if "EXP" in wo.word_order: agr_overt += 1 if "STIM" in wo.word_order: agr_overt += 1 print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible)) print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible)) cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ] oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) print(pvalue) v_tree = { "v.aff": collections.defaultdict(int), "v.tr": collections.defaultdict(int) } for wo in word_orders: v = None if "v.tr" in wo: v = "v.tr" if "v.aff" in wo: v = "v.aff" if v is not None: wo2 = tuple([e for e in sorted(wo) if e != "v.aff" and e != "v.tr"]) v_tree[v][wo2] += word_orders[wo] for v in ["v.aff", "v.tr"]: print(v) for e in v_tree[v]: print("{0} => {1}".format(e, v_tree[v][e])) import scipy.stats cont_table = [ [ v_tree["v.aff"][('EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'STIM')], v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('STIM', 'zero-EXP')] ], [ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('A', 'zero-P')], v_tree["v.tr"][('P', 'zero-A')] + v_tree["v.tr"][('zero-A', 'zero-P')] ] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue import scipy.stats cont_table = [ [ v_tree["v.aff"][('STIM', 'zero-EXP')] + v_tree["v.aff"][('EXP', 'STIM')], v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'zero-STIM')] ], [ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('P', 'zero-A')], v_tree["v.tr"][('A', 'zero-P')] + v_tree["v.tr"][('zero-A', 'zero-P')] ] ] cont_table oddsratio, pvalue = scipy.stats.fisher_exact(cont_table) pvalue other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ] verb_map = { v: "V" for v in other_verbs } A_values = [] P_values = [] S_values = [] for wo in helpers.diana.word_orders(ag, annotation_map = verb_map): word_order = [w for w in wo.word_order if not w.startswith("zero-")] if "V" in word_order: v_index = word_order.index("V") if "A" in word_order: A_values.append(word_order.index("A") - v_index) if "P" in word_order: P_values.append(word_order.index("P") - v_index) if "S" in word_order: S_values.append(word_order.index("S") - v_index) %matplotlib inline import matplotlib.pyplot as plt fig, axs = plt.subplots(1, 3, figsize=(14,4)) axs[0].hist(S_values, range(min(S_values), max(S_values)+2)) axs[0].set_title("Positions of S") axs[1].hist(A_values, range(min(A_values), max(A_values)+2)) axs[1].set_title("Positions of A") axs[2].hist(P_values, range(min(P_values), max(P_values)+2)) ret = axs[2].set_title("Positions of P") plt.figure(figsize=(10,6)) plt.boxplot([S_values, A_values, P_values]) plt.title("Positions of S, A and P") ret = plt.xticks([1, 2, 3], ["S", "A", "P"]) A_values = [[], []] P_values = [[], []] S_values = [[], []] clause_types = ["m", "m.rs", "sub", "sub.rs"] for wo in helpers.diana.word_orders(ag, annotation_map = verb_map): word_order = [w for w in wo.word_order if not w.startswith("zero-")] if "V" in word_order and wo.clause_type in clause_types: ind = 0 if wo.clause_type == "sub" or wo.clause_type == "sub.rs": ind = 1 v_index = word_order.index("V") if "A" in word_order: A_values[ind].append(word_order.index("A") - v_index) if "P" in word_order: P_values[ind].append(word_order.index("P") - v_index) if "S" in word_order: S_values[ind].append(word_order.index("S") - v_index) fig, axs = plt.subplots(2, 3, figsize=(14,10)) for ind in [0, 1]: type_text = "main" if ind == 1: type_text = "sub" axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2)) axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text)) axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2)) axs[ind][1].set_title("Positions of A {0} clauses".format(type_text)) axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2)) ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text)) fig, axs = plt.subplots(1, 2, figsize=(14,6)) for ind in [0, 1]: type_text = "main" if ind == 1: type_text = "sub" axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]]) axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text)) ret = plt.xticks([1, 2, 3], ["S", "A", "P"])