Notebook

Imports¶

In [1]:

import csv
import codecs

import poioapi.io.graf
import poioapi.annotationgraph
import poioapi.data

Parser class¶

In [2]:

class ExcelParser(poioapi.io.graf.BaseParser):

    def __init__(self, filepath):
        self.word_orders = dict()
        self.clauses = list()
        self.clause_types = dict()
        self.last_id = -1
        with codecs.open(filepath, "r", "utf-8") as csvfile:
            hinuq2 = csv.reader(csvfile, delimiter='|')
            i = 0
            for row in hinuq2:
                if i == 2:
                    clause_ids = row
                elif i == 3:
                    clause_types = row
                elif i == 4:
                    grammatical_relations = row
                i += 1  
                if i > 7:
                    # now parse
                    word_order = []
                    c_id = None
                    prev_c_id = None
                    for j, clause_id in enumerate(clause_ids):

                        # new clause
                        if clause_id != "":
                            # add word order to previous clause
                            if len(word_order) > 0:
                                self.word_orders[c_id] = word_order
                            word_order = []
                            
                            # add new clause
                            c_id = self._next_id()
                            self.clauses.append(c_id)
                            self.clause_types[c_id] = clause_types[j].strip()
                        
                        grammatical_relation = grammatical_relations[j].strip()
                        word_order.append(grammatical_relation)

                    if len(word_order) > 0:
                        self.word_orders[c_id] = word_order
                    i = 0

    def _next_id(self):
        self.last_id += 1
        return self.last_id

    def get_root_tiers(self):
        return [poioapi.io.graf.Tier("clause_id")]

    def get_child_tiers_for_tier(self, tier):
        if tier.name == "clause_id":
            return [poioapi.io.graf.Tier("grammatical_relation"),
                    poioapi.io.graf.Tier("clause_type")]

        return None

    def get_annotations_for_tier(self, tier, annotation_parent=None):
        if tier.name == "clause_id":
            return [poioapi.io.graf.Annotation(i, v) for i, v in enumerate(self.clauses)]

        elif tier.name == "clause_type":
            return [poioapi.io.graf.Annotation(self._next_id(), self.clause_types[annotation_parent.id])]

        elif tier.name == "grammatical_relation":
            return [poioapi.io.graf.Annotation(self._next_id(), v)  for v in self.word_orders[annotation_parent.id]]
        
        return []

    def tier_has_regions(self, tier):
        return False

    def region_for_annotation(self, annotation):
        pass

    def get_primary_data(self):
        pass

In [3]:

def from_excel(filepath):
    ag = poioapi.annotationgraph.AnnotationGraph()
    parser = ExcelParser(filepath)
    converter = poioapi.io.graf.GrAFConverter(parser)
    converter.parse()
    ag.tier_hierarchies = converter.tier_hierarchies
    ag.structure_type_handler = poioapi.data.DataStructureType(ag.tier_hierarchies[0])
    ag.graf = converter.graf
    return ag

Loading the data¶

In [4]:

ag = from_excel("data/Hinuq2.csv")

Counting word orders¶

In [5]:

import collections

verbs = [ 'COP', 'cop', 'SAY', 'say', 'v.tr', 'v.intr', 'v.aff' ]
others = [ 'A', 'S', 'P', 'EXP', 'STIM' ]
search_terms = verbs + others

clause_unit_nodes = ag.nodes_for_tier("clause_id")

word_orders = collections.defaultdict(int)

for parent_node in clause_unit_nodes:
    word_order = []
    for word_n in parent_node.iter_children():
        a_list = ag.annotations_for_tier("grammatical_relation", word_n)
        if len(a_list) > 0:
            a_value = ag.annotation_value_for_annotation(a_list[0])
            if a_value in search_terms:
                if a_value in verbs:
                    word_order.append('V')
                else:
                    word_order.append(a_value)
    word_orders[tuple(word_order)] += 1

for word_order in word_orders:
    print(str(word_order) + " => " + str(word_orders[word_order]))

('P', 'V', 'A') => 18
('V',) => 4
('S',) => 4
('STIM', 'V', 'EXP') => 5
('P', 'A', 'V') => 8
('V', 'P', 'A') => 3
('EXP', 'STIM', 'V') => 23
('STIM', 'EXP', 'V') => 1
('A', 'P', 'V') => 112
() => 6
('V', 'S') => 60
('S', 'V') => 228
('P', 'A') => 1
('A', 'V', 'P') => 25
('A', 'V') => 22
('V', 'A', 'P') => 4
('EXP', 'V', 'STIM') => 7
('V', 'A') => 17
('V', 'EXP', 'STIM') => 2

Positions of S, A and P¶

In [6]:

A_values = []
P_values = []
S_values = []
for parent_node in clause_unit_nodes:
    word_order = []
    for gramm_node in ag.nodes_for_tier("grammatical_relation", parent_node):
        a_list = ag.annotations_for_tier("grammatical_relation", gramm_node)
        if len(a_list) > 0:
            a_value = ag.annotation_value_for_annotation(a_list[0])
            if a_value in verbs:
                a_value = "V"
            word_order.append(a_value)
    if "V" in word_order:
        v_index = word_order.index("V")
        if "A" in word_order:
            A_values.append(word_order.index("A") - v_index)
        if "P" in word_order:
            P_values.append(word_order.index("P") - v_index)
        if "S" in word_order:
            S_values.append(word_order.index("S") - v_index)

In [8]:

%matplotlib inline
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 3, figsize=(14,4))
axs[0].hist(S_values, range(min(S_values), max(S_values)+2))
axs[0].set_title("Positions of S")
axs[1].hist(A_values, range(min(A_values), max(A_values)+2))
axs[1].set_title("Positions of A")
axs[2].hist(P_values, range(min(P_values), max(P_values)+2))
ret = axs[2].set_title("Positions of P")

Box plots of positions¶

In [10]:

plt.figure(figsize=(10,6))
plt.boxplot([S_values, A_values, P_values])
plt.title("Positions of S, A and P")
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])

Plots by clause type¶

In [11]:

A_values = [[], []]
P_values = [[], []]
S_values = [[], []]
clause_types = ["m", "m.rs", "sub", "sub.rs"]
for parent_node in clause_unit_nodes:
    word_order = []
    clause_type = None
    type_node = ag.nodes_for_tier("clause_type", parent_node)
    if len(type_node) > 0:
        clause_type_ann = ag.annotations_for_tier("clause_type", type_node[0])
        clause_type = ag.annotation_value_for_annotation(clause_type_ann[0])
    for gramm_node in ag.nodes_for_tier("grammatical_relation", parent_node):
        a_list = ag.annotations_for_tier("grammatical_relation", gramm_node)
        if len(a_list) > 0:
            a_value = ag.annotation_value_for_annotation(a_list[0])
            if a_value in verbs:
                a_value = "V"
            word_order.append(a_value)
    if "V" in word_order and clause_type in clause_types:
        ind = 0
        if clause_type == "sub" or clause_type == "sub.rs":
            ind = 1
        v_index = word_order.index("V")
        if "A" in word_order:
            A_values[ind].append(word_order.index("A") - v_index)
        if "P" in word_order:
            P_values[ind].append(word_order.index("P") - v_index)
        if "S" in word_order:
            S_values[ind].append(word_order.index("S") - v_index)

In [12]:

fig, axs = plt.subplots(2, 3, figsize=(14,10))
for ind in [0, 1]:
    type_text = "main"
    if ind == 1:
        type_text = "sub"
    axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2))
    axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text))
    axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2))
    axs[ind][1].set_title("Positions of A {0} clauses".format(type_text))
    axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2))
    ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text))

Box plots by clause type¶

In [14]:

fig, axs = plt.subplots(1, 2, figsize=(14,6))
for ind in [0, 1]:
    type_text = "main"
    if ind == 1:
        type_text = "sub"

    axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]])
    axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text))
    ret = plt.xticks([1, 2, 3], ["S", "A", "P"])

In [ ]: