import csv
import codecs
import poioapi.io.graf
import poioapi.annotationgraph
import poioapi.data
class ExcelParser(poioapi.io.graf.BaseParser):
def __init__(self, filepath):
self.word_orders = dict()
self.clauses = list()
self.clause_types = dict()
self.last_id = -1
with codecs.open(filepath, "r", "utf-8") as csvfile:
hinuq2 = csv.reader(csvfile, delimiter='|')
i = 0
for row in hinuq2:
if i == 2:
clause_ids = row
elif i == 3:
clause_types = row
elif i == 4:
grammatical_relations = row
i += 1
if i > 7:
# now parse
word_order = []
c_id = None
prev_c_id = None
for j, clause_id in enumerate(clause_ids):
# new clause
if clause_id != "":
# add word order to previous clause
if len(word_order) > 0:
self.word_orders[c_id] = word_order
word_order = []
# add new clause
c_id = self._next_id()
self.clauses.append(c_id)
self.clause_types[c_id] = clause_types[j].strip()
grammatical_relation = grammatical_relations[j].strip()
word_order.append(grammatical_relation)
if len(word_order) > 0:
self.word_orders[c_id] = word_order
i = 0
def _next_id(self):
self.last_id += 1
return self.last_id
def get_root_tiers(self):
return [poioapi.io.graf.Tier("clause_id")]
def get_child_tiers_for_tier(self, tier):
if tier.name == "clause_id":
return [poioapi.io.graf.Tier("grammatical_relation"),
poioapi.io.graf.Tier("clause_type")]
return None
def get_annotations_for_tier(self, tier, annotation_parent=None):
if tier.name == "clause_id":
return [poioapi.io.graf.Annotation(i, v) for i, v in enumerate(self.clauses)]
elif tier.name == "clause_type":
return [poioapi.io.graf.Annotation(self._next_id(), self.clause_types[annotation_parent.id])]
elif tier.name == "grammatical_relation":
return [poioapi.io.graf.Annotation(self._next_id(), v) for v in self.word_orders[annotation_parent.id]]
return []
def tier_has_regions(self, tier):
return False
def region_for_annotation(self, annotation):
pass
def get_primary_data(self):
pass
def from_excel(filepath):
ag = poioapi.annotationgraph.AnnotationGraph()
parser = ExcelParser(filepath)
converter = poioapi.io.graf.GrAFConverter(parser)
converter.parse()
ag.tier_hierarchies = converter.tier_hierarchies
ag.structure_type_handler = poioapi.data.DataStructureType(ag.tier_hierarchies[0])
ag.graf = converter.graf
return ag
ag = from_excel("data/Hinuq2.csv")
import collections
verbs = [ 'COP', 'cop', 'SAY', 'say', 'v.tr', 'v.intr', 'v.aff' ]
others = [ 'A', 'S', 'P', 'EXP', 'STIM' ]
search_terms = verbs + others
clause_unit_nodes = ag.nodes_for_tier("clause_id")
word_orders = collections.defaultdict(int)
for parent_node in clause_unit_nodes:
word_order = []
for word_n in parent_node.iter_children():
a_list = ag.annotations_for_tier("grammatical_relation", word_n)
if len(a_list) > 0:
a_value = ag.annotation_value_for_annotation(a_list[0])
if a_value in search_terms:
if a_value in verbs:
word_order.append('V')
else:
word_order.append(a_value)
word_orders[tuple(word_order)] += 1
for word_order in word_orders:
print(str(word_order) + " => " + str(word_orders[word_order]))
('P', 'V', 'A') => 18 ('V',) => 4 ('S',) => 4 ('STIM', 'V', 'EXP') => 5 ('P', 'A', 'V') => 8 ('V', 'P', 'A') => 3 ('EXP', 'STIM', 'V') => 23 ('STIM', 'EXP', 'V') => 1 ('A', 'P', 'V') => 112 () => 6 ('V', 'S') => 60 ('S', 'V') => 228 ('P', 'A') => 1 ('A', 'V', 'P') => 25 ('A', 'V') => 22 ('V', 'A', 'P') => 4 ('EXP', 'V', 'STIM') => 7 ('V', 'A') => 17 ('V', 'EXP', 'STIM') => 2
A_values = []
P_values = []
S_values = []
for parent_node in clause_unit_nodes:
word_order = []
for gramm_node in ag.nodes_for_tier("grammatical_relation", parent_node):
a_list = ag.annotations_for_tier("grammatical_relation", gramm_node)
if len(a_list) > 0:
a_value = ag.annotation_value_for_annotation(a_list[0])
if a_value in verbs:
a_value = "V"
word_order.append(a_value)
if "V" in word_order:
v_index = word_order.index("V")
if "A" in word_order:
A_values.append(word_order.index("A") - v_index)
if "P" in word_order:
P_values.append(word_order.index("P") - v_index)
if "S" in word_order:
S_values.append(word_order.index("S") - v_index)
%matplotlib inline
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 3, figsize=(14,4))
axs[0].hist(S_values, range(min(S_values), max(S_values)+2))
axs[0].set_title("Positions of S")
axs[1].hist(A_values, range(min(A_values), max(A_values)+2))
axs[1].set_title("Positions of A")
axs[2].hist(P_values, range(min(P_values), max(P_values)+2))
ret = axs[2].set_title("Positions of P")
plt.figure(figsize=(10,6))
plt.boxplot([S_values, A_values, P_values])
plt.title("Positions of S, A and P")
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])
A_values = [[], []]
P_values = [[], []]
S_values = [[], []]
clause_types = ["m", "m.rs", "sub", "sub.rs"]
for parent_node in clause_unit_nodes:
word_order = []
clause_type = None
type_node = ag.nodes_for_tier("clause_type", parent_node)
if len(type_node) > 0:
clause_type_ann = ag.annotations_for_tier("clause_type", type_node[0])
clause_type = ag.annotation_value_for_annotation(clause_type_ann[0])
for gramm_node in ag.nodes_for_tier("grammatical_relation", parent_node):
a_list = ag.annotations_for_tier("grammatical_relation", gramm_node)
if len(a_list) > 0:
a_value = ag.annotation_value_for_annotation(a_list[0])
if a_value in verbs:
a_value = "V"
word_order.append(a_value)
if "V" in word_order and clause_type in clause_types:
ind = 0
if clause_type == "sub" or clause_type == "sub.rs":
ind = 1
v_index = word_order.index("V")
if "A" in word_order:
A_values[ind].append(word_order.index("A") - v_index)
if "P" in word_order:
P_values[ind].append(word_order.index("P") - v_index)
if "S" in word_order:
S_values[ind].append(word_order.index("S") - v_index)
fig, axs = plt.subplots(2, 3, figsize=(14,10))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2))
axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text))
axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2))
axs[ind][1].set_title("Positions of A {0} clauses".format(type_text))
axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2))
ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text))
fig, axs = plt.subplots(1, 2, figsize=(14,6))
for ind in [0, 1]:
type_text = "main"
if ind == 1:
type_text = "sub"
axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]])
axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text))
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])