This document is organized in 3 sections:
import xml.etree.cElementTree as ET
tree = ET.ElementTree(file='doc1.xml')
tree.getroot()
<Element 'doc' at 0x05349EC0>
root = tree.getroot()
root.tag, root.attrib
('doc', {})
for child_of_root in root:
print child_of_root.tag, child_of_root.attrib
branch {'hash': '1cdf045c', 'name': 'testing'} branch {'hash': 'f200013e', 'name': 'release01'} branch {'name': 'invalid'}
root[0].tag, root[0].text
('branch', '\n text,source\n ')
for elem in tree.iter():
print elem.tag, elem.attrib
doc {} branch {'hash': '1cdf045c', 'name': 'testing'} branch {'hash': 'f200013e', 'name': 'release01'} sub-branch {'name': 'subrelease01'} branch {'name': 'invalid'}
for elem in tree.iter(tag='branch'):
print elem.tag, elem.attrib
branch {'hash': '1cdf045c', 'name': 'testing'} branch {'hash': 'f200013e', 'name': 'release01'} branch {'name': 'invalid'}
for elem in tree.iterfind('branch/sub-branch'):
print elem.tag, elem.attrib
sub-branch {'name': 'subrelease01'}
for elem in tree.iterfind('branch'):
print elem.tag, elem.attrib
branch {'hash': '1cdf045c', 'name': 'testing'} branch {'hash': 'f200013e', 'name': 'release01'} branch {'name': 'invalid'}
for elem in tree.iterfind('branch[@name="release01"]'):
print elem.tag, elem.attrib
branch {'hash': 'f200013e', 'name': 'release01'}
import xml.etree.cElementTree as ET
tree = ET.ElementTree(file='kanjidic2_example.xml')
First of all, what does the tree look like in this example file?
elems = [elem for elem in tree.iter()][:10]
elems
[<Element 'character' at 0x050E6848>, <Element 'literal' at 0x050E68A8>, <Element 'codepoint' at 0x050E68D8>, <Element 'cp_value' at 0x050E6908>, <Element 'cp_value' at 0x050E6920>, <Element 'radical' at 0x050E6938>, <Element 'rad_value' at 0x050E6968>, <Element 'rad_value' at 0x050E6980>, <Element 'misc' at 0x050E6998>, <Element 'grade' at 0x050E69C8>]
Getting the root: the 'character'.
root = tree.getroot()
root
<Element 'character' at 0x050E6848>
Getting the literal.
literal = root[0]
literal
<Element 'literal' at 0x050E68A8>
kanji = literal.text
kanji
u'\u672c'
print kanji
本
Getting the meanings.
meanings = [elem for elem in tree.iter('meaning')]
[meaning.text for meaning in meanings]
['book', 'present', 'main', 'true', 'real', 'counter for long cylindrical things', 'livre', u'pr\xe9sent', 'essentiel', 'origine', 'principal', u'r\xe9alit\xe9', u'v\xe9rit\xe9', u"compteur d'objets allong\xe9s", 'libro', 'origen', 'base', 'contador de cosas alargadas', 'livro', 'presente', 'real', 'verdadeiro', 'principal', 'sufixo p/ contagem De coisas longas']
But here we only want english meanings.
meanings[10].attrib
{'m_lang': 'fr'}
english_meanings = filter(lambda elem: elem.attrib == {}, meanings)
[meaning.text for meaning in english_meanings]
['book', 'present', 'main', 'true', 'real', 'counter for long cylindrical things']
Finally, we can get the Kanas.
readings = [elem for elem in tree.iter('reading')]
print [reading.text for reading in readings]
['ben3', 'bon', u'\ubcf8', u'\u30db\u30f3', u'\u3082\u3068']
Filtering for kanas.
readings[0].attrib['r_type']
'pinyin'
'r_type' in readings[0].attrib
True
kanas = filter(lambda reading: reading.attrib['r_type'] in ['ja_on', 'ja_kun'], readings)
kanas
[<Element 'reading' at 0x050E6EF0>, <Element 'reading' at 0x050E6F08>]
for kana in kanas:
print kana.text
ホン もと
import xml.etree.cElementTree as ET
tree = ET.ElementTree(file='kanjidic2.xml')
tree
<ElementTree at 0x49cce50>
root = tree.getroot()
root
<Element 'kanjidic2' at 0x049D8AB8>
root.findall('character/literal')[:10]
[<Element 'literal' at 0x049D8440>, <Element 'literal' at 0x04A2D488>, <Element 'literal' at 0x04A2D7E8>, <Element 'literal' at 0x04A2DB48>, <Element 'literal' at 0x049FC260>, <Element 'literal' at 0x049FCD28>, <Element 'literal' at 0x04A276E0>, <Element 'literal' at 0x04A27AE8>, <Element 'literal' at 0x04A27E48>, <Element 'literal' at 0x04A31428>]
I understand now: you have to specify the exact branching in the findall command while iter works because it filters the depth first search.
Searching for the entry of a specific kanji.
search_kanji = u'本'
literals = root.findall('character/literal')
literals[:10]
[<Element 'literal' at 0x049D8440>, <Element 'literal' at 0x04A2D488>, <Element 'literal' at 0x04A2D7E8>, <Element 'literal' at 0x04A2DB48>, <Element 'literal' at 0x049FC260>, <Element 'literal' at 0x049FCD28>, <Element 'literal' at 0x04A276E0>, <Element 'literal' at 0x04A27AE8>, <Element 'literal' at 0x04A27E48>, <Element 'literal' at 0x04A31428>]
len(literals)
13108
tree.find('character/literal')
<Element 'literal' at 0x0A8ABDB8>
[literal.text for literal in literals].index(u'話')
2948
print literals[2948].text
話
Getting the parent node.
characters = root.findall('character')
characters[:10]
[<Element 'character' at 0x0A8ABA10>, <Element 'character' at 0x0A8AB3E0>, <Element 'character' at 0x0A8A6F98>, <Element 'character' at 0x0A8A6C50>, <Element 'character' at 0x0A8A6350>, <Element 'character' at 0x0A8A18F0>, <Element 'character' at 0x0A89DE90>, <Element 'character' at 0x0A89DA70>, <Element 'character' at 0x0A89D758>, <Element 'character' at 0x0A89D170>]
print characters[2948][0].text
話
def find_element_by_kanji(tree, kanji):
root = tree.getroot()
literals = root.findall('character/literal')
index = [literal.text for literal in literals].index(kanji)
return root.findall('character')[index]
kuruma = find_element_by_kanji(tree, u'車')
kuruma
<Element 'character' at 0x0738E500>
print kuruma[0].text
車
def extract_data(element):
"""returns the kanji, the kana and the meanings from an element"""
kanji = element.find('literal').text
kana = [elem.text for elem in filter(lambda reading: reading.attrib['r_type'] in ['ja_on', 'ja_kun'], element.findall('reading_meaning/rmgroup/reading'))]
meanings = [elem.text for elem in filter(lambda elem: elem.attrib == {}, element.findall('reading_meaning/rmgroup/meaning'))]
return (kanji, kana, meanings)
def disp_data(data):
print data[0]
for item in data[1]:
print item
for item in data[2]:
print item
data = extract_data(kuruma)
disp_data(data)
車 シャ くるま car
disp_data(extract_data(find_element_by_kanji(tree, u'話')))
話 ワ はな.す はなし tale talk
disp_data(extract_data(find_element_by_kanji(tree, u'尖')))
尖 セン とが.る さき するど.い be pointed sharp taper displeased angry edgy
tree = ET.ElementTree(file='JMdict_example.xml')
tree
<ElementTree at 0x36692dd0>
root = tree.getroot()
root
<Element 'entry' at 0x366944E8>
Looking at the first few lines.
elems = [elem for elem in tree.iter()][:10]
elems
[<Element 'entry' at 0x366944E8>, <Element 'ent_seq' at 0x36694728>, <Element 'k_ele' at 0x36694500>, <Element 'keb' at 0x366946E0>, <Element 'ke_pri' at 0x36694650>, <Element 'ke_pri' at 0x36694548>, <Element 'ke_pri' at 0x36694608>, <Element 'r_ele' at 0x36694518>, <Element 'reb' at 0x36694578>, <Element 're_pri' at 0x366947A0>]
expression = root.find('k_ele/keb').text
print expression
右翼
reading = root.find('r_ele/reb').text
print reading
うよく
senses = root.findall('sense/gloss')
senses
[<Element 'gloss' at 0x366AF2D8>, <Element 'gloss' at 0x366AF320>, <Element 'gloss' at 0x366AF098>, <Element 'gloss' at 0x366AF368>, <Element 'gloss' at 0x366AF380>, <Element 'gloss' at 0x366AF3B0>, <Element 'gloss' at 0x366AF410>, <Element 'gloss' at 0x366AF428>, <Element 'gloss' at 0x366AF440>, <Element 'gloss' at 0x366AF470>, <Element 'gloss' at 0x366AF4A0>, <Element 'gloss' at 0x366AF4D0>]
senses = filter(lambda sense: sense.attrib == {}, senses)
senses
[<Element 'gloss' at 0x366AF2D8>, <Element 'gloss' at 0x366AF410>, <Element 'gloss' at 0x366AF428>, <Element 'gloss' at 0x366AF440>]
for sense in senses:
print sense.text
right-wing right field (e.g. in sport) right flank right wing
tree = ET.ElementTree(file='JMdict.xml')
tree
<ElementTree at 0x4a159b0>
root = tree.getroot()
root
<Element 'JMdict' at 0x04995D28>
word_entries = tree.getroot().findall('entry/k_ele/keb')
words = [entry.text for entry in word_entries]
len(words)
165048
for word in words[:50]:
print word
〃 仝 々 漢数字ゼロ ○ 〇 ABC順 CDプレーヤー CDプレイヤー N響 Oバック RS232ケーブル Tシャツ Tバック あうんの呼吸 阿吽の呼吸 明白 明白 偸閑 白地 明かん 悪どい 論う 馬酔木 彼処 彼所 あっと言う間に あっという間に あっとゆう間に 彼の あの人 彼の人 あの方 彼の方 溢れる 阿呆陀羅 甘子 天魚 雨子 𩺊 彼 いい加減にしなさい いい年をして 否々 否否 如何わしい いかなる場合でも 如何にも 幾つも 行けない
words[49][0] in words[34]
False
Ask for a specific kanji in an expression:
filtered_words = filter(lambda expression: u'寺' in expression, words)
for word in filtered_words:
print word
駆け込み寺 駆込み寺 古社寺 山寺 寺 寺院 禅寺 僧寺 大寺院 中禅寺湖 尼寺 仏寺 末寺 古寺 寺社 社寺 国分寺 寺参り 寺子屋 寺小屋 回教寺院 縁切り寺 氏寺 檀那寺 勅願寺 寺男 寺銭 菩提寺 寺格 八百八寺 寺内 入寺 敵は本能寺にあり 敵は本能寺に在り 寺号 寺域 官寺 大覚寺統 脇寺 寺中 寺社奉行 寺預け 寺入り 南都七大寺 七大寺 本願寺派 仏光寺派 誠照寺派 少林寺拳法 寺 お寺 御寺 お寺様 お寺さま 御寺様 紅妙蓮寺 寺請 寺請け 寺請制度 寺檀制度 三井寺歩行虫 三井寺芥虫 寺子 少林寺流 寺領 本能寺の変 寺務 監寺 都寺 副寺 寺務所 私寺 お寺さん 御寺さん 道明寺粉 廃寺 当寺 寺内町 宮寺 神宮寺 諸寺