from lxml import etree
import os
import pandas as pd
# The data was unzipped in this directory:
dirname = '/home/fnielsen/data/virkdk/1000_digitale_aarsrapporter'
# Attempt on a dirty reading of the data
# Not necessarily pretty, - including some strange workarounds
rows= []
for filename in os.listdir(dirname):
if not filename.endswith('.xml'):
continue
try:
tree = etree.parse(open(os.path.join(dirname, filename)))
except:
# There is XML error in the XML because of spaces!?
continue
d = {}
for element in tree.findall('/'):
if not isinstance(element, etree._Comment) and '{http://xbrl.dcca.dk/' in element.tag:
tag = element.tag[element.tag.index('}') + 1:]
context = element.attrib['contextRef']
if element.text is None:
# For some reason TypeError is not caught!?
value = None
else:
try:
value = float(element.text)
except TypeError:
value = element.text
except ValueError:
value = element.text
d[tag] = value
rows.append(d)
df = pd.DataFrame(rows)
df['NameAndSurnameOfChairmanOfGeneralMeeting'].value_counts()[:12]
Nils B. Bonde 7 Michael Vinther 7 Jens Tange Møllmann 5 Kim Larsen 5 Henrik Klougart 4 Uffe Martin Jensen 4 Henrik Rasmussen 4 Merete Lundbye Møller 4 Johannes Nielsen 4 Poul-Erik Vind 4 Søren Bruun Rasmussen 3 Thomas Folmann 3 dtype: int64
df['NameOfAuditFirm'].value_counts()[:12]
Deloitte Statsautoriseret Revisionspartnerselskab 75 PricewaterhouseCoopers Statsautoriseret Revisionspartnerselskab 63 Beierholm 52 BDO Statsautoriseret revisionsaktieselskab 52 KPMG 26 AP | Statsautoriserede Revisorer P/S, 17 Redmark, Statsautoriseret Revisionspartnerselskab 15 KPMG Statsautoriseret Revisionspartnerselskab 13 GLB REVISION, Statsautoriserede Revisorer A/S 12 CHRISTENSEN KJÆRULFF, statsautoriseret revisionsaktieselskab 12 Partner Revision, statsautoriseret revisionsaktieselskab 11 8 dtype: int64