from lxml import etree import os import pandas as pd # The data was unzipped in this directory: dirname = '/home/fnielsen/data/virkdk/1000_digitale_aarsrapporter' # Attempt on a dirty reading of the data # Not necessarily pretty, - including some strange workarounds rows= [] for filename in os.listdir(dirname): if not filename.endswith('.xml'): continue try: tree = etree.parse(open(os.path.join(dirname, filename))) except: # There is XML error in the XML because of spaces!? continue d = {} for element in tree.findall('/'): if not isinstance(element, etree._Comment) and '{http://xbrl.dcca.dk/' in element.tag: tag = element.tag[element.tag.index('}') + 1:] context = element.attrib['contextRef'] if element.text is None: # For some reason TypeError is not caught!? value = None else: try: value = float(element.text) except TypeError: value = element.text except ValueError: value = element.text d[tag] = value rows.append(d) df = pd.DataFrame(rows) df['NameAndSurnameOfChairmanOfGeneralMeeting'].value_counts()[:12] df['NameOfAuditFirm'].value_counts()[:12]