#Grab the data as parsed XML elements in a list race='data/F1 Race.txt' from lxml import etree pl=[] for el in open(race, 'r'): pl.append(etree.fromstring(el)) #Column mappings for the timing screen raceMap={ '1':'classpos', '2':'racingNumber', '3':'name', '4':'gap', '5':'interval', '6':'laptime', '7':'sector1', '9':'sector2', '11':'sector3', '12':'pitlap', '13':'pitcount' } raceUnMap = { raceMap[k]: k for k in raceMap} raceUnMap #Make use of row semantics (row is classification position) #Generate a classification position/time datastructure for each driverNumber def parse_race_pos_all(r,pos,c): '''r is data element; pos is dataframe; c is column''' if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib: cn=raceMap[c] if r[0].attrib['column']==c: tt=r.attrib['timestamp'].replace('.',':').split(':') ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f") pos=pd.concat([ pos,pd.DataFrame( { cn:r[0].attrib['value'], cn+"_colour":r[0].attrib['colour'], "pos":r[0].attrib['row'] },index=[ttx]) ]) return pos #This is a bit quirkier - driverNumber over time for a particular classification number def parse_race_pos_num(r,pos,c,p): '''r is data element; pos is dataframe; c is column; p is position we're interested in''' if r.attrib['identifier']=='101' and 'sessionstate' not in r[0].attrib: cn=raceMap[c] if r[0].attrib['column'] ==c and r[0].attrib['row'] ==p: tt=r.attrib['timestamp'].replace('.',':').split(':') ttx=datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000)#.strftime("%H:%M:%S:%f") pos=pd.concat([ pos,pd.DataFrame( {cn:r[0].attrib['value'], cn+"_colour":r[0].attrib['colour']},index=[ttx]) ]) return pos import pandas as pd pnum=pd.DataFrame() for l in pl: pnum=parse_race_pos_all(l,pnum,raceUnMap['racingNumber']) pnum['time'] = pnum.index pnum['pos']=pnum['pos'].astype(int) pnum=pnum[pnum.racingNumber!=''] pnum[:3] from ggplot import * ggplot(pnum[pnum.pos<23],aes(x='time',y='pos',colour='racingNumber'))+geom_line()+ylim(0,23) plj=[] for el in open(race, 'r'): r=etree.fromstring(el) d={} for a in r.attrib: d[a]=r.attrib[a] for a in r[0].attrib: d[a]=r[0].attrib[a] if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']: cn=raceMap[d['column']] did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ]) dd={cn:d['value'], cn+"_colour":d['colour'],'id':did,'row': d['row'],'strtime':d['timestamp']} plj.append(dd) plj[:3] import pymongo from pymongo import MongoClient import json #Connect to the MongoDB server conn = MongoClient('localhost', 27017) #Create a new database db = conn.tataracedb db.drop_collection('test1') #Create a new collection collection = db.test1 collection for i in plj: collection.update({'_id':i['id']},{"$set":i},upsert=True) collection.find_one() results=collection.find({'$and': [ { 'racingNumber': '3' }, { 'gap': { '$exists': True } } ]}) results.count() df=pd.DataFrame(list(results)) df[:3] def timeify(x): x=x.replace(':','_').replace('.','_') tt=x.split('_') return datetime.time(int(tt[0]),int(tt[1]),int(tt[2]),int(tt[3])*1000) df['time']=df['strtime'].apply(lambda x: timeify(x)) ggplot(df,aes(x='time',y='gap'))+geom_line() results=collection.find({'$and': [ #{ 'racingNumber': { '$exists': True } }, { 'gap': { '$exists': True } }, { '$or': [ {'row':'2'}, {'row':'3'},{'row':'4'},{'row':'5'}]} ]}) df=pd.DataFrame(list(results)) df['time']=df['strtime'].apply(lambda x: timeify(x)) df.gap=df.gap.convert_objects(convert_numeric=True) #df.dropna(subset=['gap','racingNumber'],inplace=True) ggplot(df,aes(x='time',y='gap',colour='row'))+geom_line() df.gap.unique() #results=collection.find({'$and': [{'row': '3'}, {'racingNumber': '3'}] },{'row':1,'strtime':1} ) results=collection.find({'row': '3' },{'row':1,'strtime':1} ) results.count() df=pd.DataFrame(list(results)) df[:3] def itimeify(x): tt=x.split('_') return (int(tt[0])*3600000+int(tt[1])*60000+int(tt[2])*1000+int(tt[3]))/1000 df['itime']=df['time'].apply(lambda x: itimeify(x)) df['time']=df['time'].apply(lambda x: timeify(x)) df[:3] ggplot(df[11:-5], aes(x='itime',y=1))+geom_point() #Is there anything we can do based around events on the same row that are within a short time of each other? #If there is a position change, details will refer to what in the row? Just the new driver? #Let's try to group close in time items... #Based on http://stackoverflow.com/a/10017017/454773 d=list(df['itime'][11:-5]) diff = [y - x for x, y in zip(*[iter(d)] * 2)] #avg = sum(diff) / len(diff) m = [[d[0]]] for x in d[1:]: #if x - m[-1][0] < avg: if x - m[-1][0] < 0.5: #this is the threshold m[-1].append(x) else: m.append([x]) print(m) from numpy import nan as NA race='data/F1 Race.txt' from lxml import etree #not used class Driver: sectors=[NA,NA,NA] prev_sectors=[NA,NA,NA] history_sectors=[[],[],[]] def __init__(self, name,driverNum): self.name = name self.driverNum = driverNum def update_sector(self,sectorNum,sectorTime): self.prev_sectors[sectorNum-1]=self.sectors[sectorNum-1] self.sectors[sectorNum-1]=sectorTime self.history_sectors[sectorNum-1].append(sectorTime) ham=Driver("hamilton","44") ham.name ham.update_sector(1,33.12) print(ham.sectors,ham.prev_sectors,ham.history_sectors) #not used class Classification: def __init__(self): self.rank={} ranks=Classification() ranks class Pits: def __init__(self): self._items={} self.history=[] self._logger=[] #self._driverNum=NA #self._lap=NA #self._strtime=None #self._count=NA def updateRacePits(self,raceObj,data): #Need 2, 6 and 13 then commit #This doesn't work if there is a position change while someone is IN PIT? #SO: car has to go OUT before it can come back in? #Also need to catch if car IN PIT then goes to RETIRED driverNum=raceObj.getDriverNumFromPos(data) if data['column'] in ['2','6','13']: if driverNum not in self._items: self._items[driverNum]={'out':True} if data['column']=='2' and data['colour']=='RED': self._logger.append(data) self._items[driverNum]['driverNum']=str(driverNum)+'_'+data['value'] self._items[driverNum]['strtime']=data['strtime'] #Note: driverNum should == int(data['value']) elif data['column']=='6': if data['value']=='IN PIT': self._logger.append(data) self._items[driverNum]['status']=data['value'] elif data['value']=='OUT': self._items[driverNum]={'out':True} elif data['column']=='13' and data['value']!='': self._logger.append(data) self._items[driverNum]['lap']=raceObj.lap self._items[driverNum]['count']=int(data['value']) if len ( self._items[driverNum].keys()) ==6 and self._items[driverNum]['out'] : #print('dd') self.history.append(self._items[driverNum].copy()) raceObj.drivers[driverNum].appendPit(self._items[driverNum].copy()) self._items[driverNum]={'out':False} class Purples: def __init__(self): self.lap={} self.sector1={} self.sector2={} self.sector3={} self._lap = {'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''} self._sector1={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''} self._sector2={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''} self._sector3={'driverNum':'',"name":'','laptime':'','classRank':'','lap':'','strtime':''} self.history=[] def updateLapPurples(self,raceObj,data): ''' ''' if data['column']=='2': self._lap['driverNum']=data['value'] self._lap['strtime']=data['strtime'] self._lap['classRank']=int(data['row']) elif data['column']=='3': self._lap['name']=data['value'] elif data['column']=='6': self._lap['laptime']=data['value'] self._lap['lap']=raceObj.lap-1 self.lap=self._lap.copy() self.history.append(self.lap) def updateSectorPurples(self,raceObj,sector,data): pass class DriverHistory: def __init__(self,driverNum=''): self.driverNum=driverNum self.pits=[] self._laptime='' self._sector1time='' self._sector2time='' self._sector3time='' self._gap='' self._interval='' self.lapdata=[] def setname(self,name): self.name=name def appendPit(self,data): self.pits.append(data) def trackLaptime(self,data): if data['row']=='1': self._gap='0' self._interval='0' if data['column'] == '6': self._laptime=data['value'] self._timeofday=data['strtime'] elif data['column'] == '7': self._sector1time=data['value'] elif data['column'] =='9': self._sector2time=data['value'] elif data['column'] =='11': self._sector3time=data['value'] elif data['column'] =='4': self._gap=data['value'] elif data['column'] =='5': self._interval=data['value'] if self._laptime != '' and self._sector1time!='' and self._sector2time!='' \ and self._sector3time!='' and self._gap!='' and self._interval!='': if raceObj.lap > (len(self.lapdata)+1): #if len(self.lapdata)==0 or (self._timeofday != self.lapdata[-1]['timeofday'] ): self.lapdata.append({ 'timeofday':self._timeofday, 'pos':int(data['row']), 'lap':raceObj.lap-1, 'laptime':self._laptime, 's1':self._sector1time, 's2':self._sector2time, 's3':self._sector3time, 'gap':self._gap, 'interval':self._interval}) self._laptime='' self._sector1time='' self._sector2time='' self._sector3time='' self._gap='' self._interval='' class Race: def __init__(self): self.lap=-1 self.pos={} #Contains the driver number for the current classification position self.lapPurples=Purples() self.pits=Pits() self.drivers={} self.driverNames={} self._namesSet=False def setDriverNumForPos(self,data): self.pos[data['row']]=data['value'] def getDriverNumFromPos(self,data): return self.pos[data['row']] def setupDriverDetails(self,data): if data['column']=='2' and data['value'].strip()!='' and data['value'] not in self.drivers: self.setDriverNumForPos(data) self.drivers[data['value']]=DriverHistory(data['value']) elif data['column']=='3' and data['value'].strip()!='': driverNum=self.getDriverNumFromPos(data) self.drivers[driverNum].setname(data['value']) self.driverNames[data['value']]=driverNum def trackDriverNumForPos(self,data): if data['column']=='2': self.setDriverNumForPos(data) def trackLapCount(self,data): if data['row']=='1' and data['column']=='5' and data['value']!='': self.lap=int(data['value']) def trackPurples(self,raceObj,data): if data['colour']=='PURPLE' and data['column'] in ['2','3','6']: self.lapPurples.updateLapPurples(raceObj,data) def trackPits(self,raceObj,data): if data['column'] in ['2','6','13']: self.pits.updateRacePits(raceObj,data) def trackLaptimes(self,raceObj,data): if data['column'] in ['4','5','6','7','9','11']: driverNum=self.getDriverNumFromPos(data) self.drivers[driverNum].trackLaptime(data) class Weather: def __init__(self): self.stamps={} def setWeather(self,data): if data['strtime'] not in self.stamps: self.stamps[data['strtime']]={} d=self.stamps[data['strtime']] if data['row']=='1': d['trackTemp']=data['value'] elif data['row']=='2': d['airTemp']=data['value'] elif data['row']=='3': d['rainfall']=data['value'] elif data['row']=='4': d['windSpeed']=data['value'] elif data['row']=='5': d['humidity']=data['value'] elif data['row']=='6': d['airpressure']=data['value'] elif data['row']=='7': d['windDir']=data['value'] plf=[] wlf=[] for el in open(race, 'r'): r=etree.fromstring(el) d={} for a in r.attrib: d[a]=r.attrib[a] for a in r[0].attrib: d[a]=r[0].attrib[a] if d['identifier']=='101' and 'row' in d and d['column'] in ['1','2','3','4','5','6','7','9','11','12','13']: dd={"value":d['value'], "colour":d['colour'],'row': d['row'],'column': d['column'],'strtime':d['timestamp']} #did= '_'.join([ d['row'],d['timestamp'].replace(':','_').replace('.','_') ]) #dd={"value":d['value'], "colour":d['colour'],'id':did,'row': d['row'],'column': d['column'],'strtime':d['timestamp']} plf.append(dd) elif d['identifier']=='101' and 'sessionstate' in d: plf.append(d) elif d['identifier']=='103' and d['column']=='1': dd={'strtime':d['timestamp'],'row': d['row'],"value":d['value']} wlf.append(dd) plf[:3] def checkSessionState(data): if ('sessionstate' in data): return data['sessionstate'] return raceObj=Race() startedRunning=False finishedRunning=False #raceHistory=RaceHistory() for x in plf: sessionstate=checkSessionState(x) if sessionstate != None: print(sessionstate) if sessionstate=="started": startedRunning=True elif startedRunning and sessionstate=="inactive": finishedRunning=True if not startedRunning and sessionstate == None: raceObj.setupDriverDetails(x) elif startedRunning and not finishedRunning and sessionstate == None: raceObj.trackLapCount(x) raceObj.trackDriverNumForPos(x) raceObj.trackPurples(raceObj,x) raceObj.trackPits(raceObj,x) raceObj.trackLaptimes(raceObj,x) #raceHistory.lapPurples.history weather=Weather() for w in wlf: weather.setWeather(w) weather.stamps['14:34:21.752'] raceObj.drivers['1'].lapdata[-3:] raceObj.drivers['1'].lapdata[:5] raceObj.lapPurples.history[:3] raceObj.drivers['1'].pits raceObj.drivers['1'].name In this example, row 17 starts to update high columns before the lower ones raceObj.pits._logger[:3] raceObj.pits.history[:3] raceObj.pos import csv from time import * #f=open('openurlgource.csv', 'rb') #reader = csv.reader(f, delimiter='\t') writer = csv.writer(open('openurlgource.txt','wb'),delimiter='|') headerline = reader.next() for row in reader: if row[4].strip() !='': t=int(mktime(strptime(row[0]+" "+row[1], "%Y-%m-%d %H:%M:%S"))) writer.writerow([t,row[3],'A',row[4].rstrip(':').replace(':','/')])