#!/usr/bin/env python # coding: utf-8 # In[7]: import os os.chdir('%s/mlb' % os.environ['HOME']) import retrosheet_sql_tools # the place to install retrosheet_sql_tools from is, # https://github.com/wellsoliver/py-retrosheet # scripts/retrosheet_sql_tools.py # In[8]: rs = retrosheet_sql_tools.retrosheet_sql() # retrosheet_sql_tools has lots of imports that arent relevent here # should be possible to comment out json, ephem, pytz, tzwhere rather than installing them # In[9]: def bsCodeToCs(bs_code): veto = ['+', '*', '.', '1', '2', '3', '>'] balls = ['B', 'I', 'V', 'P'] strikes_noout = ['F','R'] strikes_maybeout = ['C', 'K', 'L', 'M', 'O', 'Q', 'S', 'T'] ans = {0:[], 1:[]} bs = [0,0] for ch in bs_code: if ch in veto: continue # csaa only cares about pitches which were not swung at that means its either a B or a C or an other... if ch=='C': ans[1].append([bs[0], bs[1]]) elif ch=='B': ans[0].append([bs[0], bs[1]]) else: pass # ...but still need to increment the count if ch in balls: bs[0] += 1 elif ch in strikes_noout: bs[1] += 1 bs[1] = min(bs[1], 2) elif ch in strikes_maybeout: bs[1] += 1 return ans # In[10]: def getData(m, minyr=1999, maxyr=1999, ilim=999999999): q = 'select a.*, base4_ump_id as umpire from (select game_id, pit_id as pitcher, pit_hand_cd as throws, bat_id as batter, bat_hand_cd as stands, bat_home_id as home_batting, PITCH_SEQ_TX, \'00\' as cnt, pos2_fld_id as catcher from retrosheet_backup.events where year_id>=%d and year_id<=%d and playoff_flag=0 limit %d) a inner join retrosheet_backup.games b on a.game_id=b.game_id ' % (minyr, maxyr, ilim) print q + ';' data = m.sqlQueryToArray(q) return data # In[11]: def arrayToCsv(data, ofile=None, n2print=30000): if ofile is None: raise Exception ofp = open(ofile, 'w') ks = data.dtype.fields.keys() ofp.write('cs,') for k in ks[0:-1]: ofp.write('%s,' % k) k = ks[-1] ofp.write('%s\n' % k) for idata, d in enumerate(data): ts = bsCodeToCs(d['PITCH_SEQ_TX']) for ics in ts: for t in ts[ics]: t = ''.join([str(x) for x in t]) d['cnt'] = t if idata % n2print==0: print idata, len(data), ics, d, d['cnt'] ofp.write('%d,' % ics) for k in ks[0:-1]: ofp.write('%s,' % str(d[k])) k = ks[-1] ofp.write('%s\n' % str(d[k])) ofp.close() # In[12]: yrs = range(2004, 2004+1) for yr in yrs: print 'doing yr...', yr data = getData(rs, minyr=yr, maxyr=yr) ofile = 'csaa.in.%d.csv' % yr arrayToCsv(data, ofile=ofile) # In[ ]: