Data sources:
http://www.hscic.gov.uk/gpprescribingdata - all the presentation prescribing data from here
http://www.hscic.gov.uk/media/10686/Download-glossary-of-terms-for-GP-prescribing---presentation-level/pdf/GP_Prescribing_Presentation_Level_Glossary_of_Terms.pdf
http://www.connectingforhealth.nhs.uk/systemsandservices/data/ods/ccginterim/interimpcmem_v5.zip - gp to ccg mapping here
http://www.erpho.org.uk/viewResource.aspx?id=22125 - old ccg to new ccg code mapping from here
http://www.ons.gov.uk/ons/publications/re-reference-tables.html?edition=tcm%3A77-325526 - ccg population data from here (2012)
https://geoportal.statistics.gov.uk/Docs/Boundaries/Clinical_commissioning_groups_(Eng)_Apr_2013_Boundaries_(Full_Extent).zip - maps


Notes:
Prescribing data is GP level and there are two types of CCG code (old and new) so this requires 
1. obtaining CCG population data 
2. obtaining mapping of old to new CCG codes 
3. obtaining GP practice to CCG mapping

NB: There does not appear to be an authorative list of GP practices. There are other data quality issues that should be written up another time.
import os, datetime
import pandas as pd
import pickle
import folium
from pandas import Series, DataFrame, Panel
from datetime import datetime, date, time


#cuts gp prescribing data for rows containing patterns of interest as dataframes

def pattern_timeseries(pattern):
    
    pathtodata = '/media/mydisk/prescribing_data/bnf/'#set to the directory with bnf CSVs in
    pathtopatterns = '/media/mydisk/prescribing_data/patterns/' 
    pathtopickles = '/media/mydisk/prescribing_data/pickles/'
    
    os.chdir(pathtodata)
    
    files = !ls T2*
    
    global pattern_df
    
    clean_filenames = []

    for f in files.l:
        clean_filenames.append(f[:7]) #clean up filenames so that grep will work
        
    for i, item in enumerate(clean_filenames):
        date = clean_filenames[i].replace('T', '')
        date = pd.to_datetime(date, format="%Y%m") #make pandas know the date is a date
        name = "bnf_%s_%.10s.csv" % (pattern, date) #create a name for new csv of grep for pattern using date from file name
        print "writing %s" % name
        !fgrep $pattern {clean_filenames[i]}* > $pathtopatterns/$name #grep for pattern in csv files and write to file 
    
    pattern_files = !ls $pathtopatterns*$pattern*
    
    cols = ['SHA', 'PCT', 'PRACTICE', 'BNF_CODE', 'BNF_NAME', 'ITEMS', 'NIC', 'ACT_COST', 'QUANTITY', 'DateTime', 'Index']
    
    practices_est = 10000 #estimated number of practices
    
    df_list = [pd.read_csv(file, names=cols) for file in pattern_files] 
            
    pattern_df = pd.concat(df_list)
    
    pattern_df['DateTime'] = pattern_df['DateTime'].astype('|S6') 
    pattern_df['DateTime'] = pd.to_datetime(pattern_df['DateTime'], format="%Y%m")
    
    os.chdir(pathtopickles)

    pattern_df.to_pickle('%s.pkl' % pattern)
                    
    return(pattern_df)


#define our opiates
Opiates = ['Codeine', 'Dihydrocodeine', 'Tramadol', 'Tapentadol', 'Buprenorphine', 'Fentanyl', 'Methadone', 'Morph', 'Oxycodone']

#commented because have run to Feb 2014
#for opiate in Opiates:
#   pattern_timeseries(opiate)

#load prev outputs from above
pathtopickles = '/media/mydisk/prescribing_data/pickles/'
os.chdir(pathtopickles)

#load dataframes
Codeine = pd.read_pickle('Codeine.pkl')
Dihydrocodeine = pd.read_pickle('Dihydrocodeine.pkl')
Tramadol = pd.read_pickle('Tramadol.pkl')
Tapentadol = pd.read_pickle('Tapentadol.pkl')
Buprenorphine = pd.read_pickle('Buprenorphine.pkl')
Fentanyl = pd.read_pickle('Fentanyl.pkl')
Methadone = pd.read_pickle('Methadone.pkl')
Morph = pd.read_pickle('Morph.pkl')
Oxycodone = pd.read_pickle('Oxycodone.pkl')

#make a list of dataframes
df_list = [Codeine, Dihydrocodeine, Tramadol, Tapentadol, Buprenorphine, Fentanyl, Methadone, Morph, Oxycodone]
opiate_lookup = {0:'Codeine', 1:'Dihydrocodeine', 2:'Tramadol', 3:'Tapentadol', 4:'Buprenorphine', 5:'Fentanyl', 6:'Methadone', 7:'Morphine', 8:'Oxycodone'}

#prescribing data don't contain a quantity that's a quantity
#work out quantity in mg from prep information and quantity.. 

pathtocsv = '/media/mydisk/prescribing_data/xls/csv'
os.chdir(pathtocsv)

Codeine['BNF_NAME'] = Codeine['BNF_NAME'].map(str.strip) #get rid of white space
codeine_preps = Codeine['BNF_NAME'].unique().tolist() #get a list of preparation names
codeine_doses = [3, 3, 0.6, 60, 5, 15, 30, 60, 3.2, 12.8, 8, 1.35, 12.8, 2, 8, 6, 1, 0.6, 10, 5, 30, 15, 30, 12] #codeine in mg for each prep
codeine_dose_lookup = dict(zip(codeine_preps, codeine_doses)) #make dict of prep names and codeine doses
Codeine['QUANTITY_IN_MG'] = Codeine['BNF_NAME'].map(lambda x: codeine_dose_lookup[x]) * Codeine['QUANTITY']

#Dihydrocodeine
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Dihydrocodeine.csv') #lookup prepared by colleague
Dihydrocodeine['BNF_NAME'] = Dihydrocodeine['BNF_NAME'].map(str.strip) #get rid of white space
dihydrocodeine_preps = Dihydrocodeine['BNF_NAME'].unique().tolist() #get a list of preparation names
dihydrocodeine_doses = df['mg of drug per unit of preparation'].str.replace('mg', '').astype(float).tolist() #remove the mg my collaborator added
dihydrocodeine_dose_lookup = dict(zip(dihydrocodeine_preps, dihydrocodeine_doses)) #make dict of prep names and codeine doses
Dihydrocodeine['QUANTITY_IN_MG'] = Dihydrocodeine['BNF_NAME'].map(lambda x: dihydrocodeine_dose_lookup[x]) * Dihydrocodeine['QUANTITY']

#Tramadol
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Tramadol.csv') #lookup prepared by colleague
Tramadol['BNF_NAME'] = Tramadol['BNF_NAME'].map(str.strip) #get rid of white space
tramadol_preps = Tramadol['BNF_NAME'].unique().tolist() #get a list of preparation names
tramadol_doses = df['mg of drug per unit of preparation'].tolist() #remove the mg my collaborator added
tramadol_dose_lookup = dict(zip(tramadol_preps, tramadol_doses)) #make dict of prep names and codeine doses
Tramadol['QUANTITY_IN_MG'] = Tramadol['BNF_NAME'].map(lambda x: tramadol_dose_lookup[x]) * Tramadol['QUANTITY']

#Tapentadol
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Tapentadol.csv') #lookup prepared by colleague
Tapentadol['BNF_NAME'] = Tapentadol['BNF_NAME'].map(str.strip) #get rid of white space
tapentadol_preps = Tapentadol['BNF_NAME'].unique().tolist() #get a list of preparation names
tapentadol_doses = df['mg of drug per unit of preparation'].tolist() #remove the mg my collaborator added
tapentadol_dose_lookup = dict(zip(tapentadol_preps, tapentadol_doses)) #make dict of prep names and codeine doses
Tapentadol['QUANTITY_IN_MG'] = Tapentadol['BNF_NAME'].map(lambda x: tapentadol_dose_lookup[x]) * Tapentadol['QUANTITY']

#Buprenorphine
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Buprenorphine.csv') #lookup prepared by colleague
Buprenorphine['BNF_NAME'] = Buprenorphine['BNF_NAME'].map(str.strip) #get rid of white space
buprenorphine_preps = Buprenorphine['BNF_NAME'].unique().tolist() #get a list of preparation names
buprenorphine_doses = df['mg of drug per unit of preparation'].str.replace('mg', '').astype(float).tolist() #remove the mg my collaborator added
buprenorphine_dose_lookup = dict(zip(buprenorphine_preps, buprenorphine_doses)) #make dict of prep names and codeine doses
Buprenorphine['QUANTITY_IN_MG'] = Buprenorphine['BNF_NAME'].map(lambda x: buprenorphine_dose_lookup[x]) * Buprenorphine['QUANTITY']

#Fentanyl
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Fentanyl.csv') #lookup prepared by colleague
Fentanyl['BNF_NAME'] = Fentanyl['BNF_NAME'].map(str.strip) #get rid of white space
fentanyl_preps = Fentanyl['BNF_NAME'].unique().tolist() #get a list of preparation names
fentanyl_doses = df['mg of drug per unit of preparation'].tolist() #remove the mg my collaborator added
fentanyl_dose_lookup = dict(zip(fentanyl_preps, fentanyl_doses)) #make dict of prep names and codeine doses
Fentanyl['QUANTITY_IN_MG'] = Fentanyl['BNF_NAME'].map(lambda x: fentanyl_dose_lookup[x]) * Fentanyl['QUANTITY']

#Methadone
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Methadone.csv') #lookup prepared by colleague
Methadone['BNF_NAME'] = Methadone['BNF_NAME'].map(str.strip) #get rid of white space
methadone_preps = Methadone['BNF_NAME'].unique().tolist() #get a list of preparation names
methadone_doses = df['mg of drug per unit of preparation'].str.replace('tbc', '0')#remove the mg my collaborator added
methadone_doses = methadone_doses.astype(float).tolist()
methadone_dose_lookup = dict(zip(methadone_preps, methadone_doses)) #make dict of prep names and codeine doses
Methadone['QUANTITY_IN_MG'] = Methadone['BNF_NAME'].map(lambda x: methadone_dose_lookup[x]) * Methadone['QUANTITY']

#Morphine
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Morphine.csv') #lookup prepared by colleague
Morph['BNF_NAME'] = Morph['BNF_NAME'].map(str.strip) #get rid of white space
morphine_preps = Morph['BNF_NAME'].unique().tolist() #get a list of preparation names
morphine_doses = df['mg of drug per unit of preparation'].str.replace('tbc', '0')
morphine_doses = morphine_doses.astype(float).tolist() #remove the mg my collaborator added
morphine_dose_lookup = dict(zip(morphine_preps, morphine_doses)) #make dict of prep names and codeine doses
Morph['QUANTITY_IN_MG'] = Morph['BNF_NAME'].map(lambda x: morphine_dose_lookup[x]) * Morph['QUANTITY']

#Oxycodone
#work out quantity in mg from prep information and quantity
df = pd.read_csv('Oxycodone.csv') #lookup prepared by colleague
Oxycodone['BNF_NAME'] = Oxycodone['BNF_NAME'].map(str.strip) #get rid of white space
oxycodone_preps = Oxycodone['BNF_NAME'].unique().tolist() #get a list of preparation names
oxycodone_doses = df['mg of drug per unit of preparation'].tolist()
oxycodone_dose_lookup = dict(zip(oxycodone_preps, oxycodone_doses)) #make dict of prep names and codeine doses
Oxycodone['QUANTITY_IN_MG'] = Oxycodone['BNF_NAME'].map(lambda x: oxycodone_dose_lookup[x]) * Oxycodone['QUANTITY']

#ccg mid-year population estimates from ONS for 2012
#from http://www.ons.gov.uk/ons/publications/re-reference-tables.html?edition=tcm%3A77-325526
#has CCG code in new format

pathtogpdata = '/home/sam/Documents/OpenDataAbstract/'
os.chdir(pathtogpdata)

df = pd.read_csv('SAPE7DT1-Mid-2012-ccg-syoa-file.csv') 
df.icol(3).dropna().nunique() #column 3 has our 211 ccgs
df1 = df[~df.icol(3).isnull()]  #throw away rows that don't relate to our ccgs
df2 = df1.icol([0,3,4]) #just use all ages and throw away the rest
df2.columns = ['CCG13CD', 'CCG_Name', 'Population'] #name the columns sensibly
#csv of old CCG codes matched onto new ones
#from http://www.erpho.org.uk/viewResource.aspx?id=22125
df3 = pd.read_csv('ccgcodemap.csv')
#merge our dataframes to make a frame with CCG names and old and new codes and population sizes
df4 = pd.merge(df2, df3, on='CCG13CD')

#gp to ccg mapping 1
#wget http://www.connectingforhealth.nhs.uk/systemsandservices/data/ods/ccginterim/interimpcmem_v5.zip
#has CCG code in old format and contains too many CCGs (>211)
df5 = pd.read_csv('interimpcmem_v5.csv') 
df5.rename(columns = {'PRACTICECODE': 'PRACTICE'}, inplace=True)
df5 = df5[['PRACTICE', 'CCGCODE']] #throw away columns we don't care about

pathtopickles = '/media/mydisk/prescribing_data/pickles/'
os.chdir(pathtopickles)
df4.to_pickle('CCGPopData.pkl')
df5.to_pickle('GPCCGMap.pkl')

CCGPopData = pd.read_pickle('CCGPopData.pkl')
GPCCGMap = pd.read_pickle('GPCCGMap.pkl')


CCGPopData['Population'] = CCGPopData['Population'].str.replace(',','')
CCGPopData['Population'] = CCGPopData['Population'].astype(float)

for i, item in enumerate(df_list):
    df_list[i] = df_list[i][['DateTime', 'PRACTICE', 'QUANTITY_IN_MG']]
    df_list[i] = pd.merge(df_list[i], GPCCGMap, on='PRACTICE')
    df_list[i] = df_list[i].groupby(['CCGCODE', 'DateTime']).QUANTITY_IN_MG.sum() #group by ccg and date time, sum the mg of codeine
    df_list[i] = DataFrame(df_list[i]).reset_index()
    df_list[i] = pd.merge(df_list[i], CCGPopData, on='CCGCODE')
    df_list[i]['QUANTITY_IN_MG_PER_PERSON'] = df_list[i]['QUANTITY_IN_MG'] / df_list[i]['Population'] 
    df_list[i]['Drug'] = opiate_lookup[i]
    df_list[i] = df_list[i][['DateTime', 'CCG_Name', 'CCG13CD', 'Drug', 'QUANTITY_IN_MG', 'Population', 'QUANTITY_IN_MG_PER_PERSON']]
    

df = pd.concat(df_list) 

df.index = df.DateTime
df['CCG_Name'] = df['CCG_Name'].str.strip()

print df.hist()
print df.describe()

df.groupby('DateTime')['QUANTITY_IN_MG_PER_PERSON'].sum().plot(ylim=(0,22000))

for drug in df['2013'].Drug.unique():
    print drug
    df['2013'][df['2013']['Drug'] == drug]['QUANTITY_IN_MG_PER_PERSON'].hist(bins=100)
    plt.show()