import urllib.request
urllib.request.urlretrieve ("http://www.fda.gov/downloads/Drugs/InformationOnDrugs/UCM054599.zip", "UCM054599.zip")

import pandas as pd

from zipfile import ZipFile
with ZipFile("UCM054599.zip", 'r') as drugdata:
    print([entry.filename for entry in drugdata.infolist()])
    with drugdata.open('Product.txt', 'U') as productfile:
#        print(productfile.readlines())
        drugdataframe = pd.read_table(productfile)

from itertools import chain
import re

allnames = drugdataframe.drugname.replace(
    {
        ' SODIUM': '',
        ' PRESERVATIVE FREE': '',
        ' HYDROCHLORIDE': '',
        ' HCL': '',
        ' FOR ORAL SOLUTION': '',
        ' INJECTION': '',
        ' LOCK FLUSH': '',
        ' PHOSPHATE': '',
        ' SULFATE': '',
        ' SINUS': '',
        ' CHLORIDE': '',
        ' DILUTE': '',
        ' COLD': '',
        ' IV$': '',
        ' E$': '',
        '^SODIUM ': '',
        '(^| )CALCIUM( |$)': '',
        '(^| )IRON( |$)': '',
        ' TARTRATE': '',
        ' TEBUTATE': '',
        ' HALF-STRENGTH': '',
        '( R)? PEN$': '',
        ' PLAIN': '',
        '\d*[ -]HOUR': '',
        'TECHNETIUM.*': 'TECHNETIUM',
        '.*LUNGAGGREGATE.*': '',
        ' \(PRESERVATIVE FREE\)': '',
        ' \(NEEDS NO REFRIGERATION\)': '',
        ' IN PLASTIC CONTAINER': '',
        '( G.)?U. IRRIGANT': '',
        '[- ](C|SR|RX|PM|D|RF|CR|XL|ODT|CD|ES|HCT|HP|XR)(\s|$)': '',
        '[- ,][0-9.]+[%]': '',
        '[- ,#/][A-Z]?[0-9.]+': '',
        ' DISPERDOSE': '',
        '%': ''
    }
    , regex=True
).unique()
#allnames = set(chain.from_iterable(re.split("( AND| W/|,|;| &| IN)( |$)", name) for name in allnames))
allnames = set(chain.from_iterable(re.split("( |-|/|;|\)|\(|\.|,|'|:|\")", name) for name in allnames))
allnames = allnames.difference(set(['MAXALT-MLT', 'LUVOX CR', 'TRAVASOL% IN DEXTROSE%', 'CARBONATE', '', 'PN', 'PBZ', 'AVC', 'M.V.I. (WITHOUT VITAMIN K)', 'EPI E Z PEN JR', 'Lamivudine/Zidovudine mg mg Tablets Co-packaged with Nevirapine mg Tablets', ' &', 'AMINOSYN-RF', 'POLY', '8-MOP']))
allnames = [i.upper() for i in sorted(list(allnames), key=len) if (len(i) > 4) and not re.match('.*\d.*', i)]
print(len(allnames))
print(allnames)

drugfreq = {}
for name in allnames:
    for char in name:
        if char in drugfreq:
            drugfreq[char] += 1
        else:
            drugfreq[char] = 1
totchar = sum(drugfreq.values())
for char in drugfreq:
    drugfreq[char] /= (totchar / 100)
drugfreq

#Frequency in English, taken from http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html
engfreq = {
    'E': 12.02,
    'T': 9.10,
    'A': 8.12,
    'O': 7.68,
    'I': 7.31,
    'N': 6.95,
    'S': 6.28,
    'R': 6.02,
    'H': 5.92,
    'D': 4.32,
    'L': 3.98,
    'U': 2.88,
    'C': 2.71,
    'M': 2.61,
    'F': 2.30,
    'Y': 2.11,
    'W': 2.09,
    'G': 2.03,
    'P': 1.82,
    'B': 1.49,
    'V': 1.11,
    'K': 0.69,
    'X': 0.17,
    'Q': 0.11,
    'J': 0.10,
    'Z': 0.07
}

import math

relfreq = {}
for char in drugfreq:
    relfreq[char] = math.log(drugfreq[char]/engfreq[char])
relfreq

%matplotlib inline

pd.Series(relfreq).plot(kind='bar')

#Let's get the data as json to put into Angular
import json
print(json.dumps(relfreq, sort_keys=True, indent=4,))


def totalscore(name):
    return round(sum([relfreq[l] for l in name.upper()]), 2)

scores = {name: totalscore(name) for name in allnames}
pd.Series(scores).order(ascending=False).head(10)

maxrel = max(relfreq.values())
minrel = min(relfreq.values())
print(maxrel, minrel)
scaledrel = (pd.Series(relfreq) - minrel)/(maxrel - minrel)

cmap = ["#ff0000", "#fd0000", "#fb0000", "#f90000", "#f70000", "#f50000", "#f30000", "#f10000", "#ef0000", "#ec0000", "#ea0000", "#e80000", "#e60000", "#e40000", "#e20000", "#e00000", "#de0000", "#dc0000", "#da0000", "#d80000", "#d60000", "#d40000", "#d20000", "#d00000", "#ce0000", "#cb0000", "#c90000", "#c70000", "#c50000", "#c30000", "#c10000", "#bf0000", "#bd0000", "#bb0000", "#b90000", "#b70000", "#b50000", "#b30000", "#b10000", "#af0000", "#ad0000", "#ab0000", "#a80000", "#a60000", "#a40000", "#a20000", "#a00000", "#9e0000", "#9c0000", "#9a0000", "#980000", "#960000", "#940000", "#920000", "#900000", "#8e0000", "#8c0000", "#8a0000", "#870000", "#850000", "#830000", "#810000", "#7f0000", "#7d0000", "#7b0000", "#790000", "#770000", "#750000", "#730000", "#710000", "#700000", "#6e0000", "#6c0000", "#6a0000", "#680000", "#660000", "#640000", "#620000", "#600000", "#5e0000", "#5c0000", "#5a0000", "#580000", "#560000", "#540000", "#520000", "#500000", "#4f0000", "#4d0000", "#4b0000", "#490000", "#470000", "#450000", "#430000", "#410000", "#3f0000", "#3d0000", "#3b0000", "#390000", "#370000", "#350000", "#330000", "#310000", "#300000", "#2e0000", "#2c0000", "#2a0000", "#280000", "#260000", "#240000", "#220000", "#200000", "#1e0000", "#1c0000", "#1a0000", "#180000", "#160000", "#140000", "#120000", "#100000", "#0f0000", "#0d0000", "#0b0000", "#090000", "#070000", "#050000", "#030000", "#010000", "#000100", "#000300", "#000500", "#000700", "#000900", "#000b00", "#000d00", "#000f00", "#001100", "#001300", "#001500", "#001700", "#001900", "#001b00", "#001c00", "#001e00", "#002000", "#002200", "#002400", "#002600", "#002800", "#002a00", "#002c00", "#002e00", "#003000", "#003200", "#003400", "#003600", "#003800", "#003a00", "#003c00", "#003e00", "#004000", "#004200", "#004400", "#004600", "#004800", "#004a00", "#004c00", "#004e00", "#005000", "#005100", "#005300", "#005500", "#005700", "#005900", "#005b00", "#005d00", "#005f00", "#006100", "#006300", "#006500", "#006700", "#006900", "#006b00", "#006d00", "#006f00", "#007100", "#007300", "#007500", "#007700", "#007900", "#007b00", "#007d00", "#007f00", "#008100", "#008300", "#008500", "#008700", "#008900", "#008b00", "#008d00", "#008f00", "#009100", "#009300", "#009500", "#009700", "#009900", "#009b00", "#009d00", "#009f00", "#00a200", "#00a400", "#00a600", "#00a800", "#00aa00", "#00ac00", "#00ae00", "#00b000", "#00b200", "#00b400", "#00b600", "#00b800", "#00ba00", "#00bc00", "#00be00", "#00c000", "#00c200", "#00c400", "#00c600", "#00c800", "#00cb00", "#00cd00", "#00cf00", "#00d100", "#00d300", "#00d500", "#00d700", "#00d900", "#00db00", "#00dd00", "#00df00", "#00e100", "#00e300", "#00e500", "#00e700", "#00e900", "#00eb00", "#00ed00", "#00ef00", "#00f100", "#00f300", "#00f600", "#00f800", "#00fa00", "#00fc00", "#00fe00", "#00ff00"]
scaleddict = (scaledrel * (len(cmap) - 1)).map(math.floor).to_dict()

for char in sorted(scaleddict.keys()):
    print(".char%s {" % (char,))
    print('\tcolor: %s;\n}' % (cmap[scaleddict[char]],))

def test(num):
    return 1/(1+math.exp(-25*((num-0.50))))
ax = pd.DataFrame({'orig': scaledrel, 'scaled': scaledrel.map(test)}).hist()
ax[0][0].figure.suptitle('Histograms showing the distribution of letter values before and after scaling', fontsize=12)
scaleddict = (scaledrel.map(test) * (len(cmap) - 1)).map(math.floor).to_dict()

for char in sorted(scaleddict.keys()):
    print(".char%s {" % (char,))
    print('\tcolor: %s;\n}' % (cmap[scaleddict[char]],))