import urllib.request urllib.request.urlretrieve ("http://www.fda.gov/downloads/Drugs/InformationOnDrugs/UCM054599.zip", "UCM054599.zip") import pandas as pd from zipfile import ZipFile with ZipFile("UCM054599.zip", 'r') as drugdata: print([entry.filename for entry in drugdata.infolist()]) with drugdata.open('Product.txt', 'U') as productfile: # print(productfile.readlines()) drugdataframe = pd.read_table(productfile) from itertools import chain import re allnames = drugdataframe.drugname.replace( { ' SODIUM': '', ' PRESERVATIVE FREE': '', ' HYDROCHLORIDE': '', ' HCL': '', ' FOR ORAL SOLUTION': '', ' INJECTION': '', ' LOCK FLUSH': '', ' PHOSPHATE': '', ' SULFATE': '', ' SINUS': '', ' CHLORIDE': '', ' DILUTE': '', ' COLD': '', ' IV$': '', ' E$': '', '^SODIUM ': '', '(^| )CALCIUM( |$)': '', '(^| )IRON( |$)': '', ' TARTRATE': '', ' TEBUTATE': '', ' HALF-STRENGTH': '', '( R)? PEN$': '', ' PLAIN': '', '\d*[ -]HOUR': '', 'TECHNETIUM.*': 'TECHNETIUM', '.*LUNGAGGREGATE.*': '', ' \(PRESERVATIVE FREE\)': '', ' \(NEEDS NO REFRIGERATION\)': '', ' IN PLASTIC CONTAINER': '', '( G.)?U. IRRIGANT': '', '[- ](C|SR|RX|PM|D|RF|CR|XL|ODT|CD|ES|HCT|HP|XR)(\s|$)': '', '[- ,][0-9.]+[%]': '', '[- ,#/][A-Z]?[0-9.]+': '', ' DISPERDOSE': '', '%': '' } , regex=True ).unique() #allnames = set(chain.from_iterable(re.split("( AND| W/|,|;| &| IN)( |$)", name) for name in allnames)) allnames = set(chain.from_iterable(re.split("( |-|/|;|\)|\(|\.|,|'|:|\")", name) for name in allnames)) allnames = allnames.difference(set(['MAXALT-MLT', 'LUVOX CR', 'TRAVASOL% IN DEXTROSE%', 'CARBONATE', '', 'PN', 'PBZ', 'AVC', 'M.V.I. (WITHOUT VITAMIN K)', 'EPI E Z PEN JR', 'Lamivudine/Zidovudine mg mg Tablets Co-packaged with Nevirapine mg Tablets', ' &', 'AMINOSYN-RF', 'POLY', '8-MOP'])) allnames = [i.upper() for i in sorted(list(allnames), key=len) if (len(i) > 4) and not re.match('.*\d.*', i)] print(len(allnames)) print(allnames) drugfreq = {} for name in allnames: for char in name: if char in drugfreq: drugfreq[char] += 1 else: drugfreq[char] = 1 totchar = sum(drugfreq.values()) for char in drugfreq: drugfreq[char] /= (totchar / 100) drugfreq #Frequency in English, taken from http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html engfreq = { 'E': 12.02, 'T': 9.10, 'A': 8.12, 'O': 7.68, 'I': 7.31, 'N': 6.95, 'S': 6.28, 'R': 6.02, 'H': 5.92, 'D': 4.32, 'L': 3.98, 'U': 2.88, 'C': 2.71, 'M': 2.61, 'F': 2.30, 'Y': 2.11, 'W': 2.09, 'G': 2.03, 'P': 1.82, 'B': 1.49, 'V': 1.11, 'K': 0.69, 'X': 0.17, 'Q': 0.11, 'J': 0.10, 'Z': 0.07 } import math relfreq = {} for char in drugfreq: relfreq[char] = math.log(drugfreq[char]/engfreq[char]) relfreq %matplotlib inline pd.Series(relfreq).plot(kind='bar') #Let's get the data as json to put into Angular import json print(json.dumps(relfreq, sort_keys=True, indent=4,)) def totalscore(name): return round(sum([relfreq[l] for l in name.upper()]), 2) scores = {name: totalscore(name) for name in allnames} pd.Series(scores).order(ascending=False).head(10) maxrel = max(relfreq.values()) minrel = min(relfreq.values()) print(maxrel, minrel) scaledrel = (pd.Series(relfreq) - minrel)/(maxrel - minrel) cmap = ["#ff0000", "#fd0000", "#fb0000", "#f90000", "#f70000", "#f50000", "#f30000", "#f10000", "#ef0000", "#ec0000", "#ea0000", "#e80000", "#e60000", "#e40000", "#e20000", "#e00000", "#de0000", "#dc0000", "#da0000", "#d80000", "#d60000", "#d40000", "#d20000", "#d00000", "#ce0000", "#cb0000", "#c90000", "#c70000", "#c50000", "#c30000", "#c10000", "#bf0000", "#bd0000", "#bb0000", "#b90000", "#b70000", "#b50000", "#b30000", "#b10000", "#af0000", "#ad0000", "#ab0000", "#a80000", "#a60000", "#a40000", "#a20000", "#a00000", "#9e0000", "#9c0000", "#9a0000", "#980000", "#960000", "#940000", "#920000", "#900000", "#8e0000", "#8c0000", "#8a0000", "#870000", "#850000", "#830000", "#810000", "#7f0000", "#7d0000", "#7b0000", "#790000", "#770000", "#750000", "#730000", "#710000", "#700000", "#6e0000", "#6c0000", "#6a0000", "#680000", "#660000", "#640000", "#620000", "#600000", "#5e0000", "#5c0000", "#5a0000", "#580000", "#560000", "#540000", "#520000", "#500000", "#4f0000", "#4d0000", "#4b0000", "#490000", "#470000", "#450000", "#430000", "#410000", "#3f0000", "#3d0000", "#3b0000", "#390000", "#370000", "#350000", "#330000", "#310000", "#300000", "#2e0000", "#2c0000", "#2a0000", "#280000", "#260000", "#240000", "#220000", "#200000", "#1e0000", "#1c0000", "#1a0000", "#180000", "#160000", "#140000", "#120000", "#100000", "#0f0000", "#0d0000", "#0b0000", "#090000", "#070000", "#050000", "#030000", "#010000", "#000100", "#000300", "#000500", "#000700", "#000900", "#000b00", "#000d00", "#000f00", "#001100", "#001300", "#001500", "#001700", "#001900", "#001b00", "#001c00", "#001e00", "#002000", "#002200", "#002400", "#002600", "#002800", "#002a00", "#002c00", "#002e00", "#003000", "#003200", "#003400", "#003600", "#003800", "#003a00", "#003c00", "#003e00", "#004000", "#004200", "#004400", "#004600", "#004800", "#004a00", "#004c00", "#004e00", "#005000", "#005100", "#005300", "#005500", "#005700", "#005900", "#005b00", "#005d00", "#005f00", "#006100", "#006300", "#006500", "#006700", "#006900", "#006b00", "#006d00", "#006f00", "#007100", "#007300", "#007500", "#007700", "#007900", "#007b00", "#007d00", "#007f00", "#008100", "#008300", "#008500", "#008700", "#008900", "#008b00", "#008d00", "#008f00", "#009100", "#009300", "#009500", "#009700", "#009900", "#009b00", "#009d00", "#009f00", "#00a200", "#00a400", "#00a600", "#00a800", "#00aa00", "#00ac00", "#00ae00", "#00b000", "#00b200", "#00b400", "#00b600", "#00b800", "#00ba00", "#00bc00", "#00be00", "#00c000", "#00c200", "#00c400", "#00c600", "#00c800", "#00cb00", "#00cd00", "#00cf00", "#00d100", "#00d300", "#00d500", "#00d700", "#00d900", "#00db00", "#00dd00", "#00df00", "#00e100", "#00e300", "#00e500", "#00e700", "#00e900", "#00eb00", "#00ed00", "#00ef00", "#00f100", "#00f300", "#00f600", "#00f800", "#00fa00", "#00fc00", "#00fe00", "#00ff00"] scaleddict = (scaledrel * (len(cmap) - 1)).map(math.floor).to_dict() for char in sorted(scaleddict.keys()): print(".char%s {" % (char,)) print('\tcolor: %s;\n}' % (cmap[scaleddict[char]],)) def test(num): return 1/(1+math.exp(-25*((num-0.50)))) ax = pd.DataFrame({'orig': scaledrel, 'scaled': scaledrel.map(test)}).hist() ax[0][0].figure.suptitle('Histograms showing the distribution of letter values before and after scaling', fontsize=12) scaleddict = (scaledrel.map(test) * (len(cmap) - 1)).map(math.floor).to_dict() for char in sorted(scaleddict.keys()): print(".char%s {" % (char,)) print('\tcolor: %s;\n}' % (cmap[scaleddict[char]],))