import pandas as pd import numpy as np import MySQLdb as mdb AWSAccessKeyId, AWSSecretKey = ( line.strip().split('=')[1] for line in open('/Users/brian/rootkey.csv','r') ) sshKeyName, instancePass, mysqlPass, myIP = ( line.strip().split('=')[1] for line in open('/Users/brian/passwords.csv','r') ) def sqlCall(cmd): con = mdb.connect( host = 'ec2-54-91-115-227.compute-1.amazonaws.com', passwd = mysqlPass, user = 'weatherdbuser', db='weather' ); cur = con.cursor() cur.execute(cmd) field_names = [i[0] for i in cur.description] output = [] for i in range(cur.rowcount): output.append(cur.fetchone()) con.close() df = pd.DataFrame(output) try: df.columns = field_names except: pass return df cmd = "SELECT * FROM country LIMIT 5" cmd += ";" sqlCall(cmd) cmd = "SELECT * FROM station LIMIT 5" cmd += ";" sqlCall(cmd) cmd = "SELECT * FROM observation LIMIT 5" cmd += ";" sqlCall(cmd) cmd = "SELECT USAF, FIPS_ID, obs.numYears " cmd += "FROM station " cmd += "JOIN (SELECT STN, COUNT(DISTINCT year) as numYears " cmd += "FROM observation " cmd += "GROUP BY STN) as obs " cmd += "ON station.USAF = obs.STN " cmd += ";" r = sqlCall(cmd) r.head() idx = r.groupby(['FIPS_ID'])['numYears'].transform(max) == r['numYears'] longestPerCountry = r[idx].sort('FIPS_ID') longestPerCountry.tail() fn = lambda obj: obj.loc[np.random.choice(obj.index, 1, True),:] selectedSites = longestPerCountry.groupby('FIPS_ID', as_index=False).apply(fn) selectedSites.tail() selectedSitesList = selectedSites['USAF'].tolist() selectedSitesList[:10] slist = str(selectedSitesList)[1:-1] slist[:100] cmd = "SELECT s.USAF, s.station_name, c.country_name, " cmd += "(s.lat / 1000) as lat, (s.lon / 1000) as lon, (s.elev / 10) as elev " cmd += "FROM station s " cmd += "JOIN country c ON s.FIPS_ID = c.FIPS_ID " cmd += "WHERE s.USAF IN (" + slist + ");" sites = sqlCall(cmd) sites[['lat','lon','elev']] = sites[['lat','lon','elev']].astype(float) sites.head() siteTemps = [] for site in selectedSitesList: cmd = "SELECT STN, year, month, AVG(temp), MAX(maxtemp), MIN(mintemp) " cmd += "FROM observation " cmd += "WHERE STN = '" + site + "' " cmd += "GROUP BY year, month " cmd += ";" siteTemps.append(sqlCall(cmd)) siteTemps[0].head(10) sites.plot(x='lon',y='lat',kind='scatter') grapher = siteTemps[0] grapher[['year']] = grapher[['year']].astype(int) grapher[['month']] = grapher[['month']].astype(int) grapher[['t']] = grapher[['AVG(temp)']].astype(float) / 10 grapher.groupby('year') grapher.plot(x='month',y='t',kind='scatter') sites.to_csv('sites.csv',sep='\t',index=False) for i in range(len(selectedSitesList)): siteTemps[i].to_csv('sitesTemps' + str(i) + '.csv',sep='\t',index=False)