print "Before Pandas", len(dir())
import pandas
from pandas import *
print "After Pandas", len(dir())
from numpy import *
print "After NumPy", len(dir())
Before Pandas 21 After Pandas 165 After NumPy 706
Reading Data
import urlparse
import httplib
import pandas
csv_data = "http://priede.bf.lu.lv/ftp/grozs/Datorlietas/Geog5028/TIS_PRG/HomePlanet/ASTEROID.CSV"
fwf_data = "http://ssd.jpl.nasa.gov/dat/ELEMENTS.NUMBR"
tax_data = "http://sbn.psi.edu/ferret/reformatTable.action?productId=TAXONOMY10_TAB&dataSetId=EAR-A-5-DDR-TAXONOMY-V6.0"
def save_file(location):
"""Read data at url"""
url = urlparse.urlparse(location)
connection = httplib.HTTPConnection(url.netloc)
connection.connect()
connection.request('GET', url.path)
response = connection.getresponse()
data = response.read()
filename = '/tmp/' + url.path.split('/')[-1]
print filename
with open(filename, 'rw+') as tmp:
tmp.write(data)
return filename
# Read NASA asteroid data
#filename = save_file(fwf_data)
filename = "data/ELEMENTS.NUMBR.txt"
with open(filename, 'r') as f:
widths = map(lambda line: len(line) + 1, f.read().splitlines()[1].split(' '))
asteroids_nasa = pandas.read_fwf(filename, widths=widths, skiprows=[1])
widths = [11, 18, 11, 13, 13, 14, 14, 14, 14, 13, 13, 12, 12, 10, 10, 15, 15, 16, 15, 22]
asteroids_taxonomy = pandas.read_fwf(tax_data, widths=widths, skiprows=[1])
print asteroids_nasa.xs(0)
print '\n'
print asteroids_taxonomy.xs(0)
Num 1 AST_NAME Ceres Epoch 56600 Semimajor Axis 2.766807 Eccentricity 0.07579726 i 10.59398 w 72.29215 Node 80.32764 M 10.55758 H 3.34 G 0.12 Ref\n JPL 32\n Name: 0, dtype: object AST_NUMBER 1 AST_NAME Ceres PROV_ID - THOLEN_CLASS G THOLEN_PARAM 7G BARUCCI_CLASS G0 BARUCCI_PARAM 7I TEDESCO_CLASS G? TEDESCO_PARAM 2I HOWELL_CLASS CvB HOWELL_PARAM 65 SMASS_CLASS - SMASS_PARAM - BUS_CLASS C BUS_PARAM s S3OS2_CLASS_TH C S3OS2_CLASS_BB C BUS_DEMEO_CLASS C DEMEO_REF_CODE a COMMENT \n - \n Name: 0, dtype: object
Descriptions
asteroids_nasa.ix[:, 0:2].head(10)
Num | Name | |
---|---|---|
0 | 1 | Ceres |
1 | 2 | Pallas |
2 | 3 | Juno |
3 | 4 | Vesta |
4 | 5 | Astraea |
5 | 6 | Hebe |
6 | 7 | Iris |
7 | 8 | Flora |
8 | 9 | Metis |
9 | 10 | Hygiea |
asteroids_nasa.describe()
<class 'pandas.core.frame.DataFrame'> Index: 8 entries, count to max Data columns (total 10 columns): Num 8 non-null values Epoch 8 non-null values a 8 non-null values e 8 non-null values i 8 non-null values w 8 non-null values Node 8 non-null values M 8 non-null values H 8 non-null values G 8 non-null values dtypes: float64(10)
asteroids_nasa.ix[:, 3].describe()
count 369956.000000 mean 2.714804 std 2.343523 min 0.617616 25% 2.381873 50% 2.625409 75% 2.949065 max 780.919318 dtype: float64
asteroids_nasa.ix[:, 'a'].describe()
count 369956.000000 mean 2.714804 std 2.343523 min 0.617616 25% 2.381873 50% 2.625409 75% 2.949065 max 780.919318 dtype: float64
asteroids_nasa.ix[:, 3:7].corr()
a | e | i | w | |
---|---|---|---|---|
a | 1.000000 | 0.028508 | 0.057180 | 0.001323 |
e | 0.028508 | 1.000000 | 0.102916 | 0.014825 |
i | 0.057180 | 0.102916 | 1.000000 | -0.000701 |
w | 0.001323 | 0.014825 | -0.000701 | 1.000000 |
asteroids_nasa.rename(columns={'Name': 'AST_NAME', 'a': 'Semimajor Axis', 'e': 'Eccentricity'})
<class 'pandas.core.frame.DataFrame'> Int64Index: 369956 entries, 0 to 369955 Data columns (total 12 columns): Num 369956 non-null values AST_NAME 369956 non-null values Epoch 369956 non-null values Semimajor Axis 369956 non-null values Eccentricity 369956 non-null values i 369956 non-null values w 369956 non-null values Node 369956 non-null values M 369956 non-null values H 369956 non-null values G 369956 non-null values Ref 369956 non-null values dtypes: float64(8), int64(2), object(2)
asteroids_nasa
<class 'pandas.core.frame.DataFrame'> Int64Index: 369956 entries, 0 to 369955 Data columns (total 12 columns): Num 369956 non-null values Name 369956 non-null values Epoch 369956 non-null values a 369956 non-null values e 369956 non-null values i 369956 non-null values w 369956 non-null values Node 369956 non-null values M 369956 non-null values H 369956 non-null values G 369956 non-null values Ref 369956 non-null values dtypes: float64(8), int64(2), object(2)
asteroids_nasa.rename(columns={'Name': 'AST_NAME', 'a': 'Semimajor Axis', 'e': 'Eccentricity'}, inplace=True)
asteroids_nasa
<class 'pandas.core.frame.DataFrame'> Int64Index: 369956 entries, 0 to 369955 Data columns (total 12 columns): Num 369956 non-null values AST_NAME 369956 non-null values Epoch 369956 non-null values Semimajor Axis 369956 non-null values Eccentricity 369956 non-null values i 369956 non-null values w 369956 non-null values Node 369956 non-null values M 369956 non-null values H 369956 non-null values G 369956 non-null values Ref 369956 non-null values dtypes: float64(8), int64(2), object(2)
asteroids_taxonomy
<class 'pandas.core.frame.DataFrame'> Int64Index: 2615 entries, 0 to 2614 Data columns (total 20 columns): AST_NUMBER 2615 non-null values AST_NAME 2615 non-null values PROV_ID 2615 non-null values THOLEN_CLASS 2615 non-null values THOLEN_PARAM 2615 non-null values BARUCCI_CLASS 2615 non-null values BARUCCI_PARAM 2615 non-null values TEDESCO_CLASS 2615 non-null values TEDESCO_PARAM 2615 non-null values HOWELL_CLASS 2615 non-null values HOWELL_PARAM 2615 non-null values SMASS_CLASS 2615 non-null values SMASS_PARAM 2615 non-null values BUS_CLASS 2615 non-null values BUS_PARAM 2615 non-null values S3OS2_CLASS_TH 2615 non-null values S3OS2_CLASS_BB 2615 non-null values BUS_DEMEO_CLASS 2615 non-null values DEMEO_REF_CODE 2615 non-null values COMMENT 2615 non-null values dtypes: int64(1), object(19)
merged = asteroids_nasa.merge(asteroids_taxonomy, on='AST_NAME')
print merged
<class 'pandas.core.frame.DataFrame'> Int64Index: 2415 entries, 0 to 2414 Data columns (total 31 columns): Num 2415 non-null values AST_NAME 2415 non-null values Epoch 2415 non-null values Semimajor Axis 2415 non-null values Eccentricity 2415 non-null values i 2415 non-null values w 2415 non-null values Node 2415 non-null values M 2415 non-null values H 2415 non-null values G 2415 non-null values Ref 2415 non-null values AST_NUMBER 2415 non-null values PROV_ID 2415 non-null values THOLEN_CLASS 2415 non-null values THOLEN_PARAM 2415 non-null values BARUCCI_CLASS 2415 non-null values BARUCCI_PARAM 2415 non-null values TEDESCO_CLASS 2415 non-null values TEDESCO_PARAM 2415 non-null values HOWELL_CLASS 2415 non-null values HOWELL_PARAM 2415 non-null values SMASS_CLASS 2415 non-null values SMASS_PARAM 2415 non-null values BUS_CLASS 2415 non-null values BUS_PARAM 2415 non-null values S3OS2_CLASS_TH 2415 non-null values S3OS2_CLASS_BB 2415 non-null values BUS_DEMEO_CLASS 2415 non-null values DEMEO_REF_CODE 2415 non-null values COMMENT 2415 non-null values dtypes: float64(8), int64(3), object(20)
merged.pop('Ref\n')
0 JPL 32\n 1 JPL 26\n 2 JPL 102\n 3 JPL 33\n 4 JPL 83\n 5 JPL 82\n 6 JPL 105\n 7 JPL 90\n 8 JPL 90\n 9 JPL 85\n 10 JPL 70\n 11 JPL 87\n 12 JPL 60\n 13 JPL 57\n 14 JPL 70\n ... 2400 JPL 21\n 2401 JPL 2\n 2402 JPL 3\n 2403 JPL 50\n 2404 JPL 120\n 2405 JPL 1\n 2406 JPL 31\n 2407 JPL 1\n 2408 JPL 3\n 2409 JPL 1\n 2410 JPL 3\n 2411 JPL 1\n 2412 JPL 1\n 2413 JPL 1\n 2414 JPL 127\n Name: Ref , Length: 2415, dtype: object
merged.pop('COMMENT \n')
0 - \n 1 - \n 2 - \n 3 - \n 4 - \n 5 - \n 6 - \n 7 - \n 8 - \n 9 - \n 10 - \n 11 - \n 12 - \n 13 - \n 14 - \n ... 2400 - \n 2401 - \n 2402 - \n 2403 - \n 2404 - \n 2405 - \n 2406 - \n 2407 - \n 2408 - \n 2409 - \n 2410 - \n 2411 - \n 2412 - \n 2413 - \n 2414 - \n Name: COMMENT , Length: 2415, dtype: object
merged.columns
Index([Num, AST_NAME, Epoch, Semimajor Axis, Eccentricity, i, w, Node, M, H, G, AST_NUMBER, PROV_ID, THOLEN_CLASS, THOLEN_PARAM, BARUCCI_CLASS, BARUCCI_PARAM, TEDESCO_CLASS, TEDESCO_PARAM, HOWELL_CLASS, HOWELL_PARAM, SMASS_CLASS, SMASS_PARAM, BUS_CLASS, BUS_PARAM, S3OS2_CLASS_TH, S3OS2_CLASS_BB, BUS_DEMEO_CLASS, DEMEO_REF_CODE], dtype=object)
ref_codes = merged.pop('DEMEO_REF_CODE')
merged
<class 'pandas.core.frame.DataFrame'> Int64Index: 2415 entries, 0 to 2414 Data columns (total 28 columns): Num 2415 non-null values AST_NAME 2415 non-null values Epoch 2415 non-null values Semimajor Axis 2415 non-null values Eccentricity 2415 non-null values i 2415 non-null values w 2415 non-null values Node 2415 non-null values M 2415 non-null values H 2415 non-null values G 2415 non-null values AST_NUMBER 2415 non-null values PROV_ID 2415 non-null values THOLEN_CLASS 2415 non-null values THOLEN_PARAM 2415 non-null values BARUCCI_CLASS 2415 non-null values BARUCCI_PARAM 2415 non-null values TEDESCO_CLASS 2415 non-null values TEDESCO_PARAM 2415 non-null values HOWELL_CLASS 2415 non-null values HOWELL_PARAM 2415 non-null values SMASS_CLASS 2415 non-null values SMASS_PARAM 2415 non-null values BUS_CLASS 2415 non-null values BUS_PARAM 2415 non-null values S3OS2_CLASS_TH 2415 non-null values S3OS2_CLASS_BB 2415 non-null values BUS_DEMEO_CLASS 2415 non-null values dtypes: float64(8), int64(3), object(17)
merged.insert(11, 'DEMEO_REF_CODE', ref_codes)
merged
<class 'pandas.core.frame.DataFrame'> Int64Index: 2415 entries, 0 to 2414 Data columns (total 29 columns): Num 2415 non-null values AST_NAME 2415 non-null values Epoch 2415 non-null values Semimajor Axis 2415 non-null values Eccentricity 2415 non-null values i 2415 non-null values w 2415 non-null values Node 2415 non-null values M 2415 non-null values H 2415 non-null values G 2415 non-null values DEMEO_REF_CODE 2415 non-null values AST_NUMBER 2415 non-null values PROV_ID 2415 non-null values THOLEN_CLASS 2415 non-null values THOLEN_PARAM 2415 non-null values BARUCCI_CLASS 2415 non-null values BARUCCI_PARAM 2415 non-null values TEDESCO_CLASS 2415 non-null values TEDESCO_PARAM 2415 non-null values HOWELL_CLASS 2415 non-null values HOWELL_PARAM 2415 non-null values SMASS_CLASS 2415 non-null values SMASS_PARAM 2415 non-null values BUS_CLASS 2415 non-null values BUS_PARAM 2415 non-null values S3OS2_CLASS_TH 2415 non-null values S3OS2_CLASS_BB 2415 non-null values BUS_DEMEO_CLASS 2415 non-null values dtypes: float64(8), int64(3), object(18)
from pandas.stats.api import ols
model = ols(y=asteroids_nasa.xs('Semimajor Axis', axis=1), x=asteroids_nasa.xs('i', axis=1))
print model
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 369956 Number of Degrees of Freedom: 2 R-squared: 0.0033 Adj R-squared: 0.0033 Rmse: 2.3397 F-stat (1, 369954): 1213.5610, p-value: 0.0000 Degrees of Freedom: model 1, resid 369954 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.0233 0.0007 34.84 0.0000 0.0220 0.0246 intercept 2.5314 0.0065 388.25 0.0000 2.5186 2.5442 ---------------------------------End of Summary---------------------------------
import matplotlib.pyplot as plt
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-6-eff513f636fd> in <module>() ----> 1 import matplotlib.pyplot as plt ImportError: No module named matplotlib.pyplot