import pandas as pd
import numpy as np
from __future__ import division
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.mpl_style = 'default'
header_row = ['mpg', 'cyl', 'dis', 'hp', 'wt', 'acc', 'my','org','car_name']
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
car_data = pd.read_csv(url, header=0, na_values = '?', delimiter = '\s+')
car_data.columns = header_row
car_data.head()
mpg | cyl | dis | hp | wt | acc | my | org | car_name | |
---|---|---|---|---|---|---|---|---|---|
0 | 15 | 8 | 350 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
1 | 18 | 8 | 318 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
2 | 16 | 8 | 304 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
3 | 17 | 8 | 302 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
4 | 15 | 8 | 429 | 198 | 4341 | 10.0 | 70 | 1 | ford galaxie 500 |
subset_70 = car_data[car_data['my'] == 70]
subset_82 = car_data[car_data['my'] == 82]
print subset_70.describe()
print subset_82.describe()
mpg cyl dis hp wt acc my org count 28.000000 28.000000 28.000000 28.000000 28.000000 28.000000 28 28.000000 mean 17.678571 6.714286 280.500000 148.464286 3368.107143 12.982143 70 1.321429 std 5.436867 1.739671 126.605395 54.609375 868.138679 3.387018 0 0.611832 min 9.000000 4.000000 97.000000 46.000000 1835.000000 8.000000 70 1.000000 25% 14.000000 5.500000 178.750000 95.000000 2632.750000 10.000000 70 1.000000 50% 15.500000 8.000000 305.500000 150.000000 3442.500000 13.000000 70 1.000000 75% 22.500000 8.000000 384.750000 198.500000 4319.250000 15.125000 70 1.250000 max 27.000000 8.000000 455.000000 225.000000 4732.000000 20.500000 70 3.000000 mpg cyl dis hp wt acc my org count 31.000000 31.000000 31.000000 30.000000 31.000000 31.000000 31 31.000000 mean 31.709677 4.193548 128.870968 81.466667 2453.548387 16.638710 82 1.645161 std 5.392548 0.601074 39.352037 13.296962 354.276713 2.484844 0 0.914636 min 22.000000 4.000000 91.000000 52.000000 1965.000000 11.600000 82 1.000000 25% 27.000000 4.000000 105.000000 70.000000 2127.500000 14.850000 82 1.000000 50% 32.000000 4.000000 119.000000 84.000000 2525.000000 16.400000 82 1.000000 75% 36.000000 4.000000 142.000000 88.000000 2727.500000 18.000000 82 3.000000 max 44.000000 6.000000 262.000000 112.000000 3035.000000 24.600000 82 3.000000
print car_data[car_data.hp.isnull()]
mpg cyl dis hp wt acc my org car_name 31 25.0 4 98 NaN 2046 19.0 71 1 ford pinto 125 21.0 6 200 NaN 2875 17.0 74 1 ford maverick 329 40.9 4 85 NaN 1835 17.3 80 2 renault lecar deluxe 335 23.6 4 140 NaN 2905 14.3 80 1 ford mustang cobra 353 34.5 4 100 NaN 2320 15.8 81 2 renault 18i 373 23.0 4 151 NaN 3035 20.5 82 1 amc concord dl
grouped = car_data.groupby('my')
grouped.hp.mean()
my 70 148.464286 71 107.037037 72 120.178571 73 130.475000 74 94.230769 75 101.066667 76 101.117647 77 105.071429 78 99.694444 79 101.206897 80 77.481481 81 81.035714 82 81.466667 Name: hp, dtype: float64