In [1]:
# load le dataset

import pandas as pd

filename = '../data/cars93.csv'
In [2]:
df = pd.read_csv(filename)
In [3]:
df.shape
Out[3]:
(93, 27)
In [4]:
df.columns
Out[4]:
Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')
In [27]:
df.head(8)
Out[27]:
Manufacturer Model Type Min.Price Price Max.Price MPG.city MPG.highway AirBags DriveTrain ... Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight Origin Make
0 Acura Integra Small 12.9 15.9 18.8 25 31 None Front ... 5 177 102 68 37 26.5 11.0 2705 non-USA Acura Integra
1 Acura Legend Midsize 29.2 33.9 38.7 18 25 Driver & Passenger Front ... 5 195 115 71 38 30.0 15.0 3560 non-USA Acura Legend
2 Audi 90 Compact 25.9 29.1 32.3 20 26 Driver only Front ... 5 180 102 67 37 28.0 14.0 3375 non-USA Audi 90
3 Audi 100 Midsize 30.8 37.7 44.6 19 26 Driver & Passenger Front ... 6 193 106 70 37 31.0 17.0 3405 non-USA Audi 100
4 BMW 535i Midsize 23.7 30.0 36.2 22 30 Driver only Rear ... 4 186 109 69 39 27.0 13.0 3640 non-USA BMW 535i
5 Buick Century Midsize 14.2 15.7 17.3 22 31 Driver only Front ... 6 189 105 69 41 28.0 16.0 2880 USA Buick Century
6 Buick LeSabre Large 19.9 20.8 21.7 19 28 Driver only Front ... 6 200 111 74 42 30.5 17.0 3470 USA Buick LeSabre
7 Buick Roadmaster Large 22.6 23.7 24.9 16 25 Driver only Rear ... 6 216 116 78 45 30.5 21.0 4105 USA Buick Roadmaster

8 rows × 27 columns

In [ ]:
df['MPG.city']
In [5]:
df.dtypes
Out[5]:
Manufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city                int64
MPG.highway             int64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower              int64
RPM                     int64
Rev.per.mile            int64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers              int64
Length                  int64
Wheelbase               int64
Width                   int64
Turn.circle             int64
Rear.seat.room        float64
Luggage.room          float64
Weight                  int64
Origin                 object
Make                   object
dtype: object
In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

df.Horsepower.hist(bins = 10)
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10de4de10>
In [41]:
df['Manufacturer'].head()
Out[41]:
0    Acura
1    Acura
2     Audi
3     Audi
4      BMW
Name: Manufacturer, dtype: object
In [42]:
df.Manufacturer.head()
Out[42]:
0    Acura
1    Acura
2     Audi
3     Audi
4      BMW
Name: Manufacturer, dtype: object
In [44]:
df['MPG.city'].describe()
Out[44]:
count    93.000000
mean     22.365591
std       5.619812
min      15.000000
25%      18.000000
50%      21.000000
75%      25.000000
max      46.000000
Name: MPG.city, dtype: float64
In [45]:
df.corr()
Out[45]:
Min.Price Price Max.Price MPG.city MPG.highway EngineSize Horsepower RPM Rev.per.mile Fuel.tank.capacity Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight
Min.Price 1.000000 0.970601 0.906756 -0.622875 -0.579966 0.645488 0.802444 -0.042598 -0.470395 0.635369 0.061236 0.553859 0.516758 0.492878 0.428603 0.376642 0.413485 0.666554
Price 0.970601 1.000000 0.981580 -0.594562 -0.560680 0.597425 0.788218 -0.004955 -0.426395 0.619480 0.057860 0.503628 0.500864 0.456028 0.392590 0.311499 0.366569 0.647179
Max.Price 0.906756 0.981580 1.000000 -0.547811 -0.522561 0.535012 0.744445 0.025015 -0.374024 0.581294 0.053216 0.442933 0.467501 0.408414 0.347785 0.247260 0.315315 0.605142
MPG.city -0.622875 -0.594562 -0.547811 1.000000 0.943936 -0.710003 -0.672636 0.363045 0.695857 -0.813144 -0.416856 -0.666239 -0.667108 -0.720534 -0.666389 -0.384347 -0.494894 -0.843139
MPG.highway -0.579966 -0.560680 -0.522561 0.943936 1.000000 -0.626795 -0.619044 0.313469 0.587497 -0.786039 -0.466386 -0.542897 -0.615384 -0.640359 -0.593683 -0.366684 -0.371629 -0.810658
EngineSize 0.645488 0.597425 0.535012 -0.710003 -0.626795 1.000000 0.732120 -0.547898 -0.824009 0.759306 0.372721 0.780283 0.732484 0.867110 0.778464 0.502750 0.680827 0.845075
Horsepower 0.802444 0.788218 0.744445 -0.672636 -0.619044 0.732120 1.000000 0.036688 -0.600314 0.711790 0.009264 0.550865 0.486854 0.644413 0.561216 0.256732 0.359217 0.738798
RPM -0.042598 -0.004955 0.025015 0.363045 0.313469 -0.547898 0.036688 1.000000 0.494764 -0.333345 -0.467138 -0.441249 -0.467812 -0.539721 -0.505651 -0.342175 -0.524845 -0.427931
Rev.per.mile -0.470395 -0.426395 -0.374024 0.695857 0.587497 -0.824009 -0.600314 0.494764 1.000000 -0.609710 -0.334976 -0.690233 -0.636824 -0.780460 -0.733160 -0.377010 -0.592792 -0.735264
Fuel.tank.capacity 0.635369 0.619480 0.581294 -0.813144 -0.786039 0.759306 0.711790 -0.333345 -0.609710 1.000000 0.472095 0.690461 0.757674 0.798719 0.671343 0.509689 0.613437 0.894018
Passengers 0.061236 0.057860 0.053216 -0.416856 -0.466386 0.372721 0.009264 -0.467138 -0.334976 0.472095 1.000000 0.485294 0.694054 0.489979 0.449025 0.694134 0.653317 0.553273
Length 0.553859 0.503628 0.442933 -0.666239 -0.542897 0.780283 0.550865 -0.441249 -0.690233 0.690461 0.485294 1.000000 0.823650 0.822148 0.738955 0.549958 0.712962 0.806274
Wheelbase 0.516758 0.500864 0.467501 -0.667108 -0.615384 0.732484 0.486854 -0.467812 -0.636824 0.757674 0.694054 0.823650 1.000000 0.807213 0.723324 0.667259 0.734127 0.871895
Width 0.492878 0.456028 0.408414 -0.720534 -0.640359 0.867110 0.644413 -0.539721 -0.780460 0.798719 0.489979 0.822148 0.807213 1.000000 0.817854 0.465618 0.673490 0.874961
Turn.circle 0.428603 0.392590 0.347785 -0.666389 -0.593683 0.778464 0.561216 -0.505651 -0.733160 0.671343 0.449025 0.738955 0.723324 0.817854 1.000000 0.466328 0.585018 0.778043
Rear.seat.room 0.376642 0.311499 0.247260 -0.384347 -0.366684 0.502750 0.256732 -0.342175 -0.377010 0.509689 0.694134 0.549958 0.667259 0.465618 0.466328 1.000000 0.651968 0.526250
Luggage.room 0.413485 0.366569 0.315315 -0.494894 -0.371629 0.680827 0.359217 -0.524845 -0.592792 0.613437 0.653317 0.712962 0.734127 0.673490 0.585018 0.651968 1.000000 0.637226
Weight 0.666554 0.647179 0.605142 -0.843139 -0.810658 0.845075 0.738798 -0.427931 -0.735264 0.894018 0.553273 0.806274 0.871895 0.874961 0.778043 0.526250 0.637226 1.000000
In [49]:
import seaborn as sns
corr = df.corr()
fig, ax = plt.subplots(1,1, figsize =(9,9))
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f2e47b8>
In [15]:
df.describe()
Out[15]:
Min.Price Price Max.Price MPG.city MPG.highway EngineSize Horsepower RPM Rev.per.mile Fuel.tank.capacity Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight
count 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 93.000000 91.000000 82.000000 93.000000
mean 17.125806 19.509677 21.898925 22.365591 29.086022 2.667742 143.827957 5280.645161 2332.204301 16.664516 5.086022 183.204301 103.946237 69.376344 38.956989 27.829670 13.890244 3072.903226
std 8.746029 9.659430 11.030457 5.619812 5.331726 1.037363 52.374410 596.731690 496.506525 3.279370 1.038979 14.602382 6.819674 3.778986 3.223265 2.989072 2.997967 589.896510
min 6.700000 7.400000 7.900000 15.000000 20.000000 1.000000 55.000000 3800.000000 1320.000000 9.200000 2.000000 141.000000 90.000000 60.000000 32.000000 19.000000 6.000000 1695.000000
25% 10.800000 12.200000 14.700000 18.000000 26.000000 1.800000 103.000000 4800.000000 1985.000000 14.500000 4.000000 174.000000 98.000000 67.000000 37.000000 26.000000 12.000000 2620.000000
50% 14.700000 17.700000 19.600000 21.000000 28.000000 2.400000 140.000000 5200.000000 2340.000000 16.400000 5.000000 183.000000 103.000000 69.000000 39.000000 27.500000 14.000000 3040.000000
75% 20.300000 23.300000 25.300000 25.000000 31.000000 3.300000 170.000000 5750.000000 2565.000000 18.800000 6.000000 192.000000 110.000000 72.000000 41.000000 30.000000 15.000000 3525.000000
max 45.400000 61.900000 80.000000 46.000000 50.000000 5.700000 300.000000 6500.000000 3755.000000 27.000000 8.000000 219.000000 119.000000 78.000000 45.000000 36.000000 22.000000 4105.000000
In [16]:
df.dtypes
Out[16]:
Manufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city                int64
MPG.highway             int64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower              int64
RPM                     int64
Rev.per.mile            int64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers              int64
Length                  int64
Wheelbase               int64
Width                   int64
Turn.circle             int64
Rear.seat.room        float64
Luggage.room          float64
Weight                  int64
Origin                 object
Make                   object
dtype: object
In [52]:
df.Type.value_counts()
Out[52]:
Midsize    22
Small      21
Compact    16
Sporty     14
Large      11
Van         9
Name: Type, dtype: int64
In [50]:
df.Origin.value_counts()
Out[50]:
USA        48
non-USA    45
Name: Origin, dtype: int64
In [7]:
condition = (df['Origin'] == 'non-USA' ) & (df.Type == 'Midsize')
condition

df[condition].shape
Out[7]:
(10, 27)
In [8]:
col = 'Origin'
condition = df[col].isnull()

df[condition].shape
Out[8]:
(4, 27)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [20]:
import numpy as np
df['log_weight'] = df.Weight.apply(  lambda w : np.log(w + 1) ) 
In [21]:
df['log_weight'].hist(bins = 100)
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e0c16a0>
In [23]:
df['Weight'].hist(bins = 100)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e203630>
In [ ]: