import warnings
warnings.filterwarnings("ignore")
import pandas as pd
df = pd.read_csv('../data/cars93.csv')
df.head(2)
# categories
cols_with_nan = []
for col in df.columns:
print("{} {} values {} NaN ".format(col, len(df[col].unique()) , df[df[col].isnull()].shape[0] ))
if df[df[col].isnull()].shape[0] > 0:
cols_with_nan.append(col)
cols_with_nan
# rm columns qui ont des NaN
df.drop(columns=cols_with_nan, inplace = True)
df.shape
# label encoding
from sklearn.preprocessing import LabelEncoder
for col in df.columns:
if (df[col].dtype == 'O') :
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
df.head()
X = df.drop(columns = ['MPG.city','MPG.highway'] )
X.head()
X.shape
X.describe()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X
y = df['MPG.highway'].values
# model
from sklearn.linear_model import SGDRegressor
mdl = SGDRegressor(alpha=0.01)
mdl.fit(X,y)
yhat = mdl.predict(X)
print(mean_squared_error(yhat, y))
import numpy as np
np.mean(y)
plt.plot(yhat - y)
plt.plot(yhat, y, '.')
from sklearn.metrics import mean_squared_error
print(mean_squared_error(yhat, y))