We're going to start by import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.datasets import load_iris
import pydot
from IPython.display import Image
%matplotlib inline
1-Exploration de la base de données
house = 'C:/Users/HP/Anaconda3/Lib/site-packages/notebook/train.csv'
df_train = pd.read_csv(house, sep = ',')
df_train.describe()
df_train.head(5)
2-Analyse de la target variable : 'SalePrice'
df_train['SalePrice'].describe()
#Histogramme de la variable cible pour trouver s'il y a 'skewness'
sns.distplot(df_train['SalePrice'])
On remarque une skewness positive il est préférable d'appliquer log(1+x) sur cette variable pour la rendre plus normale
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])
#La nouvelle distribution
sns.distplot(df_train['SalePrice'] )
3-Etudier les corrélations entre les variables
k = 10
corrmat = df_train.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.rcParams['figure.figsize'] = ((15,5))
plt.show()
On remarque que les variables les plus corrélées à notre variable cible sont : 'OverallQual' et 'GrdLivArea', d'autres part, il sera intéressant aussi de s'intéresser aux multicoliéarités entre les variables prédictives, par exemple 'GarageArea' et 'GarageCars' ont une corrélation très elevée ce qui est justifiée en se basant sur la dascription des variables donnée par la consigne, donc on peut garder qu'une seule, de meme pour 'TotalBsmtF' and '1stFlrSF', et pour 'TotRmsAbvGrd' and 'GrLivArea' aussi.
4-Les variables manquantes
valid = 'C:/Users/HP/Anaconda3/Lib/site-packages/notebook/test.csv'
df_test = pd.read_csv(valid, sep = ',')
#Deal with multicolinearity :
df_train = df_train.drop(['1stFlrSF','GarageCars','TotRmsAbvGrd'], 1)
df_test = df_test.drop( ['1stFlrSF','GarageCars','TotRmsAbvGrd'],1)
df_test.columns
print(df_train.isnull().sum())
print(df_test.isnull().sum())
def log_rmse(yhat, ytrue):
return np.sqrt(mean_squared_error ( np.log(yhat), np.log(ytrue) ))
print("\n -- RMSE test {:.4f}".format(log_rmse(Y_test, yhat_test) ))
print("\n -- RMSE train {:.4f}".format(log_rmse(Y_train, yhat_train)))
On fait l'hypothèse que si le porcentage de valeurs manquantes est supérieur à 50 pourcent, on supprime cette colonne
total = df_train.isnull().sum().sort_values(ascending=False)
percent = ((df_train.isnull().sum()/df_train.isnull().count())*100).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
df_train = df_train.drop((missing_data[missing_data['Percent'] > 50 ]).index,1)
df_train.isnull().sum().sort_values(ascending=False)
df_test= df_test.drop((missing_data[missing_data['Percent'] > 50 ]).index,1)
df_test.isnull().sum().sort_values(ascending=False)
Compléter les valeurs numériques manquantes par la moyenne
#Complete missing values for numerical variables with the mean values :
numerical_features = df_train.select_dtypes(exclude=['object']).columns
numerical_features
N = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageYrBlt',
'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
for col in N:
df_train[col].fillna(np.mean(df_train[col]), inplace = True)
df_test[col].fillna(np.mean(df_test[col]), inplace = True)
Remplacer les valeurs manquantes pour les variables catégorielles par des : None
categorical_features = df_train.select_dtypes(include=['object']).columns
categorical_features
C = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
for col in C:
df_train[col].fillna('None', inplace = True)
df_test[col].fillna('None', inplace = True)
#Label encoding
# Join the two datasets before encoding
df_join = pd.concat([df_train,df_test])
for col in C:
le = LabelEncoder()
le.fit(df_join[col])
df_train[col] = le.transform(df_train[col])
df_test[col] = le.transform(df_test[col])
5-Tester les différents modèles
Y = df_train['SalePrice']
X = df_train.drop(['SalePrice'], 1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = RandomForestRegressor(n_estimators=1000 )
param_grid = {"max_depth": [3,6, 9, 12],
"min_samples_split": [2, 3, 5]}
gs = GridSearchCV(clf, param_grid=param_grid, cv=3, verbose = 2)
gs.fit(X,Y)
yhat_train = gs.best_estimator_.predict(X_train)
yhat_test = gs.best_estimator_.predict(X_test)
Train_score = np.sqrt(mean_squared_error(yhat_train, Y_train))
Test_score = np.sqrt(mean_squared_error(yhat_test, Y_test))
print("test {:.4f} train {:.4f} ".format(Test_score, Train_score))
X_valid = df_test[C+N]
yhat_valid = gs.best_estimator_.predict(X_valid)
yhat_valid = np.exp(yhat_valid) -1
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
results['Id'] = X_valid.index + 1461
results['SalePrice'] =yhat_valid
results.to_csv("submission_RFNew.csv", index = False)
Après avoir construit le modèle du Random forest, je vais essayer le modèle SVR (Support Vector Regression)
clf1 = SVR()
clf1.fit(X_train,Y_train)
yhat_train1 = clf1.predict(X_train)
yhat_test1 = clf1.predict(X_test)
#Scoring
Train_score1 = np.sqrt(mean_squared_error(yhat_train1, Y_train))
Test_score1 = np.sqrt(mean_squared_error(yhat_test1, Y_test))
print("test {:.4f} train {:.4f} ".format(Test_score1, Train_score1))
On remarque un overfitting du modèle
#Validation dans la base de données test
X_valid = df_test
yhat_valid1 = clf1.predict(X_valid)
yhat_valid1= np.exp(yhat_valid1) -1
#Submit :
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
results['Id'] = X_valid.index + 1461
results['SalePrice'] =yhat_valid1
results.to_csv("submission_LR5.csv", index = False)
#Lasso model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso.fit(X_train,Y_train)
yhat_train2 = lasso.predict(X_train)
yhat_test2 = lasso.predict(X_test)
Train_score2 = np.sqrt(mean_squared_error(yhat_train2, Y_train))
Test_score2 = np.sqrt(mean_squared_error(yhat_test2, Y_test))
print("test {:.4f} train {:.4f} ".format(Test_score2, Train_score2))
Le score sur kaggle pour ce modèle est : 0.12666 avec un classement de 1503
#Validation dans la base de données test
X_valid = df_test
yhat_valid2 = lasso.predict(X_valid)
yhat_valid2= np.exp(yhat_valid2) -1
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
results['Id'] = X_valid.index + 1461
results['SalePrice'] =yhat_valid2
results.to_csv("submission_Lasso.csv", index = False)
#Elastic net :
#Shuffle dataframe:
df_train= df_train.sample(frac=1)
df_test = df_test.sample(frac=1)
#Refaire le split du dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
#Construire le modèle :
from sklearn import linear_model
EN = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(X_train, Y_train)
#Fit the model :
EN.fit(X_train,Y_train)
yhat_train3 = EN.predict(X_train)
yhat_test3 = EN.predict(X_test)
Train_score3 = np.sqrt(mean_squared_error(yhat_train3, Y_train))
Test_score3 = np.sqrt(mean_squared_error(yhat_test3, Y_test))
print("test {:.4f} train {:.4f} ".format(Test_score3, Train_score3))
#Validation dans la base de données test
X_valid = df_test
yhat_valid3 = EN.predict(X_valid)
yhat_valid3= np.exp(yhat_valid3) -1
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
results['Id'] = X_valid.index + 1461
results['SalePrice'] =yhat_valid3
results.to_csv("submission_ENET.csv", index = False)
yhat_validAvg =np.exp(lasso.predict(X_valid)) +np.exp(EN.predict(X_valid)) -2
yhat_validAvg= yhat_validAvg/2
results = pd.DataFrame(columns = ['Id', 'SalePrice'])
results['Id'] = X_valid.index + 1461
results['SalePrice'] =yhat_validAvg
results.to_csv("submission_Avg2.csv", index = False)
Cette technique n'as pas permis de minimiser les erreurs, ainsi j'opte pour le modèle Lasso vu qu'il présente moins d'erreurs sur la prédiction