Dans ce notebook , on a testé trois modélisations après le preprocessing et feature engineering .La premiere méthode était de faire une réduction du nombre de variables à l'aide de xgboost regressor , dans la deuxieme méthode on a utilisé une réduction de dimensions en essayant de trouver les axes principales à l'aide de l'analyse des composantes principales.

In [42]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

Exploration et analyse des données

In [43]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Il y a quelques maisons avec la variable GrvLivArea supérieure à 4000 qu'on peut considérer comme variables abbérrantes. 
# On les élimine donc de notre dataset de training.

train_df.drop(train_df[train_df["GrLivArea"] > 4000].index, inplace=True)
test_df.loc[666, "GarageQual"] = "TA"
test_df.loc[666, "GarageCond"] = "TA"
test_df.loc[666, "GarageFinish"] = "Unf"
test_df.loc[666, "GarageYrBlt"] = "1980"

# Dans la dataset de test, la ligne 1116 a uniquement GarageType et aucune information supplémentaire. On suppose
# qu'elle n'a pas de garage.
test_df.loc[1116, "GarageType"] = np.nan

# Remplacer les valeurs manquantes de LotFrontage values par la médianne.
# LotFrontage par quartier.
lot_frontage_by_neighborhood = train_df["LotFrontage"].groupby(train_df["Neighborhood"])

# Convertir les variables catégorielles en variables ordinales.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def factorize(df, factor_df, column, fill_na=None):
    factor_df[column] = df[column]
    if fill_na is not None:
        factor_df[column].fillna(fill_na, inplace=True)
    le.fit(factor_df[column].unique())
    factor_df[column] = le.transform(factor_df[column])
    return factor_df

# Combiner toutes les variables numériques en une seule dataset. 
def munge(df):
    df_global = pd.DataFrame(index = df.index)
   
    df_global["LotFrontage"] = df["LotFrontage"]   
    for key, group in lot_frontage_by_neighborhood:
        idx = (df["Neighborhood"] == key) & (df["LotFrontage"].isnull())
        df_global.loc[idx, "LotFrontage"] = group.median()    

    df_global["LotArea"] = df["LotArea"]

    df_global["MasVnrArea"] = df["MasVnrArea"]
    df_global["MasVnrArea"].fillna(0, inplace=True)
   
    df_global["BsmtFinSF1"] = df["BsmtFinSF1"]
    df_global["BsmtFinSF1"].fillna(0, inplace=True)

    df_global["BsmtFinSF2"] = df["BsmtFinSF2"]
    df_global["BsmtFinSF2"].fillna(0, inplace=True)

    df_global["BsmtUnfSF"] = df["BsmtUnfSF"]
    df_global["BsmtUnfSF"].fillna(0, inplace=True)

    df_global["TotalBsmtSF"] = df["TotalBsmtSF"]
    df_global["TotalBsmtSF"].fillna(0, inplace=True)

    df_global["1stFlrSF"] = df["1stFlrSF"]
    df_global["2ndFlrSF"] = df["2ndFlrSF"]
    df_global["GrLivArea"] = df["GrLivArea"]
    
    df_global["GarageArea"] = df["GarageArea"]
    df_global["GarageArea"].fillna(0, inplace=True)

    df_global["WoodDeckSF"] = df["WoodDeckSF"]
    df_global["OpenPorchSF"] = df["OpenPorchSF"]
    df_global["EnclosedPorch"] = df["EnclosedPorch"]
    df_global["3SsnPorch"] = df["3SsnPorch"]
    df_global["ScreenPorch"] = df["ScreenPorch"]
    
    df_global["BsmtFullBath"] = df["BsmtFullBath"]
    df_global["BsmtFullBath"].fillna(0, inplace=True)

    df_global["BsmtHalfBath"] = df["BsmtHalfBath"]
    df_global["BsmtHalfBath"].fillna(0, inplace=True)

    df_global["FullBath"] = df["FullBath"] 
    df_global["HalfBath"] = df["HalfBath"] 
    df_global["BedroomAbvGr"] = df["BedroomAbvGr"] 
    df_global["KitchenAbvGr"] = df["KitchenAbvGr"] 
    df_global["TotRmsAbvGrd"] = df["TotRmsAbvGrd"] 
    df_global["Fireplaces"] = df["Fireplaces"] 

    df_global["GarageCars"] = df["GarageCars"]
    df_global["GarageCars"].fillna(0, inplace=True)

    df_global["CentralAir"] = (df["CentralAir"] == "Y") * 1.0
   
    df_global["OverallQual"] = df["OverallQual"]
    df_global["OverallCond"] = df["OverallCond"]

    # Les variables qui définissent la qualité peuvent étre gardées sous forme de test mais on peut les convertir 
    # en variables numériques, plus grand le nombre, la meilleure la qualité, et vis versa.

    qual_dict = {None: 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    df_global["ExterQual"] = df["ExterQual"].map(qual_dict).astype(int)
    df_global["ExterCond"] = df["ExterCond"].map(qual_dict).astype(int)
    df_global["BsmtQual"] = df["BsmtQual"].map(qual_dict).astype(int)
    df_global["BsmtCond"] = df["BsmtCond"].map(qual_dict).astype(int)
    df_global["HeatingQC"] = df["HeatingQC"].map(qual_dict).astype(int)
    df_global["KitchenQual"] = df["KitchenQual"].map(qual_dict).astype(int)
    df_global["FireplaceQu"] = df["FireplaceQu"].map(qual_dict).astype(int)
    df_global["GarageQual"] = df["GarageQual"].map(qual_dict).astype(int)
    df_global["GarageCond"] = df["GarageCond"].map(qual_dict).astype(int)

    df_global["BsmtExposure"] = df["BsmtExposure"].map(
        {None: 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}).astype(int)

    bsmt_fin_dict = {None: 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
    df_global["BsmtFinType1"] = df["BsmtFinType1"].map(bsmt_fin_dict).astype(int)
    df_global["BsmtFinType2"] = df["BsmtFinType2"].map(bsmt_fin_dict).astype(int)

    df_global["Functional"] = df["Functional"].map(
        {None: 0, "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, 
         "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8}).astype(int)

    df_global["GarageFinish"] = df["GarageFinish"].map(
        {None: 0, "Unf": 1, "RFn": 2, "Fin": 3}).astype(int)

    df_global["Fence"] = df["Fence"].map(
        {None: 0, "MnWw": 1, "GdWo": 2, "MnPrv": 3, "GdPrv": 4}).astype(int)

    df_global["YearBuilt"] = df["YearBuilt"]
    df_global["YearRemodAdd"] = df["YearRemodAdd"]

    df_global["GarageYrBlt"] = df["GarageYrBlt"]
    df_global["GarageYrBlt"].fillna(0.0, inplace=True)

    df_global["MoSold"] = df["MoSold"]
    df_global["YrSold"] = df["YrSold"]
    
    df_global["LowQualFinSF"] = df["LowQualFinSF"]
    df_global["MiscVal"] = df["MiscVal"]

    df_global["PoolQC"] = df["PoolQC"].map(qual_dict).astype(int)

    df_global["PoolArea"] = df["PoolArea"]
    df_global["PoolArea"].fillna(0, inplace=True)
    
    # Ajouter les variables catégoriques en variables numériques
    df_global = factorize(df, df_global, "MSSubClass")
    df_global = factorize(df, df_global, "MSZoning", "RL")
    df_global = factorize(df, df_global, "LotConfig")
    df_global = factorize(df, df_global, "Neighborhood")
    df_global = factorize(df, df_global, "Condition1")
    df_global = factorize(df, df_global, "BldgType")
    df_global = factorize(df, df_global, "HouseStyle")
    df_global = factorize(df, df_global, "RoofStyle")
    df_global = factorize(df, df_global, "Exterior1st", "Other")
    df_global = factorize(df, df_global, "Exterior2nd", "Other")
    df_global = factorize(df, df_global, "MasVnrType", "None")
    df_global = factorize(df, df_global, "Foundation")
    df_global = factorize(df, df_global, "SaleType", "Oth")
    df_global = factorize(df, df_global, "SaleCondition")

    # Les valeurs IR2 et IR3 n'apparaissent pas assez fréquemment, on choisit donc de faire la distinction uniquement
    # entre regular et irregular.
    df_global["IsRegularLotShape"] = (df["LotShape"] == "Reg") * 1

    # La majorité des propriétés sont level; on traite les autres possibilités comme
    # étant "not level".
    df_global["IsLandLevel"] = (df["LandContour"] == "Lvl") * 1

    # La majorité des LandSLopes sont "gentle"; on traite les autres variables en tant que "not gentle".
    df_global["IsLandSlopeGentle"] = (df["LandSlope"] == "Gtl") * 1

    # La valeur prépondérante est standard circuit breakers, on traite les autres variables en tant que "not SBrkr".
    df_global["IsElectricalSBrkr"] = (df["Electrical"] == "SBrkr") * 1

    # La valeur la plus fréquente de GarageType est "Detchd", on traite les autres valeurs en tant que "not Detchd".
    df_global["IsGarageDetached"] = (df["GarageType"] == "Detchd") * 1

    # La valeur la plus fréquente est "PavedDrive". On traite dirt/gravel et partial pavement
    # en tant que "not paved".
    df_global["IsPavedDrive"] = (df["PavedDrive"] == "Y") * 1

    # La seule caractéristique intéressante est la présence de "Shed".
    df_global["HasShed"] = (df["MiscFeature"] == "Shed") * 1.  

    # Si YearRemodAdd != YearBuilt, donc une rénovation a eu lieu.
    df_global["Remodeled"] = (df_global["YearRemodAdd"] != df_global["YearBuilt"]) * 1
    
    # Rénovation de la maison l'année où elle a été vendue?
    df_global["RecentRemodel"] = (df_global["YearRemodAdd"] == df_global["YrSold"]) * 1
    
    # Maison vendue l'année où elle a été batie?
    df_global["VeryNewHouse"] = (df_global["YearBuilt"] == df_global["YrSold"]) * 1

    df_global["Has2ndFloor"] = (df_global["2ndFlrSF"] == 0) * 1
    df_global["HasMasVnr"] = (df_global["MasVnrArea"] == 0) * 1
    df_global["HasWoodDeck"] = (df_global["WoodDeckSF"] == 0) * 1
    df_global["HasOpenPorch"] = (df_global["OpenPorchSF"] == 0) * 1
    df_global["HasEnclosedPorch"] = (df_global["EnclosedPorch"] == 0) * 1
    df_global["Has3SsnPorch"] = (df_global["3SsnPorch"] == 0) * 1
    df_global["HasScreenPorch"] = (df_global["ScreenPorch"] == 0) * 1

   
    # Les mois avec le plus grand nombre de ventes peut étre intéressant pour notre modèle.
    df_global["HighSeason"] = df["MoSold"].replace( 
        {1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0})

    df_global["NewerDwelling"] = df["MSSubClass"].replace(
        {20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0,
         90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0})   
    
    df_global.loc[df.Neighborhood == 'NridgHt', "Neighborhood_Good"] = 1
    df_global.loc[df.Neighborhood == 'Crawfor', "Neighborhood_Good"] = 1
    df_global.loc[df.Neighborhood == 'StoneBr', "Neighborhood_Good"] = 1
    df_global.loc[df.Neighborhood == 'Somerst', "Neighborhood_Good"] = 1
    df_global.loc[df.Neighborhood == 'NoRidge', "Neighborhood_Good"] = 1
    df_global["Neighborhood_Good"].fillna(0, inplace=True)

    df_global["SaleCondition_PriceDown"] = df.SaleCondition.replace(
        {'Abnorml': 1, 'Alloca': 1, 'AdjLand': 1, 'Family': 1, 'Normal': 0, 'Partial': 0})

    # Maison completée avant vente ou pas?
    df_global["BoughtOffPlan"] = df.SaleCondition.replace(
        {"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, "Family" : 0, "Normal" : 0, "Partial" : 1})
    
    df_global["BadHeating"] = df.HeatingQC.replace(
        {'Ex': 0, 'Gd': 0, 'TA': 0, 'Fa': 1, 'Po': 1})

    area_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
                 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'LowQualFinSF', 'PoolArea' ]
    df_global["TotalArea"] = df_global[area_cols].sum(axis=1)

    df_global["TotalArea1st2nd"] = df_global["1stFlrSF"] + df_global["2ndFlrSF"]

    df_global["Age"] = 2010 - df_global["YearBuilt"]
    df_global["TimeSinceSold"] = 2010 - df_global["YrSold"]

    df_global["SeasonSold"] = df_global["MoSold"].map({12:0, 1:0, 2:0, 3:1, 4:1, 5:1, 
                                                  6:2, 7:2, 8:2, 9:3, 10:3, 11:3}).astype(int)
    
    df_global["YearsSinceRemodel"] = df_global["YrSold"] - df_global["YearRemodAdd"]
    
    # Simplification des caractéristiques de qualité mauvais/normal/bon en valeurs numériques.
    df_global["SimplOverallQual"] = df_global.OverallQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2, 7 : 3, 8 : 3, 9 : 3, 10 : 3})
    df_global["SimplOverallCond"] = df_global.OverallCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2, 7 : 3, 8 : 3, 9 : 3, 10 : 3})
    df_global["SimplPoolQC"] = df_global.PoolQC.replace(
        {1 : 1, 2 : 1, 3 : 2, 4 : 2})
    df_global["SimplGarageCond"] = df_global.GarageCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplGarageQual"] = df_global.GarageQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplFireplaceQu"] = df_global.FireplaceQu.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplFireplaceQu"] = df_global.FireplaceQu.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplFunctional"] = df_global.Functional.replace(
        {1 : 1, 2 : 1, 3 : 2, 4 : 2, 5 : 3, 6 : 3, 7 : 3, 8 : 4})
    df_global["SimplKitchenQual"] = df_global.KitchenQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplHeatingQC"] = df_global.HeatingQC.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplBsmtFinType1"] = df_global.BsmtFinType1.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2})
    df_global["SimplBsmtFinType2"] = df_global.BsmtFinType2.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2, 6 : 2})
    df_global["SimplBsmtCond"] = df_global.BsmtCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplBsmtQual"] = df_global.BsmtQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplExterCond"] = df_global.ExterCond.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
    df_global["SimplExterQual"] = df_global.ExterQual.replace(
        {1 : 1, 2 : 1, 3 : 1, 4 : 2, 5 : 2})
            
    # Classifier par neighborhood un peu arbitrairement. Les valeurs ont été calculées en utilisant la fonction:
    # train_df["SalePrice"].groupby(train_df["Neighborhood"]).median().sort_values()
    carte_voisin = {
        "MeadowV" : 0,  #  88000
        "IDOTRR" : 1,   # 103000
        "BrDale" : 1,   # 106000
        "OldTown" : 1,  # 119000
        "Edwards" : 1,  # 119500
        "BrkSide" : 1,  # 124300
        "Sawyer" : 1,   # 135000
        "Blueste" : 1,  # 137500
        "SWISU" : 2,    # 139500
        "NAmes" : 2,    # 140000
        "NPkVill" : 2,  # 146000
        "Mitchel" : 2,  # 153500
        "SawyerW" : 2,  # 179900
        "Gilbert" : 2,  # 181000
        "NWAmes" : 2,   # 182900
        "Blmngtn" : 2,  # 191000
        "CollgCr" : 2,  # 197200
        "ClearCr" : 3,  # 200250
        "Crawfor" : 3,  # 200624
        "Veenker" : 3,  # 218000
        "Somerst" : 3,  # 225500
        "Timber" : 3,   # 228475
        "StoneBr" : 4,  # 278000
        "NoRidge" : 4,  # 290000
        "NridgHt" : 4,  # 315000
    }

    df_global["NeighborhoodBin"] = df["Neighborhood"].map(carte_voisin)
    return df_global

train_df_munged = munge(train_df)
test_df_munged = munge(test_df)

print(train_df_munged.shape)
print(test_df_munged.shape)

# Copier NeighborhoodBin en une dataset temporaire parce qu'on veut utiliser la version unscaled après
# pour lui appliquer one-hot encode. 
neighborhood_bin_train = pd.DataFrame(index = train_df.index)
neighborhood_bin_train["NeighborhoodBin"] = train_df_munged["NeighborhoodBin"]
neighborhood_bin_test = pd.DataFrame(index = test_df.index)
neighborhood_bin_test["NeighborhoodBin"] = test_df_munged["NeighborhoodBin"]

################################################################################

numeric_features = train_df_munged.dtypes[train_df_munged.dtypes != "object"].index

# Transformer les variables numériques qui sont skewed en leur appliquant la fonction log(variable + 1).
# Ça va rendre les variables plus normales.
from scipy.stats import skew

skewed = train_df_munged[numeric_features].apply(lambda x: skew(x.dropna().astype(float)))
skewed = skewed[skewed > 0.75]
skewed = skewed.index

train_df_munged[skewed] = np.log1p(train_df_munged[skewed])
test_df_munged[skewed] = np.log1p(test_df_munged[skewed])

# Processing: scaling de notre dataset   
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_df_munged[numeric_features])

scaled = scaler.transform(train_df_munged[numeric_features])
for i, col in enumerate(numeric_features):
    train_df_munged[col] = scaled[:, i]

scaled = scaler.transform(test_df_munged[numeric_features])
for i, col in enumerate(numeric_features):
    test_df_munged[col] = scaled[:, i]

################################################################################

# Convertir les caractéristiques catégorielles en utilisant one-hot encoding.
def onehot(df_onehot_encoder, df, column_name, fill_na, drop_name):
    df_onehot_encoder[column_name] = df[column_name]
    if fill_na is not None:
        df_onehot_encoder[column_name].fillna(fill_na, inplace=True)

    dummies = pd.get_dummies(df_onehot_encoder[column_name], prefix="_" + column_name)
    
    

    df_onehot_encoder = df_onehot_encoder.join(dummies)
    df_onehot_encoder = df_onehot_encoder.drop([column_name], axis=1)
    return df_onehot_encoder

def munge_onehot(df):
    df_onehot_encoder = pd.DataFrame(index = df.index)

    df_onehot_encoder = onehot(df_onehot_encoder, df, "MSSubClass", None, "40")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "MSZoning", "RL", "RH")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "LotConfig", None, "FR3")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Neighborhood", None, "OldTown")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Condition1", None, "RRNe")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BldgType", None, "2fmCon")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "HouseStyle", None, "1.5Unf")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "RoofStyle", None, "Shed")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Exterior1st", "VinylSd", "CBlock")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Exterior2nd", "VinylSd", "CBlock")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Foundation", None, "Wood")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "SaleType", "WD", "Oth")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "SaleCondition", "Normal", "AdjLand")

   
    temp_df = df[["MasVnrType", "MasVnrArea"]].copy()
    idx = (df["MasVnrArea"] != 0) & ((df["MasVnrType"] == "None") | (df["MasVnrType"].isnull()))
    temp_df.loc[idx, "MasVnrType"] = "BrkFace"
    df_onehot_encoder = onehot(df_onehot_encoder, temp_df, "MasVnrType", "None", "BrkCmn")

    # Rajouter les variables booléennes de calc_df comme dummy variables. 
    df_onehot_encoder = onehot(df_onehot_encoder, df, "LotShape", None, "IR3")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "LandContour", None, "Low")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "LandSlope", None, "Sev")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Electrical", "SBrkr", "FuseP")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "GarageType", "None", "CarPort")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "PavedDrive", None, "P")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "MiscFeature", "None", "Othr")

    # Variables que l'on peut ignorer mais que l'on choisit de prendre compte dans notre modèle pour voir
    # si elles influencent d'une manière ou d'une autre.
    # On choisit d'ignorer la variable Utilities parce que toutes ses valeurs sont égales à AllPub, à l'excepté
    # d'une valeur "NoSeWa" dans la dataset de training et 2 NA dans la dataset de test.
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Street", None, "Grvl")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Alley", "None", "Grvl")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Condition2", None, "PosA")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "RoofMatl", None, "WdShake")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Heating", None, "Wall")

    # Transformer ces variables en variables numériques.
    df_onehot_encoder = onehot(df_onehot_encoder, df, "ExterQual", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "ExterCond", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BsmtQual", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BsmtCond", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "HeatingQC", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "KitchenQual", "TA", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "FireplaceQu", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "GarageQual", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "GarageCond", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "PoolQC", "None", "Ex")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BsmtExposure", "None", "Gd")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BsmtFinType1", "None", "GLQ")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "BsmtFinType2", "None", "GLQ")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Functional", "Typ", "Typ")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "GarageFinish", "None", "Fin")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "Fence", "None", "MnPrv")
    df_onehot_encoder = onehot(df_onehot_encoder, df, "MoSold", None, None)
    
    # Diviser les années entre 1871 et 2010 en tranches de 20 ans.
    year_map = pd.concat(pd.Series("YearBin" + str(i+1), index=range(1871+i*20,1891+i*20)) for i in range(0, 7))

    yearbin_df = pd.DataFrame(index = df.index)
    yearbin_df["GarageYrBltBin"] = df.GarageYrBlt.map(year_map)
    yearbin_df["GarageYrBltBin"].fillna("NoGarage", inplace=True)

    yearbin_df["YearBuiltBin"] = df.YearBuilt.map(year_map)
    yearbin_df["YearRemodAddBin"] = df.YearRemodAdd.map(year_map)
    
    df_onehot_encoder = onehot(df_onehot_encoder, yearbin_df, "GarageYrBltBin", None, None)
    df_onehot_encoder = onehot(df_onehot_encoder, yearbin_df, "YearBuiltBin", None, None)
    df_onehot_encoder = onehot(df_onehot_encoder, yearbin_df, "YearRemodAddBin", None, None)

    return df_onehot_encoder

# Ajout des one-hot encoded caractéristiques catégorielles.
df_onehot_encoder = munge_onehot(train_df)
df_onehot_encoder = onehot(df_onehot_encoder, neighborhood_bin_train, "NeighborhoodBin", None, None)
train_df_munged = train_df_munged.join(df_onehot_encoder)

# Ces colonnes onehot sont absentes de la dataset de test, on les élimine donc de la dataset de training par peur 
# d'avoir un modèle overfit à cause d'elles.
colonne_a_eliminer = [
                "_Exterior1st_ImStucc", "_Exterior1st_Stone",
                "_Exterior2nd_Other","_HouseStyle_2.5Fin", 
            
                "_RoofMatl_Membran", "_RoofMatl_Metal", "_RoofMatl_Roll",
                "_Condition2_RRAe", "_Condition2_RRAn", "_Condition2_RRNn",
                "_Heating_Floor", "_Heating_OthW",

                "_Electrical_Mix", 
                "_MiscFeature_TenC",
                "_GarageQual_Ex", "_PoolQC_Fa"
            ]
train_df_munged.drop(colonne_a_eliminer, axis=1, inplace=True)

df_onehot_encoder = munge_onehot(test_df)
df_onehot_encoder = onehot(df_onehot_encoder, neighborhood_bin_test, "NeighborhoodBin", None, None)
test_df_munged = test_df_munged.join(df_onehot_encoder)

# Cette colonne est absente dans la dataset de training. On a uniquement un exemple avec celle ci dans la dataset 
# de test, on choisit donc de l'éliminer.
test_df_munged.drop(["_MSSubClass_150"], axis=1, inplace=True)

# On élimine ces colonnes. Elles sont soit inutiles, soit elles mènent à l'overfitting de notre modèle.
colonne_a_eliminer = [
    "_Condition2_PosN",    # Deux uniquement sont différentes de zéro.
    "_MSZoning_C (all)",
    "_MSSubClass_160",
]
train_df_munged.drop(colonne_a_eliminer, axis=1, inplace=True)
test_df_munged.drop(colonne_a_eliminer, axis=1, inplace=True)

################################################################################

# On prend le log parce que l'erreur est calculée entre le log de "SalePrice" et le log du prix qu'on a estimé.
# On doit donc introduire la fonction exp() pour obtenir la vraie valeur de "SalePrice".

label_df = pd.DataFrame(index = train_df_munged.index, columns=["SalePrice"])
label_df["SalePrice"] = np.log(train_df["SalePrice"])

print("Training set size:", train_df_munged.shape)
print("Test set size:", test_df_munged.shape)
(1456, 111)
(1459, 111)
('Training set size:', (1456, 403))
('Test set size:', (1459, 403))

Première méthode : sélectionner automatiquement les meilleures caractéristiques

In [40]:
train_rania=train_df_munged
train_rania['SalePrice']=label_df

Définir une fonction pour génerer le score des variables de notre dataset , l'algorithme utilisé est xgboost pour le classement des meilleures caracteristiques

In [5]:
from sklearn.ensemble import  RandomForestRegressor
import xgboost as xgb

def feature_importance(df, target_name):
    # split variable dependante et indep
    X = np.array(df.drop([target_name], 1))
    Y = np.array(df[target_name])
    names = list(df.columns)
    
    rf = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0.045,                 
                 learning_rate=0.07,
                 max_depth=20,
                 min_child_weight=1.5,
                 n_estimators=300,                                                                    
                 reg_alpha=0.65,
                 reg_lambda=0.45,
                 subsample=0.95)
    rf.fit(X, Y)
    print "Features sorted by their score:"
    a = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
               reverse=True)

    print a
    
    
    return a


feature_importance = feature_importance(train_rania,'SalePrice')
/home/boumelha/.local/lib/python2.7/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
Features sorted by their score:
[(0.0699, 'TotalArea'), (0.0629, 'TotalArea1st2nd'), (0.0509, 'OverallQual'), (0.0419, 'GrLivArea'), (0.0379, 'LotArea'), (0.0359, 'OverallCond'), (0.0349, 'TotalBsmtSF'), (0.0279, 'YearBuilt'), (0.0279, 'BsmtFinSF1'), (0.0259, 'SimplOverallQual'), (0.025, 'GarageYrBlt'), (0.025, '1stFlrSF'), (0.021, 'GarageArea'), (0.02, 'Neighborhood'), (0.017, 'OpenPorchSF'), (0.016, 'Functional'), (0.015, 'YearsSinceRemodel'), (0.015, 'WoodDeckSF'), (0.015, 'SaleCondition'), (0.015, '2ndFlrSF'), (0.014, 'SimplOverallCond'), (0.014, 'BsmtFinType1'), (0.013, 'ScreenPorch'), (0.012, 'MSZoning'), (0.011, 'Age'), (0.01, 'YearRemodAdd'), (0.01, 'Exterior1st'), (0.009, '_SaleType_New'), (0.009, '_Exterior1st_BrkFace'), (0.009, 'NeighborhoodBin'), (0.009, 'LotFrontage'), (0.009, 'BsmtExposure'), (0.008, '_Condition1_Norm'), (0.008, 'YrSold'), (0.008, 'BsmtUnfSF'), (0.008, 'BsmtFullBath'), (0.008, 'BsmtFinSF2'), (0.007, 'Condition1'), (0.006, '_SaleCondition_Normal'), (0.006, '_LotConfig_CulDSac'), (0.006, '_BsmtExposure_Gd'), (0.006, 'HeatingQC'), (0.006, 'FullBath'), (0.005, '_Neighborhood_NAmes'), (0.005, 'SimplFunctional'), (0.005, 'LotConfig'), (0.005, 'KitchenAbvGr'), (0.005, 'Fence'), (0.005, 'CentralAir'), (0.005, 'BsmtCond'), (0.004, '_SaleCondition_Family'), (0.004, '_Neighborhood_OldTown'), (0.004, '_GarageYrBltBin_YearBin7'), (0.004, '_ExterQual_Gd'), (0.004, 'MoSold'), (0.004, 'HighSeason'), (0.004, 'GarageCars'), (0.004, 'FireplaceQu'), (0.003, '_SaleCondition_Abnorml'), (0.003, '_NeighborhoodBin_2'), (0.003, '_MSSubClass_30'), (0.003, '_Functional_Typ'), (0.003, '_Exterior1st_MetalSd'), (0.003, '_ExterCond_Fa'), (0.003, '_Condition1_Feedr'), (0.003, 'TotRmsAbvGrd'), (0.003, 'SeasonSold'), (0.003, 'IsPavedDrive'), (0.003, 'HalfBath'), (0.003, 'GarageFinish'), (0.003, 'Foundation'), (0.003, 'EnclosedPorch'), (0.002, '_YearBuiltBin_YearBin4'), (0.002, '_Neighborhood_BrkSide'), (0.002, '_MoSold_6'), (0.002, '_MoSold_5'), (0.002, '_KitchenQual_Gd'), (0.002, '_Heating_Grav'), (0.002, '_Foundation_BrkTil'), (0.002, '_Exterior2nd_MetalSd'), (0.002, '_Exterior1st_Wd Sdng'), (0.002, '_ExterQual_Fa'), (0.002, '_ExterQual_Ex'), (0.002, '_BsmtQual_Ex'), (0.002, 'TimeSinceSold'), (0.002, 'SimplBsmtFinType1'), (0.002, 'SaleCondition_PriceDown'), (0.002, 'Neighborhood_Good'), (0.002, 'MasVnrArea'), (0.002, 'HouseStyle'), (0.002, 'GarageQual'), (0.002, 'Fireplaces'), (0.002, 'Exterior2nd'), (0.001, '_YearRemodAddBin_YearBin6'), (0.001, '_YearRemodAddBin_YearBin5'), (0.001, '_YearRemodAddBin_YearBin4'), (0.001, '_YearBuiltBin_YearBin2'), (0.001, '_PavedDrive_N'), (0.001, '_Neighborhood_IDOTRR'), (0.001, '_MasVnrType_BrkFace'), (0.001, '_MSZoning_RM'), (0.001, '_MSSubClass_90'), (0.001, '_LotConfig_Inside'), (0.001, '_LandSlope_Gtl'), (0.001, '_KitchenQual_Ex'), (0.001, '_HouseStyle_2.5Unf'), (0.001, '_HeatingQC_Gd'), (0.001, '_HeatingQC_Ex'), (0.001, '_GarageYrBltBin_YearBin4'), (0.001, '_GarageType_CarPort'), (0.001, '_GarageCond_TA'), (0.001, '_Functional_Mod'), (0.001, '_Exterior1st_Plywood'), (0.001, '_ExterQual_TA'), (0.001, '_ExterCond_TA'), (0.001, '_Condition1_RRAe'), (0.001, '_Condition1_Artery'), (0.001, '_BsmtFinType1_Unf'), (0.001, '_BsmtFinType1_LwQ'), (0.001, '_BsmtCond_Fa'), (0.001, '_BldgType_Duplex'), (0.001, '_Alley_Pave'), (0.001, '_Alley_Grvl'), (0.001, 'SimplKitchenQual'), (0.001, 'SimplHeatingQC'), (0.001, 'SimplExterQual'), (0.001, 'SaleType'), (0.001, 'MasVnrType'), (0.001, 'KitchenQual'), (0.001, 'IsRegularLotShape'), (0.001, 'HasWoodDeck'), (0.001, 'HasEnclosedPorch'), (0.001, 'GarageCond'), (0.001, 'ExterQual'), (0.001, 'ExterCond'), (0.001, 'BsmtQual'), (0.001, 'BedroomAbvGr'), (0.0, '_YearRemodAddBin_YearBin7'), (0.0, '_YearBuiltBin_YearBin7'), (0.0, '_YearBuiltBin_YearBin6'), (0.0, '_YearBuiltBin_YearBin5'), (0.0, '_YearBuiltBin_YearBin3'), (0.0, '_YearBuiltBin_YearBin1'), (0.0, '_Street_Pave'), (0.0, '_Street_Grvl'), (0.0, '_SaleType_WD'), (0.0, '_SaleType_Oth'), (0.0, '_SaleType_ConLw'), (0.0, '_SaleType_ConLI'), (0.0, '_SaleType_ConLD'), (0.0, '_SaleType_Con'), (0.0, '_SaleType_CWD'), (0.0, '_SaleType_COD'), (0.0, '_SaleCondition_Partial'), (0.0, '_SaleCondition_Alloca'), (0.0, '_SaleCondition_AdjLand'), (0.0, '_RoofStyle_Shed'), (0.0, '_RoofStyle_Mansard'), (0.0, '_RoofStyle_Hip'), (0.0, '_RoofStyle_Gambrel'), (0.0, '_RoofStyle_Gable'), (0.0, '_RoofStyle_Flat'), (0.0, '_RoofMatl_WdShngl'), (0.0, '_RoofMatl_WdShake'), (0.0, '_RoofMatl_Tar&Grv'), (0.0, '_RoofMatl_CompShg'), (0.0, '_PoolQC_None'), (0.0, '_PoolQC_Gd'), (0.0, '_PoolQC_Ex'), (0.0, '_PavedDrive_Y'), (0.0, '_PavedDrive_P'), (0.0, '_Neighborhood_Veenker'), (0.0, '_Neighborhood_Timber'), (0.0, '_Neighborhood_StoneBr'), (0.0, '_Neighborhood_Somerst'), (0.0, '_Neighborhood_SawyerW'), (0.0, '_Neighborhood_Sawyer'), (0.0, '_Neighborhood_SWISU'), (0.0, '_Neighborhood_NridgHt'), (0.0, '_Neighborhood_NoRidge'), (0.0, '_Neighborhood_NWAmes'), (0.0, '_Neighborhood_NPkVill'), (0.0, '_Neighborhood_Mitchel'), (0.0, '_Neighborhood_MeadowV'), (0.0, '_Neighborhood_Gilbert'), (0.0, '_Neighborhood_Edwards'), (0.0, '_Neighborhood_Crawfor'), (0.0, '_Neighborhood_CollgCr'), (0.0, '_Neighborhood_ClearCr'), (0.0, '_Neighborhood_BrDale'), (0.0, '_Neighborhood_Blueste'), (0.0, '_Neighborhood_Blmngtn'), (0.0, '_NeighborhoodBin_4'), (0.0, '_NeighborhoodBin_3'), (0.0, '_NeighborhoodBin_1'), (0.0, '_NeighborhoodBin_0'), (0.0, '_MoSold_9'), (0.0, '_MoSold_8'), (0.0, '_MoSold_7'), (0.0, '_MoSold_4'), (0.0, '_MoSold_3'), (0.0, '_MoSold_2'), (0.0, '_MoSold_12'), (0.0, '_MoSold_11'), (0.0, '_MoSold_10'), (0.0, '_MoSold_1'), (0.0, '_MiscFeature_Shed'), (0.0, '_MiscFeature_Othr'), (0.0, '_MiscFeature_None'), (0.0, '_MiscFeature_Gar2'), (0.0, '_MasVnrType_Stone'), (0.0, '_MasVnrType_None'), (0.0, '_MasVnrType_BrkCmn'), (0.0, '_MSZoning_RL'), (0.0, '_MSZoning_RH'), (0.0, '_MSZoning_FV'), (0.0, '_MSSubClass_85'), (0.0, '_MSSubClass_80'), (0.0, '_MSSubClass_75'), (0.0, '_MSSubClass_70'), (0.0, '_MSSubClass_60'), (0.0, '_MSSubClass_50'), (0.0, '_MSSubClass_45'), (0.0, '_MSSubClass_40'), (0.0, '_MSSubClass_20'), (0.0, '_MSSubClass_190'), (0.0, '_MSSubClass_180'), (0.0, '_MSSubClass_120'), (0.0, '_LotShape_Reg'), (0.0, '_LotShape_IR3'), (0.0, '_LotShape_IR2'), (0.0, '_LotShape_IR1'), (0.0, '_LotConfig_FR3'), (0.0, '_LotConfig_FR2'), (0.0, '_LotConfig_Corner'), (0.0, '_LandSlope_Sev'), (0.0, '_LandSlope_Mod'), (0.0, '_LandContour_Lvl'), (0.0, '_LandContour_Low'), (0.0, '_LandContour_HLS'), (0.0, '_LandContour_Bnk'), (0.0, '_KitchenQual_TA'), (0.0, '_KitchenQual_Fa'), (0.0, '_HouseStyle_SLvl'), (0.0, '_HouseStyle_SFoyer'), (0.0, '_HouseStyle_2Story'), (0.0, '_HouseStyle_1Story'), (0.0, '_HouseStyle_1.5Unf'), (0.0, '_HouseStyle_1.5Fin'), (0.0, '_Heating_Wall'), (0.0, '_Heating_GasW'), (0.0, '_Heating_GasA'), (0.0, '_HeatingQC_TA'), (0.0, '_HeatingQC_Po'), (0.0, '_HeatingQC_Fa'), (0.0, '_GarageYrBltBin_YearBin6'), (0.0, '_GarageYrBltBin_YearBin5'), (0.0, '_GarageYrBltBin_YearBin3'), (0.0, '_GarageYrBltBin_YearBin2'), (0.0, '_GarageYrBltBin_NoGarage'), (0.0, '_GarageType_None'), (0.0, '_GarageType_Detchd'), (0.0, '_GarageType_BuiltIn'), (0.0, '_GarageType_Basment'), (0.0, '_GarageType_Attchd'), (0.0, '_GarageType_2Types'), (0.0, '_GarageQual_TA'), (0.0, '_GarageQual_Po'), (0.0, '_GarageQual_None'), (0.0, '_GarageQual_Gd'), (0.0, '_GarageQual_Fa'), (0.0, '_GarageFinish_Unf'), (0.0, '_GarageFinish_RFn'), (0.0, '_GarageFinish_None'), (0.0, '_GarageFinish_Fin'), (0.0, '_GarageCond_Po'), (0.0, '_GarageCond_None'), (0.0, '_GarageCond_Gd'), (0.0, '_GarageCond_Fa'), (0.0, '_GarageCond_Ex'), (0.0, '_Functional_Sev'), (0.0, '_Functional_Min2'), (0.0, '_Functional_Min1'), (0.0, '_Functional_Maj2'), (0.0, '_Functional_Maj1'), (0.0, '_Foundation_Wood'), (0.0, '_Foundation_Stone'), (0.0, '_Foundation_Slab'), (0.0, '_Foundation_PConc'), (0.0, '_Foundation_CBlock'), (0.0, '_FireplaceQu_TA'), (0.0, '_FireplaceQu_Po'), (0.0, '_FireplaceQu_None'), (0.0, '_FireplaceQu_Gd'), (0.0, '_FireplaceQu_Fa'), (0.0, '_FireplaceQu_Ex'), (0.0, '_Fence_None'), (0.0, '_Fence_MnWw'), (0.0, '_Fence_MnPrv'), (0.0, '_Fence_GdWo'), (0.0, '_Fence_GdPrv'), (0.0, '_Exterior2nd_Wd Shng'), (0.0, '_Exterior2nd_Wd Sdng'), (0.0, '_Exterior2nd_VinylSd'), (0.0, '_Exterior2nd_Stucco'), (0.0, '_Exterior2nd_Stone'), (0.0, '_Exterior2nd_Plywood'), (0.0, '_Exterior2nd_ImStucc'), (0.0, '_Exterior2nd_HdBoard'), (0.0, '_Exterior2nd_CmentBd'), (0.0, '_Exterior2nd_CBlock'), (0.0, '_Exterior2nd_BrkFace'), (0.0, '_Exterior2nd_Brk Cmn'), (0.0, '_Exterior2nd_AsphShn'), (0.0, '_Exterior2nd_AsbShng'), (0.0, '_Exterior1st_WdShing'), (0.0, '_Exterior1st_VinylSd'), (0.0, '_Exterior1st_Stucco'), (0.0, '_Exterior1st_HdBoard'), (0.0, '_Exterior1st_CemntBd'), (0.0, '_Exterior1st_CBlock'), (0.0, '_Exterior1st_BrkComm'), (0.0, '_Exterior1st_AsphShn'), (0.0, '_Exterior1st_AsbShng'), (0.0, '_ExterCond_Po'), (0.0, '_ExterCond_Gd'), (0.0, '_ExterCond_Ex'), (0.0, '_Electrical_SBrkr'), (0.0, '_Electrical_FuseP'), (0.0, '_Electrical_FuseF'), (0.0, '_Electrical_FuseA'), (0.0, '_Condition2_PosA'), (0.0, '_Condition2_Norm'), (0.0, '_Condition2_Feedr'), (0.0, '_Condition2_Artery'), (0.0, '_Condition1_RRNn'), (0.0, '_Condition1_RRNe'), (0.0, '_Condition1_RRAn'), (0.0, '_Condition1_PosN'), (0.0, '_Condition1_PosA'), (0.0, '_BsmtQual_TA'), (0.0, '_BsmtQual_None'), (0.0, '_BsmtQual_Gd'), (0.0, '_BsmtQual_Fa'), (0.0, '_BsmtFinType2_Unf'), (0.0, '_BsmtFinType2_Rec'), (0.0, '_BsmtFinType2_None'), (0.0, '_BsmtFinType2_LwQ'), (0.0, '_BsmtFinType2_GLQ'), (0.0, '_BsmtFinType2_BLQ'), (0.0, '_BsmtFinType2_ALQ'), (0.0, '_BsmtFinType1_Rec'), (0.0, '_BsmtFinType1_None'), (0.0, '_BsmtFinType1_GLQ'), (0.0, '_BsmtFinType1_BLQ'), (0.0, '_BsmtFinType1_ALQ'), (0.0, '_BsmtExposure_None'), (0.0, '_BsmtExposure_No'), (0.0, '_BsmtExposure_Mn'), (0.0, '_BsmtExposure_Av'), (0.0, '_BsmtCond_TA'), (0.0, '_BsmtCond_Po'), (0.0, '_BsmtCond_None'), (0.0, '_BsmtCond_Gd'), (0.0, '_BldgType_TwnhsE'), (0.0, '_BldgType_Twnhs'), (0.0, '_BldgType_2fmCon'), (0.0, '_BldgType_1Fam'), (0.0, '_Alley_None'), (0.0, 'VeryNewHouse'), (0.0, 'SimplPoolQC'), (0.0, 'SimplGarageQual'), (0.0, 'SimplGarageCond'), (0.0, 'SimplFireplaceQu'), (0.0, 'SimplExterCond'), (0.0, 'SimplBsmtQual'), (0.0, 'SimplBsmtFinType2'), (0.0, 'SimplBsmtCond'), (0.0, 'RoofStyle'), (0.0, 'Remodeled'), (0.0, 'RecentRemodel'), (0.0, 'PoolQC'), (0.0, 'PoolArea'), (0.0, 'NewerDwelling'), (0.0, 'MiscVal'), (0.0, 'MSSubClass'), (0.0, 'LowQualFinSF'), (0.0, 'IsLandSlopeGentle'), (0.0, 'IsLandLevel'), (0.0, 'IsGarageDetached'), (0.0, 'IsElectricalSBrkr'), (0.0, 'HasShed'), (0.0, 'HasScreenPorch'), (0.0, 'HasOpenPorch'), (0.0, 'HasMasVnr'), (0.0, 'Has3SsnPorch'), (0.0, 'Has2ndFloor'), (0.0, 'BsmtHalfBath'), (0.0, 'BsmtFinType2'), (0.0, 'BoughtOffPlan'), (0.0, 'BldgType'), (0.0, 'BadHeating'), (0.0, '3SsnPorch')]
In [14]:
S = []
l=[]
for i in range(0, len(feature_importance)):
    S.append(feature_importance[i][0])
    l.append(feature_importance[i][1])
    
print S , l
[0.0699, 0.0629, 0.0509, 0.0419, 0.0379, 0.0359, 0.0349, 0.0279, 0.0279, 0.0259, 0.025, 0.025, 0.021, 0.02, 0.017, 0.016, 0.015, 0.015, 0.015, 0.015, 0.014, 0.014, 0.013, 0.012, 0.011, 0.01, 0.01, 0.009, 0.009, 0.009, 0.009, 0.009, 0.008, 0.008, 0.008, 0.008, 0.008, 0.007, 0.006, 0.006, 0.006, 0.006, 0.006, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.004, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] ['TotalArea', 'TotalArea1st2nd', 'OverallQual', 'GrLivArea', 'LotArea', 'OverallCond', 'TotalBsmtSF', 'YearBuilt', 'BsmtFinSF1', 'SimplOverallQual', 'GarageYrBlt', '1stFlrSF', 'GarageArea', 'Neighborhood', 'OpenPorchSF', 'Functional', 'YearsSinceRemodel', 'WoodDeckSF', 'SaleCondition', '2ndFlrSF', 'SimplOverallCond', 'BsmtFinType1', 'ScreenPorch', 'MSZoning', 'Age', 'YearRemodAdd', 'Exterior1st', '_SaleType_New', '_Exterior1st_BrkFace', 'NeighborhoodBin', 'LotFrontage', 'BsmtExposure', '_Condition1_Norm', 'YrSold', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtFinSF2', 'Condition1', '_SaleCondition_Normal', '_LotConfig_CulDSac', '_BsmtExposure_Gd', 'HeatingQC', 'FullBath', '_Neighborhood_NAmes', 'SimplFunctional', 'LotConfig', 'KitchenAbvGr', 'Fence', 'CentralAir', 'BsmtCond', '_SaleCondition_Family', '_Neighborhood_OldTown', '_GarageYrBltBin_YearBin7', '_ExterQual_Gd', 'MoSold', 'HighSeason', 'GarageCars', 'FireplaceQu', '_SaleCondition_Abnorml', '_NeighborhoodBin_2', '_MSSubClass_30', '_Functional_Typ', '_Exterior1st_MetalSd', '_ExterCond_Fa', '_Condition1_Feedr', 'TotRmsAbvGrd', 'SeasonSold', 'IsPavedDrive', 'HalfBath', 'GarageFinish', 'Foundation', 'EnclosedPorch', '_YearBuiltBin_YearBin4', '_Neighborhood_BrkSide', '_MoSold_6', '_MoSold_5', '_KitchenQual_Gd', '_Heating_Grav', '_Foundation_BrkTil', '_Exterior2nd_MetalSd', '_Exterior1st_Wd Sdng', '_ExterQual_Fa', '_ExterQual_Ex', '_BsmtQual_Ex', 'TimeSinceSold', 'SimplBsmtFinType1', 'SaleCondition_PriceDown', 'Neighborhood_Good', 'MasVnrArea', 'HouseStyle', 'GarageQual', 'Fireplaces', 'Exterior2nd', '_YearRemodAddBin_YearBin6', '_YearRemodAddBin_YearBin5', '_YearRemodAddBin_YearBin4', '_YearBuiltBin_YearBin2', '_PavedDrive_N', '_Neighborhood_IDOTRR', '_MasVnrType_BrkFace', '_MSZoning_RM', '_MSSubClass_90', '_LotConfig_Inside', '_LandSlope_Gtl', '_KitchenQual_Ex', '_HouseStyle_2.5Unf', '_HeatingQC_Gd', '_HeatingQC_Ex', '_GarageYrBltBin_YearBin4', '_GarageType_CarPort', '_GarageCond_TA', '_Functional_Mod', '_Exterior1st_Plywood', '_ExterQual_TA', '_ExterCond_TA', '_Condition1_RRAe', '_Condition1_Artery', '_BsmtFinType1_Unf', '_BsmtFinType1_LwQ', '_BsmtCond_Fa', '_BldgType_Duplex', '_Alley_Pave', '_Alley_Grvl', 'SimplKitchenQual', 'SimplHeatingQC', 'SimplExterQual', 'SaleType', 'MasVnrType', 'KitchenQual', 'IsRegularLotShape', 'HasWoodDeck', 'HasEnclosedPorch', 'GarageCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BedroomAbvGr', '_YearRemodAddBin_YearBin7', '_YearBuiltBin_YearBin7', '_YearBuiltBin_YearBin6', '_YearBuiltBin_YearBin5', '_YearBuiltBin_YearBin3', '_YearBuiltBin_YearBin1', '_Street_Pave', '_Street_Grvl', '_SaleType_WD', '_SaleType_Oth', '_SaleType_ConLw', '_SaleType_ConLI', '_SaleType_ConLD', '_SaleType_Con', '_SaleType_CWD', '_SaleType_COD', '_SaleCondition_Partial', '_SaleCondition_Alloca', '_SaleCondition_AdjLand', '_RoofStyle_Shed', '_RoofStyle_Mansard', '_RoofStyle_Hip', '_RoofStyle_Gambrel', '_RoofStyle_Gable', '_RoofStyle_Flat', '_RoofMatl_WdShngl', '_RoofMatl_WdShake', '_RoofMatl_Tar&Grv', '_RoofMatl_CompShg', '_PoolQC_None', '_PoolQC_Gd', '_PoolQC_Ex', '_PavedDrive_Y', '_PavedDrive_P', '_Neighborhood_Veenker', '_Neighborhood_Timber', '_Neighborhood_StoneBr', '_Neighborhood_Somerst', '_Neighborhood_SawyerW', '_Neighborhood_Sawyer', '_Neighborhood_SWISU', '_Neighborhood_NridgHt', '_Neighborhood_NoRidge', '_Neighborhood_NWAmes', '_Neighborhood_NPkVill', '_Neighborhood_Mitchel', '_Neighborhood_MeadowV', '_Neighborhood_Gilbert', '_Neighborhood_Edwards', '_Neighborhood_Crawfor', '_Neighborhood_CollgCr', '_Neighborhood_ClearCr', '_Neighborhood_BrDale', '_Neighborhood_Blueste', '_Neighborhood_Blmngtn', '_NeighborhoodBin_4', '_NeighborhoodBin_3', '_NeighborhoodBin_1', '_NeighborhoodBin_0', '_MoSold_9', '_MoSold_8', '_MoSold_7', '_MoSold_4', '_MoSold_3', '_MoSold_2', '_MoSold_12', '_MoSold_11', '_MoSold_10', '_MoSold_1', '_MiscFeature_Shed', '_MiscFeature_Othr', '_MiscFeature_None', '_MiscFeature_Gar2', '_MasVnrType_Stone', '_MasVnrType_None', '_MasVnrType_BrkCmn', '_MSZoning_RL', '_MSZoning_RH', '_MSZoning_FV', '_MSSubClass_85', '_MSSubClass_80', '_MSSubClass_75', '_MSSubClass_70', '_MSSubClass_60', '_MSSubClass_50', '_MSSubClass_45', '_MSSubClass_40', '_MSSubClass_20', '_MSSubClass_190', '_MSSubClass_180', '_MSSubClass_120', '_LotShape_Reg', '_LotShape_IR3', '_LotShape_IR2', '_LotShape_IR1', '_LotConfig_FR3', '_LotConfig_FR2', '_LotConfig_Corner', '_LandSlope_Sev', '_LandSlope_Mod', '_LandContour_Lvl', '_LandContour_Low', '_LandContour_HLS', '_LandContour_Bnk', '_KitchenQual_TA', '_KitchenQual_Fa', '_HouseStyle_SLvl', '_HouseStyle_SFoyer', '_HouseStyle_2Story', '_HouseStyle_1Story', '_HouseStyle_1.5Unf', '_HouseStyle_1.5Fin', '_Heating_Wall', '_Heating_GasW', '_Heating_GasA', '_HeatingQC_TA', '_HeatingQC_Po', '_HeatingQC_Fa', '_GarageYrBltBin_YearBin6', '_GarageYrBltBin_YearBin5', '_GarageYrBltBin_YearBin3', '_GarageYrBltBin_YearBin2', '_GarageYrBltBin_NoGarage', '_GarageType_None', '_GarageType_Detchd', '_GarageType_BuiltIn', '_GarageType_Basment', '_GarageType_Attchd', '_GarageType_2Types', '_GarageQual_TA', '_GarageQual_Po', '_GarageQual_None', '_GarageQual_Gd', '_GarageQual_Fa', '_GarageFinish_Unf', '_GarageFinish_RFn', '_GarageFinish_None', '_GarageFinish_Fin', '_GarageCond_Po', '_GarageCond_None', '_GarageCond_Gd', '_GarageCond_Fa', '_GarageCond_Ex', '_Functional_Sev', '_Functional_Min2', '_Functional_Min1', '_Functional_Maj2', '_Functional_Maj1', '_Foundation_Wood', '_Foundation_Stone', '_Foundation_Slab', '_Foundation_PConc', '_Foundation_CBlock', '_FireplaceQu_TA', '_FireplaceQu_Po', '_FireplaceQu_None', '_FireplaceQu_Gd', '_FireplaceQu_Fa', '_FireplaceQu_Ex', '_Fence_None', '_Fence_MnWw', '_Fence_MnPrv', '_Fence_GdWo', '_Fence_GdPrv', '_Exterior2nd_Wd Shng', '_Exterior2nd_Wd Sdng', '_Exterior2nd_VinylSd', '_Exterior2nd_Stucco', '_Exterior2nd_Stone', '_Exterior2nd_Plywood', '_Exterior2nd_ImStucc', '_Exterior2nd_HdBoard', '_Exterior2nd_CmentBd', '_Exterior2nd_CBlock', '_Exterior2nd_BrkFace', '_Exterior2nd_Brk Cmn', '_Exterior2nd_AsphShn', '_Exterior2nd_AsbShng', '_Exterior1st_WdShing', '_Exterior1st_VinylSd', '_Exterior1st_Stucco', '_Exterior1st_HdBoard', '_Exterior1st_CemntBd', '_Exterior1st_CBlock', '_Exterior1st_BrkComm', '_Exterior1st_AsphShn', '_Exterior1st_AsbShng', '_ExterCond_Po', '_ExterCond_Gd', '_ExterCond_Ex', '_Electrical_SBrkr', '_Electrical_FuseP', '_Electrical_FuseF', '_Electrical_FuseA', '_Condition2_PosA', '_Condition2_Norm', '_Condition2_Feedr', '_Condition2_Artery', '_Condition1_RRNn', '_Condition1_RRNe', '_Condition1_RRAn', '_Condition1_PosN', '_Condition1_PosA', '_BsmtQual_TA', '_BsmtQual_None', '_BsmtQual_Gd', '_BsmtQual_Fa', '_BsmtFinType2_Unf', '_BsmtFinType2_Rec', '_BsmtFinType2_None', '_BsmtFinType2_LwQ', '_BsmtFinType2_GLQ', '_BsmtFinType2_BLQ', '_BsmtFinType2_ALQ', '_BsmtFinType1_Rec', '_BsmtFinType1_None', '_BsmtFinType1_GLQ', '_BsmtFinType1_BLQ', '_BsmtFinType1_ALQ', '_BsmtExposure_None', '_BsmtExposure_No', '_BsmtExposure_Mn', '_BsmtExposure_Av', '_BsmtCond_TA', '_BsmtCond_Po', '_BsmtCond_None', '_BsmtCond_Gd', '_BldgType_TwnhsE', '_BldgType_Twnhs', '_BldgType_2fmCon', '_BldgType_1Fam', '_Alley_None', 'VeryNewHouse', 'SimplPoolQC', 'SimplGarageQual', 'SimplGarageCond', 'SimplFireplaceQu', 'SimplExterCond', 'SimplBsmtQual', 'SimplBsmtFinType2', 'SimplBsmtCond', 'RoofStyle', 'Remodeled', 'RecentRemodel', 'PoolQC', 'PoolArea', 'NewerDwelling', 'MiscVal', 'MSSubClass', 'LowQualFinSF', 'IsLandSlopeGentle', 'IsLandLevel', 'IsGarageDetached', 'IsElectricalSBrkr', 'HasShed', 'HasScreenPorch', 'HasOpenPorch', 'HasMasVnr', 'Has3SsnPorch', 'Has2ndFloor', 'BsmtHalfBath', 'BsmtFinType2', 'BoughtOffPlan', 'BldgType', 'BadHeating', '3SsnPorch']
In [19]:
import matplotlib.pyplot as plt

% matplotlib inline
slices_hours = S
activities = l
colors = ['r', 'g']

plt.pie(slices_hours, labels=activities, colors=colors, startangle=90, autopct='%.1f%%')
plt.title('feature importance')
plt.show()

Notre fonction revoit une liste des carateristiques et leurs score associé

Définir une fonction pour ne garder dans la dataset que les variables qui expliquent à 95% notre variable cible SalePrice.

In [252]:
def get_important_features(feature_importance):
    s,i=0,0    
    l=[]
    while s <= 0.9:
            s+=feature_importance[i][0]
            l.append(feature_importance[i][1])
            i+=1
            
    return l
get_important_features(feature_importance)
Out[252]:
['TotalArea',
 'TotalArea1st2nd',
 'OverallQual',
 'GrLivArea',
 'LotArea',
 'OverallCond',
 'TotalBsmtSF',
 'YearBuilt',
 'BsmtFinSF1',
 'SimplOverallQual',
 'GarageYrBlt',
 '1stFlrSF',
 'GarageArea',
 'Neighborhood',
 'OpenPorchSF',
 'Functional',
 'YearsSinceRemodel',
 'WoodDeckSF',
 'SaleCondition',
 '2ndFlrSF',
 'SimplOverallCond',
 'BsmtFinType1',
 'ScreenPorch',
 'MSZoning',
 'Age',
 'YearRemodAdd',
 'Exterior1st',
 '_SaleType_New',
 '_Exterior1st_BrkFace',
 'NeighborhoodBin',
 'LotFrontage',
 'BsmtExposure',
 '_Condition1_Norm',
 'YrSold',
 'BsmtUnfSF',
 'BsmtFullBath',
 'BsmtFinSF2',
 'Condition1',
 '_SaleCondition_Normal',
 '_LotConfig_CulDSac',
 '_BsmtExposure_Gd',
 'HeatingQC',
 'FullBath',
 '_Neighborhood_NAmes',
 'SimplFunctional',
 'LotConfig',
 'KitchenAbvGr',
 'Fence',
 'CentralAir',
 'BsmtCond',
 '_SaleCondition_Family',
 '_Neighborhood_OldTown',
 '_GarageYrBltBin_YearBin7',
 '_ExterQual_Gd',
 'MoSold',
 'HighSeason',
 'GarageCars',
 'FireplaceQu',
 '_SaleCondition_Abnorml',
 '_NeighborhoodBin_2',
 '_MSSubClass_30',
 '_Functional_Typ',
 '_Exterior1st_MetalSd',
 '_ExterCond_Fa',
 '_Condition1_Feedr',
 'TotRmsAbvGrd',
 'SeasonSold']

Il s'avère que la méthode de feature selection ne donne pas le meilleur RMSE

In [253]:
train_df_munged=train_df_munged[get_important_features(feature_importance)]
test_df_munged=test_df_munged[get_important_features(feature_importance)]
In [254]:
len(test_df_munged)
Out[254]:
1459
In [255]:
len(train_df_munged)
Out[255]:
1456

On a decidé de travailler avec le modèle extreme gradient boosting , et afin de pouvoir réduire l'overfitting, il est necessaire d'optimiser nos hyperparamètres, n_estimators et le learning rate.

In [44]:
import xgboost as xgb

regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

regr.fit(train_df_munged, label_df)
y_pred_x= regr.predict(train_df_munged)
y_test = label_df
print("XGBoost score on training set: ", rmse(y_test, y_pred_x))

y_pred_xgb = regr.predict(test_df_munged)
('XGBoost score on training set: ', 0.0775006584943539)

Afin de réduire l'overfitting il est important d'utiliser une régularisation l1.

On a d'abord trouvé notre alpha pour tuner notre modèle lasso avec la cross validation en utillisant regr.alpha_ avec le LassoCV , on a trouvé le meilleure alpha = 0.00045979040927269187

In [257]:
from sklearn.linear_model import Lasso , ElasticNet , RidgeCV ,LassoCV



alpha = 0.00045979040927269187
#regr = ElasticNet(alpha=best_alpha, max_iter=5000)
regr = Lasso(alpha=alpha, max_iter=50000)
#regr = LassoCV(cv=5 , random_state = 0)
# maintenant à l'aide de la fonction regr.alpha_  on trouve notre meilleure alpha
# print regr.alpha_ 
regr.fit(train_df_munged, label_df)
y_pred_l = regr.predict(train_df_munged)
y_test = label_df
print("Lasso score on training set: ", rmse(y_test, y_pred_l))

y_pred_lasso = regr.predict(test_df_munged)

y_pred = (y_pred_xgb + 2*y_pred_lasso) / 3

y_pred_reg=(y_pred_l+y_pred_x)/2

print("XGBoost and lasso score on training set: ", rmse(y_test, y_pred_reg))
('Lasso score on training set: ', 0.10719957787361478)
('XGBoost and lasso score on training set: ', 0.09166168987455008)
In [247]:
y_pred
Out[247]:
array([11.69363719, 11.99178533, 12.1155978 , ..., 12.04485023,
       11.71367972, 12.27378401])
In [248]:
y_pred = np.exp(y_pred)

pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"])
pred_df.to_csv('submission_final.csv', header=True, index_label='Id')

Cette méthode ci-dessus nous a donné le meilleure classement kaggle (classée 730 )

Une autre approche PCA : Réduction des dimensions avec analyse des composantes principales

In [227]:
from sklearn.decomposition import PCA

pca = PCA(n_components=81)

principalComponents_train = pca.fit_transform(train_df_munged.values)
principalComponents_test = pca.fit_transform(test_df_munged.values)

print principalComponents_train.shape , principalComponents_test.shape

print pca.explained_variance_ratio_
(1456, 81) (1459, 81)
[0.17306682 0.05350351 0.04551891 0.04077671 0.03522671 0.03188554
 0.02637321 0.02559108 0.02147338 0.01973191 0.01901887 0.01823768
 0.01787471 0.01661857 0.01546179 0.01502202 0.01441686 0.01393313
 0.01332003 0.01323807 0.01288978 0.01203568 0.01181044 0.01140744
 0.01118718 0.01035713 0.01001575 0.0098776  0.0092907  0.00878535
 0.008439   0.00842775 0.00813179 0.00794484 0.00776447 0.00769484
 0.00697672 0.00689752 0.00650489 0.00635159 0.00591561 0.00582511
 0.00570146 0.00563057 0.005454   0.00536632 0.00498093 0.00480348
 0.00457275 0.00444541 0.00419026 0.00408842 0.00399742 0.00390671
 0.00384323 0.00363738 0.00354214 0.00348336 0.00345371 0.00330227
 0.00316736 0.00310801 0.00286455 0.00263689 0.00260906 0.00254589
 0.00251879 0.00241836 0.00226767 0.00223015 0.00217991 0.00205414
 0.00188234 0.00187696 0.00182646 0.00176497 0.00171506 0.00162185
 0.00146983 0.00144504 0.00140049]
In [228]:
principalDf_train = pd.DataFrame(data = principalComponents_train
             , columns = range(0,list(principalComponents_train.shape)[1]))

principalDf_test = pd.DataFrame(data = principalComponents_test
             , columns = range(0,list(principalComponents_test.shape)[1]))

principalDf_test
Out[228]:
0 1 2 3 4 5 6 7 8 9 ... 71 72 73 74 75 76 77 78 79 80
0 -3.571408 -4.217402 -0.096884 0.675288 1.154890 -1.069236 -1.558719 0.013168 1.020418 0.989119 ... -0.100967 -0.041574 -0.358336 -0.055175 0.017122 0.207910 -1.360838 0.256376 0.386902 -1.276110
1 -0.546354 -3.302050 1.253671 1.575666 -0.099595 0.249438 -3.340523 -0.210844 -0.295450 -2.227293 ... 0.567625 0.068927 -0.248843 -0.436556 -0.593624 -0.197527 0.324773 1.004237 0.701508 0.540738
2 2.066261 0.577370 -0.162887 -1.464558 0.665334 1.597267 -2.619641 0.337510 1.969698 -2.007115 ... 0.299870 -0.228285 0.455916 -0.414109 0.200768 0.584384 -0.590418 -0.135959 0.208522 -0.259273
3 3.284782 0.947140 0.266234 -1.325136 0.789180 1.760537 -2.463203 1.095370 0.293953 -1.888923 ... -0.462022 -0.036001 -0.151523 -0.825204 -1.161041 0.199083 -0.067001 0.082895 -0.282798 0.074648
4 3.438048 0.519988 -2.818554 1.726679 1.476928 0.509775 3.065841 1.324658 0.863452 0.465918 ... 0.347889 -0.243664 -0.272938 0.949566 -0.488547 -0.122812 0.183812 0.024756 0.048587 0.609656
5 1.281433 1.812980 0.221018 -1.837143 1.428882 1.687435 -2.379585 -0.967266 1.941217 -1.342395 ... 0.466007 -0.672567 0.398092 0.603282 0.190866 0.249073 -0.523221 -0.055809 0.346517 0.568942
6 0.902267 -2.399136 -2.115308 -0.693276 0.048562 -0.987465 -4.394200 3.219477 0.387779 -0.711773 ... 0.196744 -0.056228 0.068110 0.738193 -0.668812 -0.840492 -0.123387 0.356062 -0.154348 0.514361
7 1.375868 2.242442 -0.115986 -1.692380 1.764808 1.192684 -2.148762 -1.095129 1.783972 -1.414356 ... -0.175176 0.138291 0.477080 -0.015531 -0.372221 0.753140 -0.456903 0.147711 0.193631 0.051897
8 1.696973 -2.271006 -1.362571 1.247566 0.241922 0.081514 -0.467880 0.647150 1.175437 -2.704801 ... 0.116171 -0.420031 0.863038 -0.582007 -0.364829 -0.052150 -0.487183 -0.444047 0.464664 0.206419
9 -2.952468 -5.020262 -1.740046 1.246880 1.571659 0.155912 -1.810170 0.646016 1.707347 -1.224332 ... 0.389472 0.160206 -0.719237 -0.064693 -0.045957 -0.291744 0.240711 0.373422 0.385224 -0.281284
10 4.321691 -0.757778 -4.516764 1.126278 -0.039461 1.037470 1.420922 1.034325 1.486415 -1.205210 ... 0.087347 0.851269 -0.525856 -1.185415 0.126912 -0.196575 -0.178727 -0.101474 -0.215496 0.971609
11 -4.825066 0.964478 -5.167970 -1.194439 2.665664 4.098009 2.603604 -1.046112 -0.549774 -0.758297 ... 0.053454 -0.001891 0.053035 -0.036076 -0.423548 0.320388 0.013080 -0.241740 -0.128468 -0.159114
12 -4.961568 1.224637 -4.692107 -1.531597 2.595943 4.671430 1.970635 -0.809619 -1.046293 -0.811617 ... 0.288328 0.029797 -0.075908 0.219617 -0.445619 0.161271 -0.052330 -0.137974 -0.012336 -0.029840
13 -1.611527 0.952107 -3.159147 -1.547756 1.272057 4.504647 1.631162 0.446884 -0.555603 -0.889731 ... 0.612924 -0.183100 -0.694455 -0.385986 -0.239363 -0.555254 -0.178417 0.056418 -0.229563 0.033947
14 -2.549337 0.381003 -5.369277 0.812385 2.814279 0.495262 1.650796 -0.555343 0.408633 -0.631893 ... -0.489197 0.508454 -0.651948 0.267522 -0.232565 -0.722788 -0.434572 0.364437 -0.076787 -0.660350
15 8.570437 3.826942 2.491645 -1.096795 -0.222025 -1.678365 0.614884 -1.456294 0.092958 -0.688970 ... -1.348299 0.014839 0.032925 0.508302 0.706810 -0.135827 -0.326246 -0.436140 -0.295824 0.143156
16 5.930586 1.788895 1.084693 1.329254 1.584867 -4.723935 0.157357 -2.093460 -0.522564 -0.366415 ... 0.082704 0.024983 -0.848827 -0.124226 0.638947 0.494105 0.381577 -0.088354 0.062283 0.479529
17 7.613512 -0.361367 1.218215 1.161027 -0.311483 -1.060729 0.089269 0.515380 -1.058064 -2.805984 ... -0.272254 0.556575 0.302631 0.022230 0.335118 -0.402222 -0.486461 -0.513862 -0.317549 -0.436709
18 6.924107 -0.392955 1.659801 1.340487 -0.712389 -0.569148 0.710250 0.058276 -1.022114 -2.700836 ... -0.248217 0.591191 0.491797 0.689917 0.545323 -0.163602 0.365955 -0.360923 -0.535388 -0.224825
19 10.098874 -1.609619 3.714482 1.318342 -3.199622 0.178666 1.910173 1.847858 -1.087345 -2.975739 ... -0.354729 -0.064532 0.447853 0.585064 0.081582 0.166207 -0.499759 -0.057924 -0.062749 -0.227839
20 7.889400 0.902421 1.585077 -0.532956 -0.930984 1.431634 -0.613687 0.810848 -0.169543 -2.690040 ... -0.022755 -0.531255 -0.074676 -0.469575 -0.461410 0.022704 0.170403 0.187914 -0.056539 0.015032
21 5.818077 1.133278 -2.286189 1.414841 0.643329 0.371102 2.104496 1.750981 0.368637 -2.876713 ... 0.039146 0.133072 0.001735 0.026567 -0.295598 0.556007 -0.158974 -0.360575 -0.194643 0.364726
22 3.112798 0.741424 0.029749 0.890471 1.768979 -2.913522 -2.928529 -0.735753 2.456838 -2.772603 ... -0.190516 -0.463093 -0.108674 -0.107259 0.035557 -0.307213 -0.185158 0.147368 0.134885 0.077623
23 3.713059 1.212736 -4.815165 1.555761 2.010524 0.547381 0.695601 1.259797 0.848258 -2.753625 ... -0.332383 -0.387417 0.243933 -0.131516 -0.466681 -0.506532 -0.825775 -0.174574 -0.136241 -0.056661
24 3.688910 2.099528 -1.280768 -1.656856 0.811731 1.328394 -1.377826 1.623649 1.459243 -1.792369 ... 0.351383 0.455246 0.396243 -0.506368 0.088617 -0.130702 0.398401 0.069513 -0.239858 -0.283752
25 4.125361 3.406827 -0.377473 -1.987186 1.238904 0.268964 -1.875982 0.060185 2.210729 -1.820533 ... -0.106620 0.215100 -0.017058 -0.037351 -0.331751 -0.029806 0.098704 0.463819 -0.056671 -0.213203
26 7.210906 1.667088 3.119866 -1.239791 -0.181204 1.350606 -0.747845 -0.240806 0.127347 -0.195709 ... 0.728889 0.039739 -0.811232 -0.201548 0.676578 0.116242 -0.474554 0.439685 -0.450886 -0.053746
27 5.130630 0.706130 0.994256 0.371323 1.014005 -2.088822 0.133915 0.763981 -0.237437 -3.013127 ... 0.279394 0.280029 -0.189491 0.414837 -0.354823 0.487798 0.834179 0.496764 -0.165724 -0.585869
28 4.303962 2.070120 -1.040111 1.949073 2.104987 -5.776430 0.706366 -1.563332 0.367665 -0.162681 ... 0.355476 -0.141818 -0.593568 -0.082930 0.241844 0.782037 0.324128 -0.234107 0.291872 -0.000866
29 5.256609 -1.085022 0.479797 1.803506 0.199565 -1.415035 0.747411 2.263505 0.470070 -0.551530 ... 0.211573 0.201462 -0.038096 -0.192130 -0.256893 -0.003946 -0.109513 0.042094 -0.422997 -0.097007
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1429 -7.674789 -0.746384 -1.468366 -1.576767 -0.045576 -2.982445 2.285495 -0.553800 0.908401 -1.094669 ... 0.611812 -0.657143 0.723753 -0.744518 0.347178 0.890671 -0.030571 0.782864 0.386519 0.273053
1430 -2.608058 0.866749 0.466857 -1.554698 0.674057 0.690918 -0.451563 -2.355508 -1.021626 0.063815 ... 0.369646 -0.425738 0.585698 -0.395025 1.129940 0.583244 0.326188 0.175409 0.412119 -0.065991
1431 -12.289174 7.167719 5.485230 9.264841 1.911924 -2.026046 -0.689577 5.242108 0.749470 1.980645 ... -0.221882 -0.886569 0.060597 0.837214 0.249085 0.153147 -0.383838 0.261305 0.455736 0.368445
1432 -7.801244 6.397413 1.016201 6.697518 -4.339519 3.270763 -1.042576 -2.679864 0.249047 2.000653 ... -0.204261 1.930395 1.007982 1.009132 0.618974 -0.177792 -0.351494 -0.160814 -0.553627 0.273899
1433 -9.889536 5.647711 -0.281087 5.820971 -3.652283 -0.461225 -0.692514 0.418991 1.454491 1.000592 ... -0.645730 -0.547031 -0.002244 -0.579413 0.265659 0.662280 -0.261823 -0.578632 0.012758 0.219745
1434 7.549972 -1.236786 -1.037420 2.991329 -1.087462 0.956189 6.890567 3.629163 -1.111917 4.000372 ... 0.598234 0.396919 -0.138507 -0.282681 -0.329524 -0.592915 0.263645 -0.559308 -0.266961 0.189954
1435 6.396506 -1.824362 -1.534612 2.287754 -1.230390 2.509871 5.958356 3.902786 -0.495649 3.177548 ... 0.665661 0.109963 0.181271 -0.027954 -1.064386 -0.436022 0.173297 0.491972 -0.092010 -0.457244
1436 1.476664 -3.783427 1.364550 1.448988 -0.468312 1.128394 -0.657175 -1.109097 0.752250 0.475294 ... -0.157134 0.285330 -0.470829 -1.105181 -0.007233 -0.063085 0.141580 0.134747 -0.118152 0.111923
1437 -2.139556 0.191493 0.918138 0.867075 1.203452 2.084285 0.258720 -3.391694 -2.192148 1.107309 ... -0.612551 0.068193 -0.070276 -0.565874 -0.140358 0.045493 -0.001370 0.395411 -0.594856 0.435343
1438 3.716260 1.058527 -0.356542 0.969923 1.202870 -2.943756 -1.422487 -1.345931 0.889342 -0.560572 ... -0.104736 0.206487 0.002469 0.543745 0.383230 0.262475 0.135427 -0.036920 -0.413956 0.135580
1439 -1.289162 -2.772679 0.057323 1.178432 0.709859 1.916798 -1.048704 -0.982104 -1.367802 -0.602290 ... -0.964134 -0.829301 0.750209 -0.607912 -0.543229 -0.331406 0.162574 0.200570 -0.080603 0.104266
1440 0.349879 -5.326363 4.486485 2.101952 -2.148430 0.994728 1.446286 -0.442774 0.610898 1.563486 ... 0.286384 0.627941 -0.612390 -0.320304 -0.141101 -0.168753 -0.324569 0.402197 0.055756 -0.409257
1441 4.137891 -1.654350 -2.590744 2.157208 -0.237197 -2.119241 -1.320260 -0.554341 1.729836 1.667017 ... 0.035540 0.576771 0.188060 -1.154449 -0.438129 -1.175809 0.349179 -0.456184 0.078855 -0.260344
1442 8.425175 -0.764963 1.195953 1.271985 -2.225826 -2.919576 1.569535 -1.651117 -1.596416 1.128666 ... -0.082944 0.377969 -0.410938 -0.580763 0.175259 -0.073001 -1.016737 0.008920 0.059395 -0.118930
1443 8.729685 -0.462463 1.702625 2.242895 -1.170914 -2.481269 1.704481 -0.250210 -2.607230 1.366801 ... -0.305191 0.574087 -0.282286 -0.305941 -0.001069 0.238567 -0.692579 -0.416856 -0.014040 0.131480
1444 -8.665487 2.268520 9.896547 4.561828 6.719017 -0.553388 0.075978 1.486462 2.032290 -0.743899 ... -0.596417 0.765139 0.285764 0.483399 -0.136124 -0.802959 -0.784755 0.135701 -1.102130 -0.333962
1445 2.669675 -1.256503 -1.749743 0.946115 0.594407 2.949997 0.438873 1.301415 -3.288260 1.546020 ... 0.032186 0.823149 0.339228 -0.906223 0.447835 0.446572 -0.136851 0.884434 0.148564 0.797389
1446 -2.995747 1.844065 -3.884022 -1.589029 2.053499 4.234789 1.373731 -0.438812 0.497479 1.142762 ... -0.386708 -0.237669 -0.066362 0.088634 0.543522 -0.431833 0.759654 -0.804857 0.984945 -0.296802
1447 -1.677637 -1.789262 -1.854649 0.516005 1.232536 -2.340343 -1.699702 0.069796 -0.341768 1.390096 ... -0.159812 0.372550 -0.313882 0.265438 0.293793 -0.012620 0.821569 0.159319 0.124801 -0.161088
1448 -1.675328 -1.081692 1.450830 0.848455 1.257406 2.248908 -0.316071 -3.060447 -1.555588 -0.556073 ... 0.446194 -0.190006 0.041251 -0.536362 0.271315 -0.472482 0.174358 0.262349 -0.196320 -0.218851
1449 -7.173595 2.612959 -8.614075 6.626529 -2.774078 2.797559 0.931067 1.888091 -0.312477 1.400954 ... 1.052721 0.131534 -0.113343 0.388010 0.359438 -0.612039 0.313565 -0.062499 -0.008249 -0.654785
1450 -5.638211 1.974901 -5.211407 -0.599462 2.647860 4.076896 1.952184 -1.321766 -0.151974 1.461485 ... 0.296364 0.220726 -0.175633 -0.457835 -0.301047 -0.381445 -0.106629 0.140527 0.222009 -0.439559
1451 -0.947384 -4.414006 0.998230 1.193897 -1.592012 1.234450 -1.274806 -1.187297 1.174828 0.808084 ... -0.896559 0.228073 -0.522623 -0.439318 0.113742 0.627491 0.150909 -0.762510 -0.582550 0.309468
1452 -6.035101 1.417855 -4.462168 -0.525139 2.457928 4.268328 2.212083 -2.029857 -0.566150 2.182272 ... 0.956790 0.477352 -0.323705 0.100710 0.111878 -0.578999 -0.142691 0.339481 0.132928 -0.765991
1453 -7.697480 5.820093 -5.741742 3.829420 -2.375291 3.657603 -0.865277 -0.324034 -0.236441 3.020406 ... 1.013825 0.036880 -0.180955 0.044260 0.189504 -0.329542 0.234441 0.280480 -0.083097 -0.512482
1454 -7.663994 5.944361 -6.053954 2.975641 -2.614390 2.661112 -1.160193 0.672375 -0.994849 3.067827 ... 1.200423 0.015299 0.351253 0.014181 0.249775 0.253436 0.076158 0.252643 0.035329 -0.498446
1455 -5.558653 1.804582 -4.295045 -0.576512 2.374983 4.353805 1.854174 -1.943181 -0.094901 1.529385 ... 1.189402 0.223900 -0.094355 -0.137994 0.395592 -0.539554 -0.220125 0.472696 0.104663 -0.649613
1456 -1.076227 -2.952491 2.064628 -0.169414 -0.649924 -1.313815 -1.655318 0.827224 -1.189235 0.247565 ... -0.600603 -0.024148 -0.821089 -0.099006 -0.180737 -0.282985 0.392991 0.021293 0.086039 -0.106162
1457 -3.549393 2.269073 -3.460952 6.476886 -4.553017 1.464801 -5.036149 1.060166 1.278924 1.599953 ... -0.319833 0.188214 0.021533 -0.346234 0.312305 0.409725 -0.717844 -0.019903 -0.398943 0.582320
1458 3.512563 0.894778 1.063317 -1.155203 0.521624 2.729664 -0.742897 -0.847261 -0.174825 1.533367 ... 0.022766 0.446954 -0.709820 0.567854 -1.018927 -0.213307 -0.592056 0.371122 -0.283505 0.640815

1459 rows × 81 columns

In [229]:
import xgboost as xgb

regr = xgb.XGBRegressor(
                 colsample_bytree=0.2,
                 gamma=0.0,
                 learning_rate=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=7200,                                                                  
                 reg_alpha=0.9,
                 reg_lambda=0.6,
                 subsample=0.2,
                 seed=42,
                 silent=1)

regr.fit(principalDf_train, label_df)


y_pred_x= regr.predict(principalDf_train)
y_test = label_df
print("XGBoost score on training set: ", rmse(y_test, y_pred_x))


y_pred_xgb = regr.predict(principalDf_test)
('XGBoost score on training set: ', 0.0619922950867125)
In [224]:
from sklearn.linear_model import Lasso , ElasticNet , RidgeCV ,LassoCV

best_alpha = 0.00099
alpha = 0.0016142533257845059
#regr = ElasticNet(alpha=best_alpha, max_iter=5000)
regr = Lasso(alpha=alpha, max_iter=50000)
#regr = LassoCV(cv=5 , random_state = 0)
regr.fit(principalComponents_train, label_df)

y_pred_l = regr.predict(principalComponents_train)
y_test = label_df
print("Lasso score on training set: ", rmse(y_test, y_pred_l))

y_pred_lasso = regr.predict(principalComponents_test)
('Lasso score on training set: ', 0.10780966876123488)
In [231]:
y_pred = (y_pred_xgb + y_pred_lasso) / 2

y_pred_reg=(y_pred_l+y_pred_x)/2

print("XGBoost and lasso score on training set: ", rmse(y_test, y_pred_reg))

y_pred = np.exp(y_pred)

pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"])
pred_df.to_csv('submission_final_pca.csv', header=True, index_label='Id')
('XGBoost and lasso score on training set: ', 0.08235373752883779)

Il parait que la réduction des dimensions avec ACP ne donne pas un score intéressant au niveau des données test.

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: