import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler,Normalizer,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from scipy import stats
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgb2
import optuna.integration.xgboost as xgb2
from keras.models import Sequential
from keras.layers import Dense,Dropout,Input,AlphaDropout
from keras.regularizers import Regularizer,l2
from collections import OrderedDict
from itertools import product
import re
#import hvplot.pandas
%matplotlib inline
import plotly.offline as pyo
pyo.init_notebook_mode()
We are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set.
salesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
itemsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
itemsCategoriesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shopsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
testDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
salesDF.head()
date | date_block_num | shop_id | item_id | item_price | item_cnt_day | |
---|---|---|---|---|---|---|
0 | 02.01.2013 | 0 | 59 | 22154 | 999.00 | 1.0 |
1 | 03.01.2013 | 0 | 25 | 2552 | 899.00 | 1.0 |
2 | 05.01.2013 | 0 | 25 | 2552 | 899.00 | -1.0 |
3 | 06.01.2013 | 0 | 25 | 2554 | 1709.05 | 1.0 |
4 | 15.01.2013 | 0 | 25 | 2555 | 1099.00 | 1.0 |
# Cleaning shops data
# Якутск Орджоникидзе, 56
salesDF.loc[salesDF.shop_id == 0, 'shop_id'] = 57
testDF.loc[testDF.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
salesDF.loc[salesDF.shop_id == 1, 'shop_id'] = 58
testDF.loc[testDF.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
salesDF.loc[salesDF.shop_id == 10, 'shop_id'] = 11
testDF.loc[testDF.shop_id == 10, 'shop_id'] = 11
Here on analysing the shop names, we can find that the shop names encompass the city name and the category.
For eg: shop id 2 - Адыгея ТЦ "Мега" - Adygea mega shopping mall(English translation)
Here Adygea is the name of the city in Russia
shopsDF['city']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[0])
shopsDF['category']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[1])
##Each SHOP_ID' in this list belongs to separate category and we use 'other' to encompass them.
for i in [0,6,9,20,21,22,46,55,57,10,11]:
shopsDF.loc[shopsDF['shop_id']==i,'category']='other'
shopsDF.loc[shopsDF['city']=='!Якутск','city']='Якутск'
# le=LabelEncoder()
# shopsDF['city']=le.fit_transform(shopsDF['city'])
# shopsDF['category']=le.fit_transform(shopsDF['category'])
When we explore item category names we have:
For eg : item_category_id 0 - PC Гарнитуры/Наушники - PC Headset/Headphones
Hence we can separate this feature as 2 columns as main category and sub category
##Splits item_category name into main category and sub category
categories=[]
sub_categories=[]
for i in itemsCategoriesDF['item_category_name'].unique():
if len(list(i.split(" - ")))==1:
categories.append(i.split(" - ")[0])
sub_categories.append('other')
else:
categories.append(i.split(" - ")[0])
sub_categories.append(i.split(" - ")[1])
itemsCategoriesDF['main_category']=categories
itemsCategoriesDF['sub_category']=sub_categories
# le=LabelEncoder()
# itemsCategoriesDF['main_category']=le.fit_transform(itemsCategoriesDF['main_category'])
# itemsCategoriesDF['sub_category']=le.fit_transform(itemsCategoriesDF['sub_category'])
Let the items categories dataframe after splitting features be merged with the items dataframe
itemsNewDF=pd.merge(itemsDF,itemsCategoriesDF[['item_category_id','main_category','sub_category']],on='item_category_id',how='inner')
salesDF=pd.merge(salesDF,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='inner')
salesDF=pd.merge(salesDF,shopsDF[['shop_id','city','category']],on='shop_id',how='inner')
Lets remove the outliers
fig=plt.figure(figsize=(10,5))
gs=fig.add_gridspec(1,2)
ax1=fig.add_subplot(gs[0,0])
img=sns.boxplot(x=salesDF['item_cnt_day'],ax=ax1)
ax2=fig.add_subplot(gs[0,1])
img=sns.boxplot(x=salesDF['item_price'],ax=ax2)
pd.cut(salesDF['item_price'],10).value_counts()
(-308.981, 30797.1] 2935632 (30797.1, 61595.2] 216 (277181.9, 307980.0] 1 (61595.2, 92393.3] 0 (92393.3, 123191.4] 0 (123191.4, 153989.5] 0 (153989.5, 184787.6] 0 (184787.6, 215585.7] 0 (215585.7, 246383.8] 0 (246383.8, 277181.9] 0 Name: item_price, dtype: int64
salesDF.drop(salesDF[(salesDF['item_cnt_day']>200)&(salesDF['item_cnt_day']<0)].index,axis=0,inplace=True)
salesDF.drop(salesDF[salesDF['item_price']>30797].index,axis=0,inplace=True)
##Splits date into day,month,year format
salesDF['DateModified']=pd.to_datetime(salesDF['date'],format='%d.%m.%Y')
salesDF['year']=salesDF['DateModified'].dt.year
salesDF['month']=salesDF['DateModified'].dt.month
salesDF['date']=salesDF['DateModified'].dt.day
shopNamesDict={shopsDF['shop_id'][i]:shopsDF['shop_name'][i] for i in range(len(shopsDF))}
shopWiseSalesDF=salesDF.groupby('shop_id').agg({'item_cnt_day':'sum','item_price':'sum'})
shopWiseSalesDF=shopWiseSalesDF.sort_values(by='item_price',ascending=False)
shopWiseSalesDF.reset_index(inplace=True)
shopWiseSalesDF['item_price']=shopWiseSalesDF['item_price'].astype(float)/1000000
shopWiseSalesDF=shopWiseSalesDF.replace({'shop_id':shopNamesDict})
fig=plt.figure(figsize=(15,5))
img=sns.barplot(data=shopWiseSalesDF[:10],y='shop_id',x='item_price',palette='rainbow')
img.set(xticklabels=[])
img.tick_params(bottom=False,left=False)
img.set_ylabel('Shop Names',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.set_xlabel('Total price of products sold across all years',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.text(5,-1,'Top 10 shops with highest sales(in terms of price)',fontfamily='sans-serif',fontweight='bold',fontsize=20)
for i in range(10):
img.text(1,i,str(round(shopWiseSalesDF['item_price'][i],0))+' Million',fontfamily='sans-serif',fontsize=12,fontweight='medium')
sns.despine(left=True,bottom=True,right=True,top=True)
itemWiseSalesDF=salesDF.groupby(['item_id'],as_index=False).agg({'item_price':'sum'}).sort_values(by='item_price',ascending=False).reset_index()
itemWiseSalesDF.drop('index',axis=1,inplace=True)
#itemWiseSalesDF['item_price']=itemWiseSalesDF['item_price'].astype('float')/1000000
itemWiseSalesDF=pd.merge(itemWiseSalesDF[:10],itemsDF[['item_id','item_name']],on='item_id',how='inner')
labels=['Sony PS4 500GB','Sony PS4 Kit 500GB','GTA V PS3','GTA V XBOX360','PS Store Top-Up Wallet']
fig=px.bar(itemWiseSalesDF[:5],y='item_name',x='item_price',orientation='h')
fig.update_layout(plot_bgcolor='#fff',
title='Top 5 Highest selling products',
yaxis=dict(showline=True,linecolor='black',tickvals=[0,1,2,3,4],ticktext=labels))
fig.show()
## Total items sold in every month across the 3 years
salesYearDF=salesDF.groupby(['year','month']).agg({'item_cnt_day':'sum'})
salesYearDF.reset_index(inplace=True)
months = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
salesYearDF=salesYearDF.replace({'month':months})
plt.figure(figsize=(15,5))
sns.set_context("notebook")
sns.set_style('white')
img=sns.lineplot(data=salesYearDF,x='month',y='item_cnt_day',hue='year')
img.set_xlabel('Month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
img.set_ylabel('Items sold per month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
sns.despine(right=True,top=True)
In both 2013 and 2014, there has been significant increase in sales during the months of November and December. This may be attributed due to Christmas season
fig=px.bar(salesDF.groupby('city',as_index=False).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False),
x='city',
y='item_cnt_day',
title='City wise sales')
fig.update_layout(plot_bgcolor='#fff')
fig.show()
The below function groups the dataframe based on city and identifies the top 5 categories in each city and represented in terms of percentage
cityWiseSalesDF=salesDF.groupby(['city','main_category']).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False).reset_index()
cityWiseSalesDF=cityWiseSalesDF.groupby('city').head(5)
cityWiseSalesDF['percentage']=cityWiseSalesDF.groupby('city')['item_cnt_day'].transform(lambda x:round(x*100/x.sum(),2))
fig=px.bar(cityWiseSalesDF,x='city',y='percentage',color='main_category',title='Top 5 categories sold in each city')
fig.update_layout(margin=dict(l=0,r=0,b=0),
showlegend=False)
fig.show()
Even though Moscow is the city with highest sales some categories like cinema bluray cd's and dvd's are comparatively sold more in Yakutsk city
Delivery of goods(Доставка товара) as a category is found only in the city of Интернет-магазин and accounts for 30% of sales.
PC games and movies accountfor the majority of sale in almost all cities
salesPartitionDict={2013:{'Items Sold Before November':0,'Items Sold After November':0},2014:{'Items Sold Before November':0,'Items Sold After November':0}}
for year in [2013,2014]:
salesPartitionDict[year]['Items Sold After November']=salesYearDF[(salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
salesPartitionDict[year]['Items Sold Before November']=salesYearDF[(-salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
salesPartitionDF=pd.DataFrame(salesPartitionDict).transpose()
salesPartitionDF.reset_index(inplace=True)
salesPartitionDF.rename(columns={'index':'year'},inplace=True)
data=[go.Bar(x=salesPartitionDF['year'],y=salesPartitionDF[i],name=i)for i in salesPartitionDF.columns[1:]]
layout=go.Layout(title='Items sold for every year before and after November')
fig=go.Figure(data=data,layout=layout)
fig.update_layout(
xaxis=dict(
showline=True,
showgrid=False,
linecolor='rgb(204, 204, 204)',
linewidth=3,
tickmode='linear',
dtick=1
),
yaxis=dict(
showgrid=False,
zeroline=False,
showline=True,
showticklabels=True,
linecolor='rgb(204, 204, 204)',
linewidth=3,
),
plot_bgcolor='white'
)
fig.show()
dateBlockNumMonth={0:'Jan 2013',1:'Feb 2013',2:'Mar 2013',3:'Apr 2013',4:'May 2013',5:'Jun 2013',6:'Jul 2013',7:'Aug 2013',8:'Sep 2013',9:'Oct 2013',10:'Nov 2013',11:'Dec 2013',
12:'Jan 2014',13:'Feb 2014',14:'Mar 2014',15:'Apr 2014',16:'May 2014',17:'Jun 2014',18:'Jul 2014',19:'Aug 2014',20:'Sep 2014',21:'Oct 2014',22:'Nov 2014',23:'Dec 2014',
24:'Jan 2015',25:'Feb 2015',26:'Mar 2015',27:'Apr 2015',28:'May 2015',29:'Jun 2015',30:'Jul 2015',31:'Aug 2015',32:'Sep 2015',33:'Oct 2015'}
a=salesDF.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
monthlyHighestDict=OrderedDict()
n=0
for i in range(34):
b=a[a['date_block_num']==i]
b=b.sort_values(by='item_cnt_day',ascending=False)[:5]
b.reset_index(inplace=True)
for j in range(5):
#item_name=itemsDF[itemsDF['item_id']==b.loc[j]['item_id']]['item_name'].values[0]
monthlyHighestDict[n]={'month':dateBlockNumMonth[i],'item_name':str(b.loc[j]['item_id']),'count':b.loc[j]['item_cnt_day']}
n+=1
fig=plt.figure(figsize=(20,20))
gs=fig.add_gridspec(5,7)
monthlyHighestDF=pd.DataFrame(monthlyHighestDict).transpose()
n=0
for i in range(5):
for j in range(7):
if n==34:
break
df=monthlyHighestDF[monthlyHighestDF['month']==dateBlockNumMonth[n]]
ax=fig.add_subplot(gs[i,j])
img=sns.barplot(data=df,y='item_name',x='count',palette='spring',orient='h',ax=ax)
ax.set_title(dateBlockNumMonth[n])
img.set(xlabel=None,ylabel=None,xticks=[])
for s in ['top','right','left']:
ax.spines[s].set_visible(False)
ax.tick_params(left=False)
n=n+1
plt.tight_layout()
The mostly sold products for all months is Branded package T-shirt 1C White,Diablo III(Game), Battlefield IV, Grand Theft Auto V (GTA V), FIFA 14, Sims 4. Most of them are games for PC,XBOX and PS3.
For eg: Battlefield 4 has been one of the top sold products in Nov 2013,Dec 2013, Jan 2014 and Feb 2014. The release date of Battlefield 4 is 29th October 2013.
Similarly GTA V was released on 17th September 2013 and it was top sold products in Sep,Oct,Dec 2013 and Jan 2014. We would be wondering what happened in Nov 2013 and it was ocupied by Battlefield 4. Nov 2013 also saw higher sales of Assassin's creed 4 and Call of Duty Ghosts. But its sales faded in subsequent months.
plt.figure(figsize=(15,6))
categoriesSalesDF=salesDF.groupby(['item_category_id'],as_index=False).agg({'item_cnt_day':'sum'})
categoriesSalesDF.reset_index(inplace=True)
categoriesSalesDF=categoriesSalesDF.sort_values(by='item_cnt_day',ascending=False)
categoriesSalesDF=pd.merge(categoriesSalesDF,itemsCategoriesDF[['item_category_name','item_category_id']],on='item_category_id',how='inner')
fig=px.bar(categoriesSalesDF[:5],x='item_category_name',y='item_cnt_day',title='Most sold categories')
fig.update_layout(plot_bgcolor='#fff',xaxis=dict(showline=True,linecolor='rgb(204, 204, 204)',linewidth=3),
yaxis=dict(showticklabels=False))
fig.show()
<Figure size 1080x432 with 0 Axes>
Games CD's, Movies CD's, Music CD's are the most sold categories
shopItemSalesDF=salesDF.groupby(['shop_id','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
shopItemSalesDF.rename(columns={'item_cnt_day':'item_cnt'},inplace=True)
shopItemSalesDF['item_cnt']=shopItemSalesDF['item_cnt'].astype(np.int16)
shopItemSalesDF.sort_values(by='item_cnt',ascending=False,inplace=True)
shopTopItem=OrderedDict()
n=0
for shop in shopItemSalesDF['shop_id'].unique():
shop_id=shop
item_id=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_id'].values[0]
item_cnt=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_cnt'].values[0]
shopTopItem[n]={'shop_id':shop,'item_id':item_id,'item_cnt':item_cnt}
n+=1
shopTopItemDF=pd.DataFrame(shopTopItem).transpose()
li=[]
for i in range(len(shopTopItemDF)):
if shopTopItemDF.loc[i]['item_id']==20949:
li.append('item_20949')
else:
li.append('others')
shopTopItemDF['category']=li
fig=px.bar(shopTopItemDF,x='shop_id',y='item_cnt',color='category', # if values in column category = 'item_20949' and 'others'
color_discrete_map={
'item_20949': 'yellow',
'others': 'violet'
})
fig.update_layout(plot_bgcolor='#fff',xaxis={'showline':True,'linewidth':1,'linecolor':'black'},yaxis={'showticklabels':False})
fig.show()
For almost all shops T-Shirts is the most sold category
yearDayDF=salesDF.groupby(['year','month'],as_index=False).agg({'item_cnt_day':'sum'})
fig=px.bar(yearDayDF,x='month',y='item_cnt_day',facet_row='year',title='Total sales for every month across years')
fig.show()
2015 has lower sales number compared with 2013 and 2014
yearMonthDayDF=salesDF.groupby(['year','month','date','shop_id'],as_index=False).agg({'item_cnt_day':'sum','DateModified':'min'})
shopStats=OrderedDict()
n=0
for i in yearMonthDayDF['shop_id'].unique():
mad=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mad()
mean=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mean()
max=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].max()
shopStats[n]={'shop_id':i,'mean':mean,'max':max,'mad':mad}
n+=1
shopStatsDF=pd.DataFrame(shopStats).transpose()
shopStatsDF['shop_id']=shopStatsDF['shop_id'].astype('str')
data=[go.Bar(x=shopStatsDF['shop_id'],y=shopStatsDF[i],name=i)for i in ['mean','mad']]
layout=go.Layout(title='Mean sales and Deviation in sales for every shop in the list',
margin=dict(l=0,r=0,b=0,t=30),
width=1500)
fig=go.Figure(data,layout)
fig.show()
Shops with higher sales are predominantly found in Moscow. Atrium Mall in Moscow(Shop id=25) has a good mean sales and lesser mean absolute deviation compared with others.
We have certain columns which should be encoded like cities,shops and categories. If we apply one hot encoding then there will be numeral columns. If we apply lable encoding the information may not be correctly represented.
So let's group the cities and categories based on their sales. For eg Moscow city sells the most number of products. Other cities lag behind. Let's create two categories as moscow and other_cities. Now it will be easy to encode.
dicCity=pd.cut(salesDF.groupby('city').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['other_cities','moscow'])
shopsDF['city'].replace(dicCity,inplace=True)
salesDF['city'].replace(dicCity,inplace=True)
dicMainCategory=pd.cut(salesDF.groupby('main_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
itemsNewDF['main_category'].replace(dicMainCategory,inplace=True)
salesDF['main_category'].replace(dicMainCategory,inplace=True)
dicSubCategory=pd.cut(salesDF.groupby('sub_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['medium','high'])
itemsNewDF['sub_category'].replace(dicSubCategory,inplace=True)
salesDF['sub_category'].replace(dicSubCategory,inplace=True)
dicCategory=pd.cut(salesDF.groupby('category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
shopsDF['category'].replace(dicCategory,inplace=True)
salesDF['category'].replace(dicCategory,inplace=True)
for col in salesDF.select_dtypes('object').columns:
le=LabelEncoder()
salesDF[col]=le.fit_transform(salesDF[col])
if col in itemsNewDF.columns:
itemsNewDF[col]=le.fit_transform(itemsNewDF[col])
elif col in shopsDF.columns:
shopsDF[col]=le.fit_transform(shopsDF[col])
salesDF.head()
date | date_block_num | shop_id | item_id | item_price | item_cnt_day | item_category_id | main_category | sub_category | city | category | DateModified | year | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 59 | 22154 | 999.0 | 1.0 | 37 | 0 | 1 | 1 | 0 | 2013-01-02 | 2013 | 1 |
1 | 16 | 4 | 59 | 2573 | 249.0 | 1.0 | 55 | 2 | 0 | 1 | 0 | 2013-05-16 | 2013 | 5 |
2 | 26 | 0 | 59 | 2574 | 399.0 | 1.0 | 55 | 2 | 0 | 1 | 0 | 2013-01-26 | 2013 | 1 |
3 | 9 | 0 | 59 | 2574 | 399.0 | 1.0 | 55 | 2 | 0 | 1 | 0 | 2013-01-09 | 2013 | 1 |
4 | 24 | 1 | 59 | 2574 | 399.0 | 1.0 | 55 | 2 | 0 | 1 | 0 | 2013-02-24 | 2013 | 2 |
Now we want to make sales forecast for a particular item to be sold in a particular shop for the month of 2015 November. So we are creating a dataframe with all possible combinations of month,shop_id and item_id.
cols=['date_block_num', 'shop_id', 'item_id'],
matrix=[]
for i in range(34):
dup=salesDF[salesDF['date_block_num']==i]
matrix.append(np.array(list(product([i],dup['shop_id'].unique(),dup['item_id'].unique())), dtype = np.int16))
matrix=pd.DataFrame(np.vstack(matrix))
matrix.rename(columns={0:'date_block_num',1:'shop_id',2:'item_id'},inplace=True)
matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["item_id"] = matrix["item_id"].astype(np.int16)
matrix.sort_values(by=['date_block_num', 'shop_id', 'item_id'],inplace=True)
matrix.reset_index(inplace=True)
matrix.drop('index',axis=1,inplace=True)
After creating the dataframe with month,shop_id and item_id we will merge this dataframe with the monthly sales of a item sold in a particular shop calculated from the sales dataframe
group=salesDF.groupby(["date_block_num", "shop_id", "item_id"],as_index=False).agg({'item_cnt_day':'sum'})
group.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num', 'shop_id', 'item_id'],how='left')
matrix['item_cnt_month']=matrix['item_cnt_month'].fillna(0).astype(np.float16)
matrix['item_cnt_month']=matrix['item_cnt_month'].clip(0,20)
group=salesDF.groupby(['date_block_num','shop_id','item_id'],as_index=False).agg({'item_price':'sum'})
group.rename(columns={'item_price':'item_price_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id','item_id'],how='left')
matrix['item_price_month']=matrix['item_price_month'].fillna(0).astype(np.float16)
matrix.head()
date_block_num | shop_id | item_id | item_cnt_month | item_price_month | |
---|---|---|---|---|---|
0 | 0 | 2 | 19 | 0.0 | 0.0 |
1 | 0 | 2 | 27 | 1.0 | 2500.0 |
2 | 0 | 2 | 28 | 0.0 | 0.0 |
3 | 0 | 2 | 29 | 0.0 | 0.0 |
4 | 0 | 2 | 32 | 0.0 | 0.0 |
Lets concat the test dataframe to our matrix dataframe. November 2015 is indicated by date block num 34
testDF["date_block_num"] = 34
testDF["date_block_num"] = testDF["date_block_num"].astype(np.int8)
testDF["shop_id"] = testDF.shop_id.astype(np.int8)
testDF["item_id"] = testDF.item_id.astype(np.int16)
matrix=pd.concat([matrix,testDF.drop('ID',axis=1)],ignore_index=True,sort=False,keys=['date_block_num', 'shop_id', 'item_id'])
matrix.fillna(0,inplace=True)
matrix=pd.merge(matrix,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='left')
matrix=pd.merge(matrix,shopsDF[['shop_id','city','category']],on='shop_id',how='left')
matrix.head()
date_block_num | shop_id | item_id | item_cnt_month | item_price_month | item_category_id | main_category | sub_category | city | category | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2 | 19 | 0.0 | 0.0 | 40 | 0 | 0 | 1 | 0 |
1 | 0 | 2 | 27 | 1.0 | 2500.0 | 19 | 0 | 1 | 1 | 0 |
2 | 0 | 2 | 28 | 0.0 | 0.0 | 30 | 0 | 0 | 1 | 0 |
3 | 0 | 2 | 29 | 0.0 | 0.0 | 23 | 0 | 1 | 1 | 0 |
4 | 0 | 2 | 32 | 0.0 | 0.0 | 40 | 0 | 0 | 1 | 0 |
import gc
del [salesDF,itemsDF,itemsCategoriesDF,itemsNewDF,shopsDF]
gc.collect()
Now we will create the lag columns
item_cnt_month_lag_1 indicates the no of items sold in that particular shop in the previous month and similary
item_cnt_month_lag_12 indicates the no of items sold in that particular shop 12 months before
df=matrix
for col in ['item_cnt_month']:
tmp = df[["date_block_num", "shop_id","item_id",col ]]
for i in [1,2,3,12]:
shifted = tmp.copy()
shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
shifted.date_block_num = shifted.date_block_num + i
df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
matrix=df
group=matrix.groupby(['date_block_num'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'avg_item_cnt_month'})
matrix = pd.merge(matrix, group, on = ["date_block_num"], how = "left")
matrix['avg_item_cnt_month']=matrix['avg_item_cnt_month'].astype(np.float16)
df=matrix[['date_block_num','shop_id','item_id','avg_item_cnt_month']]
shifted=df.copy()
shifted.columns=['date_block_num','shop_id','item_id','prev_month_avg_item_cnt']
shifted['date_block_num']=shifted['date_block_num']+1
matrix=pd.merge(matrix,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix.drop('avg_item_cnt_month',axis=1,inplace=True)
group=matrix.groupby(['date_block_num','shop_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'shop_avg_cnt_month'})
group['shop_avg_cnt_month']=group['shop_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id'],how='left')
df=matrix
tmp=df[['date_block_num', 'shop_id','item_id','shop_avg_cnt_month']]
for i in [1,2,3,12]:
shifted=tmp.copy()
shifted.columns=['date_block_num','shop_id','item_id','shop_avg_cnt_month_'+'_lag_'+str(i)]
shifted['date_block_num']=shifted['date_block_num']+i
df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix=df
matrix.drop('shop_avg_cnt_month',axis=1,inplace=True)
group=matrix.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_avg_cnt_month'})
group['item_avg_cnt_month']=group['item_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id'],how='left')
df=matrix
tmp=df[['date_block_num', 'shop_id','item_id','item_avg_cnt_month']]
for i in [1,2,3,12]:
shifted=tmp.copy()
shifted.columns=['date_block_num','shop_id','item_id','item_avg_cnt_month_'+'_lag_'+str(i)]
shifted['date_block_num']=shifted['date_block_num']+i
df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix=df
matrix.drop('item_avg_cnt_month',axis=1,inplace=True)
import gc
del [group,tmp,shifted,df,dup]
gc.collect()
2678
group=matrix.groupby(['date_block_num','main_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_category_avg_cnt_month'})
group['item_category_avg_cnt_month']=group['item_category_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','main_category'],how='left')
df=matrix
for col in ['item_category_avg_cnt_month']:
tmp=df[['date_block_num','shop_id','item_id',col]]
for i in [1,2,12]:
shifted=tmp.copy()
shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
shifted['date_block_num']=shifted['date_block_num']+i
df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix=df
matrix.drop('item_category_avg_cnt_month',axis=1,inplace=True)
group=matrix.groupby(['date_block_num','item_id','main_category','sub_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_categories_avg_cnt_month'})
group['item_categories_avg_cnt_month']=group['item_categories_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id','main_category','sub_category'],how='left')
df=matrix
for col in ['item_categories_avg_cnt_month']:
tmp=df[['date_block_num','shop_id','item_id',col]]
for i in [1,2,3,12]:
shifted=tmp.copy()
shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
shifted['date_block_num']=shifted['date_block_num']+i
df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix=df
matrix.drop('item_categories_avg_cnt_month',axis=1,inplace=True)
del [group,tmp,shifted,df]
gc.collect()
46
Since mostly the lag count for the first three months would be null, lets remove the information pertaining to those months from the dataframe.
matrix=matrix[matrix['date_block_num']>3]
##Filling nan's with zero
for col in matrix.columns:
if ('lag' in col) & (matrix[col].isnull().any()):
matrix[col].fillna(0,inplace=True)
matrix['prev_month_avg_item_cnt'].fillna(0,inplace=True)
After building the model and finding feature importance this 'item_price_month' was the most important feature. But this was causing overfitting problems and hence this column has been removed.
matrix.drop('item_price_month',axis=1,inplace=True)
x_train = matrix[matrix['date_block_num'] < 33].drop(['item_cnt_month'], axis=1)
y_train = matrix[matrix['date_block_num'] < 33]['item_cnt_month']
x_cv = matrix[(matrix['date_block_num'] == 33)].drop(['item_cnt_month'], axis=1)
y_cv = matrix[(matrix['date_block_num'] == 33)]['item_cnt_month']
x_test = matrix[matrix['date_block_num'] == 34].drop(['item_cnt_month'], axis=1)
Since we have a large dataframe lets randomly select 30,000 samples from each month for building our baseline model.
df2=matrix
df2=df2.drop(df2.index[0:],axis=0) ## We are duplicating the matrix and removing the rows and appending samples for each month
for i in x_train['date_block_num'].unique():
sampleDF=matrix[matrix['date_block_num']==i].sample(n=30000,replace=False)
df2=df2.append(sampleDF,ignore_index=True)
df2=df2.sample(frac=1)
df2.reset_index(inplace=True)
df2.drop('index',axis=1,inplace=True)
x_train_baseline=df2.drop('item_cnt_month',axis=1)
y_train_baseline=df2['item_cnt_month']
Initially we fit a simple model to our resampled data and make predictions.
Then we calculate the feature importance and consider the top 20 features for making predictions in our tuned model.
lgbmBaseline=lgb.LGBMRegressor(objective='regression',
boosting_type='gbdt',
n_estimators=500)
lgbmBaseline.fit(x_train_baseline,y_train_baseline)
lgbmBaselinePred=lgbmBaseline.predict(x_cv)
print(mean_squared_error(lgbmBaselinePred,y_cv))
#predLgbmTestBaseline=lgbmBaseline.predict(x_test)
0.8651736983635989
featureImportanceLgbmDF=pd.DataFrame(sorted(zip(lgbmBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresLgbm=list(featureImportanceLgbmDF[featureImportanceLgbmDF['values']>featureImportanceLgbmDF['values'].median()]['features'].values) ##selecting features above mean lgbm feature score
sns.barplot(y='features',x='values',data=featureImportanceLgbmDF[-20:])
#featuresLgbm=featureImportanceLgbmDF['features'][-15:].values
<AxesSubplot:xlabel='values', ylabel='features'>
x_train_lgbm=x_train[featuresLgbm]
x_cv_lgbm=x_cv[featuresLgbm]
x_test_lgbm=x_test[featuresLgbm]
#x_train_lgbm,x_test_lgbm,y_train_lgbm,y_test_lgbm=train_test_split(x_train,y_train,test_size=0.3)
def optimize(trial):
params={
'objective':'regression',
'num_leaves':trial.suggest_int('num_leaves',2,256),
'feature_fraction':trial.suggest_uniform('feature_fraction',0.4,1.0),
'boosting_type':'gbdt',
'reg_alpha':trial.suggest_uniform('alpha',1,10),
'reg_lambda':trial.suggest_uniform('lambda',1,10),
'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
'subsample':trial.suggest_uniform('subsample',0.5,1),
'n_estimators':trial.suggest_int('n_estimators',200,1000),
'min_data_in_leaf':trial.suggest_int('min_data_in_leaf',20,200)
}
model=lgb.LGBMRegressor(**params)
model.fit(x_train_lgbm,y_train,eval_set=[(x_cv_lgbm,y_cv)],early_stopping_rounds=30)
pred=model.predict(x_cv_lgbm)
rmse=mean_squared_error(pred,y_cv)
return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print('Best Trail: ',study.best_trial.params)
[I 2022-06-02 08:52:54,108] A new study created in memory with name: no-name-fe0fc650-9b10-4d3a-b895-b77d227d68d5
[LightGBM] [Warning] feature_fraction is set=0.9102203178665861, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9102203178665861 [LightGBM] [Warning] min_data_in_leaf is set=142, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=142 [1] valid_0's l2: 0.972403 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 0.970208 [3] valid_0's l2: 0.971736 [4] valid_0's l2: 0.965238 [5] valid_0's l2: 0.963101 [6] valid_0's l2: 0.971144 [7] valid_0's l2: 0.969602 [8] valid_0's l2: 0.970277 [9] valid_0's l2: 0.968177 [10] valid_0's l2: 0.973934 [11] valid_0's l2: 0.972653 [12] valid_0's l2: 0.994125 [13] valid_0's l2: 1.00178 [14] valid_0's l2: 1.00204 [15] valid_0's l2: 0.999461 [16] valid_0's l2: 0.996569 [17] valid_0's l2: 0.996582 [18] valid_0's l2: 0.99582 [19] valid_0's l2: 1.00349 [20] valid_0's l2: 1.00228 [21] valid_0's l2: 1.00329 [22] valid_0's l2: 1.02857 [23] valid_0's l2: 1.02858 [24] valid_0's l2: 1.03122 [25] valid_0's l2: 1.02977 [26] valid_0's l2: 1.02616 [27] valid_0's l2: 1.02711 [28] valid_0's l2: 1.02597 [29] valid_0's l2: 1.0799 [30] valid_0's l2: 1.07955 [31] valid_0's l2: 1.0807 [32] valid_0's l2: 1.08078 [33] valid_0's l2: 1.08005 [34] valid_0's l2: 1.07989
[I 2022-06-02 08:53:11,230] Trial 0 finished with value: 0.9631008540896839 and parameters: {'num_leaves': 224, 'feature_fraction': 0.9102203178665861, 'alpha': 3.996880924517897, 'lambda': 5.846674031958002, 'learning_rate': 0.866560353016595, 'subsample': 0.679832750946725, 'n_estimators': 562, 'min_data_in_leaf': 142}. Best is trial 0 with value: 0.9631008540896839.
[35] valid_0's l2: 1.07982 Early stopping, best iteration is: [5] valid_0's l2: 0.963101 [LightGBM] [Warning] feature_fraction is set=0.7454275976957052, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7454275976957052 [LightGBM] [Warning] min_data_in_leaf is set=190, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=190 [1] valid_0's l2: 1.20191 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 1.13364 [3] valid_0's l2: 1.07946 [4] valid_0's l2: 1.0453 [5] valid_0's l2: 1.01273 [6] valid_0's l2: 0.996017 [7] valid_0's l2: 0.974551 [8] valid_0's l2: 0.954498 [9] valid_0's l2: 0.938911 [10] valid_0's l2: 0.925632 [11] valid_0's l2: 0.916351 [12] valid_0's l2: 0.908513 [13] valid_0's l2: 0.902315 [14] valid_0's l2: 0.897798 [15] valid_0's l2: 0.894903 [16] valid_0's l2: 0.891604 [17] valid_0's l2: 0.88681 [18] valid_0's l2: 0.887867 [19] valid_0's l2: 0.885622 [20] valid_0's l2: 0.883912 [21] valid_0's l2: 0.883232 [22] valid_0's l2: 0.886163 [23] valid_0's l2: 0.884985 [24] valid_0's l2: 0.882824 [25] valid_0's l2: 0.882499 [26] valid_0's l2: 0.881551 [27] valid_0's l2: 0.880901 [28] valid_0's l2: 0.880238 [29] valid_0's l2: 0.879559 [30] valid_0's l2: 0.878465 [31] valid_0's l2: 0.877818 [32] valid_0's l2: 0.877153 [33] valid_0's l2: 0.877071 [34] valid_0's l2: 0.878122 [35] valid_0's l2: 0.877775 [36] valid_0's l2: 0.877279 [37] valid_0's l2: 0.877827 [38] valid_0's l2: 0.877896 [39] valid_0's l2: 0.877795 [40] valid_0's l2: 0.87779 [41] valid_0's l2: 0.877617 [42] valid_0's l2: 0.877178 [43] valid_0's l2: 0.876903 [44] valid_0's l2: 0.877643 [45] valid_0's l2: 0.877991 [46] valid_0's l2: 0.877873 [47] valid_0's l2: 0.877622 [48] valid_0's l2: 0.876938 [49] valid_0's l2: 0.877701 [50] valid_0's l2: 0.877584 [51] valid_0's l2: 0.877389 [52] valid_0's l2: 0.876904 [53] valid_0's l2: 0.876699 [54] valid_0's l2: 0.876598 [55] valid_0's l2: 0.876105 [56] valid_0's l2: 0.875717 [57] valid_0's l2: 0.875908 [58] valid_0's l2: 0.875898 [59] valid_0's l2: 0.875681 [60] valid_0's l2: 0.876015 [61] valid_0's l2: 0.875548 [62] valid_0's l2: 0.875346 [63] valid_0's l2: 0.87506 [64] valid_0's l2: 0.880141 [65] valid_0's l2: 0.879919 [66] valid_0's l2: 0.881039 [67] valid_0's l2: 0.88091 [68] valid_0's l2: 0.881529 [69] valid_0's l2: 0.881162 [70] valid_0's l2: 0.88078 [71] valid_0's l2: 0.880762 [72] valid_0's l2: 0.881328 [73] valid_0's l2: 0.878195 [74] valid_0's l2: 0.880276 [75] valid_0's l2: 0.880079 [76] valid_0's l2: 0.87866 [77] valid_0's l2: 0.878194 [78] valid_0's l2: 0.877595 [79] valid_0's l2: 0.879907 [80] valid_0's l2: 0.880093 [81] valid_0's l2: 0.879935 [82] valid_0's l2: 0.879396 [83] valid_0's l2: 0.87951 [84] valid_0's l2: 0.879322 [85] valid_0's l2: 0.879314 [86] valid_0's l2: 0.879631 [87] valid_0's l2: 0.87958 [88] valid_0's l2: 0.879713 [89] valid_0's l2: 0.879575 [90] valid_0's l2: 0.879606 [91] valid_0's l2: 0.879552 [92] valid_0's l2: 0.883556 [93] valid_0's l2: 0.883389 Early stopping, best iteration is: [63] valid_0's l2: 0.87506
[I 2022-06-02 08:53:51,215] Trial 1 finished with value: 0.8750602362861951 and parameters: {'num_leaves': 130, 'feature_fraction': 0.7454275976957052, 'alpha': 6.359645121324666, 'lambda': 9.156457861179456, 'learning_rate': 0.10951433640280277, 'subsample': 0.7040815219507084, 'n_estimators': 717, 'min_data_in_leaf': 190}. Best is trial 1 with value: 0.8750602362861951.
[LightGBM] [Warning] feature_fraction is set=0.5292216805806753, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5292216805806753 [LightGBM] [Warning] min_data_in_leaf is set=33, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=33 [1] valid_0's l2: 0.994705 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 1.03327 [3] valid_0's l2: 1.04375 [4] valid_0's l2: 1.02898 [5] valid_0's l2: 1.02363 [6] valid_0's l2: 1.02119 [7] valid_0's l2: 1.01709 [8] valid_0's l2: 1.0173 [9] valid_0's l2: 1.0124 [10] valid_0's l2: 1.03684 [11] valid_0's l2: 1.03823 [12] valid_0's l2: 1.04329 [13] valid_0's l2: 1.04429 [14] valid_0's l2: 1.04665 [15] valid_0's l2: 1.05028 [16] valid_0's l2: 1.04802 [17] valid_0's l2: 1.04818 [18] valid_0's l2: 1.04513 [19] valid_0's l2: 1.04551 [20] valid_0's l2: 1.047 [21] valid_0's l2: 1.04714 [22] valid_0's l2: 1.04755 [23] valid_0's l2: 1.04735 [24] valid_0's l2: 1.03443 [25] valid_0's l2: 1.0344 [26] valid_0's l2: 1.03599 [27] valid_0's l2: 1.03583 [28] valid_0's l2: 1.03595 [29] valid_0's l2: 1.02366 [30] valid_0's l2: 1.02204
[I 2022-06-02 08:54:09,751] Trial 2 finished with value: 0.9947045280640638 and parameters: {'num_leaves': 249, 'feature_fraction': 0.5292216805806753, 'alpha': 1.9336963912339238, 'lambda': 5.25865683651941, 'learning_rate': 0.9215607548187419, 'subsample': 0.9408304893605361, 'n_estimators': 723, 'min_data_in_leaf': 33}. Best is trial 1 with value: 0.8750602362861951.
[31] valid_0's l2: 1.02144 Early stopping, best iteration is: [1] valid_0's l2: 0.994705 [LightGBM] [Warning] feature_fraction is set=0.5968579579775428, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5968579579775428 [LightGBM] [Warning] min_data_in_leaf is set=98, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=98 [1] valid_0's l2: 0.977172 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 0.968616 [3] valid_0's l2: 0.968485 [4] valid_0's l2: 0.962483 [5] valid_0's l2: 0.960139 [6] valid_0's l2: 0.962693 [7] valid_0's l2: 0.96082 [8] valid_0's l2: 0.974497 [9] valid_0's l2: 0.973253 [10] valid_0's l2: 0.972131 [11] valid_0's l2: 0.968583 [12] valid_0's l2: 0.974582 [13] valid_0's l2: 0.973634 [14] valid_0's l2: 0.971765 [15] valid_0's l2: 0.975925 [16] valid_0's l2: 0.978866 [17] valid_0's l2: 0.984165 [18] valid_0's l2: 0.986456 [19] valid_0's l2: 0.972289 [20] valid_0's l2: 0.964962 [21] valid_0's l2: 0.964349 [22] valid_0's l2: 0.969214 [23] valid_0's l2: 0.968586 [24] valid_0's l2: 0.96902 [25] valid_0's l2: 0.967084 [26] valid_0's l2: 0.966492 [27] valid_0's l2: 0.968832 [28] valid_0's l2: 0.953124 [29] valid_0's l2: 0.953323 [30] valid_0's l2: 0.95341 [31] valid_0's l2: 0.947037 [32] valid_0's l2: 0.946965 [33] valid_0's l2: 0.9487 [34] valid_0's l2: 0.955344 [35] valid_0's l2: 0.953638 [36] valid_0's l2: 0.954892 [37] valid_0's l2: 0.953625 [38] valid_0's l2: 0.956807 [39] valid_0's l2: 0.955462 [40] valid_0's l2: 0.956294 [41] valid_0's l2: 0.956595 [42] valid_0's l2: 0.957393 [43] valid_0's l2: 0.956647 [44] valid_0's l2: 0.956619 [45] valid_0's l2: 0.955511 [46] valid_0's l2: 0.95917 [47] valid_0's l2: 0.959216 [48] valid_0's l2: 0.959095 [49] valid_0's l2: 0.959876 [50] valid_0's l2: 0.960512 [51] valid_0's l2: 0.960421 [52] valid_0's l2: 0.959837 [53] valid_0's l2: 0.959778 [54] valid_0's l2: 0.959788 [55] valid_0's l2: 0.95774 [56] valid_0's l2: 0.95787 [57] valid_0's l2: 0.957937 [58] valid_0's l2: 0.956977 [59] valid_0's l2: 0.957054 [60] valid_0's l2: 0.957617 [61] valid_0's l2: 0.95911
[I 2022-06-02 08:54:33,228] Trial 3 finished with value: 0.9469650557340977 and parameters: {'num_leaves': 120, 'feature_fraction': 0.5968579579775428, 'alpha': 3.713861555731498, 'lambda': 5.493641685961967, 'learning_rate': 0.7758498892437286, 'subsample': 0.7498461572632822, 'n_estimators': 209, 'min_data_in_leaf': 98}. Best is trial 1 with value: 0.8750602362861951.
[62] valid_0's l2: 0.959375 Early stopping, best iteration is: [32] valid_0's l2: 0.946965 [LightGBM] [Warning] feature_fraction is set=0.9540490660433252, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9540490660433252 [LightGBM] [Warning] min_data_in_leaf is set=107, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=107 [1] valid_0's l2: 0.956844 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 0.908076 [3] valid_0's l2: 0.905727 [4] valid_0's l2: 0.899157 [5] valid_0's l2: 0.891824 [6] valid_0's l2: 0.891818 [7] valid_0's l2: 0.918196 [8] valid_0's l2: 0.917572 [9] valid_0's l2: 0.915554 [10] valid_0's l2: 0.916916 [11] valid_0's l2: 0.912245 [12] valid_0's l2: 0.912917 [13] valid_0's l2: 0.911858 [14] valid_0's l2: 0.917131 [15] valid_0's l2: 1.02226 [16] valid_0's l2: 1.02176 [17] valid_0's l2: 1.01579 [18] valid_0's l2: 1.01453 [19] valid_0's l2: 1.01504 [20] valid_0's l2: 1.01473 [21] valid_0's l2: 1.01396 [22] valid_0's l2: 1.01312 [23] valid_0's l2: 1.00959 [24] valid_0's l2: 1.01473 [25] valid_0's l2: 1.02351 [26] valid_0's l2: 1.02351 [27] valid_0's l2: 1.02221 [28] valid_0's l2: 1.02213 [29] valid_0's l2: 1.0179 [30] valid_0's l2: 1.01748 [31] valid_0's l2: 1.01722 [32] valid_0's l2: 1.01752 [33] valid_0's l2: 1.01791 [34] valid_0's l2: 1.01805 [35] valid_0's l2: 1.01752
[I 2022-06-02 08:54:48,589] Trial 4 finished with value: 0.8918175226515949 and parameters: {'num_leaves': 108, 'feature_fraction': 0.9540490660433252, 'alpha': 3.5610786666805136, 'lambda': 5.8145704357205386, 'learning_rate': 0.6010592409499158, 'subsample': 0.5405191216830405, 'n_estimators': 427, 'min_data_in_leaf': 107}. Best is trial 1 with value: 0.8750602362861951.
[36] valid_0's l2: 1.01818 Early stopping, best iteration is: [6] valid_0's l2: 0.891818 [LightGBM] [Warning] feature_fraction is set=0.5428808129882927, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5428808129882927 [LightGBM] [Warning] min_data_in_leaf is set=96, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=96 [1] valid_0's l2: 0.997663 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 0.951455 [3] valid_0's l2: 0.950096 [4] valid_0's l2: 0.936599 [5] valid_0's l2: 0.933746 [6] valid_0's l2: 0.926375 [7] valid_0's l2: 0.927686 [8] valid_0's l2: 0.929771 [9] valid_0's l2: 0.933441 [10] valid_0's l2: 0.935014 [11] valid_0's l2: 0.929679 [12] valid_0's l2: 0.930278 [13] valid_0's l2: 0.930305 [14] valid_0's l2: 0.955169 [15] valid_0's l2: 0.956218 [16] valid_0's l2: 0.956158 [17] valid_0's l2: 0.960795 [18] valid_0's l2: 0.9591 [19] valid_0's l2: 0.958362 [20] valid_0's l2: 0.955822 [21] valid_0's l2: 0.954689 [22] valid_0's l2: 0.959224 [23] valid_0's l2: 0.955752 [24] valid_0's l2: 0.949572 [25] valid_0's l2: 0.948977 [26] valid_0's l2: 0.947841 [27] valid_0's l2: 0.950124 [28] valid_0's l2: 0.952002 [29] valid_0's l2: 0.952266 [30] valid_0's l2: 0.953804 [31] valid_0's l2: 0.952892 [32] valid_0's l2: 0.955718 [33] valid_0's l2: 0.958202 [34] valid_0's l2: 0.95917 [35] valid_0's l2: 0.958653
[I 2022-06-02 08:55:05,130] Trial 5 finished with value: 0.9263750057958984 and parameters: {'num_leaves': 113, 'feature_fraction': 0.5428808129882927, 'alpha': 8.890006558560103, 'lambda': 5.880046000468369, 'learning_rate': 0.6146925872258462, 'subsample': 0.5878835922740436, 'n_estimators': 365, 'min_data_in_leaf': 96}. Best is trial 1 with value: 0.8750602362861951.
[36] valid_0's l2: 0.960647 Early stopping, best iteration is: [6] valid_0's l2: 0.926375 [LightGBM] [Warning] feature_fraction is set=0.5507256392112124, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5507256392112124 [LightGBM] [Warning] min_data_in_leaf is set=36, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=36 [1] valid_0's l2: 1.11862 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 1.03633 [3] valid_0's l2: 0.98926 [4] valid_0's l2: 0.948704 [5] valid_0's l2: 0.913819 [6] valid_0's l2: 0.907927 [7] valid_0's l2: 0.897297 [8] valid_0's l2: 0.90089 [9] valid_0's l2: 0.896666 [10] valid_0's l2: 0.896928 [11] valid_0's l2: 0.894733 [12] valid_0's l2: 0.894425 [13] valid_0's l2: 0.895255 [14] valid_0's l2: 0.896618 [15] valid_0's l2: 0.900647 [16] valid_0's l2: 0.900429 [17] valid_0's l2: 0.906069 [18] valid_0's l2: 0.913742 [19] valid_0's l2: 0.913784 [20] valid_0's l2: 0.911953 [21] valid_0's l2: 0.910463 [22] valid_0's l2: 0.910561 [23] valid_0's l2: 0.907841 [24] valid_0's l2: 0.899767 [25] valid_0's l2: 0.898715 [26] valid_0's l2: 0.898331 [27] valid_0's l2: 0.898069 [28] valid_0's l2: 0.88843 [29] valid_0's l2: 0.886885 [30] valid_0's l2: 0.889828 [31] valid_0's l2: 0.889422 [32] valid_0's l2: 0.890094 [33] valid_0's l2: 0.889215 [34] valid_0's l2: 0.88932 [35] valid_0's l2: 0.889541 [36] valid_0's l2: 0.889347 [37] valid_0's l2: 0.892003 [38] valid_0's l2: 0.892299 [39] valid_0's l2: 0.892395 [40] valid_0's l2: 0.892559 [41] valid_0's l2: 0.892581 [42] valid_0's l2: 0.892435 [43] valid_0's l2: 0.892227 [44] valid_0's l2: 0.891296 [45] valid_0's l2: 0.891623 [46] valid_0's l2: 0.892049 [47] valid_0's l2: 0.8848 [48] valid_0's l2: 0.884241 [49] valid_0's l2: 0.885107 [50] valid_0's l2: 0.885575 [51] valid_0's l2: 0.885916 [52] valid_0's l2: 0.884838 [53] valid_0's l2: 0.883696 [54] valid_0's l2: 0.883686 [55] valid_0's l2: 0.8842 [56] valid_0's l2: 0.884869 [57] valid_0's l2: 0.89721 [58] valid_0's l2: 0.897202 [59] valid_0's l2: 0.897345 [60] valid_0's l2: 0.898385 [61] valid_0's l2: 0.89819 [62] valid_0's l2: 0.896712 [63] valid_0's l2: 0.897214 [64] valid_0's l2: 0.897251 [65] valid_0's l2: 0.897605 [66] valid_0's l2: 0.897492 [67] valid_0's l2: 0.896538 [68] valid_0's l2: 0.895725 [69] valid_0's l2: 0.895803 [70] valid_0's l2: 0.895801 [71] valid_0's l2: 0.895773 [72] valid_0's l2: 0.89558 [73] valid_0's l2: 0.899212 [74] valid_0's l2: 0.899088 [75] valid_0's l2: 0.898686 [76] valid_0's l2: 0.899097 [77] valid_0's l2: 0.899118 [78] valid_0's l2: 0.899217 [79] valid_0's l2: 0.899793 [80] valid_0's l2: 0.899737 [81] valid_0's l2: 0.899725 [82] valid_0's l2: 0.89669 [83] valid_0's l2: 0.896477 [84] valid_0's l2: 0.89364 Early stopping, best iteration is: [54] valid_0's l2: 0.883686
[I 2022-06-02 08:55:37,869] Trial 6 finished with value: 0.8836858063014148 and parameters: {'num_leaves': 163, 'feature_fraction': 0.5507256392112124, 'alpha': 8.644303544761698, 'lambda': 6.960281615141806, 'learning_rate': 0.2708795447705174, 'subsample': 0.7183056898540603, 'n_estimators': 582, 'min_data_in_leaf': 36}. Best is trial 1 with value: 0.8750602362861951.
[LightGBM] [Warning] feature_fraction is set=0.7448570099238562, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7448570099238562 [LightGBM] [Warning] min_data_in_leaf is set=45, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=45 [1] valid_0's l2: 1.2433 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 1.20132 [3] valid_0's l2: 1.16491 [4] valid_0's l2: 1.13849 [5] valid_0's l2: 1.1118 [6] valid_0's l2: 1.09744 [7] valid_0's l2: 1.07575 [8] valid_0's l2: 1.05217 [9] valid_0's l2: 1.03338 [10] valid_0's l2: 1.01529 [11] valid_0's l2: 0.998577 [12] valid_0's l2: 0.985407 [13] valid_0's l2: 0.975082 [14] valid_0's l2: 0.965678 [15] valid_0's l2: 0.954899 [16] valid_0's l2: 0.947618 [17] valid_0's l2: 0.940121 [18] valid_0's l2: 0.931668 [19] valid_0's l2: 0.925681 [20] valid_0's l2: 0.919718 [21] valid_0's l2: 0.916203 [22] valid_0's l2: 0.912432 [23] valid_0's l2: 0.910084 [24] valid_0's l2: 0.907362 [25] valid_0's l2: 0.904067 [26] valid_0's l2: 0.900833 [27] valid_0's l2: 0.898751 [28] valid_0's l2: 0.897557 [29] valid_0's l2: 0.896674 [30] valid_0's l2: 0.895816 [31] valid_0's l2: 0.894482 [32] valid_0's l2: 0.893516 [33] valid_0's l2: 0.894734 [34] valid_0's l2: 0.896185 [35] valid_0's l2: 0.894735 [36] valid_0's l2: 0.893623 [37] valid_0's l2: 0.892855 [38] valid_0's l2: 0.892146 [39] valid_0's l2: 0.891281 [40] valid_0's l2: 0.890494 [41] valid_0's l2: 0.891792 [42] valid_0's l2: 0.892643 [43] valid_0's l2: 0.891323 [44] valid_0's l2: 0.89074 [45] valid_0's l2: 0.889883 [46] valid_0's l2: 0.889401 [47] valid_0's l2: 0.889085 [48] valid_0's l2: 0.888296 [49] valid_0's l2: 0.887193 [50] valid_0's l2: 0.886631 [51] valid_0's l2: 0.886172 [52] valid_0's l2: 0.885887 [53] valid_0's l2: 0.885599 [54] valid_0's l2: 0.884993 [55] valid_0's l2: 0.88517 [56] valid_0's l2: 0.884857 [57] valid_0's l2: 0.884633 [58] valid_0's l2: 0.884359 [59] valid_0's l2: 0.885482 [60] valid_0's l2: 0.886019 [61] valid_0's l2: 0.885781 [62] valid_0's l2: 0.885649 [63] valid_0's l2: 0.888665 [64] valid_0's l2: 0.889175 [65] valid_0's l2: 0.888784 [66] valid_0's l2: 0.888209 [67] valid_0's l2: 0.888202 [68] valid_0's l2: 0.887754 [69] valid_0's l2: 0.889866 [70] valid_0's l2: 0.89027 [71] valid_0's l2: 0.890769 [72] valid_0's l2: 0.890608 [73] valid_0's l2: 0.890579 [74] valid_0's l2: 0.890808 [75] valid_0's l2: 0.890755 [76] valid_0's l2: 0.891072 [77] valid_0's l2: 0.891056 [78] valid_0's l2: 0.891317 [79] valid_0's l2: 0.89115 [80] valid_0's l2: 0.891353 [81] valid_0's l2: 0.891168 [82] valid_0's l2: 0.890899 [83] valid_0's l2: 0.890678 [84] valid_0's l2: 0.89096 [85] valid_0's l2: 0.890853 [86] valid_0's l2: 0.890848 [87] valid_0's l2: 0.890624 [88] valid_0's l2: 0.890272 Early stopping, best iteration is: [58] valid_0's l2: 0.884359
[I 2022-06-02 08:56:15,538] Trial 7 finished with value: 0.8843588366874461 and parameters: {'num_leaves': 108, 'feature_fraction': 0.7448570099238562, 'alpha': 1.601471298791404, 'lambda': 8.737681215894316, 'learning_rate': 0.05664995661757699, 'subsample': 0.7694103785509377, 'n_estimators': 710, 'min_data_in_leaf': 45}. Best is trial 1 with value: 0.8750602362861951.
[LightGBM] [Warning] feature_fraction is set=0.4686694022827363, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4686694022827363 [LightGBM] [Warning] min_data_in_leaf is set=68, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=68 [1] valid_0's l2: 1.043 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 0.962792 [3] valid_0's l2: 0.952292 [4] valid_0's l2: 0.926211 [5] valid_0's l2: 0.90971 [6] valid_0's l2: 0.908147 [7] valid_0's l2: 0.903789 [8] valid_0's l2: 0.936951 [9] valid_0's l2: 0.928376 [10] valid_0's l2: 0.930711 [11] valid_0's l2: 0.929748 [12] valid_0's l2: 0.933541 [13] valid_0's l2: 0.935972 [14] valid_0's l2: 0.937134 [15] valid_0's l2: 0.941869 [16] valid_0's l2: 0.944118 [17] valid_0's l2: 0.943211 [18] valid_0's l2: 0.944274 [19] valid_0's l2: 0.971482 [20] valid_0's l2: 0.971616 [21] valid_0's l2: 0.971617 [22] valid_0's l2: 0.975361 [23] valid_0's l2: 0.976784 [24] valid_0's l2: 0.976603 [25] valid_0's l2: 0.9759 [26] valid_0's l2: 0.975514 [27] valid_0's l2: 0.98026 [28] valid_0's l2: 0.97556 [29] valid_0's l2: 0.977003 [30] valid_0's l2: 0.977489 [31] valid_0's l2: 0.977896 [32] valid_0's l2: 0.978525 [33] valid_0's l2: 0.97719 [34] valid_0's l2: 0.975269 [35] valid_0's l2: 0.975342 [36] valid_0's l2: 0.972563
[I 2022-06-02 08:56:33,561] Trial 8 finished with value: 0.9037889391854311 and parameters: {'num_leaves': 171, 'feature_fraction': 0.4686694022827363, 'alpha': 4.195235883477049, 'lambda': 9.83212388030402, 'learning_rate': 0.44667951908786035, 'subsample': 0.8803492246911766, 'n_estimators': 248, 'min_data_in_leaf': 68}. Best is trial 1 with value: 0.8750602362861951.
[37] valid_0's l2: 0.970698 Early stopping, best iteration is: [7] valid_0's l2: 0.903789 [LightGBM] [Warning] feature_fraction is set=0.8090043739141928, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8090043739141928 [LightGBM] [Warning] min_data_in_leaf is set=194, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=194 [1] valid_0's l2: 1.21717 Training until validation scores don't improve for 30 rounds [2] valid_0's l2: 1.15741 [3] valid_0's l2: 1.10954 [4] valid_0's l2: 1.07509 [5] valid_0's l2: 1.04315 [6] valid_0's l2: 1.02376 [7] valid_0's l2: 1.0028 [8] valid_0's l2: 0.980399 [9] valid_0's l2: 0.963368 [10] valid_0's l2: 0.947227 [11] valid_0's l2: 0.93414 [12] valid_0's l2: 0.925416 [13] valid_0's l2: 0.917877 [14] valid_0's l2: 0.91103 [15] valid_0's l2: 0.905008 [16] valid_0's l2: 0.900753 [17] valid_0's l2: 0.895205 [18] valid_0's l2: 0.891027 [19] valid_0's l2: 0.88722 [20] valid_0's l2: 0.884073 [21] valid_0's l2: 0.882542 [22] valid_0's l2: 0.881883 [23] valid_0's l2: 0.88096 [24] valid_0's l2: 0.882051 [25] valid_0's l2: 0.881686 [26] valid_0's l2: 0.879948 [27] valid_0's l2: 0.878743 [28] valid_0's l2: 0.878873 [29] valid_0's l2: 0.877722 [30] valid_0's l2: 0.877031 [31] valid_0's l2: 0.873893 [32] valid_0's l2: 0.8731 [33] valid_0's l2: 0.874335 [34] valid_0's l2: 0.875042 [35] valid_0's l2: 0.874569 [36] valid_0's l2: 0.873649 [37] valid_0's l2: 0.87346 [38] valid_0's l2: 0.873781 [39] valid_0's l2: 0.873296 [40] valid_0's l2: 0.872856 [41] valid_0's l2: 0.872614 [42] valid_0's l2: 0.872431 [43] valid_0's l2: 0.872191 [44] valid_0's l2: 0.870676 [45] valid_0's l2: 0.86997 [46] valid_0's l2: 0.869711 [47] valid_0's l2: 0.869313 [48] valid_0's l2: 0.869326 [49] valid_0's l2: 0.868823 [50] valid_0's l2: 0.868629 [51] valid_0's l2: 0.869576 [52] valid_0's l2: 0.869271 [53] valid_0's l2: 0.86949 [54] valid_0's l2: 0.869616 [55] valid_0's l2: 0.869614 [56] valid_0's l2: 0.869421 [57] valid_0's l2: 0.87098 [58] valid_0's l2: 0.874341 [59] valid_0's l2: 0.874316 [60] valid_0's l2: 0.874328 [61] valid_0's l2: 0.87537 [62] valid_0's l2: 0.875101 [63] valid_0's l2: 0.875502 [64] valid_0's l2: 0.876237 [65] valid_0's l2: 0.875942 [66] valid_0's l2: 0.877103 [67] valid_0's l2: 0.877183 [68] valid_0's l2: 0.877146 [69] valid_0's l2: 0.877895 [70] valid_0's l2: 0.877841 [71] valid_0's l2: 0.878101 [72] valid_0's l2: 0.877957 [73] valid_0's l2: 0.877726 [74] valid_0's l2: 0.877647 [75] valid_0's l2: 0.877268 [76] valid_0's l2: 0.877483 [77] valid_0's l2: 0.877425 [78] valid_0's l2: 0.876827 [79] valid_0's l2: 0.876873 [80] valid_0's l2: 0.876733 Early stopping, best iteration is: [50] valid_0's l2: 0.868629
[I 2022-06-02 08:57:10,286] Trial 9 finished with value: 0.8686287665692258 and parameters: {'num_leaves': 167, 'feature_fraction': 0.8090043739141928, 'alpha': 7.530721686244508, 'lambda': 4.184295547505293, 'learning_rate': 0.08630422292011555, 'subsample': 0.9145396974096048, 'n_estimators': 447, 'min_data_in_leaf': 194}. Best is trial 9 with value: 0.8686287665692258.
Best Trail: {'num_leaves': 167, 'feature_fraction': 0.8090043739141928, 'alpha': 7.530721686244508, 'lambda': 4.184295547505293, 'learning_rate': 0.08630422292011555, 'subsample': 0.9145396974096048, 'n_estimators': 447, 'min_data_in_leaf': 194}
lgbrEstimator=lgb.LGBMRegressor(**study.best_trial.params)
lgbrEstimator.fit(x_train_lgbm,y_train)
predLgbmCV=lgbrEstimator.predict(x_cv_lgbm)
print(mean_squared_error(predLgbmCV,y_cv))
predLgbTest=lgbrEstimator.predict(x_test_lgbm)
predLgbTest=predLgbTest.clip(0,20)
[LightGBM] [Warning] lambda_l2 is set with lambda=4.184295547505293, reg_lambda=0.0 will be ignored. Current value: lambda_l2=4.184295547505293 [LightGBM] [Warning] feature_fraction is set=0.8090043739141928, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8090043739141928 [LightGBM] [Warning] min_data_in_leaf is set=194, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=194 0.8568278991960587
SHAP values (SHapley Additive exPlanations) is used to explain the predictions made by your model.
SHAP shows the contribution or the importance of each feature on the prediction of the model, it does not evaluate the quality of the prediction itself.
import shap
explainer(data) ---> Returns data,base_values,shap_values
data - The original data
base_values - The expected value of the target or the mean target value from the data
shap_values - SHAP values for each sample
explainer.shap_values ---> Returns only the shap values
shap_values=shap.TreeExplainer(lgbrEstimator).shap_values(x_cv_lgbm)
shap.summary_plot(shap_values,x_cv_lgbm,plot_type='bar')
From the above plot it evident that item_avg_cnt_month_lag,item_cnt_month_lag,shop_av_cnt_month_lag features have better predicting power compared with other variables. This prediction is also meaningful because previous month's sales is a good indicator for sales in forecoming months.
Another point to be noted is sales one month before(i.e last month) is more detrimental compared with other variables
The below plot is similar to a summary bar plot but this is more interpretable.
shap.summary_plot(shap_values,x_cv_lgbm,feature_names=x_cv_lgbm.columns)
The first 3 features indicates that higher items sold in the previous month means more items would be sold in the coming months.
The plot indicates that lower the item id better would be the average sales. We can confirm this conclusion from pdp plots.
from pdpbox import info_plots
fig=info_plots.target_plot(df=matrix,feature='item_id',feature_name='item_id',target='item_cnt_month')
fig[0]
Lower item id's have better average item count per month confirming our inference from shap plot.
info_plots.actual_plot(lgbrEstimator,x_cv_lgbm,feature='item_id',feature_name='item_id');
A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
The pattern of predictions is maintained i.e compared with other item id's 2708-4838 item id's have more items sold. But the predicted count varies significantly from actual count.
salesA=salesDF[(salesDF['item_id']>2708)&(salesDF['item_id']<7917)]['item_cnt_day'].sum()
salesB=salesDF[(salesDF['item_id']<2708)|(salesDF['item_id']>7917)]['item_cnt_day'].sum()
salesValues=[salesA,salesB]
salesNames=['item ids 2708-7917','other item ids']
fig=px.pie(names=salesNames,values=salesValues,title='Item ids 2708-7917 account for 33% of sales')
fig.update_layout(margin=dict(l=0,r=0,b=0),showlegend=False)
fig.update_traces(textinfo='label+percent')
fig.show()
ax1=info_plots.target_plot(df=df2[(df2['date_block_num']>=10) & (df2['date_block_num']<19)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[(df2['date_block_num']>=19) & (df2['date_block_num']<28)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[df2['date_block_num']>=28],feature='date_block_num',feature_name='month',target='item_cnt_month')
xgbBaseline=xgb.XGBRegressor(objective='reg:squaredlogerror',
eval_metric='rmse',
booster='gbtree',
n_estimators=200)
xgbBaseline.fit(x_train_baseline,y_train_baseline)
predXgbBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predXgbBaseline,y_cv))
0.89054406
featureImportanceXgbDF=pd.DataFrame(sorted(zip(xgbBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresXgb=featureImportanceXgbDF[featureImportanceXgbDF['values']>=featureImportanceXgbDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceXgbDF[-15:])
<AxesSubplot:xlabel='values', ylabel='features'>
x_train_xgb=x_train[featuresXgb]
x_cv_xgb=x_cv[featuresXgb]
x_test_xgb=x_test[featuresXgb]
def optimize(trial):
params={
'objective':'reg:squaredlogerror',
'eval_metric':'rmse',
'booster':'gbtree',
'alpha':trial.suggest_loguniform('alpha',1,20),
'lambda':trial.suggest_loguniform('lambda',1,20),
'max_depth':trial.suggest_int('max_depth',1,10),
'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
'subsample':trial.suggest_uniform('subsample',0.5,1),
'n_estimators':trial.suggest_int('n_estimators',10,300)
}
model=xgb.XGBRegressor(**params)
model.fit(x_train_xgb,y_train,eval_set=[(x_cv_xgb,y_cv)],early_stopping_rounds=30)
pred=model.predict(x_cv_xgb)
rmse=mean_squared_error(pred,y_cv)
return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)
xgbBestParams=study.best_trial.params
xgbBestParams.update({'objective':'reg:squaredlogerror',
'eval_metric':'rmse',
'booster':'gbtree'})
#xgbReg=xgb.XGBRegressor(**study.best_trial.params)
xgbReg=xgb.XGBRegressor(**xgbBestParams)
xgbReg.fit(x_train_xgb,y_train)
predXgbCV=xgbReg.predict(x_cv_xgb)
print(mean_squared_error(predXgbCV,y_cv))
predXgbTest=xgbReg.predict(x_test_xgb)
predXgbTest=predXgbTest.clip(0,20)
0.9194968
catboostBaseline=CatBoostRegressor(eval_metric='RMSE',
loss_function='RMSE',
iterations=500)
catboostBaseline.fit(x_train_baseline,y_train_baseline.astype('float'),early_stopping_rounds=50,verbose=50)
predCatBoostBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predCatBoostBaseline,y_cv))
Learning rate set to 0.209585 0: learn: 1.1236477 total: 118ms remaining: 59.1s 50: learn: 0.8915859 total: 3.97s remaining: 35s 100: learn: 0.8722246 total: 7.66s remaining: 30.3s 150: learn: 0.8596855 total: 11.7s remaining: 27.1s 200: learn: 0.8500771 total: 15.4s remaining: 22.9s 250: learn: 0.8416014 total: 19.1s remaining: 19s 300: learn: 0.8335103 total: 23s remaining: 15.2s 350: learn: 0.8260545 total: 26.8s remaining: 11.4s 400: learn: 0.8202651 total: 30.6s remaining: 7.56s 450: learn: 0.8137624 total: 34.4s remaining: 3.73s 499: learn: 0.8088961 total: 38.1s remaining: 0us 0.89054406
featureImportanceCatBoostDF=pd.DataFrame(sorted(zip(catboostBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresCatBoost=featureImportanceCatBoostDF[featureImportanceCatBoostDF['values']>=featureImportanceCatBoostDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceCatBoostDF[-15:])
<AxesSubplot:xlabel='values', ylabel='features'>
x_train_cat=x_train[featuresCatBoost]
x_cv_cat=x_cv[featuresCatBoost]
x_test_cat=x_test[featuresCatBoost]
def optimize(trial):
params={
'depth': trial.suggest_int('depth', 4, 8),
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',10,100),
'eval_metric':'RMSE',
'loss_function':'RMSE',
'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',5,20),
'colsample_bylevel':trial.suggest_uniform('colsample_bylevel',0.5,1),
'learning_rate':trial.suggest_uniform('learning_rate',0.0001,1),
'subsample':trial.suggest_uniform('subsample',0.5,1),
'iterations':trial.suggest_int('iterations',100,1000)
}
model=CatBoostRegressor(**params)
model.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=30,verbose=100)
pred=model.predict(x_cv_cat)
rmse=mean_squared_error(pred,y_cv.astype('float'))
return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)
[I 2022-06-02 11:46:04,402] A new study created in memory with name: no-name-f2c29064-982e-4ffd-920f-e321ecfbf787
0: learn: 1.1029261 test: 1.0570314 best: 1.0570314 (0) total: 891ms remaining: 13m 49s 100: learn: 0.8687744 test: 0.9290252 best: 0.9277017 (73) total: 1m 28s remaining: 12m 4s Stopped by overfitting detector (30 iterations wait) bestTest = 0.927701692 bestIteration = 73 Shrink model to first 74 iterations.
[I 2022-06-02 11:48:08,787] Trial 0 finished with value: 0.8606304292551291 and parameters: {'depth': 8, 'min_data_in_leaf': 19, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.5378079736103758, 'learning_rate': 0.2862666355245815, 'subsample': 0.866195462513664, 'iterations': 932}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0178522 test: 1.0058259 best: 1.0058259 (0) total: 570ms remaining: 4m 2s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9477510699 bestIteration = 44 Shrink model to first 45 iterations.
[I 2022-06-02 11:49:19,893] Trial 1 finished with value: 0.8982320904308819 and parameters: {'depth': 4, 'min_data_in_leaf': 18, 'l2_leaf_reg': 16, 'colsample_bylevel': 0.7141512855989429, 'learning_rate': 0.7080253023373682, 'subsample': 0.5382960589664838, 'iterations': 427}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0050272 test: 1.0039275 best: 1.0039275 (0) total: 577ms remaining: 1m 22s 100: learn: 0.8930705 test: 0.9633663 best: 0.9424132 (70) total: 51.7s remaining: 22s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9424132232 bestIteration = 70 Shrink model to first 71 iterations.
[I 2022-06-02 11:50:45,978] Trial 2 finished with value: 0.8881426832370868 and parameters: {'depth': 4, 'min_data_in_leaf': 19, 'l2_leaf_reg': 18, 'colsample_bylevel': 0.8027415898193161, 'learning_rate': 0.837994053759473, 'subsample': 0.5726931086646337, 'iterations': 144}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0863506 test: 1.0466450 best: 1.0466450 (0) total: 803ms remaining: 3m 59s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9378474207 bestIteration = 52 Shrink model to first 53 iterations.
[I 2022-06-02 11:52:25,028] Trial 3 finished with value: 0.8795577846037965 and parameters: {'depth': 7, 'min_data_in_leaf': 67, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.5246033321273523, 'learning_rate': 0.34355209068575415, 'subsample': 0.8021534800608656, 'iterations': 299}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.1275177 test: 1.0747990 best: 1.0747990 (0) total: 804ms remaining: 8m 53s 100: learn: 0.8921188 test: 0.9386004 best: 0.9357526 (93) total: 1m 15s remaining: 7m 3s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9357526101 bestIteration = 93 Shrink model to first 94 iterations.
[I 2022-06-02 11:54:33,083] Trial 4 finished with value: 0.8756329472555682 and parameters: {'depth': 6, 'min_data_in_leaf': 74, 'l2_leaf_reg': 15, 'colsample_bylevel': 0.7912345907866245, 'learning_rate': 0.22689618198240935, 'subsample': 0.924005802770854, 'iterations': 665}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0108997 test: 1.0008224 best: 1.0008224 (0) total: 716ms remaining: 3m 15s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9448269553 bestIteration = 39 Shrink model to first 40 iterations.
[I 2022-06-02 11:55:48,194] Trial 5 finished with value: 0.8926979755477428 and parameters: {'depth': 5, 'min_data_in_leaf': 100, 'l2_leaf_reg': 5, 'colsample_bylevel': 0.7446539822463221, 'learning_rate': 0.6947668463100837, 'subsample': 0.6161920101568504, 'iterations': 274}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0269441 test: 1.0111692 best: 1.0111692 (0) total: 674ms remaining: 1m 22s Stopped by overfitting detector (30 iterations wait) bestTest = 0.939723861 bestIteration = 47 Shrink model to first 48 iterations.
[I 2022-06-02 11:57:10,716] Trial 6 finished with value: 0.8830809348565399 and parameters: {'depth': 5, 'min_data_in_leaf': 26, 'l2_leaf_reg': 10, 'colsample_bylevel': 0.878917270514938, 'learning_rate': 0.6039769722579861, 'subsample': 0.7713409817301802, 'iterations': 123}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 0.9828832 test: 0.9857288 best: 0.9857288 (0) total: 1.12s remaining: 8m 14s Stopped by overfitting detector (30 iterations wait) bestTest = 0.934415575 bestIteration = 26 Shrink model to first 27 iterations.
[I 2022-06-02 11:58:33,509] Trial 7 finished with value: 0.8731324667712639 and parameters: {'depth': 8, 'min_data_in_leaf': 98, 'l2_leaf_reg': 9, 'colsample_bylevel': 0.8299261304989676, 'learning_rate': 0.7911262009004196, 'subsample': 0.8309398836396251, 'iterations': 444}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.0596660 test: 1.0300854 best: 1.0300854 (0) total: 734ms remaining: 2m 45s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9429523361 bestIteration = 33 Shrink model to first 34 iterations.
[I 2022-06-02 11:59:51,542] Trial 8 finished with value: 0.8891591081610171 and parameters: {'depth': 6, 'min_data_in_leaf': 51, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.8307576661697187, 'learning_rate': 0.44239201381919957, 'subsample': 0.7697952150088694, 'iterations': 227}. Best is trial 0 with value: 0.8606304292551291.
0: learn: 1.1567741 test: 1.0958238 best: 1.0958238 (0) total: 763ms remaining: 10m 54s 100: learn: 0.8920460 test: 0.9351082 best: 0.9344029 (90) total: 1m 16s remaining: 9m 30s Stopped by overfitting detector (30 iterations wait) bestTest = 0.9338687148 bestIteration = 115 Shrink model to first 116 iterations.
[I 2022-06-02 12:02:15,520] Trial 9 finished with value: 0.8721107764498629 and parameters: {'depth': 7, 'min_data_in_leaf': 51, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.9735331717447835, 'learning_rate': 0.14349366648996406, 'subsample': 0.8482159777091005, 'iterations': 858}. Best is trial 0 with value: 0.8606304292551291.
{'depth': 8, 'min_data_in_leaf': 19, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.5378079736103758, 'learning_rate': 0.2862666355245815, 'subsample': 0.866195462513664, 'iterations': 932}
catBestParams=study.best_trial.params
catBestParams.update({
'eval_metric':'RMSE',
'loss_function':'RMSE'})
catBoostReg=CatBoostRegressor(**catBestParams)
catBoostReg.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=50,verbose=50)
predCatCV=catBoostReg.predict(x_cv_cat)
print('MEAN SQUARED ERROR OF VALIDATION SET FOR CATBOOST REGRESSOR ',mean_squared_error(predCatCV,y_cv))
predCatTest=catBoostReg.predict(x_test)
predCatTest=predCatTest.clip(0,20)
0: learn: 1.1029261 test: 1.0570314 best: 1.0570314 (0) total: 1.23s remaining: 19m 6s 50: learn: 0.8880053 test: 0.9343844 best: 0.9329967 (49) total: 44.7s remaining: 12m 52s 100: learn: 0.8687744 test: 0.9290252 best: 0.9277017 (73) total: 1m 28s remaining: 12m 8s Stopped by overfitting detector (50 iterations wait) bestTest = 0.927701692 bestIteration = 73 Shrink model to first 74 iterations. MEAN SQUARED ERROR OF VALIDATION SET FOR CATBOOST REGRESSOR 0.8606304292551291
#predLgb=predLgb.clip(0,20)
predFinal1=[]
for i in predLgbTest:
predFinal1.append(np.math.floor(i))
# predXgbr=predXgbr.clip(0,20)
predFinal2=[]
for i in predXgbTest:
predFinal2.append(np.math.floor(i))
predFinal3=[]
for i in predCatTest:
predFinal3.append(np.math.floor(i))
predictions=[]
for i in range(len(predFinal1)):
val=0.3*predFinal1[i]+0.3*predFinal2[i]+0.4*predFinal3[i]
predictions.append(int(val))
submissionDF=pd.DataFrame(testDF['ID'],columns=['ID'])
submissionDF['item_cnt_month']=predictions
submissionDF.to_csv('result.csv',index=False)
The RMSE for the testing set is 0.92