Notebook

In [1]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler,Normalizer,LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from scipy import stats
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgb2
import optuna.integration.xgboost as xgb2
from keras.models import Sequential
from keras.layers import Dense,Dropout,Input,AlphaDropout
from keras.regularizers import Regularizer,l2 
from collections import OrderedDict
from itertools import product
import re
#import hvplot.pandas
%matplotlib inline
import plotly.offline as pyo
pyo.init_notebook_mode()

Objective¶

We are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set.

In [2]:

salesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
itemsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
itemsCategoriesDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shopsDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
testDF=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

In [3]:

salesDF.head()

Out[3]:

	date	shop_id	item_id	item_price	item_cnt_day
0	02.01.2013	59	22154	999.00	1.0
1	03.01.2013	25	2552	899.00	1.0
2	05.01.2013	25	2552	899.00	-1.0
3	06.01.2013	25	2554	1709.05	1.0
4	15.01.2013	25	2555	1099.00	1.0

In [4]:

# Cleaning shops data
# Якутск Орджоникидзе, 56
salesDF.loc[salesDF.shop_id == 0, 'shop_id'] = 57
testDF.loc[testDF.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
salesDF.loc[salesDF.shop_id == 1, 'shop_id'] = 58
testDF.loc[testDF.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
salesDF.loc[salesDF.shop_id == 10, 'shop_id'] = 11
testDF.loc[testDF.shop_id == 10, 'shop_id'] = 11

Here on analysing the shop names, we can find that the shop names encompass the city name and the category.

For eg: shop id 2 - Адыгея ТЦ "Мега" - Adygea mega shopping mall(English translation)

Here Adygea is the name of the city in Russia

In [5]:

shopsDF['city']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[0])
shopsDF['category']=shopsDF['shop_name'].apply(lambda x: x.split(" ")[1])

##Each SHOP_ID' in this list belongs to separate category and we use 'other' to encompass them.
for i in [0,6,9,20,21,22,46,55,57,10,11]:
    shopsDF.loc[shopsDF['shop_id']==i,'category']='other'
    
    
shopsDF.loc[shopsDF['city']=='!Якутск','city']='Якутск'

# le=LabelEncoder()
# shopsDF['city']=le.fit_transform(shopsDF['city'])
# shopsDF['category']=le.fit_transform(shopsDF['category'])

When we explore item category names we have:

For eg : item_category_id 0 - PC Гарнитуры/Наушники - PC Headset/Headphones

Hence we can separate this feature as 2 columns as main category and sub category

In [6]:

##Splits item_category name into main category and sub category
categories=[]
sub_categories=[]

for i in itemsCategoriesDF['item_category_name'].unique():
    if len(list(i.split(" - ")))==1:
        categories.append(i.split(" - ")[0])
        sub_categories.append('other')
    else:
        categories.append(i.split(" - ")[0])
        sub_categories.append(i.split(" - ")[1])
        
itemsCategoriesDF['main_category']=categories
itemsCategoriesDF['sub_category']=sub_categories

# le=LabelEncoder()
# itemsCategoriesDF['main_category']=le.fit_transform(itemsCategoriesDF['main_category'])
# itemsCategoriesDF['sub_category']=le.fit_transform(itemsCategoriesDF['sub_category'])

Let the items categories dataframe after splitting features be merged with the items dataframe

In [7]:

itemsNewDF=pd.merge(itemsDF,itemsCategoriesDF[['item_category_id','main_category','sub_category']],on='item_category_id',how='inner')

In [8]:

salesDF=pd.merge(salesDF,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='inner')
salesDF=pd.merge(salesDF,shopsDF[['shop_id','city','category']],on='shop_id',how='inner')

Lets remove the outliers

In [9]:

fig=plt.figure(figsize=(10,5))
gs=fig.add_gridspec(1,2)
ax1=fig.add_subplot(gs[0,0])
img=sns.boxplot(x=salesDF['item_cnt_day'],ax=ax1)
ax2=fig.add_subplot(gs[0,1])
img=sns.boxplot(x=salesDF['item_price'],ax=ax2)

In [10]:

pd.cut(salesDF['item_price'],10).value_counts()

Out[10]:

(-308.981, 30797.1]     2935632
(30797.1, 61595.2]          216
(277181.9, 307980.0]          1
(61595.2, 92393.3]            0
(92393.3, 123191.4]           0
(123191.4, 153989.5]          0
(153989.5, 184787.6]          0
(184787.6, 215585.7]          0
(215585.7, 246383.8]          0
(246383.8, 277181.9]          0
Name: item_price, dtype: int64

In [11]:

salesDF.drop(salesDF[(salesDF['item_cnt_day']>200)&(salesDF['item_cnt_day']<0)].index,axis=0,inplace=True)
salesDF.drop(salesDF[salesDF['item_price']>30797].index,axis=0,inplace=True)

In [12]:

##Splits date into day,month,year format
salesDF['DateModified']=pd.to_datetime(salesDF['date'],format='%d.%m.%Y')
salesDF['year']=salesDF['DateModified'].dt.year
salesDF['month']=salesDF['DateModified'].dt.month
salesDF['date']=salesDF['DateModified'].dt.day

Extensive EDA¶

In [13]:

shopNamesDict={shopsDF['shop_id'][i]:shopsDF['shop_name'][i] for i in range(len(shopsDF))}
shopWiseSalesDF=salesDF.groupby('shop_id').agg({'item_cnt_day':'sum','item_price':'sum'})
shopWiseSalesDF=shopWiseSalesDF.sort_values(by='item_price',ascending=False)
shopWiseSalesDF.reset_index(inplace=True)
shopWiseSalesDF['item_price']=shopWiseSalesDF['item_price'].astype(float)/1000000   
shopWiseSalesDF=shopWiseSalesDF.replace({'shop_id':shopNamesDict})
fig=plt.figure(figsize=(15,5))
img=sns.barplot(data=shopWiseSalesDF[:10],y='shop_id',x='item_price',palette='rainbow')
img.set(xticklabels=[])
img.tick_params(bottom=False,left=False)
img.set_ylabel('Shop Names',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.set_xlabel('Total price of products sold across all years',fontweight='bold',fontfamily='sans-serif',fontsize=15)
img.text(5,-1,'Top 10 shops with highest sales(in terms of price)',fontfamily='sans-serif',fontweight='bold',fontsize=20)
for i in range(10):
    img.text(1,i,str(round(shopWiseSalesDF['item_price'][i],0))+' Million',fontfamily='sans-serif',fontsize=12,fontweight='medium')
sns.despine(left=True,bottom=True,right=True,top=True)

In [14]:

itemWiseSalesDF=salesDF.groupby(['item_id'],as_index=False).agg({'item_price':'sum'}).sort_values(by='item_price',ascending=False).reset_index()
itemWiseSalesDF.drop('index',axis=1,inplace=True)
#itemWiseSalesDF['item_price']=itemWiseSalesDF['item_price'].astype('float')/1000000
itemWiseSalesDF=pd.merge(itemWiseSalesDF[:10],itemsDF[['item_id','item_name']],on='item_id',how='inner')

labels=['Sony PS4 500GB','Sony PS4 Kit 500GB','GTA V PS3','GTA V XBOX360','PS Store Top-Up Wallet']

fig=px.bar(itemWiseSalesDF[:5],y='item_name',x='item_price',orientation='h')
fig.update_layout(plot_bgcolor='#fff',
                  title='Top 5 Highest selling products',
                  yaxis=dict(showline=True,linecolor='black',tickvals=[0,1,2,3,4],ticktext=labels))
fig.show()

In [15]:

## Total items sold in every month across the 3 years
salesYearDF=salesDF.groupby(['year','month']).agg({'item_cnt_day':'sum'})
salesYearDF.reset_index(inplace=True)
months = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug',
          9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
salesYearDF=salesYearDF.replace({'month':months})
plt.figure(figsize=(15,5))
sns.set_context("notebook")
sns.set_style('white')
img=sns.lineplot(data=salesYearDF,x='month',y='item_cnt_day',hue='year')
img.set_xlabel('Month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
img.set_ylabel('Items sold per month',fontsize=15,fontweight='bold',color='black',fontfamily='sans-serif')
sns.despine(right=True,top=True)

In both 2013 and 2014, there has been significant increase in sales during the months of November and December. This may be attributed due to Christmas season

In [17]:

fig=px.bar(salesDF.groupby('city',as_index=False).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False),
           x='city',
           y='item_cnt_day',
          title='City wise sales')
fig.update_layout(plot_bgcolor='#fff')
fig.show()

The below function groups the dataframe based on city and identifies the top 5 categories in each city and represented in terms of percentage

In [18]:

cityWiseSalesDF=salesDF.groupby(['city','main_category']).agg({'item_cnt_day':'sum'}).sort_values('item_cnt_day',ascending=False).reset_index()
cityWiseSalesDF=cityWiseSalesDF.groupby('city').head(5)
cityWiseSalesDF['percentage']=cityWiseSalesDF.groupby('city')['item_cnt_day'].transform(lambda x:round(x*100/x.sum(),2))
fig=px.bar(cityWiseSalesDF,x='city',y='percentage',color='main_category',title='Top 5 categories sold in each city')
fig.update_layout(margin=dict(l=0,r=0,b=0),
                 showlegend=False)
fig.show()

Even though Moscow is the city with highest sales some categories like cinema bluray cd's and dvd's are comparatively sold more in Yakutsk city

Delivery of goods(Доставка товара) as a category is found only in the city of Интернет-магазин and accounts for 30% of sales.

PC games and movies accountfor the majority of sale in almost all cities

In [19]:

salesPartitionDict={2013:{'Items Sold Before November':0,'Items Sold After November':0},2014:{'Items Sold Before November':0,'Items Sold After November':0}}

for year in [2013,2014]:
    salesPartitionDict[year]['Items Sold After November']=salesYearDF[(salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
    salesPartitionDict[year]['Items Sold Before November']=salesYearDF[(-salesYearDF['month'].isin(['Nov','Dec'])) & (salesYearDF['year']==year)]['item_cnt_day'].mean()
    
salesPartitionDF=pd.DataFrame(salesPartitionDict).transpose()
salesPartitionDF.reset_index(inplace=True)
salesPartitionDF.rename(columns={'index':'year'},inplace=True)

data=[go.Bar(x=salesPartitionDF['year'],y=salesPartitionDF[i],name=i)for i in salesPartitionDF.columns[1:]]
layout=go.Layout(title='Items sold for every year before and after November')
fig=go.Figure(data=data,layout=layout)
fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        linecolor='rgb(204, 204, 204)',
        linewidth=3,
        tickmode='linear',
        dtick=1
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=True,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=3,
    ),
    plot_bgcolor='white'
)
fig.show()

In [20]:

dateBlockNumMonth={0:'Jan 2013',1:'Feb 2013',2:'Mar 2013',3:'Apr 2013',4:'May 2013',5:'Jun 2013',6:'Jul 2013',7:'Aug 2013',8:'Sep 2013',9:'Oct 2013',10:'Nov 2013',11:'Dec 2013',
                  12:'Jan 2014',13:'Feb 2014',14:'Mar 2014',15:'Apr 2014',16:'May 2014',17:'Jun 2014',18:'Jul 2014',19:'Aug 2014',20:'Sep 2014',21:'Oct 2014',22:'Nov 2014',23:'Dec 2014',
                  24:'Jan 2015',25:'Feb 2015',26:'Mar 2015',27:'Apr 2015',28:'May 2015',29:'Jun 2015',30:'Jul 2015',31:'Aug 2015',32:'Sep 2015',33:'Oct 2015'}
a=salesDF.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
monthlyHighestDict=OrderedDict()
n=0
for i in range(34):
    b=a[a['date_block_num']==i]
    b=b.sort_values(by='item_cnt_day',ascending=False)[:5]
    b.reset_index(inplace=True)
    for j in range(5):
        #item_name=itemsDF[itemsDF['item_id']==b.loc[j]['item_id']]['item_name'].values[0]
        monthlyHighestDict[n]={'month':dateBlockNumMonth[i],'item_name':str(b.loc[j]['item_id']),'count':b.loc[j]['item_cnt_day']}
        n+=1

In [21]:

fig=plt.figure(figsize=(20,20))
gs=fig.add_gridspec(5,7)
monthlyHighestDF=pd.DataFrame(monthlyHighestDict).transpose()
n=0
for i in range(5):
    for j in range(7):
        if n==34:
            break
        df=monthlyHighestDF[monthlyHighestDF['month']==dateBlockNumMonth[n]]
        ax=fig.add_subplot(gs[i,j])
        img=sns.barplot(data=df,y='item_name',x='count',palette='spring',orient='h',ax=ax)
        ax.set_title(dateBlockNumMonth[n])
        img.set(xlabel=None,ylabel=None,xticks=[])
        for s in ['top','right','left']:
            ax.spines[s].set_visible(False)
        ax.tick_params(left=False)
        n=n+1
    
plt.tight_layout()

The mostly sold products for all months is Branded package T-shirt 1C White,Diablo III(Game), Battlefield IV, Grand Theft Auto V (GTA V), FIFA 14, Sims 4. Most of them are games for PC,XBOX and PS3.

For eg: Battlefield 4 has been one of the top sold products in Nov 2013,Dec 2013, Jan 2014 and Feb 2014. The release date of Battlefield 4 is 29th October 2013.

Similarly GTA V was released on 17th September 2013 and it was top sold products in Sep,Oct,Dec 2013 and Jan 2014. We would be wondering what happened in Nov 2013 and it was ocupied by Battlefield 4. Nov 2013 also saw higher sales of Assassin's creed 4 and Call of Duty Ghosts. But its sales faded in subsequent months.

In [22]:

plt.figure(figsize=(15,6))
categoriesSalesDF=salesDF.groupby(['item_category_id'],as_index=False).agg({'item_cnt_day':'sum'})
categoriesSalesDF.reset_index(inplace=True)
categoriesSalesDF=categoriesSalesDF.sort_values(by='item_cnt_day',ascending=False)
categoriesSalesDF=pd.merge(categoriesSalesDF,itemsCategoriesDF[['item_category_name','item_category_id']],on='item_category_id',how='inner')
fig=px.bar(categoriesSalesDF[:5],x='item_category_name',y='item_cnt_day',title='Most sold categories')
fig.update_layout(plot_bgcolor='#fff',xaxis=dict(showline=True,linecolor='rgb(204, 204, 204)',linewidth=3),
                  yaxis=dict(showticklabels=False))
fig.show()

<Figure size 1080x432 with 0 Axes>

Games CD's, Movies CD's, Music CD's are the most sold categories

In [23]:

shopItemSalesDF=salesDF.groupby(['shop_id','item_id'],as_index=False).agg({'item_cnt_day':'sum'})
shopItemSalesDF.rename(columns={'item_cnt_day':'item_cnt'},inplace=True)
shopItemSalesDF['item_cnt']=shopItemSalesDF['item_cnt'].astype(np.int16)
shopItemSalesDF.sort_values(by='item_cnt',ascending=False,inplace=True)

In [24]:

shopTopItem=OrderedDict()
n=0
for shop in shopItemSalesDF['shop_id'].unique():
    shop_id=shop
    item_id=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_id'].values[0]
    item_cnt=shopItemSalesDF[shopItemSalesDF['shop_id']==shop][:1]['item_cnt'].values[0]
    shopTopItem[n]={'shop_id':shop,'item_id':item_id,'item_cnt':item_cnt}
    n+=1
    
shopTopItemDF=pd.DataFrame(shopTopItem).transpose()

li=[]
for i in range(len(shopTopItemDF)):
    if shopTopItemDF.loc[i]['item_id']==20949:
        li.append('item_20949')
    else:
        li.append('others')
shopTopItemDF['category']=li

In [25]:

fig=px.bar(shopTopItemDF,x='shop_id',y='item_cnt',color='category', # if values in column category = 'item_20949' and 'others'
    color_discrete_map={
        'item_20949': 'yellow',
        'others': 'violet'
    })
fig.update_layout(plot_bgcolor='#fff',xaxis={'showline':True,'linewidth':1,'linecolor':'black'},yaxis={'showticklabels':False})
fig.show()

For almost all shops T-Shirts is the most sold category

In [26]:

yearDayDF=salesDF.groupby(['year','month'],as_index=False).agg({'item_cnt_day':'sum'})
fig=px.bar(yearDayDF,x='month',y='item_cnt_day',facet_row='year',title='Total sales for every month across years')
fig.show()

2015 has lower sales number compared with 2013 and 2014

In [27]:

yearMonthDayDF=salesDF.groupby(['year','month','date','shop_id'],as_index=False).agg({'item_cnt_day':'sum','DateModified':'min'})

shopStats=OrderedDict()
n=0
for i in yearMonthDayDF['shop_id'].unique():
    mad=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mad()
    mean=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].mean()
    max=yearMonthDayDF[yearMonthDayDF['shop_id']==i]['item_cnt_day'].max()
    shopStats[n]={'shop_id':i,'mean':mean,'max':max,'mad':mad}
    n+=1
    
shopStatsDF=pd.DataFrame(shopStats).transpose()
shopStatsDF['shop_id']=shopStatsDF['shop_id'].astype('str')
data=[go.Bar(x=shopStatsDF['shop_id'],y=shopStatsDF[i],name=i)for i in ['mean','mad']]
layout=go.Layout(title='Mean sales and Deviation in sales for every shop in the list',
                 margin=dict(l=0,r=0,b=0,t=30),
                 width=1500)
fig=go.Figure(data,layout)
fig.show()

Shops with higher sales are predominantly found in Moscow. Atrium Mall in Moscow(Shop id=25) has a good mean sales and lesser mean absolute deviation compared with others.

Categorical Encoding¶

We have certain columns which should be encoded like cities,shops and categories. If we apply one hot encoding then there will be numeral columns. If we apply lable encoding the information may not be correctly represented.

So let's group the cities and categories based on their sales. For eg Moscow city sells the most number of products. Other cities lag behind. Let's create two categories as moscow and other_cities. Now it will be easy to encode.

In [28]:

dicCity=pd.cut(salesDF.groupby('city').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['other_cities','moscow'])
shopsDF['city'].replace(dicCity,inplace=True)
salesDF['city'].replace(dicCity,inplace=True)

In [29]:

dicMainCategory=pd.cut(salesDF.groupby('main_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
itemsNewDF['main_category'].replace(dicMainCategory,inplace=True)
salesDF['main_category'].replace(dicMainCategory,inplace=True)

In [30]:

dicSubCategory=pd.cut(salesDF.groupby('sub_category').agg({'item_cnt_day':'sum'})['item_cnt_day'],2,labels=['medium','high'])
itemsNewDF['sub_category'].replace(dicSubCategory,inplace=True)
salesDF['sub_category'].replace(dicSubCategory,inplace=True)

In [31]:

dicCategory=pd.cut(salesDF.groupby('category').agg({'item_cnt_day':'sum'})['item_cnt_day'],3,labels=['low','medium','high'])
shopsDF['category'].replace(dicCategory,inplace=True)
salesDF['category'].replace(dicCategory,inplace=True)

In [32]:

for col in salesDF.select_dtypes('object').columns:
    le=LabelEncoder()
    salesDF[col]=le.fit_transform(salesDF[col])
    if col in itemsNewDF.columns:
        itemsNewDF[col]=le.fit_transform(itemsNewDF[col])
    elif col in shopsDF.columns:
        shopsDF[col]=le.fit_transform(shopsDF[col])

In [33]:

salesDF.head()

Out[33]:

	date	date_block_num	shop_id	item_id	item_price	item_cnt_day	item_category_id	main_category	sub_category	city	DateModified	year	month
0	2	0	59	22154	999.0	1.0	37	0	1	1	2013-01-02	2013	1
1	16	4	59	2573	249.0	1.0	55	2	0	1	2013-05-16	2013	5
2	26	0	59	2574	399.0	1.0	55	2	0	1	2013-01-26	2013	1
3	9	0	59	2574	399.0	1.0	55	2	0	1	2013-01-09	2013	1
4	24	1	59	2574	399.0	1.0	55	2	0	1	2013-02-24	2013	2

Feature Engineering¶

Now we want to make sales forecast for a particular item to be sold in a particular shop for the month of 2015 November. So we are creating a dataframe with all possible combinations of month,shop_id and item_id.

In [34]:

cols=['date_block_num', 'shop_id', 'item_id'],
matrix=[]
for i in range(34):
    dup=salesDF[salesDF['date_block_num']==i]
    matrix.append(np.array(list(product([i],dup['shop_id'].unique(),dup['item_id'].unique())), dtype = np.int16))
    
matrix=pd.DataFrame(np.vstack(matrix))
matrix.rename(columns={0:'date_block_num',1:'shop_id',2:'item_id'},inplace=True)

matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
matrix["item_id"] = matrix["item_id"].astype(np.int16)

matrix.sort_values(by=['date_block_num', 'shop_id', 'item_id'],inplace=True)
matrix.reset_index(inplace=True)
matrix.drop('index',axis=1,inplace=True)

After creating the dataframe with month,shop_id and item_id we will merge this dataframe with the monthly sales of a item sold in a particular shop calculated from the sales dataframe

In [35]:

group=salesDF.groupby(["date_block_num", "shop_id", "item_id"],as_index=False).agg({'item_cnt_day':'sum'})
group.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num', 'shop_id', 'item_id'],how='left')
matrix['item_cnt_month']=matrix['item_cnt_month'].fillna(0).astype(np.float16)
matrix['item_cnt_month']=matrix['item_cnt_month'].clip(0,20)

In [36]:

group=salesDF.groupby(['date_block_num','shop_id','item_id'],as_index=False).agg({'item_price':'sum'})
group.rename(columns={'item_price':'item_price_month'},inplace=True)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id','item_id'],how='left')
matrix['item_price_month']=matrix['item_price_month'].fillna(0).astype(np.float16)

In [37]:

matrix.head()

Out[37]:

	shop_id	item_id	item_cnt_month	item_price_month
0	2	19	0.0	0.0
1	2	27	1.0	2500.0
2	2	28	0.0	0.0
3	2	29	0.0	0.0
4	2	32	0.0	0.0

Lets concat the test dataframe to our matrix dataframe. November 2015 is indicated by date block num 34

In [38]:

testDF["date_block_num"] = 34
testDF["date_block_num"] = testDF["date_block_num"].astype(np.int8)
testDF["shop_id"] = testDF.shop_id.astype(np.int8)
testDF["item_id"] = testDF.item_id.astype(np.int16)

In [39]:

matrix=pd.concat([matrix,testDF.drop('ID',axis=1)],ignore_index=True,sort=False,keys=['date_block_num', 'shop_id', 'item_id'])
matrix.fillna(0,inplace=True)

matrix=pd.merge(matrix,itemsNewDF[['item_id','item_category_id','main_category','sub_category']],on='item_id',how='left')
matrix=pd.merge(matrix,shopsDF[['shop_id','city','category']],on='shop_id',how='left')

In [40]:

matrix.head()

Out[40]:

	shop_id	item_id	item_cnt_month	item_price_month	item_category_id	sub_category	city
0	2	19	0.0	0.0	40	0	1
1	2	27	1.0	2500.0	19	1	1
2	2	28	0.0	0.0	30	0	1
3	2	29	0.0	0.0	23	1	1
4	2	32	0.0	0.0	40	0	1

In [ ]:

import gc
del [salesDF,itemsDF,itemsCategoriesDF,itemsNewDF,shopsDF]
gc.collect()

Now we will create the lag columns

item_cnt_month_lag_1 indicates the no of items sold in that particular shop in the previous month and similary

item_cnt_month_lag_12 indicates the no of items sold in that particular shop 12 months before

In [41]:

df=matrix
for col in ['item_cnt_month']:
    tmp = df[["date_block_num", "shop_id","item_id",col ]]
    for i in [1,2,3,12]:
        shifted = tmp.copy()
        shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_"+str(i)]
        shifted.date_block_num = shifted.date_block_num + i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')

matrix=df

In [42]:

group=matrix.groupby(['date_block_num'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'avg_item_cnt_month'})
matrix = pd.merge(matrix, group, on = ["date_block_num"], how = "left")
matrix['avg_item_cnt_month']=matrix['avg_item_cnt_month'].astype(np.float16)

df=matrix[['date_block_num','shop_id','item_id','avg_item_cnt_month']]
shifted=df.copy()
shifted.columns=['date_block_num','shop_id','item_id','prev_month_avg_item_cnt']
shifted['date_block_num']=shifted['date_block_num']+1

matrix=pd.merge(matrix,shifted,on=['date_block_num','shop_id','item_id'],how='left')
matrix.drop('avg_item_cnt_month',axis=1,inplace=True)

In [43]:

group=matrix.groupby(['date_block_num','shop_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'shop_avg_cnt_month'})
group['shop_avg_cnt_month']=group['shop_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','shop_id'],how='left')

df=matrix

tmp=df[['date_block_num', 'shop_id','item_id','shop_avg_cnt_month']]

for i in [1,2,3,12]:
    shifted=tmp.copy()
    shifted.columns=['date_block_num','shop_id','item_id','shop_avg_cnt_month_'+'_lag_'+str(i)]
    shifted['date_block_num']=shifted['date_block_num']+i
    df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('shop_avg_cnt_month',axis=1,inplace=True)

In [44]:

group=matrix.groupby(['date_block_num','item_id'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_avg_cnt_month'})
group['item_avg_cnt_month']=group['item_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id'],how='left')

df=matrix

tmp=df[['date_block_num', 'shop_id','item_id','item_avg_cnt_month']]
for i in [1,2,3,12]:
    shifted=tmp.copy()
    shifted.columns=['date_block_num','shop_id','item_id','item_avg_cnt_month_'+'_lag_'+str(i)]
    shifted['date_block_num']=shifted['date_block_num']+i
    df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_avg_cnt_month',axis=1,inplace=True)

In [45]:

import gc
del [group,tmp,shifted,df,dup]
gc.collect()

Out[45]:

In [46]:

group=matrix.groupby(['date_block_num','main_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_category_avg_cnt_month'})
group['item_category_avg_cnt_month']=group['item_category_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','main_category'],how='left')

df=matrix
for col in ['item_category_avg_cnt_month']:
    tmp=df[['date_block_num','shop_id','item_id',col]]
    for i in [1,2,12]:
        shifted=tmp.copy()
        shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num']=shifted['date_block_num']+i
        df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_category_avg_cnt_month',axis=1,inplace=True)

In [47]:

group=matrix.groupby(['date_block_num','item_id','main_category','sub_category'],as_index=False).agg({'item_cnt_month':'mean'}).rename(columns={'item_cnt_month':'item_categories_avg_cnt_month'})
group['item_categories_avg_cnt_month']=group['item_categories_avg_cnt_month'].astype(np.float16)
matrix=pd.merge(matrix,group,on=['date_block_num','item_id','main_category','sub_category'],how='left')

df=matrix
for col in ['item_categories_avg_cnt_month']:
    tmp=df[['date_block_num','shop_id','item_id',col]]
    for i in [1,2,3,12]:
        shifted=tmp.copy()
        shifted.columns=['date_block_num','shop_id','item_id',col+'_lag_'+str(i)]
        shifted['date_block_num']=shifted['date_block_num']+i
        df=pd.merge(df,shifted,on=['date_block_num','shop_id','item_id'],how='left')
        
matrix=df
matrix.drop('item_categories_avg_cnt_month',axis=1,inplace=True)

In [48]:

del [group,tmp,shifted,df]
gc.collect()

Out[48]:

Since mostly the lag count for the first three months would be null, lets remove the information pertaining to those months from the dataframe.

Preparing the data for modelling¶

In [49]:

matrix=matrix[matrix['date_block_num']>3]

In [50]:

##Filling nan's with zero
for col in matrix.columns:
    if ('lag' in col) & (matrix[col].isnull().any()):
        matrix[col].fillna(0,inplace=True)
        
matrix['prev_month_avg_item_cnt'].fillna(0,inplace=True)

After building the model and finding feature importance this 'item_price_month' was the most important feature. But this was causing overfitting problems and hence this column has been removed.

In [51]:

matrix.drop('item_price_month',axis=1,inplace=True)

In [52]:

x_train = matrix[matrix['date_block_num'] < 33].drop(['item_cnt_month'], axis=1)
y_train = matrix[matrix['date_block_num'] < 33]['item_cnt_month']
x_cv = matrix[(matrix['date_block_num'] == 33)].drop(['item_cnt_month'], axis=1)
y_cv = matrix[(matrix['date_block_num'] == 33)]['item_cnt_month']
x_test = matrix[matrix['date_block_num'] == 34].drop(['item_cnt_month'], axis=1)

Since we have a large dataframe lets randomly select 30,000 samples from each month for building our baseline model.

In [53]:

df2=matrix
df2=df2.drop(df2.index[0:],axis=0) ## We are duplicating the matrix and removing the rows and appending samples for each month
for i in x_train['date_block_num'].unique():
    sampleDF=matrix[matrix['date_block_num']==i].sample(n=30000,replace=False)
    df2=df2.append(sampleDF,ignore_index=True)
    
df2=df2.sample(frac=1)
df2.reset_index(inplace=True)
df2.drop('index',axis=1,inplace=True)
x_train_baseline=df2.drop('item_cnt_month',axis=1)
y_train_baseline=df2['item_cnt_month']

LGBM¶

Initially we fit a simple model to our resampled data and make predictions.

Then we calculate the feature importance and consider the top 20 features for making predictions in our tuned model.

In [54]:

lgbmBaseline=lgb.LGBMRegressor(objective='regression',
                              boosting_type='gbdt',
                              n_estimators=500)
lgbmBaseline.fit(x_train_baseline,y_train_baseline)
lgbmBaselinePred=lgbmBaseline.predict(x_cv)
print(mean_squared_error(lgbmBaselinePred,y_cv))
#predLgbmTestBaseline=lgbmBaseline.predict(x_test)

0.8651736983635989

In [55]:

featureImportanceLgbmDF=pd.DataFrame(sorted(zip(lgbmBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresLgbm=list(featureImportanceLgbmDF[featureImportanceLgbmDF['values']>featureImportanceLgbmDF['values'].median()]['features'].values) ##selecting features above mean lgbm feature score
sns.barplot(y='features',x='values',data=featureImportanceLgbmDF[-20:])
#featuresLgbm=featureImportanceLgbmDF['features'][-15:].values

Out[55]:

<AxesSubplot:xlabel='values', ylabel='features'>

In [56]:

x_train_lgbm=x_train[featuresLgbm]
x_cv_lgbm=x_cv[featuresLgbm]
x_test_lgbm=x_test[featuresLgbm]

In [57]:

#x_train_lgbm,x_test_lgbm,y_train_lgbm,y_test_lgbm=train_test_split(x_train,y_train,test_size=0.3)
def optimize(trial):
    params={
        'objective':'regression',
        'num_leaves':trial.suggest_int('num_leaves',2,256),
        'feature_fraction':trial.suggest_uniform('feature_fraction',0.4,1.0),
        'boosting_type':'gbdt',
        'reg_alpha':trial.suggest_uniform('alpha',1,10),
        'reg_lambda':trial.suggest_uniform('lambda',1,10),
        'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'n_estimators':trial.suggest_int('n_estimators',200,1000),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf',20,200)
    }
    model=lgb.LGBMRegressor(**params)
    model.fit(x_train_lgbm,y_train,eval_set=[(x_cv_lgbm,y_cv)],early_stopping_rounds=30)
    pred=model.predict(x_cv_lgbm)
    rmse=mean_squared_error(pred,y_cv)
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print('Best Trail: ',study.best_trial.params)

[I 2022-06-02 08:52:54,108] A new study created in memory with name: no-name-fe0fc650-9b10-4d3a-b895-b77d227d68d5

[LightGBM] [Warning] feature_fraction is set=0.9102203178665861, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9102203178665861
[LightGBM] [Warning] min_data_in_leaf is set=142, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=142
[1]	valid_0's l2: 0.972403
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 0.970208
[3]	valid_0's l2: 0.971736
[4]	valid_0's l2: 0.965238
[5]	valid_0's l2: 0.963101
[6]	valid_0's l2: 0.971144
[7]	valid_0's l2: 0.969602
[8]	valid_0's l2: 0.970277
[9]	valid_0's l2: 0.968177
[10]	valid_0's l2: 0.973934
[11]	valid_0's l2: 0.972653
[12]	valid_0's l2: 0.994125
[13]	valid_0's l2: 1.00178
[14]	valid_0's l2: 1.00204
[15]	valid_0's l2: 0.999461
[16]	valid_0's l2: 0.996569
[17]	valid_0's l2: 0.996582
[18]	valid_0's l2: 0.99582
[19]	valid_0's l2: 1.00349
[20]	valid_0's l2: 1.00228
[21]	valid_0's l2: 1.00329
[22]	valid_0's l2: 1.02857
[23]	valid_0's l2: 1.02858
[24]	valid_0's l2: 1.03122
[25]	valid_0's l2: 1.02977
[26]	valid_0's l2: 1.02616
[27]	valid_0's l2: 1.02711
[28]	valid_0's l2: 1.02597
[29]	valid_0's l2: 1.0799
[30]	valid_0's l2: 1.07955
[31]	valid_0's l2: 1.0807
[32]	valid_0's l2: 1.08078
[33]	valid_0's l2: 1.08005
[34]	valid_0's l2: 1.07989

[I 2022-06-02 08:53:11,230] Trial 0 finished with value: 0.9631008540896839 and parameters: {'num_leaves': 224, 'feature_fraction': 0.9102203178665861, 'alpha': 3.996880924517897, 'lambda': 5.846674031958002, 'learning_rate': 0.866560353016595, 'subsample': 0.679832750946725, 'n_estimators': 562, 'min_data_in_leaf': 142}. Best is trial 0 with value: 0.9631008540896839.

[35]	valid_0's l2: 1.07982
Early stopping, best iteration is:
[5]	valid_0's l2: 0.963101
[LightGBM] [Warning] feature_fraction is set=0.7454275976957052, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7454275976957052
[LightGBM] [Warning] min_data_in_leaf is set=190, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=190
[1]	valid_0's l2: 1.20191
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.13364
[3]	valid_0's l2: 1.07946
[4]	valid_0's l2: 1.0453
[5]	valid_0's l2: 1.01273
[6]	valid_0's l2: 0.996017
[7]	valid_0's l2: 0.974551
[8]	valid_0's l2: 0.954498
[9]	valid_0's l2: 0.938911
[10]	valid_0's l2: 0.925632
[11]	valid_0's l2: 0.916351
[12]	valid_0's l2: 0.908513
[13]	valid_0's l2: 0.902315
[14]	valid_0's l2: 0.897798
[15]	valid_0's l2: 0.894903
[16]	valid_0's l2: 0.891604
[17]	valid_0's l2: 0.88681
[18]	valid_0's l2: 0.887867
[19]	valid_0's l2: 0.885622
[20]	valid_0's l2: 0.883912
[21]	valid_0's l2: 0.883232
[22]	valid_0's l2: 0.886163
[23]	valid_0's l2: 0.884985
[24]	valid_0's l2: 0.882824
[25]	valid_0's l2: 0.882499
[26]	valid_0's l2: 0.881551
[27]	valid_0's l2: 0.880901
[28]	valid_0's l2: 0.880238
[29]	valid_0's l2: 0.879559
[30]	valid_0's l2: 0.878465
[31]	valid_0's l2: 0.877818
[32]	valid_0's l2: 0.877153
[33]	valid_0's l2: 0.877071
[34]	valid_0's l2: 0.878122
[35]	valid_0's l2: 0.877775
[36]	valid_0's l2: 0.877279
[37]	valid_0's l2: 0.877827
[38]	valid_0's l2: 0.877896
[39]	valid_0's l2: 0.877795
[40]	valid_0's l2: 0.87779
[41]	valid_0's l2: 0.877617
[42]	valid_0's l2: 0.877178
[43]	valid_0's l2: 0.876903
[44]	valid_0's l2: 0.877643
[45]	valid_0's l2: 0.877991
[46]	valid_0's l2: 0.877873
[47]	valid_0's l2: 0.877622
[48]	valid_0's l2: 0.876938
[49]	valid_0's l2: 0.877701
[50]	valid_0's l2: 0.877584
[51]	valid_0's l2: 0.877389
[52]	valid_0's l2: 0.876904
[53]	valid_0's l2: 0.876699
[54]	valid_0's l2: 0.876598
[55]	valid_0's l2: 0.876105
[56]	valid_0's l2: 0.875717
[57]	valid_0's l2: 0.875908
[58]	valid_0's l2: 0.875898
[59]	valid_0's l2: 0.875681
[60]	valid_0's l2: 0.876015
[61]	valid_0's l2: 0.875548
[62]	valid_0's l2: 0.875346
[63]	valid_0's l2: 0.87506
[64]	valid_0's l2: 0.880141
[65]	valid_0's l2: 0.879919
[66]	valid_0's l2: 0.881039
[67]	valid_0's l2: 0.88091
[68]	valid_0's l2: 0.881529
[69]	valid_0's l2: 0.881162
[70]	valid_0's l2: 0.88078
[71]	valid_0's l2: 0.880762
[72]	valid_0's l2: 0.881328
[73]	valid_0's l2: 0.878195
[74]	valid_0's l2: 0.880276
[75]	valid_0's l2: 0.880079
[76]	valid_0's l2: 0.87866
[77]	valid_0's l2: 0.878194
[78]	valid_0's l2: 0.877595
[79]	valid_0's l2: 0.879907
[80]	valid_0's l2: 0.880093
[81]	valid_0's l2: 0.879935
[82]	valid_0's l2: 0.879396
[83]	valid_0's l2: 0.87951
[84]	valid_0's l2: 0.879322
[85]	valid_0's l2: 0.879314
[86]	valid_0's l2: 0.879631
[87]	valid_0's l2: 0.87958
[88]	valid_0's l2: 0.879713
[89]	valid_0's l2: 0.879575
[90]	valid_0's l2: 0.879606
[91]	valid_0's l2: 0.879552
[92]	valid_0's l2: 0.883556
[93]	valid_0's l2: 0.883389
Early stopping, best iteration is:
[63]	valid_0's l2: 0.87506

[I 2022-06-02 08:53:51,215] Trial 1 finished with value: 0.8750602362861951 and parameters: {'num_leaves': 130, 'feature_fraction': 0.7454275976957052, 'alpha': 6.359645121324666, 'lambda': 9.156457861179456, 'learning_rate': 0.10951433640280277, 'subsample': 0.7040815219507084, 'n_estimators': 717, 'min_data_in_leaf': 190}. Best is trial 1 with value: 0.8750602362861951.

[LightGBM] [Warning] feature_fraction is set=0.5292216805806753, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5292216805806753
[LightGBM] [Warning] min_data_in_leaf is set=33, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=33
[1]	valid_0's l2: 0.994705
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.03327
[3]	valid_0's l2: 1.04375
[4]	valid_0's l2: 1.02898
[5]	valid_0's l2: 1.02363
[6]	valid_0's l2: 1.02119
[7]	valid_0's l2: 1.01709
[8]	valid_0's l2: 1.0173
[9]	valid_0's l2: 1.0124
[10]	valid_0's l2: 1.03684
[11]	valid_0's l2: 1.03823
[12]	valid_0's l2: 1.04329
[13]	valid_0's l2: 1.04429
[14]	valid_0's l2: 1.04665
[15]	valid_0's l2: 1.05028
[16]	valid_0's l2: 1.04802
[17]	valid_0's l2: 1.04818
[18]	valid_0's l2: 1.04513
[19]	valid_0's l2: 1.04551
[20]	valid_0's l2: 1.047
[21]	valid_0's l2: 1.04714
[22]	valid_0's l2: 1.04755
[23]	valid_0's l2: 1.04735
[24]	valid_0's l2: 1.03443
[25]	valid_0's l2: 1.0344
[26]	valid_0's l2: 1.03599
[27]	valid_0's l2: 1.03583
[28]	valid_0's l2: 1.03595
[29]	valid_0's l2: 1.02366
[30]	valid_0's l2: 1.02204

[I 2022-06-02 08:54:09,751] Trial 2 finished with value: 0.9947045280640638 and parameters: {'num_leaves': 249, 'feature_fraction': 0.5292216805806753, 'alpha': 1.9336963912339238, 'lambda': 5.25865683651941, 'learning_rate': 0.9215607548187419, 'subsample': 0.9408304893605361, 'n_estimators': 723, 'min_data_in_leaf': 33}. Best is trial 1 with value: 0.8750602362861951.

[31]	valid_0's l2: 1.02144
Early stopping, best iteration is:
[1]	valid_0's l2: 0.994705
[LightGBM] [Warning] feature_fraction is set=0.5968579579775428, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5968579579775428
[LightGBM] [Warning] min_data_in_leaf is set=98, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=98
[1]	valid_0's l2: 0.977172
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 0.968616
[3]	valid_0's l2: 0.968485
[4]	valid_0's l2: 0.962483
[5]	valid_0's l2: 0.960139
[6]	valid_0's l2: 0.962693
[7]	valid_0's l2: 0.96082
[8]	valid_0's l2: 0.974497
[9]	valid_0's l2: 0.973253
[10]	valid_0's l2: 0.972131
[11]	valid_0's l2: 0.968583
[12]	valid_0's l2: 0.974582
[13]	valid_0's l2: 0.973634
[14]	valid_0's l2: 0.971765
[15]	valid_0's l2: 0.975925
[16]	valid_0's l2: 0.978866
[17]	valid_0's l2: 0.984165
[18]	valid_0's l2: 0.986456
[19]	valid_0's l2: 0.972289
[20]	valid_0's l2: 0.964962
[21]	valid_0's l2: 0.964349
[22]	valid_0's l2: 0.969214
[23]	valid_0's l2: 0.968586
[24]	valid_0's l2: 0.96902
[25]	valid_0's l2: 0.967084
[26]	valid_0's l2: 0.966492
[27]	valid_0's l2: 0.968832
[28]	valid_0's l2: 0.953124
[29]	valid_0's l2: 0.953323
[30]	valid_0's l2: 0.95341
[31]	valid_0's l2: 0.947037
[32]	valid_0's l2: 0.946965
[33]	valid_0's l2: 0.9487
[34]	valid_0's l2: 0.955344
[35]	valid_0's l2: 0.953638
[36]	valid_0's l2: 0.954892
[37]	valid_0's l2: 0.953625
[38]	valid_0's l2: 0.956807
[39]	valid_0's l2: 0.955462
[40]	valid_0's l2: 0.956294
[41]	valid_0's l2: 0.956595
[42]	valid_0's l2: 0.957393
[43]	valid_0's l2: 0.956647
[44]	valid_0's l2: 0.956619
[45]	valid_0's l2: 0.955511
[46]	valid_0's l2: 0.95917
[47]	valid_0's l2: 0.959216
[48]	valid_0's l2: 0.959095
[49]	valid_0's l2: 0.959876
[50]	valid_0's l2: 0.960512
[51]	valid_0's l2: 0.960421
[52]	valid_0's l2: 0.959837
[53]	valid_0's l2: 0.959778
[54]	valid_0's l2: 0.959788
[55]	valid_0's l2: 0.95774
[56]	valid_0's l2: 0.95787
[57]	valid_0's l2: 0.957937
[58]	valid_0's l2: 0.956977
[59]	valid_0's l2: 0.957054
[60]	valid_0's l2: 0.957617
[61]	valid_0's l2: 0.95911

[I 2022-06-02 08:54:33,228] Trial 3 finished with value: 0.9469650557340977 and parameters: {'num_leaves': 120, 'feature_fraction': 0.5968579579775428, 'alpha': 3.713861555731498, 'lambda': 5.493641685961967, 'learning_rate': 0.7758498892437286, 'subsample': 0.7498461572632822, 'n_estimators': 209, 'min_data_in_leaf': 98}. Best is trial 1 with value: 0.8750602362861951.

[62]	valid_0's l2: 0.959375
Early stopping, best iteration is:
[32]	valid_0's l2: 0.946965
[LightGBM] [Warning] feature_fraction is set=0.9540490660433252, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.9540490660433252
[LightGBM] [Warning] min_data_in_leaf is set=107, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=107
[1]	valid_0's l2: 0.956844
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 0.908076
[3]	valid_0's l2: 0.905727
[4]	valid_0's l2: 0.899157
[5]	valid_0's l2: 0.891824
[6]	valid_0's l2: 0.891818
[7]	valid_0's l2: 0.918196
[8]	valid_0's l2: 0.917572
[9]	valid_0's l2: 0.915554
[10]	valid_0's l2: 0.916916
[11]	valid_0's l2: 0.912245
[12]	valid_0's l2: 0.912917
[13]	valid_0's l2: 0.911858
[14]	valid_0's l2: 0.917131
[15]	valid_0's l2: 1.02226
[16]	valid_0's l2: 1.02176
[17]	valid_0's l2: 1.01579
[18]	valid_0's l2: 1.01453
[19]	valid_0's l2: 1.01504
[20]	valid_0's l2: 1.01473
[21]	valid_0's l2: 1.01396
[22]	valid_0's l2: 1.01312
[23]	valid_0's l2: 1.00959
[24]	valid_0's l2: 1.01473
[25]	valid_0's l2: 1.02351
[26]	valid_0's l2: 1.02351
[27]	valid_0's l2: 1.02221
[28]	valid_0's l2: 1.02213
[29]	valid_0's l2: 1.0179
[30]	valid_0's l2: 1.01748
[31]	valid_0's l2: 1.01722
[32]	valid_0's l2: 1.01752
[33]	valid_0's l2: 1.01791
[34]	valid_0's l2: 1.01805
[35]	valid_0's l2: 1.01752

[I 2022-06-02 08:54:48,589] Trial 4 finished with value: 0.8918175226515949 and parameters: {'num_leaves': 108, 'feature_fraction': 0.9540490660433252, 'alpha': 3.5610786666805136, 'lambda': 5.8145704357205386, 'learning_rate': 0.6010592409499158, 'subsample': 0.5405191216830405, 'n_estimators': 427, 'min_data_in_leaf': 107}. Best is trial 1 with value: 0.8750602362861951.

[36]	valid_0's l2: 1.01818
Early stopping, best iteration is:
[6]	valid_0's l2: 0.891818
[LightGBM] [Warning] feature_fraction is set=0.5428808129882927, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5428808129882927
[LightGBM] [Warning] min_data_in_leaf is set=96, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=96
[1]	valid_0's l2: 0.997663
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 0.951455
[3]	valid_0's l2: 0.950096
[4]	valid_0's l2: 0.936599
[5]	valid_0's l2: 0.933746
[6]	valid_0's l2: 0.926375
[7]	valid_0's l2: 0.927686
[8]	valid_0's l2: 0.929771
[9]	valid_0's l2: 0.933441
[10]	valid_0's l2: 0.935014
[11]	valid_0's l2: 0.929679
[12]	valid_0's l2: 0.930278
[13]	valid_0's l2: 0.930305
[14]	valid_0's l2: 0.955169
[15]	valid_0's l2: 0.956218
[16]	valid_0's l2: 0.956158
[17]	valid_0's l2: 0.960795
[18]	valid_0's l2: 0.9591
[19]	valid_0's l2: 0.958362
[20]	valid_0's l2: 0.955822
[21]	valid_0's l2: 0.954689
[22]	valid_0's l2: 0.959224
[23]	valid_0's l2: 0.955752
[24]	valid_0's l2: 0.949572
[25]	valid_0's l2: 0.948977
[26]	valid_0's l2: 0.947841
[27]	valid_0's l2: 0.950124
[28]	valid_0's l2: 0.952002
[29]	valid_0's l2: 0.952266
[30]	valid_0's l2: 0.953804
[31]	valid_0's l2: 0.952892
[32]	valid_0's l2: 0.955718
[33]	valid_0's l2: 0.958202
[34]	valid_0's l2: 0.95917
[35]	valid_0's l2: 0.958653

[I 2022-06-02 08:55:05,130] Trial 5 finished with value: 0.9263750057958984 and parameters: {'num_leaves': 113, 'feature_fraction': 0.5428808129882927, 'alpha': 8.890006558560103, 'lambda': 5.880046000468369, 'learning_rate': 0.6146925872258462, 'subsample': 0.5878835922740436, 'n_estimators': 365, 'min_data_in_leaf': 96}. Best is trial 1 with value: 0.8750602362861951.

[36]	valid_0's l2: 0.960647
Early stopping, best iteration is:
[6]	valid_0's l2: 0.926375
[LightGBM] [Warning] feature_fraction is set=0.5507256392112124, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5507256392112124
[LightGBM] [Warning] min_data_in_leaf is set=36, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=36
[1]	valid_0's l2: 1.11862
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.03633
[3]	valid_0's l2: 0.98926
[4]	valid_0's l2: 0.948704
[5]	valid_0's l2: 0.913819
[6]	valid_0's l2: 0.907927
[7]	valid_0's l2: 0.897297
[8]	valid_0's l2: 0.90089
[9]	valid_0's l2: 0.896666
[10]	valid_0's l2: 0.896928
[11]	valid_0's l2: 0.894733
[12]	valid_0's l2: 0.894425
[13]	valid_0's l2: 0.895255
[14]	valid_0's l2: 0.896618
[15]	valid_0's l2: 0.900647
[16]	valid_0's l2: 0.900429
[17]	valid_0's l2: 0.906069
[18]	valid_0's l2: 0.913742
[19]	valid_0's l2: 0.913784
[20]	valid_0's l2: 0.911953
[21]	valid_0's l2: 0.910463
[22]	valid_0's l2: 0.910561
[23]	valid_0's l2: 0.907841
[24]	valid_0's l2: 0.899767
[25]	valid_0's l2: 0.898715
[26]	valid_0's l2: 0.898331
[27]	valid_0's l2: 0.898069
[28]	valid_0's l2: 0.88843
[29]	valid_0's l2: 0.886885
[30]	valid_0's l2: 0.889828
[31]	valid_0's l2: 0.889422
[32]	valid_0's l2: 0.890094
[33]	valid_0's l2: 0.889215
[34]	valid_0's l2: 0.88932
[35]	valid_0's l2: 0.889541
[36]	valid_0's l2: 0.889347
[37]	valid_0's l2: 0.892003
[38]	valid_0's l2: 0.892299
[39]	valid_0's l2: 0.892395
[40]	valid_0's l2: 0.892559
[41]	valid_0's l2: 0.892581
[42]	valid_0's l2: 0.892435
[43]	valid_0's l2: 0.892227
[44]	valid_0's l2: 0.891296
[45]	valid_0's l2: 0.891623
[46]	valid_0's l2: 0.892049
[47]	valid_0's l2: 0.8848
[48]	valid_0's l2: 0.884241
[49]	valid_0's l2: 0.885107
[50]	valid_0's l2: 0.885575
[51]	valid_0's l2: 0.885916
[52]	valid_0's l2: 0.884838
[53]	valid_0's l2: 0.883696
[54]	valid_0's l2: 0.883686
[55]	valid_0's l2: 0.8842
[56]	valid_0's l2: 0.884869
[57]	valid_0's l2: 0.89721
[58]	valid_0's l2: 0.897202
[59]	valid_0's l2: 0.897345
[60]	valid_0's l2: 0.898385
[61]	valid_0's l2: 0.89819
[62]	valid_0's l2: 0.896712
[63]	valid_0's l2: 0.897214
[64]	valid_0's l2: 0.897251
[65]	valid_0's l2: 0.897605
[66]	valid_0's l2: 0.897492
[67]	valid_0's l2: 0.896538
[68]	valid_0's l2: 0.895725
[69]	valid_0's l2: 0.895803
[70]	valid_0's l2: 0.895801
[71]	valid_0's l2: 0.895773
[72]	valid_0's l2: 0.89558
[73]	valid_0's l2: 0.899212
[74]	valid_0's l2: 0.899088
[75]	valid_0's l2: 0.898686
[76]	valid_0's l2: 0.899097
[77]	valid_0's l2: 0.899118
[78]	valid_0's l2: 0.899217
[79]	valid_0's l2: 0.899793
[80]	valid_0's l2: 0.899737
[81]	valid_0's l2: 0.899725
[82]	valid_0's l2: 0.89669
[83]	valid_0's l2: 0.896477
[84]	valid_0's l2: 0.89364
Early stopping, best iteration is:
[54]	valid_0's l2: 0.883686

[I 2022-06-02 08:55:37,869] Trial 6 finished with value: 0.8836858063014148 and parameters: {'num_leaves': 163, 'feature_fraction': 0.5507256392112124, 'alpha': 8.644303544761698, 'lambda': 6.960281615141806, 'learning_rate': 0.2708795447705174, 'subsample': 0.7183056898540603, 'n_estimators': 582, 'min_data_in_leaf': 36}. Best is trial 1 with value: 0.8750602362861951.

[LightGBM] [Warning] feature_fraction is set=0.7448570099238562, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7448570099238562
[LightGBM] [Warning] min_data_in_leaf is set=45, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=45
[1]	valid_0's l2: 1.2433
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.20132
[3]	valid_0's l2: 1.16491
[4]	valid_0's l2: 1.13849
[5]	valid_0's l2: 1.1118
[6]	valid_0's l2: 1.09744
[7]	valid_0's l2: 1.07575
[8]	valid_0's l2: 1.05217
[9]	valid_0's l2: 1.03338
[10]	valid_0's l2: 1.01529
[11]	valid_0's l2: 0.998577
[12]	valid_0's l2: 0.985407
[13]	valid_0's l2: 0.975082
[14]	valid_0's l2: 0.965678
[15]	valid_0's l2: 0.954899
[16]	valid_0's l2: 0.947618
[17]	valid_0's l2: 0.940121
[18]	valid_0's l2: 0.931668
[19]	valid_0's l2: 0.925681
[20]	valid_0's l2: 0.919718
[21]	valid_0's l2: 0.916203
[22]	valid_0's l2: 0.912432
[23]	valid_0's l2: 0.910084
[24]	valid_0's l2: 0.907362
[25]	valid_0's l2: 0.904067
[26]	valid_0's l2: 0.900833
[27]	valid_0's l2: 0.898751
[28]	valid_0's l2: 0.897557
[29]	valid_0's l2: 0.896674
[30]	valid_0's l2: 0.895816
[31]	valid_0's l2: 0.894482
[32]	valid_0's l2: 0.893516
[33]	valid_0's l2: 0.894734
[34]	valid_0's l2: 0.896185
[35]	valid_0's l2: 0.894735
[36]	valid_0's l2: 0.893623
[37]	valid_0's l2: 0.892855
[38]	valid_0's l2: 0.892146
[39]	valid_0's l2: 0.891281
[40]	valid_0's l2: 0.890494
[41]	valid_0's l2: 0.891792
[42]	valid_0's l2: 0.892643
[43]	valid_0's l2: 0.891323
[44]	valid_0's l2: 0.89074
[45]	valid_0's l2: 0.889883
[46]	valid_0's l2: 0.889401
[47]	valid_0's l2: 0.889085
[48]	valid_0's l2: 0.888296
[49]	valid_0's l2: 0.887193
[50]	valid_0's l2: 0.886631
[51]	valid_0's l2: 0.886172
[52]	valid_0's l2: 0.885887
[53]	valid_0's l2: 0.885599
[54]	valid_0's l2: 0.884993
[55]	valid_0's l2: 0.88517
[56]	valid_0's l2: 0.884857
[57]	valid_0's l2: 0.884633
[58]	valid_0's l2: 0.884359
[59]	valid_0's l2: 0.885482
[60]	valid_0's l2: 0.886019
[61]	valid_0's l2: 0.885781
[62]	valid_0's l2: 0.885649
[63]	valid_0's l2: 0.888665
[64]	valid_0's l2: 0.889175
[65]	valid_0's l2: 0.888784
[66]	valid_0's l2: 0.888209
[67]	valid_0's l2: 0.888202
[68]	valid_0's l2: 0.887754
[69]	valid_0's l2: 0.889866
[70]	valid_0's l2: 0.89027
[71]	valid_0's l2: 0.890769
[72]	valid_0's l2: 0.890608
[73]	valid_0's l2: 0.890579
[74]	valid_0's l2: 0.890808
[75]	valid_0's l2: 0.890755
[76]	valid_0's l2: 0.891072
[77]	valid_0's l2: 0.891056
[78]	valid_0's l2: 0.891317
[79]	valid_0's l2: 0.89115
[80]	valid_0's l2: 0.891353
[81]	valid_0's l2: 0.891168
[82]	valid_0's l2: 0.890899
[83]	valid_0's l2: 0.890678
[84]	valid_0's l2: 0.89096
[85]	valid_0's l2: 0.890853
[86]	valid_0's l2: 0.890848
[87]	valid_0's l2: 0.890624
[88]	valid_0's l2: 0.890272
Early stopping, best iteration is:
[58]	valid_0's l2: 0.884359

[I 2022-06-02 08:56:15,538] Trial 7 finished with value: 0.8843588366874461 and parameters: {'num_leaves': 108, 'feature_fraction': 0.7448570099238562, 'alpha': 1.601471298791404, 'lambda': 8.737681215894316, 'learning_rate': 0.05664995661757699, 'subsample': 0.7694103785509377, 'n_estimators': 710, 'min_data_in_leaf': 45}. Best is trial 1 with value: 0.8750602362861951.

[LightGBM] [Warning] feature_fraction is set=0.4686694022827363, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.4686694022827363
[LightGBM] [Warning] min_data_in_leaf is set=68, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=68
[1]	valid_0's l2: 1.043
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 0.962792
[3]	valid_0's l2: 0.952292
[4]	valid_0's l2: 0.926211
[5]	valid_0's l2: 0.90971
[6]	valid_0's l2: 0.908147
[7]	valid_0's l2: 0.903789
[8]	valid_0's l2: 0.936951
[9]	valid_0's l2: 0.928376
[10]	valid_0's l2: 0.930711
[11]	valid_0's l2: 0.929748
[12]	valid_0's l2: 0.933541
[13]	valid_0's l2: 0.935972
[14]	valid_0's l2: 0.937134
[15]	valid_0's l2: 0.941869
[16]	valid_0's l2: 0.944118
[17]	valid_0's l2: 0.943211
[18]	valid_0's l2: 0.944274
[19]	valid_0's l2: 0.971482
[20]	valid_0's l2: 0.971616
[21]	valid_0's l2: 0.971617
[22]	valid_0's l2: 0.975361
[23]	valid_0's l2: 0.976784
[24]	valid_0's l2: 0.976603
[25]	valid_0's l2: 0.9759
[26]	valid_0's l2: 0.975514
[27]	valid_0's l2: 0.98026
[28]	valid_0's l2: 0.97556
[29]	valid_0's l2: 0.977003
[30]	valid_0's l2: 0.977489
[31]	valid_0's l2: 0.977896
[32]	valid_0's l2: 0.978525
[33]	valid_0's l2: 0.97719
[34]	valid_0's l2: 0.975269
[35]	valid_0's l2: 0.975342
[36]	valid_0's l2: 0.972563

[I 2022-06-02 08:56:33,561] Trial 8 finished with value: 0.9037889391854311 and parameters: {'num_leaves': 171, 'feature_fraction': 0.4686694022827363, 'alpha': 4.195235883477049, 'lambda': 9.83212388030402, 'learning_rate': 0.44667951908786035, 'subsample': 0.8803492246911766, 'n_estimators': 248, 'min_data_in_leaf': 68}. Best is trial 1 with value: 0.8750602362861951.

[37]	valid_0's l2: 0.970698
Early stopping, best iteration is:
[7]	valid_0's l2: 0.903789
[LightGBM] [Warning] feature_fraction is set=0.8090043739141928, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8090043739141928
[LightGBM] [Warning] min_data_in_leaf is set=194, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=194
[1]	valid_0's l2: 1.21717
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.15741
[3]	valid_0's l2: 1.10954
[4]	valid_0's l2: 1.07509
[5]	valid_0's l2: 1.04315
[6]	valid_0's l2: 1.02376
[7]	valid_0's l2: 1.0028
[8]	valid_0's l2: 0.980399
[9]	valid_0's l2: 0.963368
[10]	valid_0's l2: 0.947227
[11]	valid_0's l2: 0.93414
[12]	valid_0's l2: 0.925416
[13]	valid_0's l2: 0.917877
[14]	valid_0's l2: 0.91103
[15]	valid_0's l2: 0.905008
[16]	valid_0's l2: 0.900753
[17]	valid_0's l2: 0.895205
[18]	valid_0's l2: 0.891027
[19]	valid_0's l2: 0.88722
[20]	valid_0's l2: 0.884073
[21]	valid_0's l2: 0.882542
[22]	valid_0's l2: 0.881883
[23]	valid_0's l2: 0.88096
[24]	valid_0's l2: 0.882051
[25]	valid_0's l2: 0.881686
[26]	valid_0's l2: 0.879948
[27]	valid_0's l2: 0.878743
[28]	valid_0's l2: 0.878873
[29]	valid_0's l2: 0.877722
[30]	valid_0's l2: 0.877031
[31]	valid_0's l2: 0.873893
[32]	valid_0's l2: 0.8731
[33]	valid_0's l2: 0.874335
[34]	valid_0's l2: 0.875042
[35]	valid_0's l2: 0.874569
[36]	valid_0's l2: 0.873649
[37]	valid_0's l2: 0.87346
[38]	valid_0's l2: 0.873781
[39]	valid_0's l2: 0.873296
[40]	valid_0's l2: 0.872856
[41]	valid_0's l2: 0.872614
[42]	valid_0's l2: 0.872431
[43]	valid_0's l2: 0.872191
[44]	valid_0's l2: 0.870676
[45]	valid_0's l2: 0.86997
[46]	valid_0's l2: 0.869711
[47]	valid_0's l2: 0.869313
[48]	valid_0's l2: 0.869326
[49]	valid_0's l2: 0.868823
[50]	valid_0's l2: 0.868629
[51]	valid_0's l2: 0.869576
[52]	valid_0's l2: 0.869271
[53]	valid_0's l2: 0.86949
[54]	valid_0's l2: 0.869616
[55]	valid_0's l2: 0.869614
[56]	valid_0's l2: 0.869421
[57]	valid_0's l2: 0.87098
[58]	valid_0's l2: 0.874341
[59]	valid_0's l2: 0.874316
[60]	valid_0's l2: 0.874328
[61]	valid_0's l2: 0.87537
[62]	valid_0's l2: 0.875101
[63]	valid_0's l2: 0.875502
[64]	valid_0's l2: 0.876237
[65]	valid_0's l2: 0.875942
[66]	valid_0's l2: 0.877103
[67]	valid_0's l2: 0.877183
[68]	valid_0's l2: 0.877146
[69]	valid_0's l2: 0.877895
[70]	valid_0's l2: 0.877841
[71]	valid_0's l2: 0.878101
[72]	valid_0's l2: 0.877957
[73]	valid_0's l2: 0.877726
[74]	valid_0's l2: 0.877647
[75]	valid_0's l2: 0.877268
[76]	valid_0's l2: 0.877483
[77]	valid_0's l2: 0.877425
[78]	valid_0's l2: 0.876827
[79]	valid_0's l2: 0.876873
[80]	valid_0's l2: 0.876733
Early stopping, best iteration is:
[50]	valid_0's l2: 0.868629

[I 2022-06-02 08:57:10,286] Trial 9 finished with value: 0.8686287665692258 and parameters: {'num_leaves': 167, 'feature_fraction': 0.8090043739141928, 'alpha': 7.530721686244508, 'lambda': 4.184295547505293, 'learning_rate': 0.08630422292011555, 'subsample': 0.9145396974096048, 'n_estimators': 447, 'min_data_in_leaf': 194}. Best is trial 9 with value: 0.8686287665692258.

Best Trail:  {'num_leaves': 167, 'feature_fraction': 0.8090043739141928, 'alpha': 7.530721686244508, 'lambda': 4.184295547505293, 'learning_rate': 0.08630422292011555, 'subsample': 0.9145396974096048, 'n_estimators': 447, 'min_data_in_leaf': 194}

In [58]:

lgbrEstimator=lgb.LGBMRegressor(**study.best_trial.params)
lgbrEstimator.fit(x_train_lgbm,y_train)
predLgbmCV=lgbrEstimator.predict(x_cv_lgbm)
print(mean_squared_error(predLgbmCV,y_cv))
predLgbTest=lgbrEstimator.predict(x_test_lgbm)
predLgbTest=predLgbTest.clip(0,20)

[LightGBM] [Warning] lambda_l2 is set with lambda=4.184295547505293, reg_lambda=0.0 will be ignored. Current value: lambda_l2=4.184295547505293
[LightGBM] [Warning] feature_fraction is set=0.8090043739141928, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8090043739141928
[LightGBM] [Warning] min_data_in_leaf is set=194, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=194
0.8568278991960587

Model Explainability¶

SHAP values (SHapley Additive exPlanations) is used to explain the predictions made by your model.

SHAP shows the contribution or the importance of each feature on the prediction of the model, it does not evaluate the quality of the prediction itself.

In [59]:

import shap

explainer(data) ---> Returns data,base_values,shap_values

data - The original data

base_values - The expected value of the target or the mean target value from the data

shap_values - SHAP values for each sample

explainer.shap_values ---> Returns only the shap values

In [60]:

shap_values=shap.TreeExplainer(lgbrEstimator).shap_values(x_cv_lgbm)
shap.summary_plot(shap_values,x_cv_lgbm,plot_type='bar')

From the above plot it evident that item_avg_cnt_month_lag,item_cnt_month_lag,shop_av_cnt_month_lag features have better predicting power compared with other variables. This prediction is also meaningful because previous month's sales is a good indicator for sales in forecoming months.

Another point to be noted is sales one month before(i.e last month) is more detrimental compared with other variables

The below plot is similar to a summary bar plot but this is more interpretable.

In [61]:

shap.summary_plot(shap_values,x_cv_lgbm,feature_names=x_cv_lgbm.columns)

The first 3 features indicates that higher items sold in the previous month means more items would be sold in the coming months.

The plot indicates that lower the item id better would be the average sales. We can confirm this conclusion from pdp plots.

In [62]:

from pdpbox import info_plots
fig=info_plots.target_plot(df=matrix,feature='item_id',feature_name='item_id',target='item_cnt_month')
fig[0]

Out[62]:

Lower item id's have better average item count per month confirming our inference from shap plot.

In [63]:

info_plots.actual_plot(lgbrEstimator,x_cv_lgbm,feature='item_id',feature_name='item_id');

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

The pattern of predictions is maintained i.e compared with other item id's 2708-4838 item id's have more items sold. But the predicted count varies significantly from actual count.

In [64]:

salesA=salesDF[(salesDF['item_id']>2708)&(salesDF['item_id']<7917)]['item_cnt_day'].sum()
salesB=salesDF[(salesDF['item_id']<2708)|(salesDF['item_id']>7917)]['item_cnt_day'].sum()
salesValues=[salesA,salesB]
salesNames=['item ids 2708-7917','other item ids']
fig=px.pie(names=salesNames,values=salesValues,title='Item ids 2708-7917 account for 33% of sales')
fig.update_layout(margin=dict(l=0,r=0,b=0),showlegend=False)
fig.update_traces(textinfo='label+percent')
fig.show()

In [65]:

ax1=info_plots.target_plot(df=df2[(df2['date_block_num']>=10) & (df2['date_block_num']<19)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[(df2['date_block_num']>=19) & (df2['date_block_num']<28)],feature='date_block_num',feature_name='month',target='item_cnt_month')
ax2=info_plots.target_plot(df=df2[df2['date_block_num']>=28],feature='date_block_num',feature_name='month',target='item_cnt_month')

XGBOOST¶

In [66]:

xgbBaseline=xgb.XGBRegressor(objective='reg:squaredlogerror',
                            eval_metric='rmse',
                            booster='gbtree',
                            n_estimators=200)
xgbBaseline.fit(x_train_baseline,y_train_baseline)
predXgbBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predXgbBaseline,y_cv))

0.89054406

In [67]:

featureImportanceXgbDF=pd.DataFrame(sorted(zip(xgbBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresXgb=featureImportanceXgbDF[featureImportanceXgbDF['values']>=featureImportanceXgbDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceXgbDF[-15:])

Out[67]:

<AxesSubplot:xlabel='values', ylabel='features'>

In [68]:

x_train_xgb=x_train[featuresXgb]
x_cv_xgb=x_cv[featuresXgb]
x_test_xgb=x_test[featuresXgb]

In [ ]:

def optimize(trial):
    params={
        'objective':'reg:squaredlogerror',
        'eval_metric':'rmse',
        'booster':'gbtree',
        'alpha':trial.suggest_loguniform('alpha',1,20),
        'lambda':trial.suggest_loguniform('lambda',1,20),
        'max_depth':trial.suggest_int('max_depth',1,10),
        'learning_rate':trial.suggest_uniform('learning_rate',0.001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'n_estimators':trial.suggest_int('n_estimators',10,300)
    }
    model=xgb.XGBRegressor(**params)
    model.fit(x_train_xgb,y_train,eval_set=[(x_cv_xgb,y_cv)],early_stopping_rounds=30)
    pred=model.predict(x_cv_xgb)
    rmse=mean_squared_error(pred,y_cv)
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)

In [70]:

xgbBestParams=study.best_trial.params
xgbBestParams.update({'objective':'reg:squaredlogerror',
           'eval_metric':'rmse',
           'booster':'gbtree'})

In [71]:

#xgbReg=xgb.XGBRegressor(**study.best_trial.params)
xgbReg=xgb.XGBRegressor(**xgbBestParams)
xgbReg.fit(x_train_xgb,y_train)
predXgbCV=xgbReg.predict(x_cv_xgb)
print(mean_squared_error(predXgbCV,y_cv))
predXgbTest=xgbReg.predict(x_test_xgb)
predXgbTest=predXgbTest.clip(0,20)

0.9194968

CATBOOST¶

In [75]:

catboostBaseline=CatBoostRegressor(eval_metric='RMSE',
                            loss_function='RMSE',
                            iterations=500)
catboostBaseline.fit(x_train_baseline,y_train_baseline.astype('float'),early_stopping_rounds=50,verbose=50)
predCatBoostBaseline=xgbBaseline.predict(x_cv)
print(mean_squared_error(predCatBoostBaseline,y_cv))

Learning rate set to 0.209585
0:	learn: 1.1236477	total: 118ms	remaining: 59.1s
50:	learn: 0.8915859	total: 3.97s	remaining: 35s
100:	learn: 0.8722246	total: 7.66s	remaining: 30.3s
150:	learn: 0.8596855	total: 11.7s	remaining: 27.1s
200:	learn: 0.8500771	total: 15.4s	remaining: 22.9s
250:	learn: 0.8416014	total: 19.1s	remaining: 19s
300:	learn: 0.8335103	total: 23s	remaining: 15.2s
350:	learn: 0.8260545	total: 26.8s	remaining: 11.4s
400:	learn: 0.8202651	total: 30.6s	remaining: 7.56s
450:	learn: 0.8137624	total: 34.4s	remaining: 3.73s
499:	learn: 0.8088961	total: 38.1s	remaining: 0us
0.89054406

In [76]:

featureImportanceCatBoostDF=pd.DataFrame(sorted(zip(catboostBaseline.feature_importances_,x_train_baseline.columns)),columns=['values','features'])
featuresCatBoost=featureImportanceCatBoostDF[featureImportanceCatBoostDF['values']>=featureImportanceCatBoostDF['values'].median()]['features']
sns.barplot(x='values',y='features',data=featureImportanceCatBoostDF[-15:])

Out[76]:

<AxesSubplot:xlabel='values', ylabel='features'>

In [77]:

x_train_cat=x_train[featuresCatBoost]
x_cv_cat=x_cv[featuresCatBoost]
x_test_cat=x_test[featuresCatBoost]

In [78]:

def optimize(trial):
    params={
        'depth': trial.suggest_int('depth', 4, 8),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',10,100),
        'eval_metric':'RMSE',
        'loss_function':'RMSE',
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',5,20),
        'colsample_bylevel':trial.suggest_uniform('colsample_bylevel',0.5,1),
        'learning_rate':trial.suggest_uniform('learning_rate',0.0001,1),
        'subsample':trial.suggest_uniform('subsample',0.5,1),
        'iterations':trial.suggest_int('iterations',100,1000)
    }
    model=CatBoostRegressor(**params)
    model.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=30,verbose=100)
    pred=model.predict(x_cv_cat)
    rmse=mean_squared_error(pred,y_cv.astype('float'))
    return rmse
study=optuna.create_study(direction='minimize')
study.optimize(optimize,n_trials=10)
print(study.best_trial.params)

[I 2022-06-02 11:46:04,402] A new study created in memory with name: no-name-f2c29064-982e-4ffd-920f-e321ecfbf787

0:	learn: 1.1029261	test: 1.0570314	best: 1.0570314 (0)	total: 891ms	remaining: 13m 49s
100:	learn: 0.8687744	test: 0.9290252	best: 0.9277017 (73)	total: 1m 28s	remaining: 12m 4s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.927701692
bestIteration = 73

Shrink model to first 74 iterations.

[I 2022-06-02 11:48:08,787] Trial 0 finished with value: 0.8606304292551291 and parameters: {'depth': 8, 'min_data_in_leaf': 19, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.5378079736103758, 'learning_rate': 0.2862666355245815, 'subsample': 0.866195462513664, 'iterations': 932}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0178522	test: 1.0058259	best: 1.0058259 (0)	total: 570ms	remaining: 4m 2s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9477510699
bestIteration = 44

Shrink model to first 45 iterations.

[I 2022-06-02 11:49:19,893] Trial 1 finished with value: 0.8982320904308819 and parameters: {'depth': 4, 'min_data_in_leaf': 18, 'l2_leaf_reg': 16, 'colsample_bylevel': 0.7141512855989429, 'learning_rate': 0.7080253023373682, 'subsample': 0.5382960589664838, 'iterations': 427}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0050272	test: 1.0039275	best: 1.0039275 (0)	total: 577ms	remaining: 1m 22s
100:	learn: 0.8930705	test: 0.9633663	best: 0.9424132 (70)	total: 51.7s	remaining: 22s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9424132232
bestIteration = 70

Shrink model to first 71 iterations.

[I 2022-06-02 11:50:45,978] Trial 2 finished with value: 0.8881426832370868 and parameters: {'depth': 4, 'min_data_in_leaf': 19, 'l2_leaf_reg': 18, 'colsample_bylevel': 0.8027415898193161, 'learning_rate': 0.837994053759473, 'subsample': 0.5726931086646337, 'iterations': 144}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0863506	test: 1.0466450	best: 1.0466450 (0)	total: 803ms	remaining: 3m 59s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9378474207
bestIteration = 52

Shrink model to first 53 iterations.

[I 2022-06-02 11:52:25,028] Trial 3 finished with value: 0.8795577846037965 and parameters: {'depth': 7, 'min_data_in_leaf': 67, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.5246033321273523, 'learning_rate': 0.34355209068575415, 'subsample': 0.8021534800608656, 'iterations': 299}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.1275177	test: 1.0747990	best: 1.0747990 (0)	total: 804ms	remaining: 8m 53s
100:	learn: 0.8921188	test: 0.9386004	best: 0.9357526 (93)	total: 1m 15s	remaining: 7m 3s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9357526101
bestIteration = 93

Shrink model to first 94 iterations.

[I 2022-06-02 11:54:33,083] Trial 4 finished with value: 0.8756329472555682 and parameters: {'depth': 6, 'min_data_in_leaf': 74, 'l2_leaf_reg': 15, 'colsample_bylevel': 0.7912345907866245, 'learning_rate': 0.22689618198240935, 'subsample': 0.924005802770854, 'iterations': 665}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0108997	test: 1.0008224	best: 1.0008224 (0)	total: 716ms	remaining: 3m 15s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9448269553
bestIteration = 39

Shrink model to first 40 iterations.

[I 2022-06-02 11:55:48,194] Trial 5 finished with value: 0.8926979755477428 and parameters: {'depth': 5, 'min_data_in_leaf': 100, 'l2_leaf_reg': 5, 'colsample_bylevel': 0.7446539822463221, 'learning_rate': 0.6947668463100837, 'subsample': 0.6161920101568504, 'iterations': 274}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0269441	test: 1.0111692	best: 1.0111692 (0)	total: 674ms	remaining: 1m 22s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.939723861
bestIteration = 47

Shrink model to first 48 iterations.

[I 2022-06-02 11:57:10,716] Trial 6 finished with value: 0.8830809348565399 and parameters: {'depth': 5, 'min_data_in_leaf': 26, 'l2_leaf_reg': 10, 'colsample_bylevel': 0.878917270514938, 'learning_rate': 0.6039769722579861, 'subsample': 0.7713409817301802, 'iterations': 123}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 0.9828832	test: 0.9857288	best: 0.9857288 (0)	total: 1.12s	remaining: 8m 14s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.934415575
bestIteration = 26

Shrink model to first 27 iterations.

[I 2022-06-02 11:58:33,509] Trial 7 finished with value: 0.8731324667712639 and parameters: {'depth': 8, 'min_data_in_leaf': 98, 'l2_leaf_reg': 9, 'colsample_bylevel': 0.8299261304989676, 'learning_rate': 0.7911262009004196, 'subsample': 0.8309398836396251, 'iterations': 444}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.0596660	test: 1.0300854	best: 1.0300854 (0)	total: 734ms	remaining: 2m 45s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9429523361
bestIteration = 33

Shrink model to first 34 iterations.

[I 2022-06-02 11:59:51,542] Trial 8 finished with value: 0.8891591081610171 and parameters: {'depth': 6, 'min_data_in_leaf': 51, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.8307576661697187, 'learning_rate': 0.44239201381919957, 'subsample': 0.7697952150088694, 'iterations': 227}. Best is trial 0 with value: 0.8606304292551291.

0:	learn: 1.1567741	test: 1.0958238	best: 1.0958238 (0)	total: 763ms	remaining: 10m 54s
100:	learn: 0.8920460	test: 0.9351082	best: 0.9344029 (90)	total: 1m 16s	remaining: 9m 30s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9338687148
bestIteration = 115

Shrink model to first 116 iterations.

[I 2022-06-02 12:02:15,520] Trial 9 finished with value: 0.8721107764498629 and parameters: {'depth': 7, 'min_data_in_leaf': 51, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.9735331717447835, 'learning_rate': 0.14349366648996406, 'subsample': 0.8482159777091005, 'iterations': 858}. Best is trial 0 with value: 0.8606304292551291.

{'depth': 8, 'min_data_in_leaf': 19, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.5378079736103758, 'learning_rate': 0.2862666355245815, 'subsample': 0.866195462513664, 'iterations': 932}

In [79]:

catBestParams=study.best_trial.params
catBestParams.update({
           'eval_metric':'RMSE',
           'loss_function':'RMSE'})

In [80]:

catBoostReg=CatBoostRegressor(**catBestParams)
catBoostReg.fit(x_train_cat,y_train.astype('float'),eval_set=[(x_cv_cat,y_cv.astype('float'))],early_stopping_rounds=50,verbose=50)
predCatCV=catBoostReg.predict(x_cv_cat)
print('MEAN SQUARED ERROR OF VALIDATION SET FOR CATBOOST REGRESSOR ',mean_squared_error(predCatCV,y_cv))
predCatTest=catBoostReg.predict(x_test)
predCatTest=predCatTest.clip(0,20)

0:	learn: 1.1029261	test: 1.0570314	best: 1.0570314 (0)	total: 1.23s	remaining: 19m 6s
50:	learn: 0.8880053	test: 0.9343844	best: 0.9329967 (49)	total: 44.7s	remaining: 12m 52s
100:	learn: 0.8687744	test: 0.9290252	best: 0.9277017 (73)	total: 1m 28s	remaining: 12m 8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.927701692
bestIteration = 73

Shrink model to first 74 iterations.
MEAN SQUARED ERROR OF VALIDATION SET FOR CATBOOST REGRESSOR  0.8606304292551291

In [81]:

#predLgb=predLgb.clip(0,20)
predFinal1=[]
for i in predLgbTest:
    predFinal1.append(np.math.floor(i))
    
# predXgbr=predXgbr.clip(0,20)
predFinal2=[]
for i in predXgbTest:
    predFinal2.append(np.math.floor(i))
    
predFinal3=[]
for i in predCatTest:
    predFinal3.append(np.math.floor(i))

Averaging the predictions¶

In [85]:

predictions=[]
for i in range(len(predFinal1)):
    val=0.3*predFinal1[i]+0.3*predFinal2[i]+0.4*predFinal3[i]
    predictions.append(int(val))

In [86]:

submissionDF=pd.DataFrame(testDF['ID'],columns=['ID'])
submissionDF['item_cnt_month']=predictions
submissionDF.to_csv('result.csv',index=False)

The RMSE for the testing set is 0.92