Demand Forecasting

ANIL NEBİ ŞENTÜRK
9 min readFeb 15, 2022

https://vitalflux.com/machine-learning-techniques-demand-forecasting/

Merhabalar,bu hafta talep tahmin modellemesi ile karşınızdayım.

Farklı mağazalar için 3 aylık ürün-level satış tahmini yapılacaktır.Mağaza-ürün kırılımın da 3 ay sonrasının tahmini yapılacaktır.

Bu çalışma da kaggle sitesinden alınmış olan “ Store Item Demand Forecasting Challenge

https://www.kaggle.com/c/demand-forecasting-kernels-only

Gerekli kütüphanelerimizi import ediyoruz(LightGBM algoritması kullanılmıştır)

import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

Tüm satır ve sütunlarda gezmesi için set_option işlemi yapıyoruz.

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
warnings.filterwarnings('ignore')

Loading the data:Veri setimi train&test olarak inceliyorum.

train = pd.read_csv('C:/Users/Lenovo/OneDrive/Masaüstü/anş.py/train.csv', parse_dates=['date'])
test = pd.read_csv('C:/Users/Lenovo/OneDrive/Masaüstü/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('C:/Users/Lenovo/OneDrive/Masaüstü/anş.py/sample_submission.csv') #kaggle dan alındı

Train ve Test setini concat ile bir araya getiriyoruz ve df olarak atıyoruz.

df = pd.concat([train, test], sort=False)

df dediğimizde bazı satırlarda id değişkeninde NaN değer varken,bazı satırlarda sales değişkeninde NaN değer vardır. Neden??

Train setinde id olmamasından ötürü,sales değişkeninde NaN olmasının sebebi ise test setinde sales değişkenin olmamasından ötürü…

Keşifçi Veri Analizi(EDA)

  1. Veri setinde ki maximum ve minimum tarihleri gözlemleyelim.
df["date"].min(), df["date"].max()
(Timestamp('2013-01-01 00:00:00'), Timestamp('2018-03-31 00:00:00'))
def check_df(dataframe, head=5):
print("##################### Shape #####################")
print(dataframe.shape)
print("##################### Types #####################")
print(dataframe.dtypes)
print("##################### Head #####################")
print(dataframe.head(head))
print("##################### Tail #####################")
print(dataframe.tail(head))
print("##################### NA #####################")
print(dataframe.isnull().sum())
print("##################### Quantiles #####################")
print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(train)

check_df(test)

check_df(train)
##################### Shape #####################
(913000, 4)
##################### Types #####################
date datetime64[ns]
store int64
item int64
sales int64
dtype: object
##################### Head #####################
date store item sales
0 2013–01–01 1 1 13
1 2013–01–02 1 1 11
2 2013–01–03 1 1 14
3 2013–01–04 1 1 13
4 2013–01–05 1 1 10
##################### Tail #####################
date store item sales
912995 2017–12–27 10 50 63
912996 2017–12–28 10 50 59
912997 2017–12–29 10 50 74
912998 2017–12–30 10 50 62
912999 2017–12–31 10 50 82
##################### NA #####################
date 0
store 0
item 0
sales 0

dtype: int64
##################### Quantiles #####################
0.00 0.05 0.50 0.95 0.99 1.00
store 1.0 1.0 5.5 10.0 10.0 10.0
item 1.0 3.0 25.5 48.0 50.0 50.0
sales 0.0 16.0 47.0 107.0 135.0 231.0

check_df(test)

##################### Shape #####################
(45000, 4)
##################### Types #####################
id int64
date datetime64[ns]
store int64
item int64
dtype: object
##################### Head #####################
id date store item
0 0 2018–01–01 1 1
1 1 2018–01–02 1 1
2 2 2018–01–03 1 1
3 3 2018–01–04 1 1
4 4 2018–01–05 1 1
##################### Tail #####################
id date store item
44995 44995 2018–03–27 10 50
44996 44996 2018–03–28 10 50
44997 44997 2018–03–29 10 50
44998 44998 2018–03–30 10 50
44999 44999 2018–03–31 10 50
##################### NA #####################
id 0
date 0
store 0
item 0
dtype: int64
##################### Quantiles #####################
0.00 0.05 0.50 0.95 0.99 1.00
id 0.0 2249.95 22499.5 42749.05 44549.01 44999.0
store 1.0 1.00 5.5 10.00 10.00 10.0
item 1.0 3.00 25.5 48.00 50.00 50.0

train ve test setini birleştirdiğimiz df’in check_df(df) bakalım.

##################### Shape #####################
(958000, 5)
##################### Types #####################
date datetime64[ns]
store int64
item int64
sales float64
id float64
dtype: object
##################### Head #####################
date store item sales id
0 2013–01–01 1 1 13.0 NaN
1 2013–01–02 1 1 11.0 NaN
2 2013–01–03 1 1 14.0 NaN
3 2013–01–04 1 1 13.0 NaN
4 2013–01–05 1 1 10.0 NaN
##################### Tail #####################
date store item sales id
44995 2018–03–27 10 50 NaN 44995.0
44996 2018–03–28 10 50 NaN 44996.0
44997 2018–03–29 10 50 NaN 44997.0
44998 2018–03–30 10 50 NaN 44998.0
44999 2018–03–31 10 50 NaN 44999.0
##################### NA #####################
date 0
store 0
item 0
sales 45000
id 913000
dtype: int64
##################### Quantiles #####################
0.00 0.05 0.50 0.95 0.99 1.00
store 1.0 1.00 5.5 10.00 10.00 10.0
item 1.0 3.00 25.5 48.00 50.00 50.0
sales 0.0 16.00 47.0 107.00 135.00 231.0
id 0.0 2249.95 22499.5 42749.05 44549.01 44999.0

# Kaç store var?
df[["store"]].nunique()
store 10
dtype: int64
# Kaç item var?
df[["item"]].nunique()
50
# Her store'da eşit sayıda mı eşsiz item var?
df.groupby(["store"])["item"].nunique()
store
1 50
2 50
3 50
4 50
5 50
6 50
7 50
8 50
9 50
10 50
Name: item, dtype: int64
# Peki her store'da eşit sayıda mı sales var?
df.groupby(["store", "item"]).agg({"sales": ["sum"]})
ut[25]:
sales
sum
store item
1 1 36468.0
2 97050.0
3 60638.0
4 36440.0
5 30335.0
...
10 46 120601.0
47 45204.0
48 105570.0
49 60317.0
50 135192.0
[500 rows x 1 columns]
# mağaza-item kırılımında satış istatistikleri
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})
:
sales
sum mean median std
store item
1 1 36468.0 19.971522 19.0 6.741022
2 97050.0 53.148959 52.0 15.005779
3 60638.0 33.208105 33.0 10.072529
4 36440.0 19.956188 20.0 6.640618
5 30335.0 16.612815 16.0 5.672102
... ... ... ...
10 46 120601.0 66.046550 65.0 18.114991
47 45204.0 24.755750 24.0 7.924820
48 105570.0 57.814896 57.0 15.898538
49 60317.0 33.032311 32.0 10.091610
50 135192.0 74.037240 73.0 19.937566
[500 rows x 4 columns]

Feature Engineering:Zaman değişkeni etrafında yeni değişkenler üretilmesini beklerim.

# Date Features
########################


def create_date_features(df):
df['month'] = df.date.dt.month #ay bilgisi
df['day_of_month'] = df.date.dt.day #ayın hangi günü
df['day_of_year'] = df.date.dt.dayofyear #yılın hangi günü
df['week_of_year'] = df.date.dt.weekofyear #yılın hangi haftası
df['day_of_week'] = df.date.dt.dayofweek #haftanın hangi günü
df['year'] = df.date.dt.year #yıl
df["is_wknd"] = df.date.dt.weekday // 4 #hftasonu&haftaiçi
df['is_month_start'] = df.date.dt.is_month_start.astype(int) #aybaşı
df['is_month_end'] = df.date.dt.is_month_end.astype(int)#aysonu
return df
df=create_date_features(df)
df.head(5)
df.groupby(["store", "item", "month"]).agg({"sales": ["sum", "mean", "median", "std"]})
Out[31]:
sales
sum mean median std
store item month
1 1 1 2125.0 13.709677 13.0 4.397413
2 2063.0 14.631206 14.0 4.668146
3 2728.0 17.600000 17.0 4.545013
4 3118.0 20.786667 20.0 4.894301
5 3448.0 22.245161 22.0 6.564705
... ... ... ...
10 50 8 13108.0 84.567742 85.0 15.676527
9 11831.0 78.873333 79.0 15.207423
10 11322.0 73.045161 72.0 14.209171
11 11549.0 76.993333 77.0 16.253651
12 8724.0 56.283871 56.0 11.782529
[6000 rows x 4 columns]

Random Noise:Sales değişkeni odağında üretecek olduğumuz yeni değişkenlere gürültü ekliyoruz.Gürültü eklemek bizi overfitting(aşırı öğrenme,ezberlemeden)korur.

# Lag/Shifted Features
########################


df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

check_df(df)

check_df(df)
##################### Shape #####################
(958000, 14)
##################### Types #####################
date datetime64[ns]
store int64
item int64
sales float64
id float64
month int64
day_of_month int64
day_of_year int64
week_of_year int64
day_of_week int64
year int64
is_wknd int64
is_month_start int32
is_month_end int32
dtype: object
##################### Head #####################
date store item sales id month day_of_month day_of_year week_of_year day_of_week year is_wknd is_month_start is_month_end
0 2013–01–01 1 1 13.0 NaN 1 1 1 1 1 2013 0 1 0
1 2013–01–02 1 1 11.0 NaN 1 2 2 1 2 2013 0 0 0
2 2013–01–03 1 1 14.0 NaN 1 3 3 1 3 2013 0 0 0
3 2013–01–04 1 1 13.0 NaN 1 4 4 1 4 2013 1 0 0
4 2013–01–05 1 1 10.0 NaN 1 5 5 1 5 2013 1 0 0
##################### Tail #####################
date store item sales id month day_of_month day_of_year week_of_year day_of_week year is_wknd is_month_start is_month_end
44995 2018–03–27 10 50 NaN 44995.0 3 27 86 13 1 2018 0 0 0
44996 2018–03–28 10 50 NaN 44996.0 3 28 87 13 2 2018 0 0 0
44997 2018–03–29 10 50 NaN 44997.0 3 29 88 13 3 2018 0 0 0
44998 2018–03–30 10 50 NaN 44998.0 3 30 89 13 4 2018 1 0 0
44999 2018–03–31 10 50 NaN 44999.0 3 31 90 13 5 2018 1 0 1
##################### NA #####################
date 0
store 0
item 0
sales 45000
id 913000
month 0
day_of_month 0
day_of_year 0
week_of_year 0
day_of_week 0
year 0
is_wknd 0
is_month_start 0
is_month_end 0
dtype: int64
##################### Quantiles #####################
0.00 0.05 0.50 0.95 0.99 1.00
store 1.0 1.00 5.5 10.00 10.00 10.0
item 1.0 3.00 25.5 48.00 50.00 50.0
sales 0.0 16.00 47.0 107.00 135.00 231.0
id 0.0 2249.95 22499.5 42749.05 44549.01 44999.0
month 1.0 1.00 6.0 12.00 12.00 12.0
day_of_month 1.0 2.00 16.0 29.00 31.00 31.0
day_of_year 1.0 16.00 174.0 347.00 362.00 366.0
week_of_year 1.0 3.00 25.0 50.00 52.00 53.0
day_of_week 0.0 0.00 3.0 6.00 6.00 6.0
year 2013.0 2013.00 2015.0 2017.00 2018.00 2018.0
is_wknd 0.0 0.00 0.0 1.00 1.00 1.0
is_month_start 0.0 0.00 0.0 0.00 1.00 1.0
is_month_end 0.0 0.00 0.0 0.00 1.00 1.0

pd.DataFrame({"sales": df["sales"].values[0:10],
"lag1": df["sales"].shift(1).values[0:10],
"lag2": df["sales"].shift(2).values[0:10],
"lag3": df["sales"].shift(3).values[0:10],
"lag4": df["sales"].shift(4).values[0:10]})
def lag_features(dataframe, lags):
for lag in lags:
dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
lambda x: x.shift(lag)) + random_noise(dataframe)
return dataframe

Rolling Mean Feature(Hareketli Ortalama):Trendi gösterir,geçmiş bilgiyi taşırlar.

d.DataFrame({"sales": df["sales"].values[0:10],
"roll2": df["sales"].rolling(window=2).mean().values[0:10],
"roll3": df["sales"].rolling(window=3).mean().values[0:10],
"roll5": df["sales"].rolling(window=5).mean().values[0:10]})


pd.DataFrame({"sales": df["sales"].values[0:10],
"roll2": df["sales"].shift(1).rolling(window=2).mean().values[0:10],
"roll3": df["sales"].shift(1).rolling(window=3).mean().values[0:10],
"roll5": df["sales"].shift(1).rolling(window=5).mean().values[0:10]})
def roll_mean_features(dataframe, windows):
for window in windows:
dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
transform(
lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
dataframe)
return dataframe


df = roll_mean_features(df, [365, 546])
df.tail()

Exponentially Weighted Mean Features:

pd.DataFrame({"sales": df["sales"].values[0:10],
"roll2": df["sales"].shift(1).rolling(window=2).mean().values[0:10],
"ewm099": df["sales"].shift(1).ewm(alpha=0.99).mean().values[0:10],
"ewm095": df["sales"].shift(1).ewm(alpha=0.95).mean().values[0:10],
"ewm07": df["sales"].shift(1).ewm(alpha=0.7).mean().values[0:10],
"ewm01": df["sales"].shift(1).ewm(alpha=0.1).mean().values[0:10]})
def ewm_features(dataframe, alphas, lags):
for alpha in alphas:
for lag in lags:
dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
return dataframe
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)
df.head()
check_df(df) #71 değişken elde ettik

One-hot-encoding:

df = pd.get_dummies(df, columns=['store', 'item', 'day_of_week', 'month'])########################
# Converting sales to log(1+sales)
########################

df['sales'] = np.log1p(df["sales"].values)
check_df(df)
# Custom Cost Function
########################

# MAE: mean absolute error
# MAPE: mean absolute percentage error
# SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)
def smape(preds, target):
n = len(preds)
masked_arr = ~((preds == 0) & (target == 0))
preds, target = preds[masked_arr], target[masked_arr]
num = np.abs(preds - target)
denom = np.abs(preds) + np.abs(target)
smape_val = (200 * np.sum(num / denom)) / n
return smape_val

def lgbm_smape(preds, train_data):
labels = train_data.get_label()
smape_val = smape(np.expm1(preds), np.expm1(labels))
return 'SMAPE', smape_val, False

Time-Based Validation Sets:

# 2017'nin başına kadar (2016'nın sonuna kadar) train seti.
train = df.loc[(df["date"] < "2017-01-01"), :]
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :]cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

List Comphrenshion yapısı ile train sütunlarında olmayanları yeni bir cols olarak atadık.Hatırlarsak train setinde id değişkenin olmadığını söylemiştik.

Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

# LightGBM parameters
lgb_params = {'metric': {'mae'},
'num_leaves': 10,
'learning_rate': 0.02,
'feature_fraction': 0.8,
'max_depth': 5,
'verbose': 0,
'num_boost_round': 1000,
'early_stopping_rounds': 200,
'nthread': -1}

# metric mae: l1, absolute loss, mean_absolute_error, regression_l1
# l2, square loss, mean_squared_error, mse, regression_l2, regression
# rmse, root square loss, root_mean_squared_error, l2_root
# mape, MAPE loss, mean_absolute_percentage_error

# num_leaves: bir ağaçtaki maksimum yaprak sayısı
# learning_rate: shrinkage_rate, eta
# feature_fraction: rf'nin random subspace özelliği. her iterasyonda rastgele göz önünde bulundurulacak değişken sayısı.
# max_depth: maksimum derinlik
# num_boost_round: n_estimators, number of boosting iterations. En az 10000-15000 civarı yapmak lazım.

# early_stopping_rounds: validasyon setindeki metrik belirli bir early_stopping_rounds'da ilerlemiyorsa yani
# hata düşmüyorsa modellemeyi durdur.
# hem train süresini kısaltır hem de overfit'e engel olur.
# nthread: num_thread, nthread, nthreads, n_jobs

lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain,
valid_sets=[lgbtrain, lgbval],
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
feval=lgbm_smape,
verbose_eval=100)

y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

smape(np.expm1(y_pred_val), np.expm1(Y_val))


########################
# Değişken önem düzeyleri
########################

def plot_lgb_importances(model, plot=False, num=10):

gain = model.feature_importance('gain')
feat_imp = pd.DataFrame({'feature': model.feature_name(),
'split': model.feature_importance('split'),
'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
if plot:
plt.figure(figsize=(10, 10))
sns.set(font_scale=1)
sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
plt.title('feature')
plt.tight_layout()
plt.show()
else:
print(feat_imp.head(num))


plot_lgb_importances(model, num=30)
plot_lgb_importances(model, num=30, plot=True)

lgb.plot_importance(model, max_num_features=20, figsize=(10, 10), importance_type="gain")
plt.show()

########################
# Final Model
########################



train = df.loc[~df.sales.isna()]
Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

lgb_params = {'metric': {'mae'},
'num_leaves': 10,
'learning_rate': 0.02,
'feature_fraction': 0.8,
'max_depth': 5,
'verbose': 0,
'nthread': -1,
"num_boost_round": model.best_iteration}


# LightGBM dataset
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration)

test_preds = model.predict(X_test, num_iteration=model.best_iteration)


submission_df = test.loc[:, ['id', 'sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)

submission_df.to_csv('submission_demand.csv', index=False)
submission_df.head(20)

VBO❤

--

--