|
- import os
- import pickle
- import pandas as pd
- import numpy as np
- from itertools import product
- from sklearn.preprocessing import LabelEncoder
- from sklearn.preprocessing import MinMaxScaler
-
- import calendar
-
- from .paths import pfs_data_dir
- from .paths import pfs_split_dir
-
-
- def feature_engineering():
- # read data
- sales = pd.read_csv(os.path.join(pfs_data_dir, "sales_train.csv"))
- shops = pd.read_csv(os.path.join(pfs_data_dir, "shops.csv"))
- items = pd.read_csv(os.path.join(pfs_data_dir, "items.csv"))
- item_cats = pd.read_csv(os.path.join(pfs_data_dir, "item_categories.csv"))
- test = pd.read_csv(os.path.join(pfs_data_dir, "test.csv"))
-
- # remove outliers
- train = sales[(sales.item_price < 10000) & (sales.item_price > 0)]
- train = train[sales.item_cnt_day < 1001]
-
- print(train.shape, sales.shape)
- print(train.tail(5))
- print(sales.tail(5))
-
- # combine shops with different id but the same name
- train.loc[train.shop_id == 0, "shop_id"] = 57
- test.loc[test.shop_id == 0, "shop_id"] = 57
-
- train.loc[train.shop_id == 1, "shop_id"] = 58
- test.loc[test.shop_id == 1, "shop_id"] = 58
-
- train.loc[train.shop_id == 40, "shop_id"] = 39
- test.loc[test.shop_id == 40, "shop_id"] = 39
-
- # obtain shop_id, item_id, month information
- index_cols = ["shop_id", "item_id", "date_block_num"]
-
- df = []
- for block_num in train["date_block_num"].unique():
- cur_shops = train.loc[sales["date_block_num"] == block_num, "shop_id"].unique()
- cur_items = train.loc[sales["date_block_num"] == block_num, "item_id"].unique()
- df.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype="int32"))
-
- df = pd.DataFrame(np.vstack(df), columns=index_cols, dtype=np.int32)
- print("df.shape: ", df.shape)
- print(df.head(5))
-
- # Add month sales
- group = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]})
- group.columns = ["item_cnt_month"]
- group.reset_index(inplace=True)
- print("group.shape: ", group.shape)
- print(group.head(5))
-
- df = pd.merge(df, group, on=index_cols, how="left")
- df["item_cnt_month"] = (
- df["item_cnt_month"]
- .fillna(0)
- .astype(np.float32)
- # df['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float32)
- )
-
- # fill test data
- test["date_block_num"] = 34
- test["date_block_num"] = test["date_block_num"].astype(np.int8)
- test["shop_id"] = test["shop_id"].astype(np.int8)
- test["item_id"] = test["item_id"].astype(np.int16)
- df = pd.concat([df, test], ignore_index=True, sort=False, keys=index_cols)
- df.fillna(0, inplace=True)
-
- # shop location features
- shops["city"] = shops["shop_name"].apply(lambda x: x.split()[0].lower())
- shops.loc[shops.city == "!якутск", "city"] = "якутск"
- shops["city_code"] = LabelEncoder().fit_transform(shops["city"])
-
- coords = dict()
- coords["якутск"] = (62.028098, 129.732555, 4)
- coords["адыгея"] = (44.609764, 40.100516, 3)
- coords["балашиха"] = (55.8094500, 37.9580600, 1)
- coords["волжский"] = (53.4305800, 50.1190000, 3)
- coords["вологда"] = (59.2239000, 39.8839800, 2)
- coords["воронеж"] = (51.6720400, 39.1843000, 3)
- coords["выездная"] = (0, 0, 0)
- coords["жуковский"] = (55.5952800, 38.1202800, 1)
- coords["интернет-магазин"] = (0, 0, 0)
- coords["казань"] = (55.7887400, 49.1221400, 4)
- coords["калуга"] = (54.5293000, 36.2754200, 4)
- coords["коломна"] = (55.0794400, 38.7783300, 4)
- coords["красноярск"] = (56.0183900, 92.8671700, 4)
- coords["курск"] = (51.7373300, 36.1873500, 3)
- coords["москва"] = (55.7522200, 37.6155600, 1)
- coords["мытищи"] = (55.9116300, 37.7307600, 1)
- coords["н.новгород"] = (56.3286700, 44.0020500, 4)
- coords["новосибирск"] = (55.0415000, 82.9346000, 4)
- coords["омск"] = (54.9924400, 73.3685900, 4)
- coords["ростовнадону"] = (47.2313500, 39.7232800, 3)
- coords["спб"] = (59.9386300, 30.3141300, 2)
- coords["самара"] = (53.2000700, 50.1500000, 4)
- coords["сергиев"] = (56.3000000, 38.1333300, 4)
- coords["сургут"] = (61.2500000, 73.4166700, 4)
- coords["томск"] = (56.4977100, 84.9743700, 4)
- coords["тюмень"] = (57.1522200, 65.5272200, 4)
- coords["уфа"] = (54.7430600, 55.9677900, 4)
- coords["химки"] = (55.8970400, 37.4296900, 1)
- coords["цифровой"] = (0, 0, 0)
- coords["чехов"] = (55.1477000, 37.4772800, 4)
- coords["ярославль"] = (57.6298700, 39.8736800, 2)
-
- shops["city_coord_1"] = shops["city"].apply(lambda x: coords[x][0])
- shops["city_coord_2"] = shops["city"].apply(lambda x: coords[x][1])
- shops["country_part"] = shops["city"].apply(lambda x: coords[x][2])
-
- shops = shops[["shop_id", "city_code", "city_coord_1", "city_coord_2", "country_part"]]
-
- df = pd.merge(df, shops, on=["shop_id"], how="left")
-
- # process items category name
- map_dict = {
- "Чистые носители (штучные)": "Чистые носители",
- "Чистые носители (шпиль)": "Чистые носители",
- "PC ": "Аксессуары",
- "Служебные": "Служебные ",
- }
-
- items = pd.merge(items, item_cats, on="item_category_id")
-
- items["item_category"] = items["item_category_name"].apply(lambda x: x.split("-")[0])
- items["item_category"] = items["item_category"].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
- items["item_category_common"] = LabelEncoder().fit_transform(items["item_category"])
-
- items["item_category_code"] = LabelEncoder().fit_transform(items["item_category_name"])
- items = items[["item_id", "item_category_common", "item_category_code"]]
-
- df = pd.merge(df, items, on=["item_id"], how="left")
-
- # Weekends count / number of days in a month
- def count_days(date_block_num):
- year = 2013 + date_block_num // 12
- month = 1 + date_block_num % 12
- weeknd_count = len([1 for i in calendar.monthcalendar(year, month) if i[6] != 0])
- days_in_month = calendar.monthrange(year, month)[1]
- return weeknd_count, days_in_month, month
-
- map_dict = {i: count_days(i) for i in range(35)}
-
- df["weeknd_count"] = df["date_block_num"].apply(lambda x: map_dict[x][0])
- df["days_in_month"] = df["date_block_num"].apply(lambda x: map_dict[x][1])
-
- # Interation features: Item is new / Item was bought in this shop before
- first_item_block = df.groupby(["item_id"])["date_block_num"].min().reset_index()
- first_item_block["item_first_interaction"] = 1
-
- first_shop_item_buy_block = (
- df[df["date_block_num"] > 0].groupby(["shop_id", "item_id"])["date_block_num"].min().reset_index()
- )
- first_shop_item_buy_block["first_date_block_num"] = first_shop_item_buy_block["date_block_num"]
-
- df = pd.merge(
- df,
- first_item_block[["item_id", "date_block_num", "item_first_interaction"]],
- on=["item_id", "date_block_num"],
- how="left",
- )
- df = pd.merge(
- df,
- first_shop_item_buy_block[["item_id", "shop_id", "first_date_block_num"]],
- on=["item_id", "shop_id"],
- how="left",
- )
-
- df["first_date_block_num"].fillna(100, inplace=True)
- df["shop_item_sold_before"] = (df["first_date_block_num"] < df["date_block_num"]).astype("int8")
- df.drop(["first_date_block_num"], axis=1, inplace=True)
-
- df["item_first_interaction"].fillna(0, inplace=True)
- df["shop_item_sold_before"].fillna(0, inplace=True)
-
- df["item_first_interaction"] = df["item_first_interaction"].astype("int8")
- df["shop_item_sold_before"] = df["shop_item_sold_before"].astype("int8")
-
- def lag_feature(df, lags, col):
- tmp = df[["date_block_num", "shop_id", "item_id", col]]
- for i in lags:
- shifted = tmp.copy()
- shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i)]
- shifted["date_block_num"] += i
- df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
- lag_name = col + "_lag_" + str(i)
- df[lag_name] = df[lag_name].astype("float32")
- return df
-
- df = lag_feature(df, [1, 2, 3], "item_cnt_month")
-
- index_cols = ["shop_id", "item_id", "date_block_num"]
- group = (
- train.groupby(index_cols)["item_price"]
- .mean()
- .reset_index()
- .rename(columns={"item_price": "avg_shop_price"}, errors="raise")
- )
- df = pd.merge(df, group, on=index_cols, how="left")
-
- df["avg_shop_price"] = df["avg_shop_price"].fillna(0).astype(np.float32)
-
- index_cols = ["item_id", "date_block_num"]
- group = (
- train.groupby(["date_block_num", "item_id"])["item_price"]
- .mean()
- .reset_index()
- .rename(columns={"item_price": "avg_item_price"}, errors="raise")
- )
-
- df = pd.merge(df, group, on=index_cols, how="left")
- df["avg_item_price"] = df["avg_item_price"].fillna(0).astype(np.float32)
-
- df["item_shop_price_avg"] = (df["avg_shop_price"] - df["avg_item_price"]) / df["avg_item_price"]
- df["item_shop_price_avg"].fillna(0, inplace=True)
-
- df = lag_feature(df, [1, 2, 3], "item_shop_price_avg")
- df.drop(["avg_shop_price", "avg_item_price", "item_shop_price_avg"], axis=1, inplace=True)
-
- item_id_target_mean = (
- df.groupby(["date_block_num", "item_id"])["item_cnt_month"]
- .mean()
- .reset_index()
- .rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
- )
- df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id"], how="left")
-
- df["item_target_enc"] = df["item_target_enc"].fillna(0).astype(np.float32)
-
- df = lag_feature(df, [1, 2, 3], "item_target_enc")
- df.drop(["item_target_enc"], axis=1, inplace=True)
-
- item_id_target_mean = (
- df.groupby(["date_block_num", "item_id", "city_code"])["item_cnt_month"]
- .mean()
- .reset_index()
- .rename(columns={"item_cnt_month": "item_loc_target_enc"}, errors="raise")
- )
- df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "city_code"], how="left")
-
- df["item_loc_target_enc"] = df["item_loc_target_enc"].fillna(0).astype(np.float32)
-
- df = lag_feature(df, [1, 2, 3], "item_loc_target_enc")
- df.drop(["item_loc_target_enc"], axis=1, inplace=True)
-
- item_id_target_mean = (
- df.groupby(["date_block_num", "item_id", "shop_id"])["item_cnt_month"]
- .mean()
- .reset_index()
- .rename(columns={"item_cnt_month": "item_shop_target_enc"}, errors="raise")
- )
-
- df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_id", "shop_id"], how="left")
-
- df["item_shop_target_enc"] = df["item_shop_target_enc"].fillna(0).astype(np.float32)
-
- df = lag_feature(df, [1, 2, 3], "item_shop_target_enc")
- df.drop(["item_shop_target_enc"], axis=1, inplace=True)
-
- item_id_target_mean = (
- df[df["item_first_interaction"] == 1]
- .groupby(["date_block_num", "item_category_code"])["item_cnt_month"]
- .mean()
- .reset_index()
- .rename(columns={"item_cnt_month": "new_item_cat_avg"}, errors="raise")
- )
-
- df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code"], how="left")
-
- df["new_item_cat_avg"] = df["new_item_cat_avg"].fillna(0).astype(np.float32)
-
- df = lag_feature(df, [1, 2, 3], "new_item_cat_avg")
- df.drop(["new_item_cat_avg"], axis=1, inplace=True)
-
- # For new items add avg category sales in a separate store for last 3 months
- item_id_target_mean = (
- df[df["item_first_interaction"] == 1]
- .groupby(["date_block_num", "item_category_code", "shop_id"])["item_cnt_month"]
- .mean()
- .reset_index()
- .rename(columns={"item_cnt_month": "new_item_shop_cat_avg"}, errors="raise")
- )
-
- df = pd.merge(df, item_id_target_mean, on=["date_block_num", "item_category_code", "shop_id"], how="left")
-
- df["new_item_shop_cat_avg"] = df["new_item_shop_cat_avg"].fillna(0).astype(np.float32)
-
- df = lag_feature(df, [1, 2, 3], "new_item_shop_cat_avg")
- df.drop(["new_item_shop_cat_avg"], axis=1, inplace=True)
-
- def lag_feature_adv(df, lags, col):
- tmp = df[["date_block_num", "shop_id", "item_id", col]]
- for i in lags:
- shifted = tmp.copy()
- shifted.columns = ["date_block_num", "shop_id", "item_id", col + "_lag_" + str(i) + "_adv"]
- shifted["date_block_num"] += i
- shifted["item_id"] -= 1
- df = pd.merge(df, shifted, on=["date_block_num", "shop_id", "item_id"], how="left")
- lag_name = col + "_lag_" + str(i) + "_adv"
- df[lag_name] = df[lag_name].astype("float32")
- return df
-
- df = lag_feature_adv(df, [1, 2, 3], "item_cnt_month")
-
- # df.fillna(0, inplace=True)
- df = df[(df["date_block_num"] > 2)]
-
- df.drop(["ID"], axis=1, inplace=True, errors="ignore")
-
- print(df.shape)
- print(df.columns)
- print(df.head(10))
-
- fill_dict = {}
- for col in df.columns:
- fill_dict[col] = df[col].mean()
-
- group_df = df.groupby(["shop_id"])
-
- for shop_id, shop_df in group_df:
- # remove data of data_block_num=34, i.e., 2015.11
- # this is test set in competition
- shop_df = shop_df[shop_df.date_block_num <= 33]
-
- # fill the null
- cols = shop_df.isnull().any()
- idx = list(cols[cols.values].index)
- shop_df[idx] = shop_df.groupby("item_id", sort=False)[idx].apply(
- lambda x: x.fillna(method="ffill").fillna(method="bfill")
- )
- shop_df[idx] = shop_df[idx].fillna(shop_df[idx].mean())
- for col in idx:
- shop_df[col] = shop_df[col].fillna(fill_dict[col])
-
- # min-max scale
- drop_fea_list = [
- "shop_id",
- "city_code",
- "city_coord_1",
- "city_coord_2",
- "country_part",
- "item_cnt_month",
- "date_block_num",
- ]
- fea_list = [col for col in shop_df.columns if col not in drop_fea_list]
- mms = MinMaxScaler()
- shop_df[fea_list] = mms.fit_transform(shop_df[fea_list])
- shop_df = shop_df[fea_list + ["item_cnt_month", "date_block_num"]]
-
- date_split = 29
- split = False
-
- while split is False:
- df1 = shop_df[shop_df["date_block_num"] <= date_split]
-
- df2 = shop_df[shop_df["date_block_num"] > date_split]
-
- if df2.shape[0] > 0 and df1.shape[0] > 0:
- split = True
- else:
- date_split -= 1
-
- if date_split < 0:
- break
-
- if split is True:
- print("ShopID:{}, split block:{}".format(shop_id, date_split))
- print(df1.shape, df2.shape)
-
- # save train csv
- fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-train.csv".format(shop_id))
- df1.to_csv(fpath, index=False)
-
- # save val csv
- fpath = os.path.join(pfs_split_dir, "Shop{:0>2d}-val.csv".format(shop_id))
- df2.to_csv(fpath, index=False)
|