The following libraries were used for this project
library(tidyverse)
library(lubridate)
library(readr)
library(timeDate)
Reading in data sets that can be found on the page for the kaggle competition which can be found at https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data
item_categories= read_csv("item_categories.csv")
items= read_csv("items.csv")
sales_train=read_csv("sales_train.csv")
shops=read_csv("shops.csv")
test=read_csv("test.csv")
#create enhanced sales training data (join all datasets)
test$date_block_num=34
test$date="02.11.2015"
test$item_price=0
test$item_cnt_day=0
sales_train$ID=9999999999
sales_train=rbind(sales_train,test)
enh_sales_train=left_join(sales_train, shops)
enh_sales_train=left_join(enh_sales_train, items)
enh_sales_train=left_join(enh_sales_train, item_categories)
#remove if price greater than 100000
enh_sales_train=enh_sales_train[enh_sales_train$item_price < 100000, ]
#remove if price is negative
enh_sales_train=enh_sales_train[enh_sales_train$item_price > -0.00001, ]
# remove shops not in the test set
testshopids=test$shop_id
enh_sales_train=filter(enh_sales_train, shop_id %in% testshopids)
# fix shops that have multiple ids
enh_sales_train$shop_id=recode(enh_sales_train$shop_id, `0` = 57, `1` = 58, `40`=39, `10`=11)
A new variable was created to reflect the number of characters in an item name.
# item name length
enh_sales_train$item_name_length=nchar(enh_sales_train$item_name)
A new variable was created to show the first date the item was sold across all shops.
#create item first sales date
firstsaledates=aggregate(x = enh_sales_train$date_block_num,
by = list(enh_sales_train$item_id),
FUN = min)
firstsaledates=rename(firstsaledates, itemfirstsaledate=x, item_id=Group.1)
enh_sales_train=left_join(enh_sales_train, firstsaledates)
A new variable was created to show the first date an item was sold in a specific shop
#create item-shop first sales date
firstsaledates=aggregate(x = enh_sales_train$date_block_num,
by = list(enh_sales_train$shop_id,enh_sales_train$item_id),
FUN = min)
firstsaledates=rename(firstsaledates, shopfirstsaledate=x, shop_id=Group.1,item_id=Group.2)
enh_sales_train=left_join(enh_sales_train, firstsaledates)
A new variable was created to separate the city name, which was included in the shop name variable.
# Get city names based off of shop names
enh_sales_train=separate(enh_sales_train, shop_name, c("cityname", NA,NA),sep=" ", remove=FALSE)
Given that the original data is in Russian, there may have been some issues inputing the data. One shop name began with “!” so this was removed.
#fix one shop that starts with !
enh_sales_train$cityname=gsub("!", "", enh_sales_train$cityname)
enh_sales_train$cityname=as.factor(enh_sales_train$cityname)
enh_sales_train$cityname=as.numeric(enh_sales_train$cityname)
Created variables that contained the latitude, longitude, and region of the city each shop was located in.
#add in city coordinates and region of country
city_coordinates= read_csv("city_coordinates.csv")
city_coordinates=separate(city_coordinates, coord, c("cityname","lat","long" ,"region"),sep=",", remove=TRUE)
city_coordinates$lat=as.numeric(city_coordinates$lat)
city_coordinates$long=as.numeric(city_coordinates$long)
city_coordinates$region=as.numeric(city_coordinates$region)
city_coordinates$cityname=as.factor(city_coordinates$cityname)
city_coordinates$cityname=as.numeric(city_coordinates$cityname)
enh_sales_train=left_join(enh_sales_train, city_coordinates)
A binary variable was created to reflect whether the item was considered “expensive.” An expensive item was one that fell within the top 25% prices of items being sold.
#expensive item indicator
enh_sales_train$EII=ifelse(IQR(enh_sales_train$item_price, na.rm = TRUE)<enh_sales_train$item_price,1,0 )
A variable was created to show the average prices of all items in each store.
# mean of item prices in store
temp=enh_sales_train %>% # Specify data frame
group_by(shop_id) %>% # Specify group indicator
summarise_at(vars(item_price), # Specify column
list(storemeanprice = mean)) # Specify function
enh_sales_train=left_join(enh_sales_train, temp)
A variable was created that reflected the average price of each item across all stores.
# mean of item prices across stores
temp=enh_sales_train %>% # Specify data frame
group_by(item_id) %>% # Specify group indicator
summarise_at(vars(item_price), # Specify column
list(itemmeanprice = mean)) # Specify function
enh_sales_train=left_join(enh_sales_train, temp)
A variable was created that showed the difference between an item and the average price of items in that store. This was done to determine if selling a high price item in a low end store or vice versa would have an impact on sales.
#relative price index
enh_sales_train$RPI=enh_sales_train$item_price-enh_sales_train$storemeanprice
A new variable was cretaed to determine if an item in a store was more or less expensive than the average price of that item across all stores.
#Deviation of price of item dependent on location
enh_sales_train$lessexpensive=ifelse(enh_sales_train$itemmeanprice+1 > enh_sales_train$item_price,1,0)
enh_sales_train$moreexpensive=ifelse(enh_sales_train$itemmeanprice+1 < enh_sales_train$item_price,1,0)
A variable was created to show the percentage of items in a store were “overpriced.” Over priced was defined as items that had a higher price in a store than the item’s average price across all stores.
#percent of over priced items by day for a store
temp=enh_sales_train %>% # Specify data frame
group_by(shop_id) %>% # Specify group indicator
summarise_at(vars(moreexpensive), # Specify column
list(percitemsexp = mean)) # Specify function
enh_sales_train=left_join(enh_sales_train, temp)
A new binary variable was created to show if the item was returned.
#if there was a return
enh_sales_train$return=ifelse(enh_sales_train$item_cnt_day<0,1,0)
A new variable was created to show the percent of items that were returned at each store everyday.
#percent of items returned day for a store
temp=enh_sales_train %>% # Specify data frame
group_by(shop_id) %>% # Specify group indicator
summarise_at(vars(return), # Specify column
list(percitemsret = mean)) # Specify function
enh_sales_train=left_join(enh_sales_train, temp)
The original data was in a D/M/Y format. Three new variables were created that held the day, month, and year respectively. A fourth column was created that reflected the number of days in the month a sale took place.
#proper date format
enh_sales_train$date1=parse_date_time(enh_sales_train$date, orders = c( "dmy", "mdy","ymd"))
#month
enh_sales_train$month=months(enh_sales_train$date1)
enh_sales_train$month=as.factor(enh_sales_train$month)
enh_sales_train$month=as.numeric(enh_sales_train$month)
#year
enh_sales_train$year=year(enh_sales_train$date1)
enh_sales_train$year=as.factor(enh_sales_train$year)
enh_sales_train$year=as.numeric(enh_sales_train$year)
#day
enh_sales_train$day=day(enh_sales_train$date1)
#days in the month
enh_sales_train$numdaysmonth=days_in_month(enh_sales_train$date1)
Finally, a variable was created the reflected the average number of items sold at a shop each month.
# average number of item sales per month per shop
itemsbymonth=aggregate(x = enh_sales_train$item_cnt_day,
by = list(enh_sales_train$shop_id,enh_sales_train$date_block_num ),
FUN = sum)
meanbymonth=aggregate(x = itemsbymonth$x,
by = list(itemsbymonth$Group.1 ),
FUN = mean)
meanbymonth=rename(meanbymonth, avgitemshop=x, shop_id=Group.1)
enh_sales_train=left_join(enh_sales_train, meanbymonth)
The original data contained information for the number of sales of each item in each shop every day. Since the test data we eventually need to predict is for the entire month of November, 2015, our current data must be aggregated from daily to monthly data. The transformed data will contain the shop id, item id, month, year, and the variables that were created above.
#transform dataset for prediction
predictiondata=enh_sales_train %>%
group_by(date_block_num, shop_id, item_id) %>%
summarise(item_cnt_month = sum(item_cnt_day) )
#make dataset wider
predictiondata_w=pivot_wider(predictiondata,names_from = date_block_num, values_from =item_cnt_month )
#fill in zeroes for NAs
predictiondata_w[is.na(predictiondata_w)] = 0
#make dataset longer again
predictiondata=pivot_longer(predictiondata_w,
cols = `0`:`34`,
names_to = c("date_block_num"))
predictiondata=rename(predictiondata, item_cnt_month=value)
#limit monthly count to fall in between 0 and 20
predictiondata$item_cnt_month=replace(predictiondata$item_cnt_month, predictiondata$item_cnt_month<0,0)
predictiondata$item_cnt_month=replace(predictiondata$item_cnt_month, predictiondata$item_cnt_month>20,20)
The sales of each item in a store for the previous three months were added.
#add in lags
predictiondata_plus <- predictiondata %>% # Add lagged column
group_by(shop_id, item_id) %>%
mutate(countlag1 =lag(item_cnt_month, n = 1, default = NA)) %>%
mutate(countlag2 =lag(item_cnt_month, n = 2, default = NA)) %>%
mutate(countlag3 =lag(item_cnt_month, n = 3, default = NA)) %>%
as.data.frame()
#add in other features from main dataset
#create in aggregate variables
addvar=enh_sales_train %>%
group_by(date_block_num, shop_id, item_id) %>%
summarise(item_price_month= mean(item_price) )
#pivot wider
add_var_w=pivot_wider(addvar,names_from = date_block_num,
values_from =c(item_price_month))
#fill in zeroes for NAs
add_var_w[is.na(add_var_w)] = 0
#make dataset longer again
addvar=pivot_longer(add_var_w,
cols = `0`:`34`,
names_to =c("date_block_num"),
values_to="item_price_month")
As we did with the number of items sold, the lagged price of the item from the previous three months was included.
#add in lags
addvar_plus <- addvar %>% # Add lagged column
group_by(shop_id, item_id) %>%
mutate(pricelag1 =lag(item_price_month, n = 1, default = NA)) %>%
mutate(pricelag2 =lag(item_price_month, n = 2, default = NA)) %>%
mutate(pricelag3 =lag(item_price_month, n = 3, default = NA)) %>%
as.data.frame()
predictiondata_plus=left_join(predictiondata_plus,addvar_plus)
Here we add in some variables from out feature engineering that did not translate when aggregating from day to month.
# add in store and item specific variables
morevars=select (enh_sales_train,c(date_block_num, shop_id, item_id,item_category_id,storemeanprice
,itemmeanprice,percitemsexp,percitemsret, cityname,month,year ,numdaysmonth,avgitemshop,lat,long,region,item_name_length,itemfirstsaledate,shopfirstsaledate ))
predictiondata_plus$date_block_num=as.numeric(predictiondata_plus$date_block_num)
predictiondata_plus=left_join(predictiondata_plus,morevars)
Lastly we remove some data that does not make sense for our analysis and prediction models.
#Delete rows not in original data
predictiondata_plus=predictiondata_plus %>% drop_na(item_category_id )
#delete rows from first three months as they don't have th full number of lags
predictiondata_plus1<-predictiondata_plus[!(predictiondata_plus$date_block_num < 3),]
#fill in remaining NAs
predictiondata_plus1[is.na(predictiondata_plus1)] = 0
Our goal is to predict the number of each item sold in each store in a specified month, so the number of items sold is separated into a new dataframe.
#Separate outcomes (item monthly counts)
outcomes=select(predictiondata_plus1, c(date_block_num,item_cnt_month))
prediction=select(predictiondata_plus1, -c(item_cnt_month, item_price_month))
To compare the effectiveness of our models, a validation set is created that contains the 33rd month of data (October 2015.) The first 32 months of data will be used as our training set.
#create training datasets
X_train=prediction[!(prediction$date_block_num > 32),]
y_train=outcomes[!(outcomes$date_block_num > 32),]
y_train=select(y_train, -c(date_block_num))
#create validation datasets
X_validate=prediction[(prediction$date_block_num== 33),]
Y_validate=outcomes[(outcomes$date_block_num == 33),]
Y_validate=select(Y_validate, -c(date_block_num))
#create test dataset training
X_test=prediction[(prediction$date_block_num == 34),]
#convert to matrices
xtrain<- as.matrix(X_train)
xvalidate<- as.matrix(X_validate)
xtest=as.matrix(X_test)
ytrain=as.matrix(y_train)
yvalidate=as.matrix(Y_validate)
# change test data back to original row order
xtest3 <- left_join(test,as.data.frame(xtest))
xtest3 <- xtest3[,-c(1,5,6,7)]
Due to the size of out dataset some of the models I wanted to try were untenable in R, so the data was saved to csvs in order to be used in python.
# right pre processed data to csvs to be used in python models
write.csv(as.data.frame(xtrain),"xtrain3.csv",row.names = FALSE)
write.csv(as.data.frame(xvalidate),"xvalidate3.csv",row.names = FALSE)
write.csv(as.data.frame(xtest3),"xtest3.csv",row.names = FALSE)
write.csv(as.data.frame(ytrain),"ytrain3.csv",row.names = FALSE)
write.csv(as.data.frame(yvalidate),"yvalidate3.csv",row.names = FALSE)