The following libraries were used for this project

library(tidyverse)
library(lubridate)
library(readr)
library(timeDate)

Reading in data sets that can be found on the page for the kaggle competition which can be found at https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data

item_categories= read_csv("item_categories.csv")
items= read_csv("items.csv")
sales_train=read_csv("sales_train.csv")
shops=read_csv("shops.csv")
test=read_csv("test.csv")

#create enhanced sales training data (join all datasets)

test$date_block_num=34
test$date="02.11.2015"
test$item_price=0
test$item_cnt_day=0
sales_train$ID=9999999999
sales_train=rbind(sales_train,test)

enh_sales_train=left_join(sales_train, shops)
enh_sales_train=left_join(enh_sales_train, items)
enh_sales_train=left_join(enh_sales_train, item_categories)

Clean dataset

remove outliers

#remove if price greater than 100000
enh_sales_train=enh_sales_train[enh_sales_train$item_price < 100000, ]  


#remove if price is negative 
enh_sales_train=enh_sales_train[enh_sales_train$item_price > -0.00001, ]  

Remove Data

# remove shops not in the test set 
testshopids=test$shop_id
enh_sales_train=filter(enh_sales_train, shop_id %in% testshopids)


# fix shops that have multiple ids
enh_sales_train$shop_id=recode(enh_sales_train$shop_id, `0` = 57, `1` = 58, `40`=39, `10`=11)

Feature Enginnering

A new variable was created to reflect the number of characters in an item name.

# item name length
enh_sales_train$item_name_length=nchar(enh_sales_train$item_name)

A new variable was created to show the first date the item was sold across all shops.

#create item first sales date
firstsaledates=aggregate(x = enh_sales_train$date_block_num,               
                                        by = list(enh_sales_train$item_id),              
                                        FUN = min) 
firstsaledates=rename(firstsaledates, itemfirstsaledate=x, item_id=Group.1)

enh_sales_train=left_join(enh_sales_train, firstsaledates)

A new variable was created to show the first date an item was sold in a specific shop

#create item-shop first sales date
firstsaledates=aggregate(x = enh_sales_train$date_block_num,               
                         by = list(enh_sales_train$shop_id,enh_sales_train$item_id),              
                         FUN = min) 
firstsaledates=rename(firstsaledates, shopfirstsaledate=x, shop_id=Group.1,item_id=Group.2)

enh_sales_train=left_join(enh_sales_train, firstsaledates)

A new variable was created to separate the city name, which was included in the shop name variable.

# Get city names based off of shop names  

enh_sales_train=separate(enh_sales_train, shop_name, c("cityname", NA,NA),sep=" ", remove=FALSE)

Given that the original data is in Russian, there may have been some issues inputing the data. One shop name began with “!” so this was removed.

#fix one shop that starts with ! 
enh_sales_train$cityname=gsub("!", "", enh_sales_train$cityname)
enh_sales_train$cityname=as.factor(enh_sales_train$cityname)
enh_sales_train$cityname=as.numeric(enh_sales_train$cityname)

Created variables that contained the latitude, longitude, and region of the city each shop was located in.

#add in city coordinates and region of country 
city_coordinates= read_csv("city_coordinates.csv")
city_coordinates=separate(city_coordinates, coord, c("cityname","lat","long" ,"region"),sep=",", remove=TRUE)
city_coordinates$lat=as.numeric(city_coordinates$lat)
city_coordinates$long=as.numeric(city_coordinates$long)
city_coordinates$region=as.numeric(city_coordinates$region)
city_coordinates$cityname=as.factor(city_coordinates$cityname)
city_coordinates$cityname=as.numeric(city_coordinates$cityname)
enh_sales_train=left_join(enh_sales_train, city_coordinates)

A binary variable was created to reflect whether the item was considered “expensive.” An expensive item was one that fell within the top 25% prices of items being sold.

#expensive item indicator 
enh_sales_train$EII=ifelse(IQR(enh_sales_train$item_price, na.rm = TRUE)<enh_sales_train$item_price,1,0 )

A variable was created to show the average prices of all items in each store.

# mean of item prices in store 
temp=enh_sales_train %>%  # Specify data frame
  group_by(shop_id) %>%                         # Specify group indicator
  summarise_at(vars(item_price),              # Specify column
               list(storemeanprice = mean))               # Specify function

enh_sales_train=left_join(enh_sales_train, temp)

A variable was created that reflected the average price of each item across all stores.

# mean of item prices across stores
temp=enh_sales_train %>%  # Specify data frame
  group_by(item_id) %>%                         # Specify group indicator
  summarise_at(vars(item_price),              # Specify column
               list(itemmeanprice = mean))               # Specify function

enh_sales_train=left_join(enh_sales_train, temp)

A variable was created that showed the difference between an item and the average price of items in that store. This was done to determine if selling a high price item in a low end store or vice versa would have an impact on sales.

#relative price index
enh_sales_train$RPI=enh_sales_train$item_price-enh_sales_train$storemeanprice

A new variable was cretaed to determine if an item in a store was more or less expensive than the average price of that item across all stores.

#Deviation of price of item dependent on location
enh_sales_train$lessexpensive=ifelse(enh_sales_train$itemmeanprice+1 > enh_sales_train$item_price,1,0)
enh_sales_train$moreexpensive=ifelse(enh_sales_train$itemmeanprice+1 < enh_sales_train$item_price,1,0)

A variable was created to show the percentage of items in a store were “overpriced.” Over priced was defined as items that had a higher price in a store than the item’s average price across all stores.

#percent of over priced items by day for a store  
temp=enh_sales_train %>%  # Specify data frame
  group_by(shop_id) %>%                         # Specify group indicator
  summarise_at(vars(moreexpensive),              # Specify column
               list(percitemsexp = mean))               # Specify function
enh_sales_train=left_join(enh_sales_train, temp)

A new binary variable was created to show if the item was returned.

#if there was a return 
enh_sales_train$return=ifelse(enh_sales_train$item_cnt_day<0,1,0)

A new variable was created to show the percent of items that were returned at each store everyday.

#percent of items returned day for a store  
temp=enh_sales_train %>%  # Specify data frame
  group_by(shop_id) %>%                         # Specify group indicator
  summarise_at(vars(return),              # Specify column
               list(percitemsret = mean))               # Specify function
enh_sales_train=left_join(enh_sales_train, temp)

The original data was in a D/M/Y format. Three new variables were created that held the day, month, and year respectively. A fourth column was created that reflected the number of days in the month a sale took place.

#proper date format 
enh_sales_train$date1=parse_date_time(enh_sales_train$date, orders = c( "dmy", "mdy","ymd"))  

#month
enh_sales_train$month=months(enh_sales_train$date1)
enh_sales_train$month=as.factor(enh_sales_train$month)
enh_sales_train$month=as.numeric(enh_sales_train$month)
#year
enh_sales_train$year=year(enh_sales_train$date1)
enh_sales_train$year=as.factor(enh_sales_train$year)
enh_sales_train$year=as.numeric(enh_sales_train$year)
#day
enh_sales_train$day=day(enh_sales_train$date1)

#days in the month
enh_sales_train$numdaysmonth=days_in_month(enh_sales_train$date1)

Finally, a variable was created the reflected the average number of items sold at a shop each month.

# average number of item sales per month per shop
itemsbymonth=aggregate(x = enh_sales_train$item_cnt_day,               
          by = list(enh_sales_train$shop_id,enh_sales_train$date_block_num ),              
          FUN = sum)                           

meanbymonth=aggregate(x = itemsbymonth$x,               
                      by = list(itemsbymonth$Group.1 ),              
                                    FUN = mean)                           
meanbymonth=rename(meanbymonth, avgitemshop=x, shop_id=Group.1)

enh_sales_train=left_join(enh_sales_train, meanbymonth)

Transformation

The original data contained information for the number of sales of each item in each shop every day. Since the test data we eventually need to predict is for the entire month of November, 2015, our current data must be aggregated from daily to monthly data. The transformed data will contain the shop id, item id, month, year, and the variables that were created above.

#transform dataset for prediction 
predictiondata=enh_sales_train %>%
  group_by(date_block_num, shop_id, item_id) %>%
  summarise(item_cnt_month = sum(item_cnt_day) )


#make dataset wider 
predictiondata_w=pivot_wider(predictiondata,names_from = date_block_num, values_from =item_cnt_month )

#fill in zeroes for NAs
predictiondata_w[is.na(predictiondata_w)] = 0

#make dataset longer again 
predictiondata=pivot_longer(predictiondata_w,
                            cols = `0`:`34`,
                            names_to = c("date_block_num"))
predictiondata=rename(predictiondata, item_cnt_month=value)

#limit monthly count to fall in between 0 and 20
predictiondata$item_cnt_month=replace(predictiondata$item_cnt_month, predictiondata$item_cnt_month<0,0)
predictiondata$item_cnt_month=replace(predictiondata$item_cnt_month, predictiondata$item_cnt_month>20,20)

The sales of each item in a store for the previous three months were added.

#add in lags
predictiondata_plus <- predictiondata %>%                            # Add lagged column
  group_by(shop_id, item_id) %>%
  mutate(countlag1 =lag(item_cnt_month, n = 1, default = NA)) %>% 
  mutate(countlag2 =lag(item_cnt_month, n = 2, default = NA)) %>% 
  mutate(countlag3 =lag(item_cnt_month, n = 3, default = NA)) %>% 
  as.data.frame()

#add in other features from main dataset

#create in aggregate variables  
addvar=enh_sales_train %>%
  group_by(date_block_num, shop_id, item_id) %>%
  summarise(item_price_month= mean(item_price) ) 
#pivot wider 
add_var_w=pivot_wider(addvar,names_from = date_block_num,
                      values_from =c(item_price_month))
#fill in zeroes for NAs 
add_var_w[is.na(add_var_w)] = 0
#make dataset longer again 
addvar=pivot_longer(add_var_w,
                    cols = `0`:`34`,
                    names_to =c("date_block_num"), 
                    values_to="item_price_month")

As we did with the number of items sold, the lagged price of the item from the previous three months was included.

#add in lags
addvar_plus <- addvar %>%                            # Add lagged column
  group_by(shop_id, item_id) %>%
  mutate(pricelag1 =lag(item_price_month, n = 1, default = NA)) %>% 
  mutate(pricelag2 =lag(item_price_month, n = 2, default = NA)) %>% 
  mutate(pricelag3 =lag(item_price_month, n = 3, default = NA)) %>% 
  as.data.frame()

predictiondata_plus=left_join(predictiondata_plus,addvar_plus)

Here we add in some variables from out feature engineering that did not translate when aggregating from day to month.

# add in store and item specific variables

morevars=select (enh_sales_train,c(date_block_num, shop_id, item_id,item_category_id,storemeanprice
                                   ,itemmeanprice,percitemsexp,percitemsret, cityname,month,year ,numdaysmonth,avgitemshop,lat,long,region,item_name_length,itemfirstsaledate,shopfirstsaledate ))

predictiondata_plus$date_block_num=as.numeric(predictiondata_plus$date_block_num)
predictiondata_plus=left_join(predictiondata_plus,morevars)

Lastly we remove some data that does not make sense for our analysis and prediction models.

#Delete rows not in original data
predictiondata_plus=predictiondata_plus %>% drop_na(item_category_id )

#delete rows from first three months as they don't have th full number of lags 
predictiondata_plus1<-predictiondata_plus[!(predictiondata_plus$date_block_num < 3),]

#fill in remaining NAs
predictiondata_plus1[is.na(predictiondata_plus1)] = 0

Train/Test split

Our goal is to predict the number of each item sold in each store in a specified month, so the number of items sold is separated into a new dataframe.

#Separate outcomes (item monthly counts)

outcomes=select(predictiondata_plus1, c(date_block_num,item_cnt_month))
prediction=select(predictiondata_plus1, -c(item_cnt_month, item_price_month)) 

To compare the effectiveness of our models, a validation set is created that contains the 33rd month of data (October 2015.) The first 32 months of data will be used as our training set.

#create training datasets 
X_train=prediction[!(prediction$date_block_num > 32),]

y_train=outcomes[!(outcomes$date_block_num > 32),]
y_train=select(y_train, -c(date_block_num)) 

#create validation datasets
X_validate=prediction[(prediction$date_block_num== 33),]


Y_validate=outcomes[(outcomes$date_block_num == 33),]
Y_validate=select(Y_validate, -c(date_block_num))

#create test dataset training   
X_test=prediction[(prediction$date_block_num == 34),]  

#convert to matrices 
xtrain<- as.matrix(X_train) 
xvalidate<- as.matrix(X_validate) 
xtest=as.matrix(X_test)

ytrain=as.matrix(y_train) 
yvalidate=as.matrix(Y_validate)
# change test data back to original row order
xtest3 <- left_join(test,as.data.frame(xtest))
xtest3 <- xtest3[,-c(1,5,6,7)]

Due to the size of out dataset some of the models I wanted to try were untenable in R, so the data was saved to csvs in order to be used in python.

# right pre processed data to csvs to be used in python models
write.csv(as.data.frame(xtrain),"xtrain3.csv",row.names = FALSE)
write.csv(as.data.frame(xvalidate),"xvalidate3.csv",row.names = FALSE)
write.csv(as.data.frame(xtest3),"xtest3.csv",row.names = FALSE)
write.csv(as.data.frame(ytrain),"ytrain3.csv",row.names = FALSE)
write.csv(as.data.frame(yvalidate),"yvalidate3.csv",row.names = FALSE)