Forecasting Class - HW3

I had this old time series forecasting problem from an internship interview that I had not finished completely. So for HW3 since we have the choice to pick up time series of choice I will try to forecast the demand for solar panels specifically for “organic” medium only as was my task at hand.

Data Structure

There are 36 files given for each month starting from Dec 2014 until Nov 2017(last month). The excel sheets have similar format containing medium wise breakup of the new user acquisition and related metrics and usage numbers. Understanding the variables:

  • medium - source or medium that directed the user to client website for query.
  • sessions - no. of sessions opened.
  • percent_new_sessions - what percentage of above were done at different times.
  • new_users - total new users
  • bounce_rate - percent of these new users lost.
  • pages_per_session - no. of pages browsed per session/ search activity.
  • avg_session_duration - average time taken during each session/ search activity.
  • goal_1_conv_rate - impression to registration.
  • goal_1_comp -
  • goal_1_value - dollar value.

Data Loading

Following code will start importing the downloaded files and appending them into one dataset.

library(readxl)
## Warning: package 'readxl' was built under R version 3.4.2
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.3
baseDir="D:/Internships/Interview Task"
setwd(baseDir)
rm(dataset)
## Warning in rm(dataset): object 'dataset' not found
file_list <- list.files()
for (file in file_list){
       
  # if the merged dataset doesn't exist, create it
  if (!exists("dataset")){
    dataset<- read_excel(file, sheet=2)
    names(dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate",  "goal_1_comp", "goal_1_value")   
    dataset$source<-file
  }
   
  # if the merged dataset does exist, append to it
  if (exists("dataset")){
    temp_dataset <-read_excel(file, sheet=2)
    names(temp_dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate",  "goal_1_comp", "goal_1_value") 
    temp_dataset$source<-file
    dataset<-rbind(dataset, temp_dataset)
    rm(temp_dataset)
  }
}

The input datafile is ready by reading and appending all the sheets into a data frame.

#dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
#names(dataset1)

Data Pre-processing

# Getting the date information from the name of source file
start=regexpr("20",dataset$source)
dataset$date<-as.Date(substr(dataset$source,start,start+7),format="%Y%m%d")
dataset<-dataset[,-11]
library(psych)
describe(dataset)
## Warning in describe(dataset): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##                      vars   n     mean       sd  median  trimmed     mad
## medium*                 1 348      NaN       NA      NA      NaN      NA
## sessions                2 385 33985.32 85726.92 1756.00 10876.25 2601.96
## percent_new_sessions    3 385     0.44     0.29    0.50     0.44    0.32
## new_users               4 385 22677.86 60669.50  599.00  6187.83  888.08
## bounce_rate             5 385     0.58     0.24    0.57     0.58    0.22
## pages_per_session       6 385     3.87     2.82    3.53     3.48    2.12
## avg_session_duration    7 385   249.47   237.47  226.58   218.33  164.79
## goal_1_conv_rate        8 385     0.02     0.04    0.00     0.01    0.00
## goal_1_comp             9 385   297.69   793.36    0.00    83.59    0.00
## goal_1_value           10 385     0.00     0.00    0.00     0.00    0.00
## date*                  11 385      NaN       NA      NA      NaN      NA
##                      min       max     range  skew kurtosis      se
## medium*              Inf      -Inf      -Inf    NA       NA      NA
## sessions               1 526509.00 526508.00  3.59    13.03 4369.05
## percent_new_sessions   0      1.00      1.00  0.09    -0.91    0.01
## new_users              0 364649.00 364649.00  3.61    12.90 3092.00
## bounce_rate            0      1.00      1.00 -0.18    -0.20    0.01
## pages_per_session      1     25.00     24.00  2.97    14.40    0.14
## avg_session_duration   0   2178.00   2178.00  3.33    19.68   12.10
## goal_1_conv_rate       0      0.28      0.28  3.31    13.99    0.00
## goal_1_comp            0   5485.00   5485.00  3.54    14.37   40.43
## goal_1_value           0      0.00      0.00   NaN      NaN    0.00
## date*                Inf      -Inf      -Inf    NA       NA      NA
d1<-subset(dataset,dataset["medium"]=="organic")
#d1<-subset(dataset,dataset["medium"]=="organic",select=c("new_users","date"))

library(ggplot2); library(tidyr); library(purrr)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
d1[,-1] %>%
  gather() %>%
  ggplot(aes(value)) + 
  facet_wrap(~key, scales = "free") + 
  geom_bar(stat="count", position=position_dodge()) + 
  ggtitle("Count of responses by Questions - Post(Anxiety)") + 
    scale_y_continuous(limits=c(0,30)) +
  scale_x_discrete(drop=F)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: Removed 1 rows containing missing values (geom_bar).

par(mfrow=c(1,2))
hist(d1$new_users,breaks = 30, main = "New Users")

boxplot(d1$new_users, main="box plots of new users range")

plot(d1$date,d1$new_users)

{plot(d1$date,d1$new_users,type='b',col="green",xlab="Date (Month/Yr)",ylab ="New Users" )
abline(lm(d1$new_users~d1$date))
# add a title and subtitle 
title("Customer Growth", "Households interested in moving to Solar Panels Installation")}

d1$year=substr(d1$date,1,4)
annual_growth<-aggregate(d1$new_users,by=list(d1$year),FUN=mean)
colnames(annual_growth)[1]=("year")
barplot(annual_growth$x,main="Yearly acquisition of new customers", xlab="Year",ylab ="New Users" ,legend=annual_growth$year)

Company has seen exponential growth in the year 2017 owing to its partnerships and pricing and vetting strategies.

boxplot(d1$new_users~format(as.Date(d1$date,format="%Y-%m-%d"), "%m"))

The boxplots of the monthly averages shows the presence of a peculiar pattern, increase in monthly average demand over each month except the holidays of Dec month.

library(tseries) 
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d1$new_users, alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d1$new_users
## Dickey-Fuller = -1.0156, Lag order = 0, p-value = 0.9221
## alternative hypothesis: stationary
adf.test(diff(d1$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d1$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(d1$new_users)
## Dickey-Fuller = -7.2011, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
adf.test(log(d1$new_users), alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  log(d1$new_users)
## Dickey-Fuller = -1.6957, Lag order = 0, p-value = 0.6919
## alternative hypothesis: stationary
adf.test(diff(log(d1$new_users)), alternative="stationary", k=0)
## Warning in adf.test(diff(log(d1$new_users)), alternative = "stationary", :
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log(d1$new_users))
## Dickey-Fuller = -6.4139, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary

Statistically confirming the stationarity effect.

tsdisplay(log(d1$new_users))

tsdisplay(diff(d1$new_users))

tsdisplay(diff(log(d1$new_users)))

l<-length(d1$new_users)
train<-data.frame(d1[c(1:(l-6)),4])
test <-data.frame(d1[(l-5):l,4])
myts.tr<-ts(train,frequency = 12,end = c(4,7))
myts.te<-ts(test,frequency = 12,start = c(4,8))
plot(decompose(myts.tr,type = "multiplicative"))

lt.fit <- holt(myts.tr, h=6, initial = "optimal")
et.fit <- holt(myts.tr, h=6, exponential=TRUE)
dt.fit <- hw(myts.tr, h=6, damped=TRUE)

lt.pred <- forecast(lt.fit, h=6)
et.pred <- forecast(et.fit, h=6)
dt.pred <- forecast(dt.fit, h=6)

aa.fit  <- auto.arima(train)
aa.pred <- forecast(aa.fit, h = 6)
#summary(aa.fit)

nn.fit<-nnetar(myts.tr,lambda= 0.5)   # box-cox transformation
nn.pred <- forecast(nn.fit, h = 6)
#summary(nn.fit)

As there is presence of trend we could also try HoltWinters , ARIMA and neural net models.

accuracy(lt.pred$mean,myts.te)
##                 ME     RMSE      MAE       MPE     MAPE      ACF1
## Test set -94086.15 100496.6 94086.15 -34.65927 34.65927 0.2895357
##          Theil's U
## Test set  4.911575
accuracy(et.pred$mean,myts.te)
##                 ME   RMSE      MAE       MPE     MAPE      ACF1 Theil's U
## Test set -149895.5 170162 149895.5 -54.27738 54.27738 0.4022184  8.161312
accuracy(dt.pred$mean,myts.te)
##                 ME     RMSE      MAE       MPE     MAPE        ACF1
## Test set -33762.51 36985.01 33762.51 -12.53272 12.53272 -0.08564009
##          Theil's U
## Test set  1.824607
# accuracy(aa.pred$mean,myts.te)
accuracy(nn.pred$mean,myts.te)
##                 ME     RMSE      MAE       MPE     MAPE      ACF1
## Test set -275501.4 299308.1 275501.4 -100.6244 100.6244 0.4058125
##          Theil's U
## Test set  15.15903

Holt method with damped trend seems a better fit in here and neural nets also doesnt give enough information in this scenario, mostly because the datapoints are very less for forecasting sake.