I had this old time series forecasting problem from an internship interview that I had not finished completely. So for HW3 since we have the choice to pick up time series of choice I will try to forecast the demand for solar panels specifically for “organic” medium only as was my task at hand.
There are 36 files given for each month starting from Dec 2014 until Nov 2017(last month). The excel sheets have similar format containing medium wise breakup of the new user acquisition and related metrics and usage numbers. Understanding the variables:
Following code will start importing the downloaded files and appending them into one dataset.
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.2
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.3
baseDir="D:/Internships/Interview Task"
setwd(baseDir)
rm(dataset)
## Warning in rm(dataset): object 'dataset' not found
file_list <- list.files()
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("dataset")){
dataset<- read_excel(file, sheet=2)
names(dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate", "goal_1_comp", "goal_1_value")
dataset$source<-file
}
# if the merged dataset does exist, append to it
if (exists("dataset")){
temp_dataset <-read_excel(file, sheet=2)
names(temp_dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate", "goal_1_comp", "goal_1_value")
temp_dataset$source<-file
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
The input datafile is ready by reading and appending all the sheets into a data frame.
#dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
#names(dataset1)
# Getting the date information from the name of source file
start=regexpr("20",dataset$source)
dataset$date<-as.Date(substr(dataset$source,start,start+7),format="%Y%m%d")
dataset<-dataset[,-11]
library(psych)
describe(dataset)
## Warning in describe(dataset): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd median trimmed mad
## medium* 1 348 NaN NA NA NaN NA
## sessions 2 385 33985.32 85726.92 1756.00 10876.25 2601.96
## percent_new_sessions 3 385 0.44 0.29 0.50 0.44 0.32
## new_users 4 385 22677.86 60669.50 599.00 6187.83 888.08
## bounce_rate 5 385 0.58 0.24 0.57 0.58 0.22
## pages_per_session 6 385 3.87 2.82 3.53 3.48 2.12
## avg_session_duration 7 385 249.47 237.47 226.58 218.33 164.79
## goal_1_conv_rate 8 385 0.02 0.04 0.00 0.01 0.00
## goal_1_comp 9 385 297.69 793.36 0.00 83.59 0.00
## goal_1_value 10 385 0.00 0.00 0.00 0.00 0.00
## date* 11 385 NaN NA NA NaN NA
## min max range skew kurtosis se
## medium* Inf -Inf -Inf NA NA NA
## sessions 1 526509.00 526508.00 3.59 13.03 4369.05
## percent_new_sessions 0 1.00 1.00 0.09 -0.91 0.01
## new_users 0 364649.00 364649.00 3.61 12.90 3092.00
## bounce_rate 0 1.00 1.00 -0.18 -0.20 0.01
## pages_per_session 1 25.00 24.00 2.97 14.40 0.14
## avg_session_duration 0 2178.00 2178.00 3.33 19.68 12.10
## goal_1_conv_rate 0 0.28 0.28 3.31 13.99 0.00
## goal_1_comp 0 5485.00 5485.00 3.54 14.37 40.43
## goal_1_value 0 0.00 0.00 NaN NaN 0.00
## date* Inf -Inf -Inf NA NA NA
d1<-subset(dataset,dataset["medium"]=="organic")
#d1<-subset(dataset,dataset["medium"]=="organic",select=c("new_users","date"))
library(ggplot2); library(tidyr); library(purrr)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
d1[,-1] %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~key, scales = "free") +
geom_bar(stat="count", position=position_dodge()) +
ggtitle("Count of responses by Questions - Post(Anxiety)") +
scale_y_continuous(limits=c(0,30)) +
scale_x_discrete(drop=F)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: Removed 1 rows containing missing values (geom_bar).
par(mfrow=c(1,2))
hist(d1$new_users,breaks = 30, main = "New Users")
boxplot(d1$new_users, main="box plots of new users range")
plot(d1$date,d1$new_users)
{plot(d1$date,d1$new_users,type='b',col="green",xlab="Date (Month/Yr)",ylab ="New Users" )
abline(lm(d1$new_users~d1$date))
# add a title and subtitle
title("Customer Growth", "Households interested in moving to Solar Panels Installation")}
d1$year=substr(d1$date,1,4)
annual_growth<-aggregate(d1$new_users,by=list(d1$year),FUN=mean)
colnames(annual_growth)[1]=("year")
barplot(annual_growth$x,main="Yearly acquisition of new customers", xlab="Year",ylab ="New Users" ,legend=annual_growth$year)
Company has seen exponential growth in the year 2017 owing to its partnerships and pricing and vetting strategies.
boxplot(d1$new_users~format(as.Date(d1$date,format="%Y-%m-%d"), "%m"))
The boxplots of the monthly averages shows the presence of a peculiar pattern, increase in monthly average demand over each month except the holidays of Dec month.
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d1$new_users, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: d1$new_users
## Dickey-Fuller = -1.0156, Lag order = 0, p-value = 0.9221
## alternative hypothesis: stationary
adf.test(diff(d1$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d1$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(d1$new_users)
## Dickey-Fuller = -7.2011, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
adf.test(log(d1$new_users), alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: log(d1$new_users)
## Dickey-Fuller = -1.6957, Lag order = 0, p-value = 0.6919
## alternative hypothesis: stationary
adf.test(diff(log(d1$new_users)), alternative="stationary", k=0)
## Warning in adf.test(diff(log(d1$new_users)), alternative = "stationary", :
## p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(log(d1$new_users))
## Dickey-Fuller = -6.4139, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
Statistically confirming the stationarity effect.
tsdisplay(log(d1$new_users))
tsdisplay(diff(d1$new_users))
tsdisplay(diff(log(d1$new_users)))
l<-length(d1$new_users)
train<-data.frame(d1[c(1:(l-6)),4])
test <-data.frame(d1[(l-5):l,4])
myts.tr<-ts(train,frequency = 12,end = c(4,7))
myts.te<-ts(test,frequency = 12,start = c(4,8))
plot(decompose(myts.tr,type = "multiplicative"))
lt.fit <- holt(myts.tr, h=6, initial = "optimal")
et.fit <- holt(myts.tr, h=6, exponential=TRUE)
dt.fit <- hw(myts.tr, h=6, damped=TRUE)
lt.pred <- forecast(lt.fit, h=6)
et.pred <- forecast(et.fit, h=6)
dt.pred <- forecast(dt.fit, h=6)
aa.fit <- auto.arima(train)
aa.pred <- forecast(aa.fit, h = 6)
#summary(aa.fit)
nn.fit<-nnetar(myts.tr,lambda= 0.5) # box-cox transformation
nn.pred <- forecast(nn.fit, h = 6)
#summary(nn.fit)
As there is presence of trend we could also try HoltWinters , ARIMA and neural net models.
accuracy(lt.pred$mean,myts.te)
## ME RMSE MAE MPE MAPE ACF1
## Test set -94086.15 100496.6 94086.15 -34.65927 34.65927 0.2895357
## Theil's U
## Test set 4.911575
accuracy(et.pred$mean,myts.te)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -149895.5 170162 149895.5 -54.27738 54.27738 0.4022184 8.161312
accuracy(dt.pred$mean,myts.te)
## ME RMSE MAE MPE MAPE ACF1
## Test set -33762.51 36985.01 33762.51 -12.53272 12.53272 -0.08564009
## Theil's U
## Test set 1.824607
# accuracy(aa.pred$mean,myts.te)
accuracy(nn.pred$mean,myts.te)
## ME RMSE MAE MPE MAPE ACF1
## Test set -275501.4 299308.1 275501.4 -100.6244 100.6244 0.4058125
## Theil's U
## Test set 15.15903
Holt method with damped trend seems a better fit in here and neural nets also doesnt give enough information in this scenario, mostly because the datapoints are very less for forecasting sake.