Available Information

There are 36 files given for each month starting from Dec 2014 until Nov 2017(last month). The excel sheets have similar format containing medium wise breakup of the new user acquisition and related metrics and usage numbers. Understanding the variables: -
-

Following code will start importing the downloaded files and appending them into one dataset.

library(readxl)
## Warning: package 'readxl' was built under R version 3.4.2
baseDir="D:/Internships/Interview Task"
setwd(baseDir)
rm(dataset)
## Warning in rm(dataset): object 'dataset' not found
file_list <- list.files()
for (file in file_list){
       
  # if the merged dataset doesn't exist, create it
  if (!exists("dataset")){
    dataset<- read_excel(file, sheet=2)
    names(dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate",  "goal_1_comp", "goal_1_value")   
    dataset$source<-file
  }
   
  # if the merged dataset does exist, append to it
  if (exists("dataset")){
    temp_dataset <-read_excel(file, sheet=2)
    names(temp_dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate",  "goal_1_comp", "goal_1_value") 
    temp_dataset$source<-file
    dataset<-rbind(dataset, temp_dataset)
    rm(temp_dataset)
  }
}
#dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
#names(dataset1)
start=regexpr("20",dataset$source)
dataset$date<-as.Date(substr(dataset$source,start,start+7),format="%Y%m%d")
dataset<-dataset[,-11]
summary(dataset)
##     medium             sessions      percent_new_sessions   new_users     
##  Length:385         Min.   :     1   Min.   :0.0000       Min.   :     0  
##  Class :character   1st Qu.:    17   1st Qu.:0.1922       1st Qu.:     4  
##  Mode  :character   Median :  1756   Median :0.4973       Median :   599  
##                     Mean   : 33985   Mean   :0.4446       Mean   : 22678  
##                     3rd Qu.: 18832   3rd Qu.:0.6667       3rd Qu.:  9226  
##                     Max.   :526509   Max.   :1.0000       Max.   :364649  
##   bounce_rate     pages_per_session avg_session_duration goal_1_conv_rate 
##  Min.   :0.0000   Min.   : 1.000    Min.   :   0.0       Min.   :0.00000  
##  1st Qu.:0.4239   1st Qu.: 2.032    1st Qu.: 105.0       1st Qu.:0.00000  
##  Median :0.5714   Median : 3.525    Median : 226.6       Median :0.00000  
##  Mean   :0.5805   Mean   : 3.868    Mean   : 249.5       Mean   :0.01552  
##  3rd Qu.:0.7294   3rd Qu.: 4.834    3rd Qu.: 330.7       3rd Qu.:0.01500  
##  Max.   :1.0000   Max.   :25.000    Max.   :2178.0       Max.   :0.28400  
##   goal_1_comp      goal_1_value      date           
##  Min.   :   0.0   Min.   :0     Min.   :2014-12-01  
##  1st Qu.:   0.0   1st Qu.:0     1st Qu.:2015-10-01  
##  Median :   0.0   Median :0     Median :2016-08-01  
##  Mean   : 297.7   Mean   :0     Mean   :2016-07-05  
##  3rd Qu.:   3.0   3rd Qu.:0     3rd Qu.:2017-05-01  
##  Max.   :5485.0   Max.   :0     Max.   :2017-11-01
d1<-subset(dataset,dataset["medium"]=="organic",select=c("new_users","date"))
plot(d1$date,d1$new_users)

{plot(d1$date,d1$new_users,type='b',col="green",xlab="Date (Month/Yr)",ylab ="New Users" )
abline(lm(d1$new_users~d1$date))
# add a title and subtitle 
title("Customer Growth", "Households interested in moving to Solar Panels Installation")}

d1$year=substr(d1$date,1,4)
annual_growth<-aggregate(d1$new_users,by=list(d1$year),FUN=mean)
colnames(annual_growth)[1]=("year")
barplot(annual_growth$x,main="Yearly acquisition of new customers", xlab="Year",ylab ="New Users" ,legend=annual_growth$year)

Company has seen exponential growth in the year 2017 owing to its partnerships and pricing and vetting strategies.

boxplot(d1$new_users~format(as.Date(d1$date,format="%Y-%m-%d"), "%m"))

The boxplots of the monthly averages shows the presence of seasonality, if any.

library(tseries) 
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d1$new_users, alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d1$new_users
## Dickey-Fuller = -1.0156, Lag order = 0, p-value = 0.9221
## alternative hypothesis: stationary
adf.test(diff(d1$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d1$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(d1$new_users)
## Dickey-Fuller = -7.2011, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary

Statistically confirming the stationarity effect.

acf(log(d1$new_users))

acf(diff(d1$new_users))

acf(diff(log(d1$new_users)))

pacf(diff(d1$new_users))

fit <- arima(diff(d1$new_users), c(0, 1, 1),seasonal = list(order = c(0, 1, 1), period = 12))
pred <- predict(fit, n.ahead = 1*12)
#ts.plot(d1$new_users,pred$pred, lty = c(1,3))