There are 36 files given for each month starting from Dec 2014 until Nov 2017(last month). The excel sheets have similar format containing medium wise breakup of the new user acquisition and related metrics and usage numbers. Understanding the variables: -
-
Following code will start importing the downloaded files and appending them into one dataset.
library(readxl)
## Warning: package 'readxl' was built under R version 3.4.2
baseDir="D:/Internships/Interview Task"
setwd(baseDir)
rm(dataset)
## Warning in rm(dataset): object 'dataset' not found
file_list <- list.files()
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("dataset")){
dataset<- read_excel(file, sheet=2)
names(dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate", "goal_1_comp", "goal_1_value")
dataset$source<-file
}
# if the merged dataset does exist, append to it
if (exists("dataset")){
temp_dataset <-read_excel(file, sheet=2)
names(temp_dataset)<-c("medium", "sessions", "percent_new_sessions","new_users", "bounce_rate", "pages_per_session", "avg_session_duration", "goal_1_conv_rate", "goal_1_comp", "goal_1_value")
temp_dataset$source<-file
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
#dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
#names(dataset1)
start=regexpr("20",dataset$source)
dataset$date<-as.Date(substr(dataset$source,start,start+7),format="%Y%m%d")
dataset<-dataset[,-11]
summary(dataset)
## medium sessions percent_new_sessions new_users
## Length:385 Min. : 1 Min. :0.0000 Min. : 0
## Class :character 1st Qu.: 17 1st Qu.:0.1922 1st Qu.: 4
## Mode :character Median : 1756 Median :0.4973 Median : 599
## Mean : 33985 Mean :0.4446 Mean : 22678
## 3rd Qu.: 18832 3rd Qu.:0.6667 3rd Qu.: 9226
## Max. :526509 Max. :1.0000 Max. :364649
## bounce_rate pages_per_session avg_session_duration goal_1_conv_rate
## Min. :0.0000 Min. : 1.000 Min. : 0.0 Min. :0.00000
## 1st Qu.:0.4239 1st Qu.: 2.032 1st Qu.: 105.0 1st Qu.:0.00000
## Median :0.5714 Median : 3.525 Median : 226.6 Median :0.00000
## Mean :0.5805 Mean : 3.868 Mean : 249.5 Mean :0.01552
## 3rd Qu.:0.7294 3rd Qu.: 4.834 3rd Qu.: 330.7 3rd Qu.:0.01500
## Max. :1.0000 Max. :25.000 Max. :2178.0 Max. :0.28400
## goal_1_comp goal_1_value date
## Min. : 0.0 Min. :0 Min. :2014-12-01
## 1st Qu.: 0.0 1st Qu.:0 1st Qu.:2015-10-01
## Median : 0.0 Median :0 Median :2016-08-01
## Mean : 297.7 Mean :0 Mean :2016-07-05
## 3rd Qu.: 3.0 3rd Qu.:0 3rd Qu.:2017-05-01
## Max. :5485.0 Max. :0 Max. :2017-11-01
d1<-subset(dataset,dataset["medium"]=="organic",select=c("new_users","date"))
plot(d1$date,d1$new_users)
{plot(d1$date,d1$new_users,type='b',col="green",xlab="Date (Month/Yr)",ylab ="New Users" )
abline(lm(d1$new_users~d1$date))
# add a title and subtitle
title("Customer Growth", "Households interested in moving to Solar Panels Installation")}
d1$year=substr(d1$date,1,4)
annual_growth<-aggregate(d1$new_users,by=list(d1$year),FUN=mean)
colnames(annual_growth)[1]=("year")
barplot(annual_growth$x,main="Yearly acquisition of new customers", xlab="Year",ylab ="New Users" ,legend=annual_growth$year)
Company has seen exponential growth in the year 2017 owing to its partnerships and pricing and vetting strategies.
boxplot(d1$new_users~format(as.Date(d1$date,format="%Y-%m-%d"), "%m"))
The boxplots of the monthly averages shows the presence of seasonality, if any.
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d1$new_users, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: d1$new_users
## Dickey-Fuller = -1.0156, Lag order = 0, p-value = 0.9221
## alternative hypothesis: stationary
adf.test(diff(d1$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d1$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(d1$new_users)
## Dickey-Fuller = -7.2011, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
Statistically confirming the stationarity effect.
acf(log(d1$new_users))
acf(diff(d1$new_users))
acf(diff(log(d1$new_users)))
pacf(diff(d1$new_users))
fit <- arima(diff(d1$new_users), c(0, 1, 1),seasonal = list(order = c(0, 1, 1), period = 12))
pred <- predict(fit, n.ahead = 1*12)
#ts.plot(d1$new_users,pred$pred, lty = c(1,3))