chunk like this:
dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
names(dataset1)
## [1] "percent_new_sessions" "avg_session_duration" "bounce_rate"
## [4] "medium" "new_users" "pages_per_session"
## [7] "goal_1_comp" "goal_1_conv_rate" "goal_1_value"
## [10] "sessions" "month" "month_yr"
## [13] "year"
summary(dataset1)
## percent_new_sessions avg_session_duration bounce_rate medium
## Min. :0.0000 Min. : 0.0 Min. :0.0000 :180
## 1st Qu.:0.1918 1st Qu.: 105.0 1st Qu.:0.4264 (none) :180
## Median :0.4973 Median : 223.0 Median :0.5731 email :180
## Mean :0.4450 Mean : 248.0 Mean :0.5817 organic :180
## 3rd Qu.:0.6667 3rd Qu.: 329.8 3rd Qu.:0.7298 referral:180
## Max. :1.0000 Max. :2178.0 Max. :1.0000 social :180
## (Other) :805
## new_users pages_per_session goal_1_comp goal_1_conv_rate
## Min. : 0 Min. : 1.000 Min. : 0.0 Min. :0.000000
## 1st Qu.: 4 1st Qu.: 2.032 1st Qu.: 0.0 1st Qu.:0.000000
## Median : 599 Median : 3.506 Median : 0.0 Median :0.000000
## Mean : 23083 Mean : 3.827 Mean : 299.2 Mean :0.015092
## 3rd Qu.: 9226 3rd Qu.: 4.818 3rd Qu.: 2.0 3rd Qu.:0.009709
## Max. :364649 Max. :25.000 Max. :5485.0 Max. :0.284000
##
## goal_1_value sessions month month_yr
## Min. :0 Min. : 1 Min. : 1.000 08_2016: 80
## 1st Qu.:0 1st Qu.: 17 1st Qu.: 4.000 08_2017: 80
## Median :0 Median : 1756 Median : 7.000 11_2017: 80
## Mean :0 Mean : 34578 Mean : 6.631 04_2017: 75
## 3rd Qu.:0 3rd Qu.: 19020 3rd Qu.: 9.000 09_2017: 75
## Max. :0 Max. :526509 Max. :12.000 05_2017: 70
## (Other):1425
## year
## Min. :2014
## 1st Qu.:2015
## Median :2016
## Mean :2016
## 3rd Qu.:2017
## Max. :2017
##
d1<-subset(dataset1,dataset1["medium"]=="organic",select=c("new_users","month","year"))
d2<-d1
d2$date<-as.Date(with(d1, paste(year, month, "01",sep="-")),format="%Y-%m-%d")
#d1<-d1[,-4]
plot(d2$date,d2$new_users)
#abline(lm(d2["new_users"]~d2["date"]))}
plot(aggregate(d2["new_users"],by=c(d2["year"]),FUN=mean))
boxplot(d2$new_users~d2$month)
#plot(aggregate(d2["new_users"],by=c(d2["year"]),FUN=mean))
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d2$new_users, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: d2$new_users
## Dickey-Fuller = -3.1099, Lag order = 0, p-value = 0.1123
## alternative hypothesis: stationary
adf.test(diff(d2$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d2$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(d2$new_users)
## Dickey-Fuller = -12.353, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
acf(log(d2$new_users))
acf(diff(d2$new_users))
acf(diff(log(d2$new_users)))
pacf(diff(d2$new_users))
(fit <- arima(log(d2$new_users), c(0, 1, 0)))
##
## Call:
## arima(x = log(d2$new_users), order = c(0, 1, 0))
##
##
## sigma^2 estimated as 0.2903: log likelihood = -143.29, aic = 288.58
#,seasonal = list(order = c(0, 1, 1), period = 12)
pred <- predict(fit, n.ahead = 1*12)
#ts.plot(d2$new_users,pred$pred, log = "y", lty = c(1,3))