R Markdown

chunk like this:

dataset1<-read.csv("C:/Users/PGD/Downloads/dataset1_energysage.csv")
names(dataset1)
##  [1] "percent_new_sessions" "avg_session_duration" "bounce_rate"         
##  [4] "medium"               "new_users"            "pages_per_session"   
##  [7] "goal_1_comp"          "goal_1_conv_rate"     "goal_1_value"        
## [10] "sessions"             "month"                "month_yr"            
## [13] "year"
summary(dataset1)
##  percent_new_sessions avg_session_duration  bounce_rate          medium   
##  Min.   :0.0000       Min.   :   0.0       Min.   :0.0000           :180  
##  1st Qu.:0.1918       1st Qu.: 105.0       1st Qu.:0.4264   (none)  :180  
##  Median :0.4973       Median : 223.0       Median :0.5731   email   :180  
##  Mean   :0.4450       Mean   : 248.0       Mean   :0.5817   organic :180  
##  3rd Qu.:0.6667       3rd Qu.: 329.8       3rd Qu.:0.7298   referral:180  
##  Max.   :1.0000       Max.   :2178.0       Max.   :1.0000   social  :180  
##                                                             (Other) :805  
##    new_users      pages_per_session  goal_1_comp     goal_1_conv_rate  
##  Min.   :     0   Min.   : 1.000    Min.   :   0.0   Min.   :0.000000  
##  1st Qu.:     4   1st Qu.: 2.032    1st Qu.:   0.0   1st Qu.:0.000000  
##  Median :   599   Median : 3.506    Median :   0.0   Median :0.000000  
##  Mean   : 23083   Mean   : 3.827    Mean   : 299.2   Mean   :0.015092  
##  3rd Qu.:  9226   3rd Qu.: 4.818    3rd Qu.:   2.0   3rd Qu.:0.009709  
##  Max.   :364649   Max.   :25.000    Max.   :5485.0   Max.   :0.284000  
##                                                                        
##   goal_1_value    sessions          month           month_yr   
##  Min.   :0     Min.   :     1   Min.   : 1.000   08_2016:  80  
##  1st Qu.:0     1st Qu.:    17   1st Qu.: 4.000   08_2017:  80  
##  Median :0     Median :  1756   Median : 7.000   11_2017:  80  
##  Mean   :0     Mean   : 34578   Mean   : 6.631   04_2017:  75  
##  3rd Qu.:0     3rd Qu.: 19020   3rd Qu.: 9.000   09_2017:  75  
##  Max.   :0     Max.   :526509   Max.   :12.000   05_2017:  70  
##                                                  (Other):1425  
##       year     
##  Min.   :2014  
##  1st Qu.:2015  
##  Median :2016  
##  Mean   :2016  
##  3rd Qu.:2017  
##  Max.   :2017  
## 
d1<-subset(dataset1,dataset1["medium"]=="organic",select=c("new_users","month","year"))
d2<-d1
d2$date<-as.Date(with(d1, paste(year, month, "01",sep="-")),format="%Y-%m-%d")

#d1<-d1[,-4]
plot(d2$date,d2$new_users)

#abline(lm(d2["new_users"]~d2["date"]))}
plot(aggregate(d2["new_users"],by=c(d2["year"]),FUN=mean))

boxplot(d2$new_users~d2$month)

#plot(aggregate(d2["new_users"],by=c(d2["year"]),FUN=mean))
library(tseries) 
## Warning: package 'tseries' was built under R version 3.4.3
adf.test(d2$new_users, alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d2$new_users
## Dickey-Fuller = -3.1099, Lag order = 0, p-value = 0.1123
## alternative hypothesis: stationary
adf.test(diff(d2$new_users), alternative="stationary", k=0)
## Warning in adf.test(diff(d2$new_users), alternative = "stationary", k = 0):
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(d2$new_users)
## Dickey-Fuller = -12.353, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
acf(log(d2$new_users))

acf(diff(d2$new_users))

acf(diff(log(d2$new_users)))

pacf(diff(d2$new_users))

(fit <- arima(log(d2$new_users), c(0, 1, 0)))
## 
## Call:
## arima(x = log(d2$new_users), order = c(0, 1, 0))
## 
## 
## sigma^2 estimated as 0.2903:  log likelihood = -143.29,  aic = 288.58
#,seasonal = list(order = c(0, 1, 1), period = 12)
pred <- predict(fit, n.ahead = 1*12)
#ts.plot(d2$new_users,pred$pred, log = "y", lty = c(1,3))