R Markdown

Hi I am Phu.

rm(list = ls())

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0     v readr   1.3.1
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   0.8.3     v stringr 1.4.0
## v ggplot2 3.2.0     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl,
##     flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
##     modify, prepend, splice
library(ggplot2)

#insert theme
Phu_theme_black <- function(...) {
  
  theme(
    
    axis.line = element_blank(), 
    
    axis.text.x = element_text(color = "white", lineheight = 0.9), 
    
    axis.text.y = element_text(color = "white", lineheight = 0.9), 
    
    axis.ticks = element_line(color = "white", size  =  0.2), 
    
    axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)), 
    
    axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)), 
    
    axis.ticks.length = unit(0.3, "lines"),  
    
    legend.background = element_rect(color = NA, fill = "gray10"), 
    
    legend.key = element_rect(color = "white",  fill = "gray10"), 
    
    legend.key.size = unit(1.2, "lines"), 
    
    legend.key.height = NULL, 
    
    legend.key.width = NULL,     
    
    legend.text = element_text(color = "white"), 
    
    legend.title = element_text(face = "bold", hjust = 0, color = "white"), 
    
    legend.text.align = NULL, 
    
    legend.title.align = NULL, 
    
    legend.direction = "vertical", 
    
    legend.box = NULL,
    
    panel.background = element_rect(fill = "gray10", color  =  NA), 
    
    panel.border = element_blank(),
    
    panel.grid.major = element_line(color = "grey35"), 
    
    panel.grid.minor = element_line(color = "grey20"),  
    
    panel.spacing = unit(0.5, "lines"),  
    
    strip.background = element_rect(fill = "grey30", color = "grey10"), 
    
    strip.text.x = element_text(color = "white"), 
    
    strip.text.y = element_text(color = "white", angle = -90), 
    
    plot.background = element_rect(color = "gray10", fill = "gray10"), 
    
    plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25, margin = margin(2, 2, 2, 2)), 
    
    plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)), 
    
    plot.caption = element_text(color = "white", hjust = 0), 
    
    plot.margin = unit(rep(1, 4), "lines"))
  
}

#Import data
data <- read.csv("E:/RStudio/RPractices/Practice/Monte_Carlo/Cusnew.csv")

class(data)
## [1] "data.frame"
str(data)
## 'data.frame':    43 obs. of  2 variables:
##  $ MONTH_ID            : Factor w/ 43 levels "2016-01-01","2016-02-01",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ COUNT.DISTINCTCUSID.: int  30561 43674 18519 17591 15833 16177 17414 21560 17209 18521 ...
colnames(data) <- c("Month", "SLKH")

head(data)
##        Month  SLKH
## 1 2016-01-01 30561
## 2 2016-02-01 43674
## 3 2016-03-01 18519
## 4 2016-04-01 17591
## 5 2016-05-01 15833
## 6 2016-06-01 16177
#data$Month <- as.Date(data$Month, format = "%y/%m/%d")

data$SLKH <-  as.numeric(data$SLKH)

#Train and test set

train = data %>% slice(1:43) %>% select(1:2)

test = data %>% slice(37:43) %>% select(1:2)

#Histogram

hist(train$SLKH)

ggplot(train, aes(x=SLKH))+
  
  geom_histogram(fill="white", alpha=.9)+
  
  labs(x=NULL, y=NULL, title = "Histogram of New cus")+
  
  # scale_x_continuous(breaks= seq(0,600000, by=100000))+
  
  Phu_theme_black() + theme(plot.title=element_text(vjust=3, size=15) )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#line

ggplot(train, aes(x=Month, y= SLKH))+
  
  geom_line(color= "red", aes(group=1), size=1.5)+
  
  geom_point(colour="red", size = 3.5, alpha=0.5)+
  
  labs(title="The number of new customers monthly", x=NULL, y="New cus")+
  
  theme( plot.title=element_text(vjust=3, size=15) ) + Phu_theme_black()

#Build model
stats=data.frame(y=log1p(train$SLKH)
                 
                 ,ds=train$Month)

stats=aggregate(stats$y,by=list(stats$ds),FUN=sum)

head(stats)
##      Group.1         x
## 1 2016-01-01 10.327513
## 2 2016-02-01 10.684531
## 3 2016-03-01  9.826607
## 4 2016-04-01  9.775200
## 5 2016-05-01  9.669915
## 6 2016-06-01  9.691408
colnames(stats)<- c("ds","y")

model = prophet(stats, changepoint.prior.scale = 0.01)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
summary(model)
##                         Length Class      Mode     
## growth                   1     -none-     character
## changepoints            25     POSIXct    numeric  
## n.changepoints           1     -none-     numeric  
## changepoint.range        1     -none-     numeric  
## yearly.seasonality       1     -none-     character
## weekly.seasonality       1     -none-     character
## daily.seasonality        1     -none-     character
## holidays                 0     -none-     NULL     
## seasonality.mode         1     -none-     character
## seasonality.prior.scale  1     -none-     numeric  
## changepoint.prior.scale  1     -none-     numeric  
## holidays.prior.scale     1     -none-     numeric  
## mcmc.samples             1     -none-     numeric  
## interval.width           1     -none-     numeric  
## uncertainty.samples      1     -none-     numeric  
## specified.changepoints   1     -none-     logical  
## start                    1     POSIXct    numeric  
## y.scale                  1     -none-     numeric  
## logistic.floor           1     -none-     logical  
## t.scale                  1     -none-     numeric  
## changepoints.t          25     -none-     numeric  
## seasonalities            1     -none-     list     
## extra_regressors         0     -none-     list     
## country_holidays         0     -none-     NULL     
## stan.fit                 0     -none-     NULL     
## params                   5     -none-     list     
## history                  5     data.frame list     
## history.dates           43     POSIXct    numeric  
## train.holiday.names      0     -none-     NULL     
## train.component.cols     3     data.frame list     
## component.modes          2     -none-     list
future = make_future_dataframe(model, periods = 5, freq = "month")

tail(future)
##            ds
## 43 2019-07-01
## 44 2019-08-01
## 45 2019-09-01
## 46 2019-10-01
## 47 2019-11-01
## 48 2019-12-01
forecast = predict(model, future)

#Visualize the change point

add_changepoints_to_plot <- function(m, threshold = 0.01, cp_color = "red",
                                     cp_linetype = "dashed", trend = TRUE, ...){
  layers <- list()
  if (trend){
    trend_layer <- ggplot2::geom_line(
      ggplot2::aes_string("ds", "trend"), color = cp_color,...)
    layers <- append(layers, trend_layer)
  }
  signif_changepoints <- m$changepoints[abs(m$params$delta) >= threshold]
  cp_layer <- ggplot2::geom_vline(
    xintercept = as.integer(signif_changepoints), color = cp_color,
    linetype = cp_linetype,...
  )
  layers <- append(layers, cp_layer)
  return(layers)
}

plot(model, forecast)+
add_changepoints_to_plot(model)

prophet_plot_components(model, forecast)

#Data processing
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
str(forecast)
## 'data.frame':    48 obs. of  16 variables:
##  $ ds                        : POSIXct, format: "2016-01-01" "2016-02-01" ...
##  $ trend                     : num  9.69 9.73 9.76 9.8 9.84 ...
##  $ additive_terms            : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ additive_terms_lower      : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ additive_terms_upper      : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ yearly                    : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ yearly_lower              : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ yearly_upper              : num  0.248 0.1068 0.3663 0.1721 -0.0756 ...
##  $ multiplicative_terms      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ multiplicative_terms_lower: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ multiplicative_terms_upper: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ yhat_lower                : num  9.62 9.45 9.76 9.61 9.43 ...
##  $ yhat_upper                : num  10.3 10.2 10.5 10.3 10.1 ...
##  $ trend_lower               : num  9.69 9.73 9.76 9.8 9.84 ...
##  $ trend_upper               : num  9.69 9.73 9.76 9.8 9.84 ...
##  $ yhat                      : num  9.94 9.83 10.13 9.97 9.76 ...
str(stats)
## 'data.frame':    43 obs. of  2 variables:
##  $ ds: Factor w/ 43 levels "2016-01-01","2016-02-01",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ y : num  10.33 10.68 9.83 9.78 9.67 ...
forecast$ds <- as.factor(forecast$ds)
new_result <- sqldf("select a.ds, a.yhat_lower, a.yhat_upper, a.yhat, b.y                                    from forecast a
                            left join stats b on a.ds = b.ds")

# Function visualizes predictions vs actuals: 
new_result <- data.frame(new_result)
new_result$date <- as.Date.character(new_result$ds)
str(new_result)
## 'data.frame':    48 obs. of  6 variables:
##  $ ds        : chr  "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01" ...
##  $ yhat_lower: num  9.62 9.45 9.76 9.61 9.43 ...
##  $ yhat_upper: num  10.3 10.2 10.5 10.3 10.1 ...
##  $ yhat      : num  9.94 9.83 10.13 9.97 9.76 ...
##  $ y         : num  10.33 10.68 9.83 9.78 9.67 ...
##  $ date      : Date, format: "2016-01-01" "2016-02-01" ...
ggplot(new_result, aes(x=date))+
  geom_line(aes(y = y), color= "red", size=1.5)+
  geom_line(aes(y = yhat), color= "pink", size=1.5)+
  geom_line(aes(y = yhat_lower), color= "orange", size=1.5, linetype = "twodash")+
  geom_line(aes(y = yhat_upper), color= "blue", size=1.5, linetype = "twodash")+
  labs(title="The number of new customers monthly", x="Date", y="New cus")+
  scale_colour_manual("",
                      breaks = c("Act", "Forecast", "Upper", "Lower"),
                      values = c("red", "pink", "orange", "blue"))+
  theme( plot.title=element_text(vjust=3, size=15) )+ 
  labs(x = NULL, y = NULL, subtitle = "Forecast & Backtest", 
       title = "The number of customers")+
    Phu_theme_black()
## Warning: Removed 5 rows containing missing values (geom_path).

#To be continued

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.