Hi I am Phu.
rm(list = ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v readr 1.3.1
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 0.8.3 v stringr 1.4.0
## v ggplot2 3.2.0 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl,
## flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
## modify, prepend, splice
library(ggplot2)
#insert theme
Phu_theme_black <- function(...) {
theme(
axis.line = element_blank(),
axis.text.x = element_text(color = "white", lineheight = 0.9),
axis.text.y = element_text(color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
legend.background = element_rect(color = NA, fill = "gray10"),
legend.key = element_rect(color = "white", fill = "gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(color = "white"),
legend.title = element_text(face = "bold", hjust = 0, color = "white"),
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
panel.background = element_rect(fill = "gray10", color = NA),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(color = "white"),
strip.text.y = element_text(color = "white", angle = -90),
plot.background = element_rect(color = "gray10", fill = "gray10"),
plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25, margin = margin(2, 2, 2, 2)),
plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),
plot.caption = element_text(color = "white", hjust = 0),
plot.margin = unit(rep(1, 4), "lines"))
}
#Import data
data <- read.csv("E:/RStudio/RPractices/Practice/Monte_Carlo/Cusnew.csv")
class(data)
## [1] "data.frame"
str(data)
## 'data.frame': 43 obs. of 2 variables:
## $ MONTH_ID : Factor w/ 43 levels "2016-01-01","2016-02-01",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ COUNT.DISTINCTCUSID.: int 30561 43674 18519 17591 15833 16177 17414 21560 17209 18521 ...
colnames(data) <- c("Month", "SLKH")
head(data)
## Month SLKH
## 1 2016-01-01 30561
## 2 2016-02-01 43674
## 3 2016-03-01 18519
## 4 2016-04-01 17591
## 5 2016-05-01 15833
## 6 2016-06-01 16177
#data$Month <- as.Date(data$Month, format = "%y/%m/%d")
data$SLKH <- as.numeric(data$SLKH)
#Train and test set
train = data %>% slice(1:43) %>% select(1:2)
test = data %>% slice(37:43) %>% select(1:2)
#Histogram
hist(train$SLKH)
ggplot(train, aes(x=SLKH))+
geom_histogram(fill="white", alpha=.9)+
labs(x=NULL, y=NULL, title = "Histogram of New cus")+
# scale_x_continuous(breaks= seq(0,600000, by=100000))+
Phu_theme_black() + theme(plot.title=element_text(vjust=3, size=15) )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#line
ggplot(train, aes(x=Month, y= SLKH))+
geom_line(color= "red", aes(group=1), size=1.5)+
geom_point(colour="red", size = 3.5, alpha=0.5)+
labs(title="The number of new customers monthly", x=NULL, y="New cus")+
theme( plot.title=element_text(vjust=3, size=15) ) + Phu_theme_black()
#Build model
stats=data.frame(y=log1p(train$SLKH)
,ds=train$Month)
stats=aggregate(stats$y,by=list(stats$ds),FUN=sum)
head(stats)
## Group.1 x
## 1 2016-01-01 10.327513
## 2 2016-02-01 10.684531
## 3 2016-03-01 9.826607
## 4 2016-04-01 9.775200
## 5 2016-05-01 9.669915
## 6 2016-06-01 9.691408
colnames(stats)<- c("ds","y")
model = prophet(stats, changepoint.prior.scale = 0.01)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
summary(model)
## Length Class Mode
## growth 1 -none- character
## changepoints 25 POSIXct numeric
## n.changepoints 1 -none- numeric
## changepoint.range 1 -none- numeric
## yearly.seasonality 1 -none- character
## weekly.seasonality 1 -none- character
## daily.seasonality 1 -none- character
## holidays 0 -none- NULL
## seasonality.mode 1 -none- character
## seasonality.prior.scale 1 -none- numeric
## changepoint.prior.scale 1 -none- numeric
## holidays.prior.scale 1 -none- numeric
## mcmc.samples 1 -none- numeric
## interval.width 1 -none- numeric
## uncertainty.samples 1 -none- numeric
## specified.changepoints 1 -none- logical
## start 1 POSIXct numeric
## y.scale 1 -none- numeric
## logistic.floor 1 -none- logical
## t.scale 1 -none- numeric
## changepoints.t 25 -none- numeric
## seasonalities 1 -none- list
## extra_regressors 0 -none- list
## country_holidays 0 -none- NULL
## stan.fit 0 -none- NULL
## params 5 -none- list
## history 5 data.frame list
## history.dates 43 POSIXct numeric
## train.holiday.names 0 -none- NULL
## train.component.cols 3 data.frame list
## component.modes 2 -none- list
future = make_future_dataframe(model, periods = 5, freq = "month")
tail(future)
## ds
## 43 2019-07-01
## 44 2019-08-01
## 45 2019-09-01
## 46 2019-10-01
## 47 2019-11-01
## 48 2019-12-01
forecast = predict(model, future)
#Visualize the change point
add_changepoints_to_plot <- function(m, threshold = 0.01, cp_color = "red",
cp_linetype = "dashed", trend = TRUE, ...){
layers <- list()
if (trend){
trend_layer <- ggplot2::geom_line(
ggplot2::aes_string("ds", "trend"), color = cp_color,...)
layers <- append(layers, trend_layer)
}
signif_changepoints <- m$changepoints[abs(m$params$delta) >= threshold]
cp_layer <- ggplot2::geom_vline(
xintercept = as.integer(signif_changepoints), color = cp_color,
linetype = cp_linetype,...
)
layers <- append(layers, cp_layer)
return(layers)
}
plot(model, forecast)+
add_changepoints_to_plot(model)
prophet_plot_components(model, forecast)
#Data processing
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
str(forecast)
## 'data.frame': 48 obs. of 16 variables:
## $ ds : POSIXct, format: "2016-01-01" "2016-02-01" ...
## $ trend : num 9.69 9.73 9.76 9.8 9.84 ...
## $ additive_terms : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ additive_terms_lower : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ additive_terms_upper : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ yearly : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ yearly_lower : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ yearly_upper : num 0.248 0.1068 0.3663 0.1721 -0.0756 ...
## $ multiplicative_terms : num 0 0 0 0 0 0 0 0 0 0 ...
## $ multiplicative_terms_lower: num 0 0 0 0 0 0 0 0 0 0 ...
## $ multiplicative_terms_upper: num 0 0 0 0 0 0 0 0 0 0 ...
## $ yhat_lower : num 9.62 9.45 9.76 9.61 9.43 ...
## $ yhat_upper : num 10.3 10.2 10.5 10.3 10.1 ...
## $ trend_lower : num 9.69 9.73 9.76 9.8 9.84 ...
## $ trend_upper : num 9.69 9.73 9.76 9.8 9.84 ...
## $ yhat : num 9.94 9.83 10.13 9.97 9.76 ...
str(stats)
## 'data.frame': 43 obs. of 2 variables:
## $ ds: Factor w/ 43 levels "2016-01-01","2016-02-01",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ y : num 10.33 10.68 9.83 9.78 9.67 ...
forecast$ds <- as.factor(forecast$ds)
new_result <- sqldf("select a.ds, a.yhat_lower, a.yhat_upper, a.yhat, b.y from forecast a
left join stats b on a.ds = b.ds")
# Function visualizes predictions vs actuals:
new_result <- data.frame(new_result)
new_result$date <- as.Date.character(new_result$ds)
str(new_result)
## 'data.frame': 48 obs. of 6 variables:
## $ ds : chr "2016-01-01" "2016-02-01" "2016-03-01" "2016-04-01" ...
## $ yhat_lower: num 9.62 9.45 9.76 9.61 9.43 ...
## $ yhat_upper: num 10.3 10.2 10.5 10.3 10.1 ...
## $ yhat : num 9.94 9.83 10.13 9.97 9.76 ...
## $ y : num 10.33 10.68 9.83 9.78 9.67 ...
## $ date : Date, format: "2016-01-01" "2016-02-01" ...
ggplot(new_result, aes(x=date))+
geom_line(aes(y = y), color= "red", size=1.5)+
geom_line(aes(y = yhat), color= "pink", size=1.5)+
geom_line(aes(y = yhat_lower), color= "orange", size=1.5, linetype = "twodash")+
geom_line(aes(y = yhat_upper), color= "blue", size=1.5, linetype = "twodash")+
labs(title="The number of new customers monthly", x="Date", y="New cus")+
scale_colour_manual("",
breaks = c("Act", "Forecast", "Upper", "Lower"),
values = c("red", "pink", "orange", "blue"))+
theme( plot.title=element_text(vjust=3, size=15) )+
labs(x = NULL, y = NULL, subtitle = "Forecast & Backtest",
title = "The number of customers")+
Phu_theme_black()
## Warning: Removed 5 rows containing missing values (geom_path).
#To be continued
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.