Btv Pre-roll Inven Forecast Test

This My first R Markdown Page!!!

’19년 5월~7월 기간의 Btv pre-roll 광고 inven에 대한 Time Series(ETS 및 ARIMA) 예측 테스트 R code

## Loading required package: ggplot2
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: fpp2
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff
## Loading required package: fma
## Loading required package: expsmooth
## Observations: 577
## Variables: 7
## $ Date            <chr> "2018-01-01", "2018-01-02", "2018-01-03", "2018-…
## $ VOD_시청시간_초 <chr> "8,421,536,616", "7,878,348,464", "7,498,034,394", "7…
## $ VOD_시청건수    <chr> "4,496,473", "4,658,646", "4,535,338", "4,500,479", …
## $ REQ             <chr> "4,567,997", "4,630,415", "4,533,702", "4,490,70…
## $ INV             <chr> "7,377,096", "7,110,607", "6,932,244", "6,839,95…
## $ Res             <chr> "4,787,165", "4,391,051", "4,252,549", "4,101,29…
## $ Imp             <chr> "4,498,714", "4,141,349", "4,011,362", "3,872,03…
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# 이상치 데이터 보정작업 
### >>> VOD 시청건수와 큰 차이 데이터 대체처리, 260만 미만 8개
btv_pre_roll$REQ_chg_flag = FALSE
arrange(btv_pre_roll %>% filter(REQ < 2650000), REQ)
btv_pre_roll$REQ_chg_flag [btv_pre_roll$REQ < 2600000] <- TRUE
#btv_pre_roll$REQ [REQ < 2600000] <- round(btv_pre_roll$VOD_시청건수 [REQ < 2600000] * 0.98, 0)
btv_pre_roll$REQ [btv_pre_roll$REQ < 2600000] <- btv_pre_roll$VOD_시청건수 [btv_pre_roll$REQ < 2600000]
filter(btv_pre_roll, REQ_chg_flag == TRUE)

###2018-05-15 ~ 2018-06-12까지 이상 데이터 대체 처리 >> 29개
btv_pre_roll$REQ_chg_flag [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12'] <- TRUE
btv_pre_roll$REQ [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12'] <- 
    btv_pre_roll$VOD_시청건수 [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12']

# 보정후 raw data 그래프
ggplot(btv_pre_roll, aes(x=Date, y=REQ)) +
  geom_point() + geom_smooth(method = 'lm', color = 'red', linetype =2) + # 직선 추세선 추가
  geom_smooth() + # 곡선 추세선 추가
  ggtitle("Raw Data 추세선")

tail(btv_pre_roll)

# 요일  구하기
Sys.setlocale("LC_TIME", "English")
btv_pre_roll$weekday = factor(weekdays(btv_pre_roll$Date), 
                              levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(btv_pre_roll$weekday)
Sys.setlocale("LC_TIME", "Korean")

# 월(month)  구하기
btv_pre_roll$month <-factor(substr(btv_pre_roll$Date,6,7),
                            levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(btv_pre_roll$month)
length(btv_pre_roll$month)

# 특일 구하기
# 참고 페이지 : https://m.blog.naver.com/hancury/221057426711
# 사이트 : https://www.data.go.kr/
# 메뉴명 : 마이페이지 > OPEN API > 개발계정 상세보기
#인증키 "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"
if(!require(glue)){install.packages("glue"); library(glue)}
if(!require(XML)){install.packages("XML"); library(XML)}
if(!require(stringr)){install.packages("stringr"); library(stringr)}

api.key <- "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"

url.format <- 
  'http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?ServiceKey={key}&solYear={year}&solMonth={month}'

holiday.request <- function(key, year, month) glue(url.format)

df_holiday <- data.frame(dateName=NULL, Date=NULL)

for(m in 1:12){
  holiday_2018 <- xmlToList(holiday.request(api.key, 2018, str_pad(m, 2, pad=0)))
  holiday_2019 <- xmlToList(holiday.request(api.key, 2019, str_pad(m, 2, pad=0)))
  
  items_2018 <- holiday_2018$body$items
  items_2019 <- holiday_2019$body$items
  items_test <- holiday_2018$body$items
  items_test <- holiday_2019$body$items
  
  for(item_2018 in items_2018){
    if(item_2018$isHoliday == 'Y') {
      #print(paste(item_2018$dateName, item_2018$locdate, sep=' : '))
      df_holiday <- rbind(df_holiday, 
                          data.frame(dateName = item_2018$dateName, 
                                     Date = (paste(substr(item_2018$locdate,1,4),
                                                   substr(item_2018$locdate,5,6), 
                                                   substr(item_2018$locdate,7,8), sep = '-')),
                                     stringsAsFactors = FALSE))
    }
  }
  
  for(item_2019 in items_2019){
    if(item_2019$isHoliday == 'Y') {
      #print(paste(item_2019$dateName, item_2019$locdate, sep=' : '))
      df_holiday <- rbind(df_holiday, 
                          data.frame(dateName = item_2019$dateName, 
                                     Date = (paste(substr(item_2019$locdate,1,4),
                                                   substr(item_2019$locdate,5,6), 
                                                   substr(item_2019$locdate,7,8), sep = '-')),
                                     stringsAsFactors = FALSE))
    }
  }
  
}

# 날짜 데이터의 데이터 타입 변환
df_holiday$Date <- as.Date(df_holiday$Date)

### Left 조인하기 : 광고 데이터 + 특일 데이터..
btv_pre_roll <- left_join(btv_pre_roll, df_holiday)

# 특일 데이터 범주화
btv_pre_roll <- mutate(btv_pre_roll, isHolyday = as.numeric(!is.na(dateName)))

# 필요한 데이터 잘라내기
head(btv_pre_roll); tail(btv_pre_roll)
btv_pre_roll2 <- filter(btv_pre_roll, Date <= '2019-04-30')
btv_pre_roll3 <- filter(btv_pre_roll, Date <= '2019-05-31')
btv_pre_roll4 <- filter(btv_pre_roll, Date <= '2019-06-30')

tail(btv_pre_roll2); tail(btv_pre_roll3); tail(btv_pre_roll4)

# 예측 Dataset 생성
# 예측할 날짜로 된 수열 생성하기
s_date2 <- as.Date("2019-05-01")
e_date2 <- as.Date("2019-05-31")
s_date3 <- as.Date("2019-06-01")
e_date3 <- as.Date("2019-06-30")
s_date4 <- as.Date("2019-07-01")
e_date4 <- as.Date("2019-07-31")

add_df2 <- data.frame(Date = seq(from = s_date2, to=e_date2, by=1))
add_df2$Date <- as.Date(add_df2$Date); add_df2$REQ <- NA; add_df2$weekday <- NA; add_df2$month <- NA; add_df2$isHolyday <- NA; 
add_df3 <- data.frame(Date = seq(from = s_date3, to=e_date3, by=1))
add_df3$Date <- as.Date(add_df3$Date); add_df3$REQ <- NA; add_df3$weekday <- NA; add_df3$month <- NA; add_df3$isHolyday <- NA;

add_df4 <- data.frame(Date = seq(from = s_date4, to=e_date4, by=1))
add_df4$Date <- as.Date(add_df4$Date); add_df4$REQ <- NA; add_df4$weekday <- NA; add_df4$month <- NA; add_df4$isHolyday <- NA;


# 요일  구하기
Sys.setlocale("LC_TIME", "English")
add_df2$weekday = factor(weekdays(add_df2$Date), 
                         levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df2$weekday)

add_df3$weekday = factor(weekdays(add_df3$Date), 
                         levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df3$weekday)

add_df4$weekday = factor(weekdays(add_df4$Date), 
                         levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df4$weekday)

Sys.setlocale("LC_TIME", "Korean")

# 월(month)  구하기
add_df2$month <-factor(substr(add_df2$Date,6,7),
                       levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df2$month)

add_df3$month <-factor(substr(add_df3$Date,6,7),
                       levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df3$month)

add_df4$month <-factor(substr(add_df4$Date,6,7),
                       levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df4$month)


# 특일 데이터 범주화 추가
add_df2$isHolyday <- as.numeric(!is.na(left_join(add_df2, df_holiday)$dateName))
add_df3$isHolyday <- as.numeric(!is.na(left_join(add_df3, df_holiday)$dateName))
add_df4$isHolyday <- as.numeric(!is.na(left_join(add_df4, df_holiday)$dateName))

### 시계열 적용 시작
ts2_train <- ts(btv_pre_roll2$REQ, start = 1, frequency = 7)
ts3_train <- ts(btv_pre_roll3$REQ, start = 1, frequency = 7)
time(ts2_train)
time(ts3_train)

ts4_train <- ts(btv_pre_roll4$REQ, start = 1, frequency = 7)
time(ts4_train)

# ETS (지수평활(exponential smoothing))
fit1 <- ets(ts2_train) 
fit1_2 <- ets(ts3_train) 
fit1_3 <- ets(ts4_train) 

fcast1 <- forecast(fit1, h=31); autoplot(fcast1) # 5월 예측
fcast1_2 <- forecast(fit1_2, h=30); autoplot(fcast1_2) # 6월 예측
fcast1_3 <- forecast(fit1_3, h=31); autoplot(fcast1_3) # 7월 예측


checkresiduals(fit1); summary(fcast1) # 5월
checkresiduals(fit1_2); summary(fcast1_2) # 6월
checkresiduals(fit1_3); summary(fcast1_3) # 7월

autoplot(fit1) + ylab("fcast1, ETS 광고인벤 예측(ts2_train)") # 5월
autoplot(fit1_2) + ylab("fcast1_2, ETS 광고인벤 예측(ts3_train)") # 6월
autoplot(fit1_3) + ylab("fcast1_3, ETS 광고인벤 예측(ts4_train)") # 7월


# ARIMA auto (자기회귀누적이동평균(ARIMA, AutoRegressive Integrated Moving Average)
fit2 <- auto.arima(ts2_train) # 5월
fit2_2 <- auto.arima(ts3_train) # 6월
fit2_3 <- auto.arima(ts4_train) # 7월

fcast2 <- forecast(fit2, h=31); autoplot(fcast2) # 5월 
fcast2_2 <- forecast(fit2_2, h=30); autoplot(fcast2_2) # 6월
fcast2_3 <- forecast(fit2_3, h=31); autoplot(fcast2_3) # 7월

checkresiduals(fit2); summary(fcast2)
checkresiduals(fit2_2); summary(fcast2_2)
checkresiduals(fit2_3); summary(fcast2_3)

autoplot(fit2) + ylab("fcast2, ARIMA(auto) 광고인벤 예측(ts2_train)") # 5월
autoplot(fit2_2) + ylab("fcast2_2, ARIMA(auto) 광고인벤 예측(ts3_train)") # 6월
autoplot(fit2_3) + ylab("fcast2_3, ARIMA(auto) 광고인벤 예측(ts4_train)") # 7월

#예측하기 > # ETS, ARIMA auto
#2019년 5월 
t1 <- length(fcast1$fitted)-30; t2 <- length(fcast1$fitted)
paste(sum(fcast1$lower[,"95%"]), "/", sum(fcast1$fitted[t1:t2]), "/", sum(fcast1$upper[,"95%"]))
paste(sum(fcast1$lower[,"80%"]), "/", sum(fcast1$fitted[t1:t2]), "/", sum(fcast1$upper[,"80%"]))
paste(sum(fcast2$lower[,"95%"]), "/", sum(fcast2$fitted[t1:t2]), "/", sum(fcast2$upper[,"95%"]))
paste(sum(fcast2$lower[,"80%"]), "/", sum(fcast2$fitted[t1:t2]), "/", sum(fcast2$upper[,"80%"]))
#2019년 6월
t1 <- length(fcast1_2$fitted)-29; t2 <- length(fcast1_2$fitted)
paste(sum(fcast1_2$lower[,"95%"]), "/", sum(fcast1_2$fitted[t1:t2]), "/", sum(fcast1_2$upper[,"95%"]))
paste(sum(fcast1_2$lower[,"80%"]), "/", sum(fcast1_2$fitted[t1:t2]), "/", sum(fcast1_2$upper[,"80%"]))
paste(sum(fcast2_2$lower[,"95%"]), "/", sum(fcast2_2$fitted[t1:t2]), "/", sum(fcast2_2$upper[,"95%"]))
paste(sum(fcast2_2$lower[,"80%"]), "/", sum(fcast2_2$fitted[t1:t2]), "/", sum(fcast2_2$upper[,"80%"]))
#2019년 7월
t1 <- length(fcast1_3$fitted)-29; t2 <- length(fcast1_3$fitted)
paste(sum(fcast1_3$lower[,"95%"]), "/", sum(fcast1_3$fitted[t1:t2]), "/", sum(fcast1_3$upper[,"95%"]))
paste(sum(fcast1_3$lower[,"80%"]), "/", sum(fcast1_3$fitted[t1:t2]), "/", sum(fcast1_3$upper[,"80%"]))
paste(sum(fcast2_3$lower[,"95%"]), "/", sum(fcast2_3$fitted[t1:t2]), "/", sum(fcast2_3$upper[,"95%"]))
paste(sum(fcast2_3$lower[,"80%"]), "/", sum(fcast2_3$fitted[t1:t2]), "/", sum(fcast2_3$upper[,"80%"]))