This My first R Markdown Page!!!
’19년 5월~7월 기간의 Btv pre-roll 광고 inven에 대한 Time Series(ETS 및 ARIMA) 예측 테스트 R code
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: fpp2
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
## Loading required package: fma
## Loading required package: expsmooth
## Observations: 577
## Variables: 7
## $ Date <chr> "2018-01-01", "2018-01-02", "2018-01-03", "2018-…
## $ VOD_시청시간_초 <chr> "8,421,536,616", "7,878,348,464", "7,498,034,394", "7…
## $ VOD_시청건수 <chr> "4,496,473", "4,658,646", "4,535,338", "4,500,479", …
## $ REQ <chr> "4,567,997", "4,630,415", "4,533,702", "4,490,70…
## $ INV <chr> "7,377,096", "7,110,607", "6,932,244", "6,839,95…
## $ Res <chr> "4,787,165", "4,391,051", "4,252,549", "4,101,29…
## $ Imp <chr> "4,498,714", "4,141,349", "4,011,362", "3,872,03…
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# 이상치 데이터 보정작업
### >>> VOD 시청건수와 큰 차이 데이터 대체처리, 260만 미만 8개
btv_pre_roll$REQ_chg_flag = FALSE
arrange(btv_pre_roll %>% filter(REQ < 2650000), REQ)
btv_pre_roll$REQ_chg_flag [btv_pre_roll$REQ < 2600000] <- TRUE
#btv_pre_roll$REQ [REQ < 2600000] <- round(btv_pre_roll$VOD_시청건수 [REQ < 2600000] * 0.98, 0)
btv_pre_roll$REQ [btv_pre_roll$REQ < 2600000] <- btv_pre_roll$VOD_시청건수 [btv_pre_roll$REQ < 2600000]
filter(btv_pre_roll, REQ_chg_flag == TRUE)
###2018-05-15 ~ 2018-06-12까지 이상 데이터 대체 처리 >> 29개
btv_pre_roll$REQ_chg_flag [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12'] <- TRUE
btv_pre_roll$REQ [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12'] <-
btv_pre_roll$VOD_시청건수 [btv_pre_roll$Date >= '2018-05-15' & btv_pre_roll$Date <= '2018-06-12']
# 보정후 raw data 그래프
ggplot(btv_pre_roll, aes(x=Date, y=REQ)) +
geom_point() + geom_smooth(method = 'lm', color = 'red', linetype =2) + # 직선 추세선 추가
geom_smooth() + # 곡선 추세선 추가
ggtitle("Raw Data 추세선")
tail(btv_pre_roll)
# 요일 구하기
Sys.setlocale("LC_TIME", "English")
btv_pre_roll$weekday = factor(weekdays(btv_pre_roll$Date),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(btv_pre_roll$weekday)
Sys.setlocale("LC_TIME", "Korean")
# 월(month) 구하기
btv_pre_roll$month <-factor(substr(btv_pre_roll$Date,6,7),
levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(btv_pre_roll$month)
length(btv_pre_roll$month)
# 특일 구하기
# 참고 페이지 : https://m.blog.naver.com/hancury/221057426711
# 사이트 : https://www.data.go.kr/
# 메뉴명 : 마이페이지 > OPEN API > 개발계정 상세보기
#인증키 "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"
if(!require(glue)){install.packages("glue"); library(glue)}
if(!require(XML)){install.packages("XML"); library(XML)}
if(!require(stringr)){install.packages("stringr"); library(stringr)}
api.key <- "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"
url.format <-
'http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?ServiceKey={key}&solYear={year}&solMonth={month}'
holiday.request <- function(key, year, month) glue(url.format)
df_holiday <- data.frame(dateName=NULL, Date=NULL)
for(m in 1:12){
holiday_2018 <- xmlToList(holiday.request(api.key, 2018, str_pad(m, 2, pad=0)))
holiday_2019 <- xmlToList(holiday.request(api.key, 2019, str_pad(m, 2, pad=0)))
items_2018 <- holiday_2018$body$items
items_2019 <- holiday_2019$body$items
items_test <- holiday_2018$body$items
items_test <- holiday_2019$body$items
for(item_2018 in items_2018){
if(item_2018$isHoliday == 'Y') {
#print(paste(item_2018$dateName, item_2018$locdate, sep=' : '))
df_holiday <- rbind(df_holiday,
data.frame(dateName = item_2018$dateName,
Date = (paste(substr(item_2018$locdate,1,4),
substr(item_2018$locdate,5,6),
substr(item_2018$locdate,7,8), sep = '-')),
stringsAsFactors = FALSE))
}
}
for(item_2019 in items_2019){
if(item_2019$isHoliday == 'Y') {
#print(paste(item_2019$dateName, item_2019$locdate, sep=' : '))
df_holiday <- rbind(df_holiday,
data.frame(dateName = item_2019$dateName,
Date = (paste(substr(item_2019$locdate,1,4),
substr(item_2019$locdate,5,6),
substr(item_2019$locdate,7,8), sep = '-')),
stringsAsFactors = FALSE))
}
}
}
# 날짜 데이터의 데이터 타입 변환
df_holiday$Date <- as.Date(df_holiday$Date)
### Left 조인하기 : 광고 데이터 + 특일 데이터..
btv_pre_roll <- left_join(btv_pre_roll, df_holiday)
# 특일 데이터 범주화
btv_pre_roll <- mutate(btv_pre_roll, isHolyday = as.numeric(!is.na(dateName)))
# 필요한 데이터 잘라내기
head(btv_pre_roll); tail(btv_pre_roll)
btv_pre_roll2 <- filter(btv_pre_roll, Date <= '2019-04-30')
btv_pre_roll3 <- filter(btv_pre_roll, Date <= '2019-05-31')
btv_pre_roll4 <- filter(btv_pre_roll, Date <= '2019-06-30')
tail(btv_pre_roll2); tail(btv_pre_roll3); tail(btv_pre_roll4)
# 예측 Dataset 생성
# 예측할 날짜로 된 수열 생성하기
s_date2 <- as.Date("2019-05-01")
e_date2 <- as.Date("2019-05-31")
s_date3 <- as.Date("2019-06-01")
e_date3 <- as.Date("2019-06-30")
s_date4 <- as.Date("2019-07-01")
e_date4 <- as.Date("2019-07-31")
add_df2 <- data.frame(Date = seq(from = s_date2, to=e_date2, by=1))
add_df2$Date <- as.Date(add_df2$Date); add_df2$REQ <- NA; add_df2$weekday <- NA; add_df2$month <- NA; add_df2$isHolyday <- NA;
add_df3 <- data.frame(Date = seq(from = s_date3, to=e_date3, by=1))
add_df3$Date <- as.Date(add_df3$Date); add_df3$REQ <- NA; add_df3$weekday <- NA; add_df3$month <- NA; add_df3$isHolyday <- NA;
add_df4 <- data.frame(Date = seq(from = s_date4, to=e_date4, by=1))
add_df4$Date <- as.Date(add_df4$Date); add_df4$REQ <- NA; add_df4$weekday <- NA; add_df4$month <- NA; add_df4$isHolyday <- NA;
# 요일 구하기
Sys.setlocale("LC_TIME", "English")
add_df2$weekday = factor(weekdays(add_df2$Date),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df2$weekday)
add_df3$weekday = factor(weekdays(add_df3$Date),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df3$weekday)
add_df4$weekday = factor(weekdays(add_df4$Date),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(add_df4$weekday)
Sys.setlocale("LC_TIME", "Korean")
# 월(month) 구하기
add_df2$month <-factor(substr(add_df2$Date,6,7),
levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df2$month)
add_df3$month <-factor(substr(add_df3$Date,6,7),
levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df3$month)
add_df4$month <-factor(substr(add_df4$Date,6,7),
levels = c('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'))
summary(add_df4$month)
# 특일 데이터 범주화 추가
add_df2$isHolyday <- as.numeric(!is.na(left_join(add_df2, df_holiday)$dateName))
add_df3$isHolyday <- as.numeric(!is.na(left_join(add_df3, df_holiday)$dateName))
add_df4$isHolyday <- as.numeric(!is.na(left_join(add_df4, df_holiday)$dateName))
### 시계열 적용 시작
ts2_train <- ts(btv_pre_roll2$REQ, start = 1, frequency = 7)
ts3_train <- ts(btv_pre_roll3$REQ, start = 1, frequency = 7)
time(ts2_train)
time(ts3_train)
ts4_train <- ts(btv_pre_roll4$REQ, start = 1, frequency = 7)
time(ts4_train)
# ETS (지수평활(exponential smoothing))
fit1 <- ets(ts2_train)
fit1_2 <- ets(ts3_train)
fit1_3 <- ets(ts4_train)
fcast1 <- forecast(fit1, h=31); autoplot(fcast1) # 5월 예측
fcast1_2 <- forecast(fit1_2, h=30); autoplot(fcast1_2) # 6월 예측
fcast1_3 <- forecast(fit1_3, h=31); autoplot(fcast1_3) # 7월 예측
checkresiduals(fit1); summary(fcast1) # 5월
checkresiduals(fit1_2); summary(fcast1_2) # 6월
checkresiduals(fit1_3); summary(fcast1_3) # 7월
autoplot(fit1) + ylab("fcast1, ETS 광고인벤 예측(ts2_train)") # 5월
autoplot(fit1_2) + ylab("fcast1_2, ETS 광고인벤 예측(ts3_train)") # 6월
autoplot(fit1_3) + ylab("fcast1_3, ETS 광고인벤 예측(ts4_train)") # 7월
# ARIMA auto (자기회귀누적이동평균(ARIMA, AutoRegressive Integrated Moving Average)
fit2 <- auto.arima(ts2_train) # 5월
fit2_2 <- auto.arima(ts3_train) # 6월
fit2_3 <- auto.arima(ts4_train) # 7월
fcast2 <- forecast(fit2, h=31); autoplot(fcast2) # 5월
fcast2_2 <- forecast(fit2_2, h=30); autoplot(fcast2_2) # 6월
fcast2_3 <- forecast(fit2_3, h=31); autoplot(fcast2_3) # 7월
checkresiduals(fit2); summary(fcast2)
checkresiduals(fit2_2); summary(fcast2_2)
checkresiduals(fit2_3); summary(fcast2_3)
autoplot(fit2) + ylab("fcast2, ARIMA(auto) 광고인벤 예측(ts2_train)") # 5월
autoplot(fit2_2) + ylab("fcast2_2, ARIMA(auto) 광고인벤 예측(ts3_train)") # 6월
autoplot(fit2_3) + ylab("fcast2_3, ARIMA(auto) 광고인벤 예측(ts4_train)") # 7월
#예측하기 > # ETS, ARIMA auto
#2019년 5월
t1 <- length(fcast1$fitted)-30; t2 <- length(fcast1$fitted)
paste(sum(fcast1$lower[,"95%"]), "/", sum(fcast1$fitted[t1:t2]), "/", sum(fcast1$upper[,"95%"]))
paste(sum(fcast1$lower[,"80%"]), "/", sum(fcast1$fitted[t1:t2]), "/", sum(fcast1$upper[,"80%"]))
paste(sum(fcast2$lower[,"95%"]), "/", sum(fcast2$fitted[t1:t2]), "/", sum(fcast2$upper[,"95%"]))
paste(sum(fcast2$lower[,"80%"]), "/", sum(fcast2$fitted[t1:t2]), "/", sum(fcast2$upper[,"80%"]))
#2019년 6월
t1 <- length(fcast1_2$fitted)-29; t2 <- length(fcast1_2$fitted)
paste(sum(fcast1_2$lower[,"95%"]), "/", sum(fcast1_2$fitted[t1:t2]), "/", sum(fcast1_2$upper[,"95%"]))
paste(sum(fcast1_2$lower[,"80%"]), "/", sum(fcast1_2$fitted[t1:t2]), "/", sum(fcast1_2$upper[,"80%"]))
paste(sum(fcast2_2$lower[,"95%"]), "/", sum(fcast2_2$fitted[t1:t2]), "/", sum(fcast2_2$upper[,"95%"]))
paste(sum(fcast2_2$lower[,"80%"]), "/", sum(fcast2_2$fitted[t1:t2]), "/", sum(fcast2_2$upper[,"80%"]))
#2019년 7월
t1 <- length(fcast1_3$fitted)-29; t2 <- length(fcast1_3$fitted)
paste(sum(fcast1_3$lower[,"95%"]), "/", sum(fcast1_3$fitted[t1:t2]), "/", sum(fcast1_3$upper[,"95%"]))
paste(sum(fcast1_3$lower[,"80%"]), "/", sum(fcast1_3$fitted[t1:t2]), "/", sum(fcast1_3$upper[,"80%"]))
paste(sum(fcast2_3$lower[,"95%"]), "/", sum(fcast2_3$fitted[t1:t2]), "/", sum(fcast2_3$upper[,"95%"]))
paste(sum(fcast2_3$lower[,"80%"]), "/", sum(fcast2_3$fitted[t1:t2]), "/", sum(fcast2_3$upper[,"80%"]))