선형회귀(linear regression)를 통한 inven 예측 R code
if(!require(ggplot2)){install.packages("ggplot2"); library(ggplot2)}
## Loading required package: ggplot2
if(!require(dplyr)){install.packages("dplyr"); library(dplyr)}
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 588166 31.5 1211095 64.7 1211095 64.7
## Vcells 1063434 8.2 8388608 64.0 1753441 13.4
getwd()
## [1] "/home/mjs0428/inven_forcast"
setwd("/home/mjs0428/inven_forcast")
btv_pre_roll <- read.csv("./Btv_Preroll-Req_Inv_Res_Imp-data_20190620.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE, fileEncoding = "euc-kr")
glimpse(btv_pre_roll)
## Observations: 536
## Variables: 7
## $ Date <chr> "2018-01-01", "2018-01-02", "2018-01-03", "2018…
## $ VOD_시청시간.초. <chr> "8,421,536,616", "7,878,348,464", "7,498,034,394", "…
## $ VOD_시청건수 <chr> "4,496,473", "4,658,646", "4,535,338", "4,500,479",…
## $ Request <chr> "4,567,997", "4,630,415", "4,533,702", "4,490,7…
## $ Inv. <chr> "7,377,096", "7,110,607", "6,932,244", "6,839,9…
## $ Response <chr> "4,787,165", "4,391,051", "4,252,549", "4,101,2…
## $ IMP <chr> "4,498,714", "4,141,349", "4,011,362", "3,872,0…
# 숫자형 데이터 변환
btv_pre_roll$Request = as.numeric(gsub(",", "", btv_pre_roll$Request))
btv_pre_roll$Inv. = as.numeric(gsub(",", "", btv_pre_roll$Inv.))
btv_pre_roll$Response = as.numeric(gsub(",", "", btv_pre_roll$Response))
btv_pre_roll$IMP = as.numeric(gsub(",", "", btv_pre_roll$IMP))
# 숫자형 데이터 변환
btv_pre_roll$Request <- as.numeric(gsub(",", "", btv_pre_roll$Request))
btv_pre_roll$Inv. <- as.numeric(gsub(",", "", btv_pre_roll$Inv.))
btv_pre_roll$Response <- as.numeric(gsub(",", "", btv_pre_roll$Response))
btv_pre_roll$IMP <- as.numeric(gsub(",", "", btv_pre_roll$IMP))
# 추가된 필드
btv_pre_roll$VOD_시청시간.초. <- round(as.numeric(gsub(",", "", btv_pre_roll$VOD_시청시간.초.))/3600,0)
btv_pre_roll <- rename(btv_pre_roll, VOD_시청시간 = VOD_시청시간.초.)
btv_pre_roll$VOD_시청건수 <- as.numeric(gsub(",", "", btv_pre_roll$VOD_시청건수))
str(btv_pre_roll)
## 'data.frame': 536 obs. of 7 variables:
## $ Date : chr "2018-01-01" "2018-01-02" "2018-01-03" "2018-01-04" ...
## $ VOD_시청시간: num 2339316 2188430 2082787 2083401 2071845 ...
## $ VOD_시청건수: num 4496473 4658646 4535338 4500479 4488078 ...
## $ Request : num 4567997 4630415 4533702 4490705 4480584 ...
## $ Inv. : num 7377096 7110607 6932244 6839954 6786827 ...
## $ Response : num 4787165 4391051 4252549 4101297 4167156 ...
## $ IMP : num 4498714 4141349 4011362 3872031 3928040 ...
glimpse(btv_pre_roll)
## Observations: 536
## Variables: 7
## $ Date <chr> "2018-01-01", "2018-01-02", "2018-01-03", "2018-01-…
## $ VOD_시청시간 <dbl> 2339316, 2188430, 2082787, 2083401, 2071845, 2217020, 2…
## $ VOD_시청건수 <dbl> 4496473, 4658646, 4535338, 4500479, 4488078, 4558005, 4…
## $ Request <dbl> 4567997, 4630415, 4533702, 4490705, 4480584, 458342…
## $ Inv. <dbl> 7377096, 7110607, 6932244, 6839954, 6786827, 706520…
## $ Response <dbl> 4787165, 4391051, 4252549, 4101297, 4167156, 446999…
## $ IMP <dbl> 4498714, 4141349, 4011362, 3872031, 3928040, 420822…
tail(btv_pre_roll)
## Date VOD_시청시간 VOD_시청건수 Request Inv. Response IMP
## 531 2019-06-15 1764021 3392494 3336975 5865433 3596532 3489441
## 532 2019-06-16 1787598 3351160 3296535 5824550 3489334 3388260
## 533 2019-06-17 1519081 2849966 2819099 4847480 3020758 2930798
## 534 2019-06-18 1467465 2870217 2844401 4884775 3162402 3068361
## 535 2019-06-19 1446935 2836474 2800606 4851506 3325569 3223069
## 536 2019-06-20 1442873 2794911 2762179 4794207 3257581 3158809
ggplot(btv_pre_roll, aes(x=Date)) + ggtitle("Request(grey)/ VOD_watch_cnt(blue) / VOD_watch_time(orange)") +
geom_point(aes(y = Request), color = "black", alpha = 0.3) +
geom_point(aes(y = VOD_시청건수), color = "blue", alpha = 0.3) +
geom_point(aes(y = VOD_시청시간), color = "orange", alpha = 0.3)
# 날짜형 타입으로 변환
btv_pre_roll$Date <- as.Date(btv_pre_roll$Date)
# 요일 구하기
btv_pre_roll$weekday = factor(weekdays(btv_pre_roll$Date),
levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),
order = T)
summary(btv_pre_roll$weekday)
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 77 77 77 77 76 76 76
# 특일 구하기
# 참고 페이지 : https://m.blog.naver.com/hancury/221057426711
# 사이트 : https://www.data.go.kr/
# 메뉴명 : 마이페이지 > OPEN API > 개발계정 상세보기
#인증키 "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"
if(!require(glue)){install.packages("glue"); library(glue)}
## Loading required package: glue
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
if(!require(XML)){install.packages("XML"); library(XML)}
## Loading required package: XML
if(!require(stringr)){install.packages("stringr"); library(stringr)}
## Loading required package: stringr
api.key <- "SD5uTucKQxefBuN79R8TKiBa15fAbw5j1id%2FdBH0SsWFDK4boXqDpgMPtc8QJ8UNyaSO8HpGJf%2FAbh65oBzg7g%3D%3D"
url.format <-
'http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?ServiceKey={key}&solYear={year}&solMonth={month}'
holiday.request <- function(key, year, month) glue(url.format)
df_holiday <- data.frame(dateName=NULL, Date=NULL)
for(m in 1:12){
holiday_2018 <- xmlToList(holiday.request(api.key, 2018, str_pad(m, 2, pad=0)))
holiday_2019 <- xmlToList(holiday.request(api.key, 2019, str_pad(m, 2, pad=0)))
items_2018 <- holiday_2018$body$items
items_2019 <- holiday_2019$body$items
items_test <- holiday_2018$body$items
items_test <- holiday_2019$body$items
for(item_2018 in items_2018){
if(item_2018$isHoliday == 'Y') {
#print(paste(item_2018$dateName, item_2018$locdate, sep=' : '))
df_holiday <- rbind(df_holiday,
data.frame(dateName = item_2018$dateName,
Date = (paste(substr(item_2018$locdate,1,4),
substr(item_2018$locdate,5,6),
substr(item_2018$locdate,7,8), sep = '-')),
stringsAsFactors = FALSE))
}
}
for(item_2019 in items_2019){
if(item_2019$isHoliday == 'Y') {
#print(paste(item_2019$dateName, item_2019$locdate, sep=' : '))
df_holiday <- rbind(df_holiday,
data.frame(dateName = item_2019$dateName,
Date = (paste(substr(item_2019$locdate,1,4),
substr(item_2019$locdate,5,6),
substr(item_2019$locdate,7,8), sep = '-')),
stringsAsFactors = FALSE))
}
}
}
# 날짜 데이터의 데이터 타입 변환
df_holiday$Date <- as.Date(df_holiday$Date)
### Left 조인하기 : 광고 데이터 + 특일 데이터..
btv_pre_roll_week_holy <- left_join(btv_pre_roll, df_holiday)
## Joining, by = "Date"
# 요일 데이터 범주화
btv_pre_roll_week_holy$isMon <-as.numeric(btv_pre_roll_week_holy$weekday == 'Monday') # 요일 범주화
btv_pre_roll_week_holy$isTue <-as.numeric(btv_pre_roll_week_holy$weekday == 'Tuesday') # 요일 범주화
btv_pre_roll_week_holy$isWed <-as.numeric(btv_pre_roll_week_holy$weekday == 'Wednesday') # 요일 범주화
btv_pre_roll_week_holy$isThu <-as.numeric(btv_pre_roll_week_holy$weekday == 'Thursday') # 요일 범주화
btv_pre_roll_week_holy$isFri <-as.numeric(btv_pre_roll_week_holy$weekday == 'Friday') # 요일 범주화
btv_pre_roll_week_holy$isSat <-as.numeric(btv_pre_roll_week_holy$weekday == 'Saturday') # 요일 범주화
btv_pre_roll_week_holy$isSun <-as.numeric(btv_pre_roll_week_holy$weekday == 'Sunday') # 요일 범주화
# 특일 데이터 범주화
btv_pre_roll_week_holy <- mutate(btv_pre_roll_week_holy, isHolyday = as.numeric(!is.na(dateName)))
# 방학 데이터 범주화 (여름방학 : 7워~8월, 겨울방학 1월~2월)
btv_pre_roll_week_holy <-
mutate(btv_pre_roll_week_holy,
isVacation = as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) %in% c("07", "08", "01", "02")))
# 월별 범주화
btv_pre_roll_week_holy$isJan <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '01') # 1월
btv_pre_roll_week_holy$isFeb <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '02') # 2월
btv_pre_roll_week_holy$isMar <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '03') # 3월
btv_pre_roll_week_holy$isApr <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '04') # 4월
btv_pre_roll_week_holy$isMay <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '05') # 5월
btv_pre_roll_week_holy$isJun <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '06') # 6월
btv_pre_roll_week_holy$isJul <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '07') # 7월
btv_pre_roll_week_holy$isAug <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '08') # 8월
btv_pre_roll_week_holy$isSep <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '09') # 9월
btv_pre_roll_week_holy$isOct <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '10') # 10월
btv_pre_roll_week_holy$isNov <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '11') # 11월
btv_pre_roll_week_holy$isDec <-as.numeric(substr(btv_pre_roll_week_holy$Date,6,7) == '12') # 12월
# 예측하기...
# 예측할 날짜로 된 수열 생성하기
s_date <- as.Date("2019-06-21")
e_date <- as.Date("2019-12-31")
add_df <- data.frame(Date = seq(from = s_date, to=e_date, by=1))
add_df$Date <- as.Date(add_df$Date)
add_df$VOD_시청시간 <- NA; add_df$VOD_시청건수 <- NA;
add_df$Request <- NA; add_df$Inv. <- NA; add_df$Response <- NA; add_df$IMP <- NA;
add_df$weekday <- NA; add_df$isMon <- NA; add_df$isTue <- NA; add_df$isWed<- NA;
add_df$isThu <- NA; add_df$isFri <- NA; add_df$isSat <- NA; add_df$isSun <- NA;
add_df$isHolyday <- NA; add_df$isVacation <- NA;
add_df$isJan <- NA; add_df$isFeb <- NA; add_df$isMar <- NA; add_df$isApr <- NA;
add_df$isMay <- NA; add_df$isJun <- NA; add_df$isJul <- NA; add_df$isAug <- NA;
add_df$isSep <- NA; add_df$isOct <- NA; add_df$isNov <- NA; add_df$isDec <- NA;
# 예측할 날짜에 요일데이터 및 특일데이터추가....
# 예측할 날짜에 요일 데이터 추가
add_df$weekday = weekdays(add_df$Date)
# 예측할 날짜에 요일 데이터 범주화 추가
add_df$isMon <-as.numeric(add_df$weekday == 'Monday') # 요일 범주화
add_df$isTue <-as.numeric(add_df$weekday == 'Tuesday') # 요일 범주화
add_df$isWed <-as.numeric(add_df$weekday == 'Wednesday') # 요일 범주화
add_df$isThu <-as.numeric(add_df$weekday == 'Thursday') # 요일 범주화
add_df$isFri <-as.numeric(add_df$weekday == 'Friday') # 요일 범주화
add_df$isSat <-as.numeric(add_df$weekday == 'Saturday') # 요일 범주화
add_df$isSun <-as.numeric(add_df$weekday == 'Sunday') # 요일 범주화
# 특일 데이터 추가
add_df <- left_join(add_df, df_holiday)
## Joining, by = "Date"
# 특일 데이터 범주화 추가
add_df$isHolyday <- as.numeric(!is.na(add_df$dateName))
# 방학 데이터 범주화 추가 (여름방학 : 7워~8월, 겨울방학 12월~2월)
add_df$isVacation <- as.numeric(substr(add_df$Date,6,7) %in% c("07", "08", "01", "02"))
# 월별 범주화
add_df$isJan <-as.numeric(substr(add_df$Date,6,7) == '01') # 1월
add_df$isFeb <-as.numeric(substr(add_df$Date,6,7) == '02') # 2월
add_df$isMar <-as.numeric(substr(add_df$Date,6,7) == '03') # 3월
add_df$isApr <-as.numeric(substr(add_df$Date,6,7) == '04') # 4월
add_df$isMay <-as.numeric(substr(add_df$Date,6,7) == '05') # 5월
add_df$isJun <-as.numeric(substr(add_df$Date,6,7) == '06') # 6월
add_df$isJul <-as.numeric(substr(add_df$Date,6,7) == '07') # 7월
add_df$isAug <-as.numeric(substr(add_df$Date,6,7) == '08') # 8월
add_df$isSep <-as.numeric(substr(add_df$Date,6,7) == '09') # 9월
add_df$isOct <-as.numeric(substr(add_df$Date,6,7) == '10') # 10월
add_df$isNov <-as.numeric(substr(add_df$Date,6,7) == '11') # 11월
add_df$isDec <-as.numeric(substr(add_df$Date,6,7) == '12') # 12월
# 기존 데이터와 예측치 결합
btv_pre_roll_week_holy2 <- rbind(btv_pre_roll_week_holy, add_df)
# ★★★★★ interation model
old_btv_pre_roll <- lm(Request ~ Date + isMon + isTue + isWed + isThu + isFri + isSat + isSun + isHolyday + isVacation +
isMon:Date + isTue:Date + isWed:Date + isThu:Date + isFri:Date + isSat:Date + isSun:Date + isHolyday:Date + isVacation:Date +
isMon:isHolyday + isTue:isHolyday + isWed:isHolyday + isThu:isHolyday + isFri:isHolyday + isSat:isHolyday + isSun:isHolyday + isVacation:isHolyday +
isMon:isVacation + isTue:isVacation + isWed:isVacation + isThu:isVacation + isFri:isVacation + isSat:isVacation + isSun:isVacation,
data = btv_pre_roll_week_holy)
int_btv_pre_roll <- lm(Request ~ Date + isMon + isTue + isWed + isThu + isFri + isSat + isHolyday +
isJan + isFeb + isMar + isApr + isMay + isJun + isJul + isAug + isSep +
isOct + isNov + isMon:isHolyday + isTue:isHolyday + isWed:isHolyday +
isThu:isHolyday + isFri:isHolyday + isSat:isHolyday + isMon:isJan +
isMon:isFeb + isMon:isMar + isMon:isApr + isMon:isMay + isMon:isJun +
isMon:isJul + isMon:isSep + isMon:isOct + isMon:isNov + isTue:isFeb +
isTue:isMar + isTue:isApr + isTue:isMay + isTue:isJun + isTue:isJul +
isTue:isSep + isTue:isOct + isTue:isNov + isWed:isFeb + isWed:isMar +
isWed:isApr + isWed:isJun + isWed:isSep + isWed:isOct + isWed:isNov +
isThu:isFeb + isThu:isMar + isThu:isApr + isThu:isSep + isThu:isOct +
isFri:isFeb + isFri:isMar + isFri:isApr + isFri:isAug + isFri:isSep +
isSat:isFeb + isSat:isSep,
data = btv_pre_roll_week_holy)
summary(int_btv_pre_roll) ## interaction effect
##
## Call:
## lm(formula = Request ~ Date + isMon + isTue + isWed + isThu +
## isFri + isSat + isHolyday + isJan + isFeb + isMar + isApr +
## isMay + isJun + isJul + isAug + isSep + isOct + isNov + isMon:isHolyday +
## isTue:isHolyday + isWed:isHolyday + isThu:isHolyday + isFri:isHolyday +
## isSat:isHolyday + isMon:isJan + isMon:isFeb + isMon:isMar +
## isMon:isApr + isMon:isMay + isMon:isJun + isMon:isJul + isMon:isSep +
## isMon:isOct + isMon:isNov + isTue:isFeb + isTue:isMar + isTue:isApr +
## isTue:isMay + isTue:isJun + isTue:isJul + isTue:isSep + isTue:isOct +
## isTue:isNov + isWed:isFeb + isWed:isMar + isWed:isApr + isWed:isJun +
## isWed:isSep + isWed:isOct + isWed:isNov + isThu:isFeb + isThu:isMar +
## isThu:isApr + isThu:isSep + isThu:isOct + isFri:isFeb + isFri:isMar +
## isFri:isApr + isFri:isAug + isFri:isSep + isSat:isFeb + isSat:isSep,
## data = btv_pre_roll_week_holy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2628097 -118726 -562 148608 1219153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.499e+07 1.774e+06 8.453 3.55e-16 ***
## Date -6.216e+02 9.912e+01 -6.271 8.11e-10 ***
## isMon -1.261e+05 1.296e+05 -0.973 0.331078
## isTue -2.591e+05 9.847e+04 -2.632 0.008776 **
## isWed -3.010e+05 7.772e+04 -3.872 0.000123 ***
## isThu -3.605e+05 7.007e+04 -5.145 3.93e-07 ***
## isFri -3.982e+05 7.092e+04 -5.615 3.36e-08 ***
## isSat 3.862e+04 6.039e+04 0.640 0.522761
## isHolyday 7.708e+04 2.070e+05 0.372 0.709783
## isJan 6.092e+05 8.001e+04 7.614 1.47e-13 ***
## isFeb 1.022e+05 1.418e+05 0.720 0.471674
## isMar 1.321e+05 1.046e+05 1.262 0.207564
## isApr 4.226e+04 1.079e+05 0.392 0.695420
## isMay -3.724e+05 8.299e+04 -4.487 9.09e-06 ***
## isJun -4.103e+05 9.142e+04 -4.488 9.06e-06 ***
## isJul -1.223e+05 9.915e+04 -1.233 0.218137
## isAug -9.386e+04 9.092e+04 -1.032 0.302447
## isSep -1.077e+06 1.725e+05 -6.243 9.57e-10 ***
## isOct 2.929e+04 1.186e+05 0.247 0.804974
## isNov 1.681e+04 1.034e+05 0.162 0.870997
## isMon:isHolyday 9.055e+04 2.664e+05 0.340 0.734094
## isTue:isHolyday 2.390e+05 2.550e+05 0.937 0.349087
## isWed:isHolyday 5.607e+05 2.584e+05 2.170 0.030504 *
## isThu:isHolyday 1.589e+05 2.916e+05 0.545 0.586211
## isFri:isHolyday -3.719e+05 3.263e+05 -1.140 0.254908
## isSat:isHolyday -7.589e+05 3.211e+05 -2.363 0.018524 *
## isMon:isJan -1.483e+05 1.731e+05 -0.857 0.391872
## isMon:isFeb -4.503e+04 2.132e+05 -0.211 0.832833
## isMon:isMar -5.135e+05 1.896e+05 -2.709 0.006996 **
## isMon:isApr -4.797e+05 1.839e+05 -2.609 0.009382 **
## isMon:isMay -6.268e+04 1.835e+05 -0.342 0.732753
## isMon:isJun -3.149e+05 1.882e+05 -1.673 0.095057 .
## isMon:isJul -6.286e+05 2.076e+05 -3.028 0.002597 **
## isMon:isSep 2.765e+05 2.656e+05 1.041 0.298418
## isMon:isOct -7.044e+05 2.183e+05 -3.227 0.001337 **
## isMon:isNov -3.643e+05 2.233e+05 -1.631 0.103558
## isTue:isFeb -2.513e+04 1.940e+05 -0.130 0.896987
## isTue:isMar -4.555e+05 1.698e+05 -2.683 0.007554 **
## isTue:isApr -4.212e+05 1.672e+05 -2.518 0.012118 *
## isTue:isMay -6.350e+04 1.511e+05 -0.420 0.674495
## isTue:isJun -2.133e+05 1.681e+05 -1.269 0.204982
## isTue:isJul -2.013e+05 1.895e+05 -1.063 0.288516
## isTue:isSep 5.299e+05 2.486e+05 2.132 0.033562 *
## isTue:isOct -2.205e+05 2.008e+05 -1.098 0.272819
## isTue:isNov -3.479e+05 2.066e+05 -1.684 0.092782 .
## isWed:isFeb 2.199e+05 1.856e+05 1.185 0.236798
## isWed:isMar -3.877e+05 1.586e+05 -2.444 0.014896 *
## isWed:isApr -3.902e+05 1.608e+05 -2.426 0.015629 *
## isWed:isJun -2.136e+05 1.615e+05 -1.323 0.186432
## isWed:isSep 5.906e+05 2.427e+05 2.433 0.015326 *
## isWed:isOct -2.952e+05 1.934e+05 -1.527 0.127511
## isWed:isNov -2.062e+05 1.974e+05 -1.045 0.296685
## isThu:isFeb 2.439e+05 1.831e+05 1.332 0.183571
## isThu:isMar -2.437e+05 1.509e+05 -1.615 0.106931
## isThu:isApr -3.231e+05 1.573e+05 -2.055 0.040435 *
## isThu:isSep 7.272e+05 2.381e+05 3.054 0.002388 **
## isThu:isOct -2.841e+05 2.030e+05 -1.400 0.162257
## isFri:isFeb 2.360e+05 1.850e+05 1.275 0.202887
## isFri:isMar -1.308e+05 1.482e+05 -0.883 0.377941
## isFri:isApr -2.548e+05 1.576e+05 -1.616 0.106736
## isFri:isAug 2.557e+05 1.757e+05 1.455 0.146298
## isFri:isSep 7.937e+05 2.384e+05 3.329 0.000941 ***
## isSat:isFeb 1.286e+05 1.805e+05 0.712 0.476591
## isSat:isSep 9.338e+05 2.235e+05 4.177 3.51e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 336000 on 472 degrees of freedom
## Multiple R-squared: 0.6431, Adjusted R-squared: 0.5954
## F-statistic: 13.5 on 63 and 472 DF, p-value: < 2.2e-16
# 점 추정 예측
#predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response')
#interval이 confidence로 주어진 경우 회귀 계수에 대한 신뢰 구간을 고려하여 종속 변수의 신뢰 구간(confidence interval)을 찾는다.
#predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response', interval="confidence")
#interval이 prediction인 경우 회귀 계수의 신뢰 구간과 오차항을 고려한 종속 변수의 예측 구간(prediction interval)을 찾는다.
#(구간 크기 : confidence < prediction)
# predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response', interval="prediction")
# 구간 추정 예측 : fit = 예측값, lwr = 예측하한값, upr = 예측 상한값
#interval="prediction" 구간 추정
pred_interval <- predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response', interval="prediction")
conf_interval <- predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response', interval="confidence")
str(pred_interval)
## num [1:730, 1:3] 4598051 4445108 4402664 4342467 4304146 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:730] "1" "2" "3" "4" ...
## ..$ : chr [1:3] "fit" "lwr" "upr"
str(conf_interval)
## num [1:730, 1:3] 4598051 4445108 4402664 4342467 4304146 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:730] "1" "2" "3" "4" ...
## ..$ : chr [1:3] "fit" "lwr" "upr"
btv_pre_roll_week_holy2$prd_fit <- pred_interval[,1] # fit
btv_pre_roll_week_holy2$prd_lwr <- pred_interval[,2] # lwr
btv_pre_roll_week_holy2$prd_upr <- pred_interval[,3] # upr
btv_pre_roll_week_holy2$cnf_fit <- conf_interval[,1] # fit
btv_pre_roll_week_holy2$cnf_lwr <- conf_interval[,2] # lwr
btv_pre_roll_week_holy2$cnf_upr <- conf_interval[,3] # upr
#btv_pre_roll_week_holy2$old_pred <- round(predict(old_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response'), 0)
#btv_pre_roll_week_holy2$int_pred <- round(predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2, type='response'), 0)
btv_pre_roll_week_holy2$old_pred <- round(predict(old_btv_pre_roll, newdata=btv_pre_roll_week_holy2), 0)
## Warning in predict.lm(old_btv_pre_roll, newdata = btv_pre_roll_week_holy2):
## prediction from a rank-deficient fit may be misleading
btv_pre_roll_week_holy2$int_pred <- round(predict(int_btv_pre_roll, newdata=btv_pre_roll_week_holy2), 0)
as.numeric(as.Date('2018-01-01'))
## [1] 17532
summary(int_btv_pre_roll)
##
## Call:
## lm(formula = Request ~ Date + isMon + isTue + isWed + isThu +
## isFri + isSat + isHolyday + isJan + isFeb + isMar + isApr +
## isMay + isJun + isJul + isAug + isSep + isOct + isNov + isMon:isHolyday +
## isTue:isHolyday + isWed:isHolyday + isThu:isHolyday + isFri:isHolyday +
## isSat:isHolyday + isMon:isJan + isMon:isFeb + isMon:isMar +
## isMon:isApr + isMon:isMay + isMon:isJun + isMon:isJul + isMon:isSep +
## isMon:isOct + isMon:isNov + isTue:isFeb + isTue:isMar + isTue:isApr +
## isTue:isMay + isTue:isJun + isTue:isJul + isTue:isSep + isTue:isOct +
## isTue:isNov + isWed:isFeb + isWed:isMar + isWed:isApr + isWed:isJun +
## isWed:isSep + isWed:isOct + isWed:isNov + isThu:isFeb + isThu:isMar +
## isThu:isApr + isThu:isSep + isThu:isOct + isFri:isFeb + isFri:isMar +
## isFri:isApr + isFri:isAug + isFri:isSep + isSat:isFeb + isSat:isSep,
## data = btv_pre_roll_week_holy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2628097 -118726 -562 148608 1219153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.499e+07 1.774e+06 8.453 3.55e-16 ***
## Date -6.216e+02 9.912e+01 -6.271 8.11e-10 ***
## isMon -1.261e+05 1.296e+05 -0.973 0.331078
## isTue -2.591e+05 9.847e+04 -2.632 0.008776 **
## isWed -3.010e+05 7.772e+04 -3.872 0.000123 ***
## isThu -3.605e+05 7.007e+04 -5.145 3.93e-07 ***
## isFri -3.982e+05 7.092e+04 -5.615 3.36e-08 ***
## isSat 3.862e+04 6.039e+04 0.640 0.522761
## isHolyday 7.708e+04 2.070e+05 0.372 0.709783
## isJan 6.092e+05 8.001e+04 7.614 1.47e-13 ***
## isFeb 1.022e+05 1.418e+05 0.720 0.471674
## isMar 1.321e+05 1.046e+05 1.262 0.207564
## isApr 4.226e+04 1.079e+05 0.392 0.695420
## isMay -3.724e+05 8.299e+04 -4.487 9.09e-06 ***
## isJun -4.103e+05 9.142e+04 -4.488 9.06e-06 ***
## isJul -1.223e+05 9.915e+04 -1.233 0.218137
## isAug -9.386e+04 9.092e+04 -1.032 0.302447
## isSep -1.077e+06 1.725e+05 -6.243 9.57e-10 ***
## isOct 2.929e+04 1.186e+05 0.247 0.804974
## isNov 1.681e+04 1.034e+05 0.162 0.870997
## isMon:isHolyday 9.055e+04 2.664e+05 0.340 0.734094
## isTue:isHolyday 2.390e+05 2.550e+05 0.937 0.349087
## isWed:isHolyday 5.607e+05 2.584e+05 2.170 0.030504 *
## isThu:isHolyday 1.589e+05 2.916e+05 0.545 0.586211
## isFri:isHolyday -3.719e+05 3.263e+05 -1.140 0.254908
## isSat:isHolyday -7.589e+05 3.211e+05 -2.363 0.018524 *
## isMon:isJan -1.483e+05 1.731e+05 -0.857 0.391872
## isMon:isFeb -4.503e+04 2.132e+05 -0.211 0.832833
## isMon:isMar -5.135e+05 1.896e+05 -2.709 0.006996 **
## isMon:isApr -4.797e+05 1.839e+05 -2.609 0.009382 **
## isMon:isMay -6.268e+04 1.835e+05 -0.342 0.732753
## isMon:isJun -3.149e+05 1.882e+05 -1.673 0.095057 .
## isMon:isJul -6.286e+05 2.076e+05 -3.028 0.002597 **
## isMon:isSep 2.765e+05 2.656e+05 1.041 0.298418
## isMon:isOct -7.044e+05 2.183e+05 -3.227 0.001337 **
## isMon:isNov -3.643e+05 2.233e+05 -1.631 0.103558
## isTue:isFeb -2.513e+04 1.940e+05 -0.130 0.896987
## isTue:isMar -4.555e+05 1.698e+05 -2.683 0.007554 **
## isTue:isApr -4.212e+05 1.672e+05 -2.518 0.012118 *
## isTue:isMay -6.350e+04 1.511e+05 -0.420 0.674495
## isTue:isJun -2.133e+05 1.681e+05 -1.269 0.204982
## isTue:isJul -2.013e+05 1.895e+05 -1.063 0.288516
## isTue:isSep 5.299e+05 2.486e+05 2.132 0.033562 *
## isTue:isOct -2.205e+05 2.008e+05 -1.098 0.272819
## isTue:isNov -3.479e+05 2.066e+05 -1.684 0.092782 .
## isWed:isFeb 2.199e+05 1.856e+05 1.185 0.236798
## isWed:isMar -3.877e+05 1.586e+05 -2.444 0.014896 *
## isWed:isApr -3.902e+05 1.608e+05 -2.426 0.015629 *
## isWed:isJun -2.136e+05 1.615e+05 -1.323 0.186432
## isWed:isSep 5.906e+05 2.427e+05 2.433 0.015326 *
## isWed:isOct -2.952e+05 1.934e+05 -1.527 0.127511
## isWed:isNov -2.062e+05 1.974e+05 -1.045 0.296685
## isThu:isFeb 2.439e+05 1.831e+05 1.332 0.183571
## isThu:isMar -2.437e+05 1.509e+05 -1.615 0.106931
## isThu:isApr -3.231e+05 1.573e+05 -2.055 0.040435 *
## isThu:isSep 7.272e+05 2.381e+05 3.054 0.002388 **
## isThu:isOct -2.841e+05 2.030e+05 -1.400 0.162257
## isFri:isFeb 2.360e+05 1.850e+05 1.275 0.202887
## isFri:isMar -1.308e+05 1.482e+05 -0.883 0.377941
## isFri:isApr -2.548e+05 1.576e+05 -1.616 0.106736
## isFri:isAug 2.557e+05 1.757e+05 1.455 0.146298
## isFri:isSep 7.937e+05 2.384e+05 3.329 0.000941 ***
## isSat:isFeb 1.286e+05 1.805e+05 0.712 0.476591
## isSat:isSep 9.338e+05 2.235e+05 4.177 3.51e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 336000 on 472 degrees of freedom
## Multiple R-squared: 0.6431, Adjusted R-squared: 0.5954
## F-statistic: 13.5 on 63 and 472 DF, p-value: < 2.2e-16
round(coef(int_btv_pre_roll),4) # 절편과 기울기를 보여주는 함수는 coef()이다.
## (Intercept) Date isMon isTue
## 14993480.7174 -621.5944 -126117.7131 -259127.3872
## isWed isThu isFri isSat
## -300950.0486 -360524.8677 -398224.5950 38621.8208
## isHolyday isJan isFeb isMar
## 77079.9411 609169.3094 102157.7658 132069.5337
## isApr isMay isJun isJul
## 42257.0060 -372362.9828 -410254.8696 -122265.1892
## isAug isSep isOct isNov
## -93857.2221 -1076852.3234 29293.9191 16806.3765
## isMon:isHolyday isTue:isHolyday isWed:isHolyday isThu:isHolyday
## 90551.8062 238999.7573 560724.9489 158854.7147
## isFri:isHolyday isSat:isHolyday isMon:isJan isMon:isFeb
## -371915.6405 -758876.0621 -148320.0972 -45027.7859
## isMon:isMar isMon:isApr isMon:isMay isMon:isJun
## -513508.6919 -479653.9902 -62679.2893 -314870.4363
## isMon:isJul isMon:isSep isMon:isOct isMon:isNov
## -628584.6865 276477.4784 -704411.5033 -364270.8368
## isTue:isFeb isTue:isMar isTue:isApr isTue:isMay
## -25130.3863 -455512.4234 -421187.6038 -63503.2404
## isTue:isJun isTue:isJul isTue:isSep isTue:isOct
## -213314.3106 -201306.4179 529883.7592 -220465.5744
## isTue:isNov isWed:isFeb isWed:isMar isWed:isApr
## -347949.5683 219867.9706 -387718.6675 -390246.2463
## isWed:isJun isWed:isSep isWed:isOct isWed:isNov
## -213628.8805 590634.2171 -295228.7569 -206184.3124
## isThu:isFeb isThu:isMar isThu:isApr isThu:isSep
## 243874.2079 -243693.9753 -323146.0828 727180.3531
## isThu:isOct isFri:isFeb isFri:isMar isFri:isApr
## -284114.2459 235957.3686 -130787.6928 -254768.3861
## isFri:isAug isFri:isSep isSat:isFeb isSat:isSep
## 255722.8495 793687.6748 128606.4749 933844.7731
#write.csv(btv_pre_roll_week_holy2, '111.csv')
p1 <- ggplot(btv_pre_roll_week_holy, aes(x=Date, y=Request)) +
geom_point() + geom_smooth(method = 'lm', color = 'red', linetype =2) + # 직선 추세선 추가
geom_smooth() + # 곡선 추세선 추가
ggtitle("Raw Data 추세선")
p2 <- ggplot(btv_pre_roll_week_holy2, aes(x=Date)) +
geom_point(aes(y = Request), color = "grey") +
geom_jitter(aes(y = int_pred), color = "red", alpha = 0.5, size = 1) +
ggtitle("interation 예측")
if(!require(gridExtra)){install.packages("gridExtra"); library(gridExtra)} # ggplot2 그래프를 하나의 그림 안에 위치시킬 수 있게 해준다.
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, nrow =2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 194 rows containing missing values (geom_point).
grid.arrange(p1, p2, ncol =2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 194 rows containing missing values (geom_point).
as.numeric(as.Date('2018-01-01')); as.numeric(as.Date('2018-01-02')); as.numeric(as.Date('2019-12-31'))
## [1] 17532
## [1] 17533
## [1] 18261
as.numeric(as.Date('2019-12-31'))-as.numeric(as.Date('2018-01-01'))
## [1] 729