This report is a summary of lesson by Nina Zumel & John Mount, Data Camp
library(tidyverse)
theme_set(theme_bw())
houseprice <- read_rds("houseprice.rds")
sparrow <- read_rds("sparrow.rds")
load("Income.RData")
load("Bikes.RData")
load("Soybean.RData")
skipped
GainCurvePlot(frame, xvar, truthvar, title)
xvar: 독립변수truthvar: 종속변수library(WVPlots)
houseprice_model <- lm(price ~ size, data = houseprice)
houseprice$prediction <- predict(houseprice_model, houseprice)
GainCurvePlot(houseprice, "prediction", "price", "Home price model")
A measure of how well the model fits or explains the data
\(R^2\) is the variance explained
by the model.
\(R^2 = 1 - \frac{RSS}{SS_{Tot}}\)
where
Example of an additive relationship:
plant_height ~ bacteria + sun
The simultaneous influence of two variables on the outcome is not additive
plant_height ~ bacteria + sun + bacteria:sun
:)y ~ a:b
*)y ~ I(a*b)
y~ a + b + a:b
ggplot(incometrain, aes(x = Income2005)) +
geom_density() +
geom_vline(aes(xintercept = mean(incometrain$Income2005), col = "Mean")) +
geom_vline(aes(xintercept = median(incometrain$Income2005), col = "Median")) +
scale_color_manual(values = c("Mean" = "red", "Median" = "blue")) +
theme(legend.position = "bottom",
legend.title = element_blank()) +
labs(title = "Distribution of Income Data",
subtitle = "Mean: 49,894 Median: 39,000")
Reducing multiplicative error reduces relative error
RMS-relative error = \(\sqrt{((\frac{pred-y}{y})^2}\)
modIncome <- lm(Income2005 ~ AFQT + Math, data = incometrain)
incometrain %>%
mutate(pred = predict(modIncome, .),
err = pred - Income2005) %>%
summarise(rmse = sqrt(mean(err^2)),
rms.relerr = sqrt(mean((err/Income2005)^2)))
## rmse rms.relerr
## 1 45812.71 25.54613
modIncome <- lm(log(Income2005) ~ AFQT + Math, data = incometrain)
incometrain %>%
mutate(predlog = predict(modIncome, .),
pred = exp(predlog),
err = pred - Income2005) %>%
summarise(rmse = sqrt(mean(err^2)),
rms.relerr = sqrt(mean((err/Income2005)^2)))
## rmse rms.relerr
## 1 48165.51 17.93455
log(Income2005) model: smaller RMS-relative error, larger RMSE
\(pseudo~R^2 = 1- \frac{deviance}{null.deviance}\)
broom::glance()library(broom)
sparrow$has_status <- ifelse(sparrow$status == "Survived", 1, 0)
mld_sparrow <- glm(has_status ~ total_length + weight + humerus, data = sparrow, family = "binomial")
glance(mld_sparrow) %>%
summarize(pR2 = 1 - deviance/null.deviance)
## # A tibble: 1 × 1
## pR2
## <dbl>
## 1 0.364
sigr::wrapChiSqTest()library(sigr)
wrapChiSqTest(mld_sparrow)
## [1] "<b>Chi-Square Test</b> summary: <i>pseudo-<i>R<sup>2</sup></i></i>=0.3637 (<i>χ<sup>2</sup></i>(3,<i>N</i>=87)=42.91, <i>p</i><1e-05)."
sparrow$pred <- predict(mld_sparrow, sparrow, type = "response")
GainCurvePlot(sparrow, "pred", "has_status", "Sparrow model")
일반적인 회귀식 대신 poisson and quasipoisson regression을 사용합니다.
glm(formula, data, family)
poisson or
quasipoissonmean(y) = var(y)var(y) much different from mean(y) then
quasipoissonbikesAugust %>%
summarise(mean = mean(cnt),
var = var(cnt))
## mean var
## 1 288.3105 51927.01
Since var(cnt) > mean(cnt) -> use
quasipoisson
fmla <- cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed
model <- glm(fmla, data = bikesAugust, family = quasipoisson)
glance(model) %>%
summarize(pseudoR2 = 1 - deviance / null.deviance)
## # A tibble: 1 × 1
## pseudoR2
## <dbl>
## 1 0.832
bikesAugust$pred <- predict(model, bikesAugust, type = "response")
ggplot(bikesAugust, aes(x = pred, y = cnt)) +
geom_point() +
geom_abline(color = "darkblue") +
ggtitle("Predict bike rentals on new data")
bikesAugust %>%
mutate(instant = (instant - min(instant)) / 24) %>%
pivot_longer(cols = c("cnt", "pred"), names_to = "value_type", values_to = "value") %>%
filter(instant < 14) %>%
ggplot(aes(x = instant, y = value, color = value_type, linetype = value_type)) +
geom_point() +
geom_line() +
scale_x_continuous("Day", breaks = 0:14, labels = 0:14) +
scale_color_brewer(palette = "Dark2") +
ggtitle("Predicted August bike rentals, Quasipoisson model")
gam() in the mgcv package
gma(formula, family, data)
제곱 변환, 로그 변환 없이 비선형 관계 정의 가능(특히
데이터셋에 대한 전문 지식이 부족할 경우 유용할 수
있습니다.)
family:
Best for larger datasets
s() function
anx ~ s(hassles)
s() designates that variable should be non-linear
s()를 사용하면 비선형 관계를
학습합니다.s() with continuous variables
plot(model)
model을 plot에 직접 할당하면 학습에 사용된
variable transformation plot을 그릴 수 있습니다.
`predict(model, type = “terms”)
plot의 y값을 얻으려면 type="terms"를
설정하세요.
summary(soybean_train)
## Plot Variety Year Time weight
## 1988F6 : 10 F:161 1988:124 Min. :14.00 Min. : 0.0290
## 1988F7 : 9 P:169 1989:102 1st Qu.:27.00 1st Qu.: 0.6663
## 1988P1 : 9 1990:104 Median :42.00 Median : 3.5233
## 1988P8 : 9 Mean :43.56 Mean : 6.1645
## 1988P2 : 9 3rd Qu.:56.00 3rd Qu.:10.3808
## 1988F3 : 8 Max. :84.00 Max. :27.3700
## (Other):276
ggplot(soybean_train, aes(x = Time, y = weight)) +
geom_point()
시간과 무게 관계가 비선형 관계를 보이고 있습니다.
library(mgcv)
fmla.gam <- weight ~ s(Time)
model.gam <- gam(fmla.gam, data = soybean_train, family = gaussian)
summary(model.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## weight ~ s(Time)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.1645 0.1143 53.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Time) 8.495 8.93 338.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.902 Deviance explained = 90.4%
## GCV = 4.4395 Scale est. = 4.3117 n = 330
plot(model.gam)
fmla.lin <- weight ~ Time
model.lin <- lm(fmla.lin, data = soybean_train)
# Get predictions from linear model
soybean_test$pred.lin <- predict(model.lin, newdata = soybean_test)
# Get predictions from gam model
soybean_test$pred.gam <- as.numeric(predict(model.gam, newdata = soybean_test))
# Pivot the predictions into a "long" dataset
soybean_long <- soybean_test %>%
pivot_longer(cols = c("pred.lin", "pred.gam"), names_to = "modeltype", values_to = "pred")
# Calculate the rmse
soybean_long %>%
mutate(residual = weight - pred) %>% # residuals
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residual^2))) # calculate the RMSE
## # A tibble: 2 × 2
## modeltype rmse
## <chr> <dbl>
## 1 pred.gam 2.29
## 2 pred.lin 3.19
# Compare the predictions against actual weights on the test data
soybean_long %>%
ggplot(aes(x = Time)) + # the column for the x axis
geom_point(aes(y = weight)) + # the y-column for the scatterplot
geom_point(aes(y = pred, color = modeltype)) + # the y-column for the point-and-line plot
geom_line(aes(y = pred, color = modeltype, linetype = modeltype)) + # the y-column for the point-and-line plot
scale_color_brewer(palette = "Dark2")
Rule of the form:
결정트리가 예측할 때 결과 값이 “조각난” 형태로
나오는 문제를 의미하며, 연속적인 데이터를 부드럽게 예측하는 대신,
계단형(stepwise)으로 예측하는 현상을 말합니다.(상단의
예시에서는 겨우 6개의 선택지가 있습니다.) 이로 인해
선형 관계 역시 표현이 어렵습니다.
Tree가 너무 세분화(deep tree)되어 있으면 모델이 복잡해져서
overfitting이 될 수 있으며, 너무 간단하면(shallow tree) Coarse-Grained이
발생합니다. 단일 tree 사용 보다는 여러 모델을 ensemble한 모델을
사용하여 해결할 수 있습니다.
ranger()`model <- ranger(fmla, data, num.trees = 500, respect.unordered.factors = “order”)
formula, datanum.trees (default 500) - use at least 200mtry - number of variables to try at each node
respect.unordered.factors - recommend set to “order”
# The outcome column
(outcome <- "cnt")
## [1] "cnt"
# The input variables
(vars <- c("hr", "holiday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed"))
## [1] "hr" "holiday" "workingday" "weathersit" "temp"
## [6] "atemp" "hum" "windspeed"
# Create the formula string for bikes rented as a function of the inputs
(fmla <- paste(outcome, "~", paste(vars, collapse = " + ")))
## [1] "cnt ~ hr + holiday + workingday + weathersit + temp + atemp + hum + windspeed"
# Load the package ranger
library(ranger)
# Fit and print the random forest model
(bike_model_rf <- ranger(fmla, # formula
bikesJuly, # data
num.trees = 500,
respect.unordered.factors = "order",
seed = 1234))
## Ranger result
##
## Call:
## ranger(fmla, bikesJuly, num.trees = 500, respect.unordered.factors = "order", seed = 1234)
##
## Type: Regression
## Number of trees: 500
## Sample size: 744
## Number of independent variables: 8
## Mtry: 2
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 8305.032
## R squared (OOB): 0.8189198
ranger 모델에서 predict함수를 사용해서
예측값을 계산을 하면 일반적인 경우와 다르게 테이블을 반환하기에
$predictions 칼럼을 별도로 출력해야 함# Make predictions on the August data
bikesAugust$pred <- predict(bike_model_rf, bikesAugust)$predictions
# Calculate the RMSE of the predictions
bikesAugust %>%
mutate(residual = cnt - pred) %>% # calculate the residual
summarize(rmse = sqrt(mean(residual^2))) # calculate rmse
## rmse
## 1 96.75403
# Plot actual outcome vs predictions (predictions on x-axis)
ggplot(bikesAugust, aes(x = pred, y = cnt)) +
geom_point() +
geom_abline()
how to safely convert categorical variables to
indicator variables
대부분의 R 모델링 함수들은 범주형 변수들을 자동으로 처리하지만 다음에
설명할 XGBoost의 경우에는 파이선에서 개발된
패키지이므로 직접 처리하지 못하고, 별도로 숫자로 변환해야 합니다. ->
One-Hot-Encoding(범주형 변수를 여러 개의 0과 1로 이루어진 더미
변수로 변환하는 방식)
vtreatBasic idea:
designTreatmentsZ(dframe, varlist, verbose = T/F) to
design a treatment plan from the training data, then
dframe: training datavarlist: list of input variable namesverbose: suppress progress messagesprepare(treatmentplan, dframe, varRestriction = nuwvars)
to created “clean” data
prepare() with treatment plan for all future
datatreatmentplandframevarRestriction: list of variables to prepare (optional)
library(vtreat)
library(magrittr)
dframe <- data.frame(
color = c("r", "b", "g", "b", "r", "r", "g", "b", "g", "g"),
size = c(11, 15, 14, 13, 11, 9, 12, 7, 12, 11),
popularity = c(1.3956245, 0.9217988, 1.2025453, 1.0838662, 0.8043527,
1.1035440, 0.8746332, 0.6947058, 0.8832502, 1.0201361)
)
vars <- c("size", "color")
treatplan <- designTreatmentsZ(dframe = dframe, vars)
## [1] "vtreat 1.6.5 inspecting inputs Fri Feb 21 17:02:39 2025"
## [1] "designing treatments Fri Feb 21 17:02:39 2025"
## [1] " have initial level statistics Fri Feb 21 17:02:39 2025"
## [1] " scoring treatments Fri Feb 21 17:02:39 2025"
## [1] "have treatment plan Fri Feb 21 17:02:39 2025"
scoreFrame <- treatplan %>%
use_series(scoreFrame) %>%
select(varName, origName, code)
newvars <- scoreFrame %>%
filter(code %in% c("clean", "lev")) %>%
use_series(varName)
dframe.treat <- prepare(treatplan, dframe, varRestriction = newvars)
vtreat 으로 처리한 데이터의 경우 훈련 데이터에
없는 범주 값을 자동으로 관리하여 새로운 데이터도 안전하게
변환이 가능합니다.outcome <- "cnt"
vars <- colnames(bikesJuly)[1:8]
treatplan <- designTreatmentsZ(bikesJuly, vars, verbose = FALSE)
newvars <- treatplan %>%
use_series(scoreFrame) %>%
filter(code %in% c("clean", "lev")) %>%
use_series(varName)
bikesJuly.treat <- prepare(treatplan, bikesJuly, varRestriction = newvars)
bikesAugust.treat <- prepare(treatplan, bikesAugust, varRestriction = newvars)
str(bikesJuly.treat)
## 'data.frame': 744 obs. of 33 variables:
## $ holiday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ temp : num 0.76 0.74 0.72 0.72 0.7 0.68 0.7 0.74 0.78 0.82 ...
## $ atemp : num 0.727 0.697 0.697 0.712 0.667 ...
## $ hum : num 0.66 0.7 0.74 0.84 0.79 0.79 0.79 0.7 0.62 0.56 ...
## $ windspeed : num 0 0.1343 0.0896 0.1343 0.194 ...
## $ hr_lev_x_0 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_1 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_10 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_11 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_13 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_15 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_16 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_17 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_18 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_19 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_2 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ hr_lev_x_20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_21 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_22 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_23 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hr_lev_x_3 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ hr_lev_x_4 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ hr_lev_x_5 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ hr_lev_x_6 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ hr_lev_x_7 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ hr_lev_x_8 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ hr_lev_x_9 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ weathersit_lev_x_Clear_to_partly_cloudy: num 1 1 1 1 1 1 1 1 1 1 ...
## $ weathersit_lev_x_Light_Precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weathersit_lev_x_Misty : num 0 0 0 0 0 0 0 0 0 0 ...
XGBoost는 훈련데이터의 잔차를 최소화하기위해 훈련을 반복하기 떄문에 overfitting 가능성이 상당히 높습니다. cross-validation을 통해서 out of sample을 추정함으로써 위험을 줄일 수 있습니다.
xgb.cv() with a large number of rounds (trees)cv <- xgb.cv(data = as.matrix(bikesJan.treat), label = bikesJan$cnt, objective = "reg:squarederror", nrounds = 100, nfold = 5, eta = 0.3, max_depth = 6
xgb.cv() and xgboost()
data: input data as matrix;
label: outcomeobjective: for regression -
"reg:squarederror"nrounds: maximum number of trees to fiteta: learning ratemax_depth: maximum depth of individual treesnfold(xgb.cv() only): number of folds for
cross validationxgb.cv()$evaluation_log: records estimated RMSE for
each roundelog <- as.data.frame(cv$evaluation_log)
(nrounds <- which.min(elog$test_rmse_mean))
xgboost(), setting nrounds = \(n_{best}\)nrounds <- 78
model <- xgboost(data = as.matrix(bikesJan.treat),
label = bikesJan$cnt,
nrounds = nrounds,
objective = "reg:squarederror",
eta = 0.3,
max_depth = 6)
bikesFeb.treat <- prepare(treatplan, bikesFeb, varRestriction = newvars)
bikesFeb$pred <- predict(model, as.matrix(bikesFeb.treat))
library(xgboost)
cv <- xgb.cv(data = as.matrix(bikesJuly.treat),
label = bikesJuly$cnt,
nrounds = 50,
nfold = 5,
objective = "reg:squarederror",
eta = 0.75,
max_depth = 5,
early_stopping_rounds = 5,
verbose = FALSE)
elog <- as.data.frame(cv$evaluation_log)
elog %>%
summarise(ntrees.train = which.min(train_rmse_mean),
ntrees.test = which.min(test_rmse_mean))
## ntrees.train ntrees.test
## 1 41 36
ntrees <- which.min(elog$test_rmse_mean)
bike_model_xgb <- xgboost(data = as.matrix(bikesJuly.treat),
label = bikesJuly$cnt,
nrounds = ntrees,
objective = "reg:squarederror",
eta = 0.75,
max_depth = 5,
verbose = FALSE)
bikesAugust$pred <- predict(bike_model_xgb, as.matrix(bikesAugust.treat))
ggplot(bikesAugust, aes(x = pred, y = cnt)) +
geom_point() +
geom_abline()
모델이 잘 fitting되어 보인다. 하지만 좌측 하단 일부 pred 값들이 음수를 보이고 있습니다.
bikesAugust %>%
mutate(res = cnt - pred) %>%
summarize(rmse = sqrt(mean(res^2)))
## rmse
## 1 83.95859