This report is a summary of lesson by Dima, Data camp
List column Workflowgap_nested <- gapminder %>%
group_by(country) %>%
nest()
# Explore gap_nested
head(gap_nested)
## # A tibble: 6 × 2
## # Groups: country [6]
## country data
## <fct> <list>
## 1 Algeria <tibble [52 × 6]>
## 2 Argentina <tibble [52 × 6]>
## 3 Australia <tibble [52 × 6]>
## 4 Austria <tibble [52 × 6]>
## 5 Bangladesh <tibble [52 × 6]>
## 6 Belgium <tibble [52 × 6]>
# Calculate the mean population for each country
pop_nested <- gap_nested %>%
mutate(mean_pop = map(data, ~mean(.x$population)))
# Take a look at pop_nested
head(pop_nested)
## # A tibble: 6 × 3
## # Groups: country [6]
## country data mean_pop
## <fct> <list> <list>
## 1 Algeria <tibble [52 × 6]> <dbl [1]>
## 2 Argentina <tibble [52 × 6]> <dbl [1]>
## 3 Australia <tibble [52 × 6]> <dbl [1]>
## 4 Austria <tibble [52 × 6]> <dbl [1]>
## 5 Bangladesh <tibble [52 × 6]> <dbl [1]>
## 6 Belgium <tibble [52 × 6]> <dbl [1]>
# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>%
unnest(mean_pop)
# Take a look at pop_mean
head(pop_mean)
## # A tibble: 6 × 3
## # Groups: country [6]
## country data mean_pop
## <fct> <list> <dbl>
## 1 Algeria <tibble [52 × 6]> 23129438.
## 2 Argentina <tibble [52 × 6]> 30783053.
## 3 Australia <tibble [52 × 6]> 16074837.
## 4 Austria <tibble [52 × 6]> 7746272.
## 5 Bangladesh <tibble [52 × 6]> 97649407.
## 6 Belgium <tibble [52 × 6]> 9983596.
# Build a linear model for each country
gap_models <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy~year, data = .x)))
# Extract the model for Algeria
algeria_model <- gap_models$model[[1]]
# View the summary for the Algeria model
summary(algeria_model)
##
## Call:
## lm(formula = life_expectancy ~ year, data = .x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.044 -1.577 -0.543 1.700 3.843
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.197e+03 3.994e+01 -29.96 <2e-16 ***
## year 6.349e-01 2.011e-02 31.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.177 on 50 degrees of freedom
## Multiple R-squared: 0.9522, Adjusted R-squared: 0.9513
## F-statistic: 996.2 on 1 and 50 DF, p-value: < 2.2e-16
tidy(): returns the statistical findings of the
model(such as coefficients)glance(): retruns a concise one-row summary of the
modelaugment(): adds prediction columns to the data being
modeled# Build the augmented data frame
algeria_fitted <- augment(algeria_model)
# Compare the predicted values with the actual values of life expectancy
algeria_fitted %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red")
# Extract the coefficient statistics of each model into nested data frames
model_coef_nested <- gap_models %>%
mutate(coef = map(model, ~tidy(.x)))
# Simplify the coef data frames for each model
model_coef <- model_coef_nested %>%
unnest(coef)
# Plot a histogram of the coefficient estimates for year
model_coef %>%
filter(term == "year") %>%
ggplot(aes(x = estimate)) +
geom_histogram()
### Evaluating the fit of many models
# Extract the fit statistics of each model into data frames
model_perf_nested <- gap_models %>%
mutate(fit = map(model, ~glance(.x)))
# Simplify the fit data frames for each model
model_perf <- model_perf_nested %>%
unnest(fit)
# Look at the first six rows of model_perf
head(model_perf, 6)
## # A tibble: 6 × 15
## # Groups: country [6]
## country data model r.squared adj.r.squared sigma statistic p.value df
## <fct> <list> <lis> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Algeria <tibble> <lm> 0.952 0.951 2.18 996. 1.11e-34 1
## 2 Argenti… <tibble> <lm> 0.984 0.984 0.431 3137. 8.78e-47 1
## 3 Austral… <tibble> <lm> 0.983 0.983 0.511 2905. 5.83e-46 1
## 4 Austria <tibble> <lm> 0.987 0.986 0.438 3702. 1.48e-48 1
## 5 Banglad… <tibble> <lm> 0.949 0.947 1.83 921. 7.10e-34 1
## 6 Belgium <tibble> <lm> 0.990 0.990 0.331 5094. 5.54e-52 1
## # ℹ 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## # df.residual <int>, nobs <int>
# Plot a histogram of rsquared for the 77 models
model_perf %>%
ggplot(aes(x = r.squared)) +
geom_histogram()
# Extract the 4 best fitting models
best_fit <- model_perf %>%
ungroup() %>%
slice_max(r.squared, n = 4)
# Extract the 4 models with the worst fit
worst_fit <- model_perf %>%
ungroup() %>%
slice_min(r.squared, n = 4)
best_augmented <- best_fit %>%
# Build the augmented data frame for each country model
mutate(augmented = map(model, ~ augment(.x))) %>%
# Expand the augmented data frames
unnest(augmented)
worst_augmented <- worst_fit %>%
# Build the augmented data frame for each country model
mutate(augmented = map(model, ~ augment(.x))) %>%
# Expand the augmented data frames
unnest(augmented)
head(best_augmented)
## # A tibble: 6 × 23
## country data model r.squared adj.r.squared sigma statistic p.value df
## <fct> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## 2 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## 3 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## 4 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## 5 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## 6 Italy <tibble> <lm> 0.997 0.997 0.226 15665. 4.14e-64 1
## # ℹ 14 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
## # df.residual <int>, nobs <int>, life_expectancy <dbl>, year <int>,
## # .fitted <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>, .cooksd <dbl>,
## # .std.resid <dbl>
best_augmented %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
##### Worst fitting
worst_augmented %>%
ggplot(aes(x = year)) +
geom_point(aes(y = life_expectancy)) +
geom_line(aes(y = .fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
#### Improve the fit of your models
# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>%
mutate(model = map(data, ~lm(formula = life_expectancy ~ ., data = .x)))
fullmodel_perf <- gap_fullmodel %>%
# Extract the fit statistics of each model into data frames
mutate(fit = map(model, ~glance(.x))) %>%
# Simplify the fit data frames for each model
unnest(fit)
# View the performance for the four countries with the worst fitting four simple models you looked at before
fullmodel_perf %>%
filter(country %in% worst_fit$country) %>%
select(country, adj.r.squared)
## # A tibble: 4 × 2
## # Groups: country [4]
## country adj.r.squared
## <fct> <dbl>
## 1 Botswana 0.844
## 2 Lesotho 0.908
## 3 Zambia 0.706
## 4 Zimbabwe 0.978
gap_split <- initial_split(gapminder, prop = 0.75)
training_data <- training(gap_split)
testing_data <- testing(gap_split)
##
nrow(training_data)
## [1] 3003
nrow(testing_data)
## [1] 1001
vfold_cv(data = ..., v = ...)cv_split <- vfold_cv(training_data, v = 5)
cv_split
## # 5-fold cross-validation
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [2402/601]> Fold1
## 2 <split [2402/601]> Fold2
## 3 <split [2402/601]> Fold3
## 4 <split [2403/600]> Fold4
## 5 <split [2403/600]> Fold5
cv_data <- cv_split %>%
mutate(train = map(splits, ~training(.x)),
validate = map(splits, ~testing(.x)))
head(cv_data)
## # 5-fold cross-validation
## # A tibble: 5 × 4
## splits id train validate
## <list> <chr> <list> <list>
## 1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 2 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 3 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]>
## 4 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]>
## 5 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]>
MAE(Mean Absolute Error): 평균적으로 모델의 예측이
실제와 얼마나 다른지의 척도로 작을수록 bettery valuesy values# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>%
mutate(model = map(train, ~lm(life_expectancy ~ ., data = .x)))
cv_prep_lm <- cv_models_lm %>%
mutate(
# Extract the recorded life expectancy for the records in the validate data frames
validate_actual = map(validate, ~.x$life_expectancy),
# Predict life expectancy for each validate set using its corresponding model
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
)
cv_prep_lm
## # 5-fold cross-validation
## # A tibble: 5 × 7
## splits id train validate model validate_actual
## <list> <chr> <list> <list> <list> <list>
## 1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble> <lm> <dbl [601]>
## 2 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble> <lm> <dbl [601]>
## 3 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble> <lm> <dbl [601]>
## 4 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble> <lm> <dbl [600]>
## 5 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble> <lm> <dbl [600]>
## # ℹ 1 more variable: validate_predicted <list>
# Calculate the mean absolute error for each validate fold
cv_eval_lm <- cv_prep_lm %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~mae(actual = .x, predicted = .y)))
# Print the validate_mae column
cv_eval_lm$validate_mae
## [1] 1.544342 1.500521 1.505744 1.516214 1.546751
# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)
## [1] 1.522714
cv_eval_lm 모델은 예측치와 평균 약 8년 정도의 오차를
보이고 있다. 더 정확한 모델로 개선할 수 있을까?
여러 개의 결정 트리(Decision Trees)를 훈련하여 예측하는 앙상블 학습 방법
rf_model <- ranger::ranger(formula = …, data = …, seed = …, mtry = …, num.trees = …)
seed argument는 결과를 reproducible 하도록 하기 위해
설정mtry 1 : number of features (default: 회귀 - feat / 3,
분류 - \(\sqrt{ number of feat }\) )
mtry개의 변수를 랜덤하게 선택하여 최적의 분할을
찾음mtry값이 작으면 더 랜덤한 모델(성능이
낮아짐)이 되고, 크면 더 결정론적인 모델(과적합 위험
증가)이 됨num.trees 1 : \(\infty\) (default 500)
prediction <- predict(rf_model, new_data)$predictions
library(ranger)
# Build a random forest model for each fold
cv_model_rf <- cv_data %>%
mutate(model = map(train, ~ ranger(formula = life_expectancy ~ .,
data = .x,
seed = 42,
num.trees = 100)))
# Generate predictions using the random forest model
cv_prep_rf <- cv_model_rf %>%
mutate(validate_predicted = map2(model, validate,
~ predict(.x, .y)$predictions),
validate_actual = map(validate, ~.x$life_expectancy))
# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~ mae(actual = .x, predicted = .y)))
# Print the validate_mae column
cv_eval_rf$validate_mae
## [1] 0.7884257 0.8071857 0.7737300 0.7586612 0.9464448
# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)
## [1] 0.8148895
mtrycrossing# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = 2:5)
cv_tune
## # A tibble: 20 × 5
## splits id train validate mtry
## <list> <chr> <list> <list> <int>
## 1 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]> 2
## 2 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]> 3
## 3 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]> 4
## 4 <split [2402/601]> Fold1 <tibble [2,402 × 7]> <tibble [601 × 7]> 5
## 5 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]> 2
## 6 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]> 3
## 7 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]> 4
## 8 <split [2402/601]> Fold2 <tibble [2,402 × 7]> <tibble [601 × 7]> 5
## 9 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]> 2
## 10 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]> 3
## 11 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]> 4
## 12 <split [2402/601]> Fold3 <tibble [2,402 × 7]> <tibble [601 × 7]> 5
## 13 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]> 2
## 14 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]> 3
## 15 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]> 4
## 16 <split [2403/600]> Fold4 <tibble [2,403 × 7]> <tibble [600 × 7]> 5
## 17 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]> 2
## 18 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]> 3
## 19 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]> 4
## 20 <split [2403/600]> Fold5 <tibble [2,403 × 7]> <tibble [600 × 7]> 5
# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>%
mutate(model = map2(train, mtry, ~ ranger(formula = life_expectancy ~ .,
data = .x,
mtry = .y,
num.trees = 100)))
# Generate predictions using the random forest model
cv_prep_tunerf <- cv_model_tunerf %>%
mutate(validate_predicted = map2(model, validate,
~ predict(.x, .y)$predictions),
validate_actual = map(validate, ~.x$life_expectancy))
# Calculate validate MAE for each fold
cv_eval_tunerf <- cv_prep_tunerf %>%
mutate(validate_mae = map2_dbl(validate_actual, validate_predicted, ~ mae(actual = .x, predicted = .y)))
# Calculate the mean validate_mae for each mtry used
cv_eval_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae))
## # A tibble: 4 × 2
## mtry mean_mae
## <int> <dbl>
## 1 2 0.821
## 2 3 0.808
## 3 4 0.807
## 4 5 0.805
Using ranger with an mtry = 4 and
num.trees = 100 is the best!
best_model <- ranger(formula = life_expectancy ~ .,
data = training_data,
mtry = 4,
num.trees = 100,
seed = 42)
test_actual <- testing_data$life_expectancy
test_predict <- predict(best_model, testing_data)$predictions
mae(test_actual, test_predict)
## [1] 0.6326096
glm(formula = …, data = …, family = “binomial”)
set.seed(42)
# Prepare the initial split object
data_split <- initial_split(attrition, prop = 0.75)
# Extract the training data frame
training_data <- training(data_split)
# Extract the testing data frame
testing_data <- testing(data_split)
cv_split <- vfold_cv(training_data, v = 5)
cv_data <- cv_split %>%
mutate(
# Extract the train data frame for each split
train = map(splits, ~training(.x)),
# Extract the validate data frame for each split
validate = map(splits, ~testing(.x))
)
cv_data
## # 5-fold cross-validation
## # A tibble: 5 × 4
## splits id train validate
## <list> <chr> <list> <list>
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]>
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]>
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]>
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]>
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]>
cv_models_lr <- cv_data %>%
mutate(model = map(train, ~ glm(formula = Attrition ~ .,
data = .x,
family = "binomial")))
cv_models_lr
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id train validate model
## <list> <chr> <list> <list> <list>
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]> <glm>
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]> <glm>
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]> <glm>
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]> <glm>
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]> <glm>
y valuesy values값들이 모두 “YES”, “NO” 이기 때문에 TRUE와
FALSE로 바꿔줘야 함
validate_actual <- validate$Attrition == "YES"
predict은 probability vector로 반환하기에
binary vector로 변환시켜야 함
validate_prob <- predict(model, validatae, type = "respose")
validate_predicted <- validate_prob > 0.5
*
table(validate_actual, validate_predicted)
How well your model predicted both the TRUE and FALSE classes
* accuracy(validate_actual, validate_predicted)
How often the model is correct at predicting the TRUE class
(TRUE 예상한 것 중에 얼마나 실제 TRUE 인지?)
* precision(validate_actual, validate_predicted)
Compares the number of observations the model has correctly
identified as TRUE to the total number of TRUE observations
(실제 TRUE 중에 얼마나 TRUE를 예상한건지?)
* recall(validate_actual, validate_predicted)
cv_prep_lr <- cv_models_lr %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response") > 0.5)
)
cv_prep_lr
## # 5-fold cross-validation
## # A tibble: 5 × 7
## splits id train validate model validate_actual
## <list> <chr> <list> <list> <list> <list>
## 1 <split [881/221]> Fold1 <df [881 × 31]> <df [221 × 31]> <glm> <lgl [221]>
## 2 <split [881/221]> Fold2 <df [881 × 31]> <df [221 × 31]> <glm> <lgl [221]>
## 3 <split [882/220]> Fold3 <df [882 × 31]> <df [220 × 31]> <glm> <lgl [220]>
## 4 <split [882/220]> Fold4 <df [882 × 31]> <df [220 × 31]> <glm> <lgl [220]>
## 5 <split [882/220]> Fold5 <df [882 × 31]> <df [220 × 31]> <glm> <lgl [220]>
## # ℹ 1 more variable: validate_predicted <list>
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_lr %>%
mutate(validate_recall = map2_dbl(.x = validate_actual,
.y = validate_predicted,
.f = ~ recall(actual = .x, predicted = .y)))
# Print the validate_recall column
cv_perf_recall$validate_recall
## [1] 0.3157895 0.2727273 0.4651163 0.4705882 0.5172414
# Calculate the average of the validate_recall column
mean(cv_perf_recall$validate_recall)
## [1] 0.4082925
library(ranger)
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = c(2, 4, 8, 16))
# Build a cross validation model for each fold & mtry combination
cv_models_rf <- cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
cv_prep_rf <- cv_models_rf %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y, type = "response")$predictions == "Yes")
)
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_rf %>%
mutate(recall = map2_dbl(.x = validate_actual, .y = validate_predicted, ~recall(actual = .x, predicted = .y)))
# Calculate the mean recall for each mtry used
cv_perf_recall %>%
group_by(mtry) %>%
summarise(mean_recall = mean(recall))
## # A tibble: 4 × 2
## mtry mean_recall
## <dbl> <dbl>
## 1 2 0.0818
## 2 4 0.137
## 3 8 0.155
## 4 16 0.177
recall performance는 대략 0.4로 random
forest 보다 뛰어나다!!!# Build the logistic regression model using all training data
best_model <- glm(formula = Attrition ~ .,
data = training_data, family = "binomial")
# Prepare binary vector of actual Attrition values for testing_data
test_actual <- testing_data$Attrition == "Yes"
# Prepare binary vector of predicted Attrition values for testing_data
test_predicted <- predict(best_model, testing_data, type = "response") > 0.5
# Compare the actual & predicted performance visually using a table
table(test_actual, test_predicted)
## test_predicted
## test_actual FALSE TRUE
## FALSE 301 7
## TRUE 33 27
# Calculate the test accuracy
accuracy(test_actual, test_predicted)
## [1] 0.8913043
# Calculate the test precision
precision(test_actual, test_predicted)
## [1] 0.7941176
# Calculate the test recall
recall(test_actual, test_predicted)
## [1] 0.45