数据大小: 80个预测变量, 2930观测值
## [1] 2930 81
因变量前几个
## [1] 215000 105000 172000 244000 189900 195500
数据前几行
## # A tibble: 6 x 81
## MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley Lot_Shape
## <fct> <fct> <dbl> <int> <fct> <fct> <fct>
## 1 One_Story_1946_and_New~ Resident~ 141 31770 Pave No_A~ Slightly~
## 2 One_Story_1946_and_New~ Resident~ 80 11622 Pave No_A~ Regular
## 3 One_Story_1946_and_New~ Resident~ 81 14267 Pave No_A~ Slightly~
## 4 One_Story_1946_and_New~ Resident~ 93 11160 Pave No_A~ Regular
## 5 Two_Story_1946_and_New~ Resident~ 74 13830 Pave No_A~ Slightly~
## 6 Two_Story_1946_and_New~ Resident~ 78 9978 Pave No_A~ Slightly~
## # i 74 more variables: Land_Contour <fct>, Utilities <fct>, Lot_Config <fct>,
## # Land_Slope <fct>, Neighborhood <fct>, Condition_1 <fct>, Condition_2 <fct>,
## # Bldg_Type <fct>, House_Style <fct>, Overall_Qual <fct>, Overall_Cond <fct>,
## # Year_Built <int>, Year_Remod_Add <int>, Roof_Style <fct>, Roof_Matl <fct>,
## # Exterior_1st <fct>, Exterior_2nd <fct>, Mas_Vnr_Type <fct>,
## # Mas_Vnr_Area <dbl>, Exter_Qual <fct>, Exter_Cond <fct>, Foundation <fct>,
## # Bsmt_Qual <fct>, Bsmt_Cond <fct>, Bsmt_Exposure <fct>, ...
各变量基本情况
离职数据
## Warning: 程辑包'modeldata'是用R版本4.3.3 来建造的
##
## 载入程辑包:'modeldata'
## The following object is masked _by_ '.GlobalEnv':
##
## ames
30个变量 1470观测值
## [1] 1470 31
## # A tibble: 6 x 31
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <int> <fct> <fct> <int> <fct> <int> <ord>
## 1 41 Yes Travel_Rarely 1102 Sales 1 College
## 2 49 No Travel_Freque~ 279 Research_~ 8 Below_Co~
## 3 37 Yes Travel_Rarely 1373 Research_~ 2 College
## 4 33 No Travel_Freque~ 1392 Research_~ 3 Master
## 5 27 No Travel_Rarely 591 Research_~ 2 Below_Co~
## 6 32 No Travel_Freque~ 1005 Research_~ 2 College
## # i 24 more variables: EducationField <fct>, EnvironmentSatisfaction <ord>,
## # Gender <fct>, HourlyRate <int>, JobInvolvement <ord>, JobLevel <int>,
## # JobRole <fct>, JobSatisfaction <ord>, MaritalStatus <fct>,
## # MonthlyIncome <int>, MonthlyRate <int>, NumCompaniesWorked <int>,
## # OverTime <fct>, PercentSalaryHike <int>, PerformanceRating <ord>,
## # RelationshipSatisfaction <ord>, StockOptionLevel <int>,
## # TotalWorkingYears <int>, TrainingTimesLastYear <int>, ...
y的分布: 16%离职
##
## No Yes
## 0.8387755 0.1612245
拆分钻石数据
set.seed(123) # for reproducibility, 设置种子,可再现再生
split <- initial_split(diamonds, strata = "price", prop = 0.7)
train <- training(split)
test <- testing(split)检查训练集与测试集密度函数
# Do the distributions line up?
ggplot(train, aes(x = price)) +
geom_line(stat = "density",
trim = TRUE) +
geom_line(data = test,
stat = "density",
trim = TRUE, col = "red")Lab 动手尝试 连续型Y
拆分ames数据
set.seed(123)
ames_split <- initial_split(ames, prop = ___, strata = ___)
ames_train <- training(___)
ames_test <- testing(___)
# Do the distributions line up?
ggplot(ames_train, aes(x = ___)) +
geom_line(stat = "density",
trim = TRUE) +
geom_line(data = ames_test,
stat = "density",
trim = TRUE, col = "red")Lab 分类数据抽样
# Variables + interactions 变量名称+交叉项
model_fn(Sale_Price ~ Neighborhood + Year_Sold + Neighborhood:Year_Sold, data = ames_train)
# Shorthand for all predictors 所有预测变量的简写
model_fn(Sale_Price ~ ., data = ames_train)# Usually, the variables must all be numeric 通常需要数值型,不能是其他的
features <- c("Year_Sold", "Longitude", "Latitude")
model_fn(x = ames_train[, features], y = ames_train$Sale_Price)直接engine: 交叉验证
meta engine: 交叉验证
# example of 10 fold CV in caret
caret_cv <- train(
Sale_Price ~ .,
data = ames_train,
method = "lm",
trControl = trainControl(method = "cv", number = 10) #<<
)meta engine: bootstrap自助法
超参控制模型复杂度,权衡bias偏差和variance方差
k近邻法
产生一些数据并画图
set.seed(123) # for reproducibility
x <- seq(from = 0, to = 2 * pi, length = 500)
y <- sin(x) + rnorm(length(x), sd = 0.3)
df <- data.frame(x, y) %>%
filter(x < 4.5)
ggplot() +
geom_point(data = df, aes(x, y), alpha = .3)Grid搜索
各种超参的组合
# resampling procedure 设置重复抽样:重复10次5折交叉验证
cv <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
# define grid search 设置超参网格
hyper_grid <- expand.grid(k = seq(2, 150, by = 2))
# perform grid search with caret 网格搜索
knn_fit <- train(
x ~ y,
data = df,
method = "knn",
trControl = cv,
tuneGrid = hyper_grid
)结果作图
ggplot() +
geom_line(data = knn_fit$results, aes(k, RMSE)) +
geom_point(data = knn_fit$results, aes(k, RMSE)) +
geom_point(data = filter(knn_fit$results, k == as.numeric(knn_fit$bestTune)),
aes(k, RMSE),
shape = 21,
fill = "yellow",
color = "black",
stroke = 1,
size = 2) +
scale_y_continuous("Error (RMSE)")最佳k
## k
## 20 40
用最佳k拟合
# single model fit
best_model <- knnreg(y ~ x, k = knn_fit$bestTune$k, data = df)
df$predictions <- predict(best_model, df)
df$truth <- sin(df$x)
ggplot(df, aes(x, y)) +
geom_point(alpha = .3) +
geom_line(aes(y = truth), lty = "dashed", size = 1.5) +
geom_line(aes(x, predictions), size = 1.5, color = "dodgerblue") +
scale_y_continuous("Response", limits = c(-1.75, 1.75), expand = c(0, 0)) +
scale_x_continuous(limits = c(0, 4.5), expand = c(0, 0)) +
ggtitle("Optimal KNN model")整个过程大约2分钟
# 1. stratified sampling with the rsample package
set.seed(123)
split <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train <- training(split)
ames_test <- testing(split)
# 2. create a resampling method
cv <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5
)
# 3. create a hyperparameter grid search
hyper_grid <- expand.grid(k = seq(2, 26, by = 2))
# 4. execute grid search with knn model
# use RMSE as preferred metric
knn_fit <- train(
Sale_Price ~ .,
data = ames_train,
method = "knn",
trControl = cv,
tuneGrid = hyper_grid,
metric = "RMSE"
)## k-Nearest Neighbors
##
## 2049 samples
## 80 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1844, 1844, 1843, 1844, 1844, 1845, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 2 47206.74 0.6596344 31133.25
## 4 44771.97 0.6901038 29276.09
## 6 43846.05 0.7045166 28895.48
## 8 44181.13 0.7033657 29055.05
## 10 44332.46 0.7053884 29129.62
## 12 44486.34 0.7075253 29155.41
## 14 44790.79 0.7077073 29307.11
## 16 45119.37 0.7073844 29484.00
## 18 45366.02 0.7072968 29641.92
## 20 45746.63 0.7052851 29907.89
## 22 46187.92 0.7017539 30192.39
## 24 46605.01 0.6986611 30483.94
## 26 47023.74 0.6956317 30750.09
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
# 43846.05
# plot cross validation results
ggplot(knn_fit$results, aes(k, RMSE)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = scales::dollar)attrition数据
训练测试集划分
set.seed(123)
churn_split <- initial_split(churn, prop = 0.7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test <- testing(churn_split)逻辑回归
关注accuracy
ctrl <- trainControl(
method = "cv",
number = 10
)
set.seed(123)
cv_model1 <- train(
Attrition ~ MonthlyIncome,
data = churn_train,
method = "glm",
family = "binomial",
trControl = ctrl,
)
set.seed(123)
cv_model2 <- train(
Attrition ~ .,
data = churn_train,
method = "glm",
family = "binomial",
trControl = ctrl,
)
# extract out of sample performance measures
summary(
resamples(
list(
model1 = cv_model1,
model2 = cv_model2
)
)
)$statistics$Accuracy## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model1 0.8349515 0.8349515 0.8398379 0.8395076 0.8431373 0.8446602 0
## model2 0.8058252 0.8529412 0.8786765 0.8715662 0.8907767 0.9320388 0
关注ROC
ctrl <- trainControl(
method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary # needed for AUC/ROC
)
set.seed(123)
cv_model3 <- train(
Attrition ~ MonthlyIncome,
data = churn_train,
method = "glm",
family = "binomial",
trControl = ctrl,
metric = "ROC"
)
set.seed(123)
cv_model4 <- train(
Attrition ~ .,
data = churn_train,
method = "glm",
family = "binomial",
trControl = ctrl,
metric = "ROC"
)
# extract out of sample performance measures
summary(
resamples(
list(
model3 = cv_model3,
model4 = cv_model4
)
)
)$statistics## $ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model3 0.4165527 0.6321265 0.6588364 0.6598116 0.7421341 0.8006466 0
## model4 0.7592339 0.7850341 0.8324641 0.8227533 0.8471593 0.8879310 0
##
## $Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model3 1.0000000 1.000000 1.0000000 1.0000000 1.0000000 1.0000000 0
## model4 0.8953488 0.936648 0.9593023 0.9536087 0.9738372 0.9885057 0
##
## $Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## model3 0.0000 0.0000000 0.0000 0.0000000 0.0000000 0.000 0
## model4 0.3125 0.3529412 0.4375 0.4419118 0.5147059 0.625 0
结果检视
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.8715662 0.4539185 0.03610858 0.1452058
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.8227533 0.9536087 0.4419118 0.04194381 0.02792027 0.1119403
混淆矩阵与各种评价标准
下列代码给出的是训练集的各种指标,如果需要测试集的表现,把churn_train改成churn_test即可
# predict class
pred_class <- predict(cv_model2, churn_train)
# create confusion matrix
confusionMatrix(
data = relevel(pred_class, ref = "Yes"),
reference = relevel(churn_train$Attrition, ref = "Yes"),
mode = "everything"
)## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 83 20
## No 82 843
##
## Accuracy : 0.9008
## 95% CI : (0.8809, 0.9184)
## No Information Rate : 0.8395
## P-Value [Acc > NIR] : 8.982e-09
##
## Kappa : 0.5658
##
## Mcnemar's Test P-Value : 1.542e-09
##
## Sensitivity : 0.50303
## Specificity : 0.97683
## Pos Pred Value : 0.80583
## Neg Pred Value : 0.91135
## Precision : 0.80583
## Recall : 0.50303
## F1 : 0.61940
## Prevalence : 0.16051
## Detection Rate : 0.08074
## Detection Prevalence : 0.10019
## Balanced Accuracy : 0.73993
##
## 'Positive' Class : Yes
##
ROC图
library(ROCR)
# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m2_prob <- predict(cv_model2, churn_train, type = "prob")$Yes
# Compute AUC metrics for cv_model1 and cv_model2
perf1 <- prediction(m1_prob, churn_train$Attrition) %>%
performance(measure = "tpr", x.measure = "fpr")
perf2 <- prediction(m2_prob, churn_train$Attrition) %>%
performance(measure = "tpr", x.measure = "fpr")
# Plot ROC curves for cv_model1 and cv_model2
plot(perf1, col = "black", lty = 2)
plot(perf2, add = TRUE, col = "blue")
legend(0.8, 0.2, legend = c("cv_model1", "cv_model2"),
col = c("black", "blue"), lty = 2:1, cex = 0.6)library(ROCit)
# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m2_prob <- predict(cv_model2, churn_train, type = "prob")$Yes
# Compute AUC metrics for cv_model1 and cv_model2
ROCit_obj1 <- rocit(m1_prob,churn_train$Attrition)
ROCit_obj2 <- rocit(m2_prob,churn_train$Attrition)
# Plot ROC curves for cv_model1 and cv_model2
plot(ROCit_obj1)