大数据3-流程

准备数据

数据大小： 80个预测变量， 2930观测值

# access data
ames <- AmesHousing::make_ames()

# initial dimension
dim(ames)

## [1] 2930   81

因变量前几个

# response variable
head(ames$Sale_Price)

## [1] 215000 105000 172000 244000 189900 195500

数据前几行

# first few observations
head(ames)

## # A tibble: 6 x 81
##   MS_SubClass             MS_Zoning Lot_Frontage Lot_Area Street Alley Lot_Shape
##   <fct>                   <fct>            <dbl>    <int> <fct>  <fct> <fct>    
## 1 One_Story_1946_and_New~ Resident~          141    31770 Pave   No_A~ Slightly~
## 2 One_Story_1946_and_New~ Resident~           80    11622 Pave   No_A~ Regular  
## 3 One_Story_1946_and_New~ Resident~           81    14267 Pave   No_A~ Slightly~
## 4 One_Story_1946_and_New~ Resident~           93    11160 Pave   No_A~ Regular  
## 5 Two_Story_1946_and_New~ Resident~           74    13830 Pave   No_A~ Slightly~
## 6 Two_Story_1946_and_New~ Resident~           78     9978 Pave   No_A~ Slightly~
## # i 74 more variables: Land_Contour <fct>, Utilities <fct>, Lot_Config <fct>,
## #   Land_Slope <fct>, Neighborhood <fct>, Condition_1 <fct>, Condition_2 <fct>,
## #   Bldg_Type <fct>, House_Style <fct>, Overall_Qual <fct>, Overall_Cond <fct>,
## #   Year_Built <int>, Year_Remod_Add <int>, Roof_Style <fct>, Roof_Matl <fct>,
## #   Exterior_1st <fct>, Exterior_2nd <fct>, Mas_Vnr_Type <fct>,
## #   Mas_Vnr_Area <dbl>, Exter_Qual <fct>, Exter_Cond <fct>, Foundation <fct>,
## #   Bsmt_Qual <fct>, Bsmt_Cond <fct>, Bsmt_Exposure <fct>, ...

各变量基本情况

summary(ames)

str(ames)

离职数据

library(modeldata)

## Warning: 程辑包'modeldata'是用R版本4.3.3 来建造的

## 
## 载入程辑包：'modeldata'

## The following object is masked _by_ '.GlobalEnv':
## 
##     ames

data(attrition)
churn <- as_tibble(attrition)

30个变量 1470观测值

dim(churn)

## [1] 1470   31

head(churn)

## # A tibble: 6 x 31
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <int> <fct>     <fct>              <int> <fct>                 <int> <ord>    
## 1    41 Yes       Travel_Rarely       1102 Sales                     1 College  
## 2    49 No        Travel_Freque~       279 Research_~                8 Below_Co~
## 3    37 Yes       Travel_Rarely       1373 Research_~                2 College  
## 4    33 No        Travel_Freque~      1392 Research_~                3 Master   
## 5    27 No        Travel_Rarely        591 Research_~                2 Below_Co~
## 6    32 No        Travel_Freque~      1005 Research_~                2 College  
## # i 24 more variables: EducationField <fct>, EnvironmentSatisfaction <ord>,
## #   Gender <fct>, HourlyRate <int>, JobInvolvement <ord>, JobLevel <int>,
## #   JobRole <fct>, JobSatisfaction <ord>, MaritalStatus <fct>,
## #   MonthlyIncome <int>, MonthlyRate <int>, NumCompaniesWorked <int>,
## #   OverTime <fct>, PercentSalaryHike <int>, PerformanceRating <ord>,
## #   RelationshipSatisfaction <ord>, StockOptionLevel <int>,
## #   TotalWorkingYears <int>, TrainingTimesLastYear <int>, ...

y的分布： 16%离职

table(churn$Attrition) %>% prop.table()

## 
##        No       Yes 
## 0.8387755 0.1612245

数据拆分

拆分钻石数据

set.seed(123) # for reproducibility， 设置种子，可再现再生
split <- initial_split(diamonds, strata = "price", prop = 0.7)
train <- training(split)
test  <- testing(split)

检查训练集与测试集密度函数

# Do the distributions line up? 
ggplot(train, aes(x = price)) + 
  geom_line(stat = "density", 
            trim = TRUE) + 
  geom_line(data = test, 
            stat = "density", 
            trim = TRUE, col = "red")

Lab 动手尝试连续型Y

拆分ames数据

set.seed(123)
ames_split <- initial_split(ames, prop = ___, strata = ___)
ames_train <- training(___)
ames_test  <- testing(___)

# Do the distributions line up? 
ggplot(ames_train, aes(x = ___)) + 
  geom_line(stat = "density", 
            trim = TRUE) + 
  geom_line(data = ames_test, 
            stat = "density", 
            trim = TRUE, col = "red")

Lab 分类数据抽样

set.seed(123)
churn_split <- initial_split(churn, prop = ___, strata = ___)
churn_train <- training(___)
churn_test  <- testing(___)

# consistent response ratio between train & test
table(___) %>% prop.table()
table(___) %>% prop.table()

模型 interfaces

Formula 公式+整个数据 interface

# Variables + interactions 变量名称+交叉项
model_fn(Sale_Price ~ Neighborhood + Year_Sold + Neighborhood:Year_Sold, data = ames_train)

# Shorthand for all predictors 所有预测变量的简写
model_fn(Sale_Price ~ ., data = ames_train)

XY interface： x和y数据

# Usually, the variables must all be numeric 通常需要数值型，不能是其他的
features <- c("Year_Sold", "Longitude", "Latitude")
model_fn(x = ames_train[, features], y = ames_train$Sale_Price)

变量名称 interface

# specify x & y by character strings字符串形式
model_fn(
  x = c("Year_Sold", "Longitude", "Latitude"),
  y = "Sale_Price",
  data = ames.h2o
  )

许多 engines

direct直接engine 和meta engine

线性回归的engine

lm_lm    <- lm(Sale_Price ~ ., data = ames_train)
lm_glm   <- glm(Sale_Price ~ ., data = ames_train, family = gaussian)
lm_caret <- train(Sale_Price ~ ., data = ames_train, method = "lm")

lm() and glm() 直接
caret::train() meta engine

随机抽样

直接engine: 交叉验证

glmnet_cv <- cv.glmnet(x, y, alpha=0, nfolds=10, lambda = 10^seq(-5,5,length=100))

meta engine: 交叉验证

# example of 10 fold CV in caret
caret_cv <- train(
  Sale_Price ~ .,
  data = ames_train,
  method = "lm",
  trControl = trainControl(method = "cv",  number = 10) #<<
  )

meta engine: bootstrap自助法

# example of 10 fold CV in caret
caret_cv <- train(
  Sale_Price ~ .,
  data = ames_train,
  method = "lm",
  trControl = trainControl(method = "boot",  number = 10) #<<
  )

调整超参

超参控制模型复杂度，权衡bias偏差和variance方差

k近邻法

k大：variance方差小bias偏差大
k小：variance方差大bias偏差小

产生一些数据并画图

set.seed(123)  # for reproducibility
x <- seq(from = 0, to = 2 * pi, length = 500)
y <- sin(x) + rnorm(length(x), sd = 0.3)
df <- data.frame(x, y) %>%
  filter(x < 4.5)
ggplot() +
  geom_point(data = df, aes(x, y), alpha = .3)

Grid搜索

各种超参的组合

# resampling procedure 设置重复抽样：重复10次5折交叉验证
cv <- trainControl(method = "repeatedcv", number = 10, repeats = 10)

# define grid search 设置超参网格
hyper_grid <- expand.grid(k = seq(2, 150, by = 2))

# perform grid search with caret 网格搜索
knn_fit <- train(
  x ~ y, 
  data = df, 
  method = "knn", 
  trControl = cv, 
  tuneGrid = hyper_grid
  )

结果作图

ggplot() +
  geom_line(data = knn_fit$results, aes(k, RMSE)) +
  geom_point(data = knn_fit$results, aes(k, RMSE)) +
  geom_point(data = filter(knn_fit$results, k == as.numeric(knn_fit$bestTune)),
             aes(k, RMSE),
             shape = 21,
             fill = "yellow",
             color = "black",
             stroke = 1,
             size = 2) +
  scale_y_continuous("Error (RMSE)")

最佳k

print(knn_fit$bestTune)

##     k
## 20 40

用最佳k拟合

# single model fit
best_model <- knnreg(y ~ x, k = knn_fit$bestTune$k, data = df)
df$predictions <- predict(best_model, df)
df$truth <- sin(df$x)

ggplot(df, aes(x, y)) +
  geom_point(alpha = .3) +
  geom_line(aes(y = truth), lty = "dashed", size = 1.5) +
  geom_line(aes(x, predictions), size = 1.5, color = "dodgerblue") +
  scale_y_continuous("Response", limits = c(-1.75, 1.75), expand = c(0, 0)) +
  scale_x_continuous(limits = c(0, 4.5), expand = c(0, 0)) +
  ggtitle("Optimal KNN model")

回归问题整个流程

拆分Train and test
抽样
超级参数grid
调参
模型评估

整个过程大约2分钟

# 1. stratified sampling with the rsample package
set.seed(123)
split  <- initial_split(ames, prop = 0.7, strata = "Sale_Price")
ames_train  <- training(split)
ames_test   <- testing(split)

# 2. create a resampling method
cv <- trainControl(
  method = "repeatedcv", 
  number = 10, 
  repeats = 5
  )

# 3. create a hyperparameter grid search
hyper_grid <- expand.grid(k = seq(2, 26, by = 2))

# 4. execute grid search with knn model
#    use RMSE as preferred metric
knn_fit <- train(
  Sale_Price ~ ., 
  data = ames_train, 
  method = "knn", 
  trControl = cv, 
  tuneGrid = hyper_grid,
  metric = "RMSE"
  )

# 5. evaluate results
# print model results
knn_fit

## k-Nearest Neighbors 
## 
## 2049 samples
##   80 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1844, 1844, 1843, 1844, 1844, 1845, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    2  47206.74  0.6596344  31133.25
##    4  44771.97  0.6901038  29276.09
##    6  43846.05  0.7045166  28895.48
##    8  44181.13  0.7033657  29055.05
##   10  44332.46  0.7053884  29129.62
##   12  44486.34  0.7075253  29155.41
##   14  44790.79  0.7077073  29307.11
##   16  45119.37  0.7073844  29484.00
##   18  45366.02  0.7072968  29641.92
##   20  45746.63  0.7052851  29907.89
##   22  46187.92  0.7017539  30192.39
##   24  46605.01  0.6986611  30483.94
##   26  47023.74  0.6956317  30750.09
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.

# 43846.05
# plot cross validation results
ggplot(knn_fit$results, aes(k, RMSE)) + 
  geom_line() +
  geom_point() +
  scale_y_continuous(labels = scales::dollar)

分类问题整个流程：逻辑回归回顾与caret执行

attrition数据

library(modeldata)
data(attrition)
churn <- as_tibble(attrition)

训练测试集划分

set.seed(123)
churn_split <- initial_split(churn, prop = 0.7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test  <- testing(churn_split)

逻辑回归

关注accuracy

ctrl <- trainControl(
  method = "cv",
  number = 10
)

set.seed(123)
cv_model1 <- train(
  Attrition ~ MonthlyIncome, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = ctrl,
)

set.seed(123)
cv_model2 <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = ctrl,
)

# extract out of sample performance measures
summary(
  resamples(
    list(
      model1 = cv_model1, 
      model2 = cv_model2
    )
  )
)$statistics$Accuracy

##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## model1 0.8349515 0.8349515 0.8398379 0.8395076 0.8431373 0.8446602    0
## model2 0.8058252 0.8529412 0.8786765 0.8715662 0.8907767 0.9320388    0

关注ROC

ctrl <- trainControl(
  method = "cv",
  number = 10,
  classProbs = TRUE,
  summaryFunction = twoClassSummary # needed for AUC/ROC
)

set.seed(123)
cv_model3 <- train(
  Attrition ~ MonthlyIncome, 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = ctrl,
  metric = "ROC"
)

set.seed(123)
cv_model4 <- train(
  Attrition ~ ., 
  data = churn_train, 
  method = "glm",
  family = "binomial",
  trControl = ctrl,
  metric = "ROC"
)

# extract out of sample performance measures
summary(
  resamples(
    list(
      model3 = cv_model3, 
      model4 = cv_model4
    )
  )
)$statistics

## $ROC
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## model3 0.4165527 0.6321265 0.6588364 0.6598116 0.7421341 0.8006466    0
## model4 0.7592339 0.7850341 0.8324641 0.8227533 0.8471593 0.8879310    0
## 
## $Sens
##             Min.  1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## model3 1.0000000 1.000000 1.0000000 1.0000000 1.0000000 1.0000000    0
## model4 0.8953488 0.936648 0.9593023 0.9536087 0.9738372 0.9885057    0
## 
## $Spec
##          Min.   1st Qu. Median      Mean   3rd Qu.  Max. NA's
## model3 0.0000 0.0000000 0.0000 0.0000000 0.0000000 0.000    0
## model4 0.3125 0.3529412 0.4375 0.4419118 0.5147059 0.625    0

结果检视

cv_model2$results

##   parameter  Accuracy     Kappa AccuracySD   KappaSD
## 1      none 0.8715662 0.4539185 0.03610858 0.1452058

cv_model4$results

##   parameter       ROC      Sens      Spec      ROCSD     SensSD    SpecSD
## 1      none 0.8227533 0.9536087 0.4419118 0.04194381 0.02792027 0.1119403

混淆矩阵与各种评价标准

下列代码给出的是训练集的各种指标，如果需要测试集的表现，把churn_train改成churn_test即可

# predict class
pred_class <- predict(cv_model2, churn_train)

# create confusion matrix
confusionMatrix(
  data = relevel(pred_class, ref = "Yes"), 
  reference = relevel(churn_train$Attrition, ref = "Yes"),
  mode = "everything"
)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  83  20
##        No   82 843
##                                           
##                Accuracy : 0.9008          
##                  95% CI : (0.8809, 0.9184)
##     No Information Rate : 0.8395          
##     P-Value [Acc > NIR] : 8.982e-09       
##                                           
##                   Kappa : 0.5658          
##                                           
##  Mcnemar's Test P-Value : 1.542e-09       
##                                           
##             Sensitivity : 0.50303         
##             Specificity : 0.97683         
##          Pos Pred Value : 0.80583         
##          Neg Pred Value : 0.91135         
##               Precision : 0.80583         
##                  Recall : 0.50303         
##                      F1 : 0.61940         
##              Prevalence : 0.16051         
##          Detection Rate : 0.08074         
##    Detection Prevalence : 0.10019         
##       Balanced Accuracy : 0.73993         
##                                           
##        'Positive' Class : Yes             
##

ROC图

library(ROCR)

# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m2_prob <- predict(cv_model2, churn_train, type = "prob")$Yes

# Compute AUC metrics for cv_model1 and cv_model2
perf1 <- prediction(m1_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")
perf2 <- prediction(m2_prob, churn_train$Attrition) %>%
  performance(measure = "tpr", x.measure = "fpr")

# Plot ROC curves for cv_model1 and cv_model2
plot(perf1, col = "black", lty = 2)
plot(perf2, add = TRUE, col = "blue")
legend(0.8, 0.2, legend = c("cv_model1", "cv_model2"),
       col = c("black", "blue"), lty = 2:1, cex = 0.6)

library(ROCit)

# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m2_prob <- predict(cv_model2, churn_train, type = "prob")$Yes

# Compute AUC metrics for cv_model1 and cv_model2
ROCit_obj1 <- rocit(m1_prob,churn_train$Attrition)
ROCit_obj2 <- rocit(m2_prob,churn_train$Attrition)

# Plot ROC curves for cv_model1 and cv_model2
plot(ROCit_obj1)

plot(ROCit_obj2)