数据导入和预处理

首先导入car.test.frame数据集,为了增加可读性,然后将其变量名转换成中文。

数据导入和预处理

特别说明:j48()函数的使用需要先安装Java软件

首先导入car.test.frame数据集,为了增加可读性,然后将其变量名转换成中文。

数据读取

library(tidyverse)
library(rpart)
data(car.test.frame)

car<-tibble(car.test.frame)
car
## # A tibble: 60 × 8
##    Price Country   Reliability Mileage Type  Weight Disp.    HP
##    <int> <fct>           <int>   <int> <fct>  <int> <int> <int>
##  1  8895 USA                 4      33 Small   2560    97   113
##  2  7402 USA                 2      33 Small   2345   114    90
##  3  6319 Korea               4      37 Small   1845    81    63
##  4  6635 Japan/USA           5      32 Small   2260    91    92
##  5  6599 Japan               5      32 Small   2440   113   103
##  6  8672 Mexico              4      26 Small   2285    97    82
##  7  7399 Japan/USA           5      33 Small   2275    97    90
##  8  7254 Korea               1      28 Small   2350    98    74
##  9  9599 Japan               5      25 Small   2295   109    90
## 10  5866 Japan              NA      34 Small   1900    73    73
## # ℹ 50 more rows
car$Mileage <- 100*4.546/(1.6*car$Mileage)
names(car) <- c("价格","产地","可靠性","油耗","类型","车重",
                        "发动机功率","净马力")
tibble(car)
## # A tibble: 60 × 8
##     价格 产地      可靠性  油耗 类型   车重 发动机功率 净马力
##    <int> <fct>      <int> <dbl> <fct> <int>      <int>  <int>
##  1  8895 USA            4  8.61 Small  2560         97    113
##  2  7402 USA            2  8.61 Small  2345        114     90
##  3  6319 Korea          4  7.68 Small  1845         81     63
##  4  6635 Japan/USA      5  8.88 Small  2260         91     92
##  5  6599 Japan          5  8.88 Small  2440        113    103
##  6  8672 Mexico         4 10.9  Small  2285         97     82
##  7  7399 Japan/USA      5  8.61 Small  2275         97     90
##  8  7254 Korea          1 10.1  Small  2350         98     74
##  9  9599 Japan          5 11.4  Small  2295        109     90
## 10  5866 Japan         NA  8.36 Small  1900         73     73
## # ℹ 50 more rows
summary(car) #获取数据集的概要信息
##       价格              产地        可靠性           油耗             类型   
##  Min.   : 5866   USA      :26   Min.   :1.000   Min.   : 7.679   Compact:15  
##  1st Qu.: 9932   Japan    :19   1st Qu.:2.000   1st Qu.:10.523   Large  : 3  
##  Median :12216   Japan/USA: 7   Median :3.000   Median :12.353   Medium :13  
##  Mean   :12616   Korea    : 3   Mean   :3.388   Mean   :11.962   Small  :13  
##  3rd Qu.:14933   Germany  : 2   3rd Qu.:5.000   3rd Qu.:13.530   Sporty : 9  
##  Max.   :24760   France   : 1   Max.   :5.000   Max.   :15.785   Van    : 7  
##                  (Other)  : 2   NA's   :11                                   
##       车重        发动机功率        净马力     
##  Min.   :1845   Min.   : 73.0   Min.   : 63.0  
##  1st Qu.:2571   1st Qu.:113.8   1st Qu.:101.5  
##  Median :2885   Median :144.5   Median :111.5  
##  Mean   :2901   Mean   :152.1   Mean   :122.3  
##  3rd Qu.:3231   3rd Qu.:180.0   3rd Qu.:142.8  
##  Max.   :3855   Max.   :305.0   Max.   :225.0  
## 

将目标变量转换成分类型变量

#传统方式利用现有变量生成新的变量
#Group_Mileage <- matrix(0,60,1) #设矩阵Group_Mileage用来存放新变量
#Group_Mileage[which(car$"油耗">=11.6)] <- "A"
#Group_Mileage[which(car$"油耗"<=9)] <- "C"
#Group_Mileage[which(Group_Mileage==0)] <- "B"

#car$"分组油耗" <- Group_Mileage
#car[1:10,c(4,9)]
#tibble(car)
#利用mutate()函数生成新的变量
library(dplyr)

car <- car %>%
  mutate(分组油耗 = case_when(
    油耗 >= 11.6 ~ "A",
    油耗 <= 9 ~ "C",
    TRUE ~ "B"  # 其他情况赋值为 "B"
  ))

# 查看修改后的前10行数据的指定列
print(car[1:10, c(4, 9)])  # 油耗在第四列,分组油耗是第九列
## # A tibble: 10 × 2
##     油耗 分组油耗
##    <dbl> <chr>   
##  1  8.61 C       
##  2  8.61 C       
##  3  7.68 C       
##  4  8.88 C       
##  5  8.88 C       
##  6 10.9  B       
##  7  8.61 C       
##  8 10.1  B       
##  9 11.4  B       
## 10  8.36 C
# 查看整个数据框
tibble(car)
## # A tibble: 60 × 9
##     价格 产地      可靠性  油耗 类型   车重 发动机功率 净马力 分组油耗
##    <int> <fct>      <int> <dbl> <fct> <int>      <int>  <int> <chr>   
##  1  8895 USA            4  8.61 Small  2560         97    113 C       
##  2  7402 USA            2  8.61 Small  2345        114     90 C       
##  3  6319 Korea          4  7.68 Small  1845         81     63 C       
##  4  6635 Japan/USA      5  8.88 Small  2260         91     92 C       
##  5  6599 Japan          5  8.88 Small  2440        113    103 C       
##  6  8672 Mexico         4 10.9  Small  2285         97     82 B       
##  7  7399 Japan/USA      5  8.61 Small  2275         97     90 C       
##  8  7254 Korea          1 10.1  Small  2350         98     74 B       
##  9  9599 Japan          5 11.4  Small  2295        109     90 B       
## 10  5866 Japan         NA  8.36 Small  1900         73     73 C       
## # ℹ 50 more rows

注:R对中文识别不太完善,所以我们还原变量名称为英文

names(car)=c("Price","Country","Reliability","Mileage",
"Type","Weight","Disp.","HP","Oil_Consumption") 
car$Oil_Consumption<-as.factor(car$Oil_Consumption)

随机森林插值

rfImpute函数:利用随机森林模型中的临近矩阵来对将要进行模型建立的预测数据中存在缺失值进行插值,经过不断的迭代修正所插入的缺失值,尽可能的得到最优的样本拟合值

library(randomForest)

tibble(car)
## # A tibble: 60 × 9
##    Price Country   Reliability Mileage Type  Weight Disp.    HP Oil_Consumption
##    <int> <fct>           <int>   <dbl> <fct>  <int> <int> <int> <fct>          
##  1  8895 USA                 4    8.61 Small   2560    97   113 C              
##  2  7402 USA                 2    8.61 Small   2345   114    90 C              
##  3  6319 Korea               4    7.68 Small   1845    81    63 C              
##  4  6635 Japan/USA           5    8.88 Small   2260    91    92 C              
##  5  6599 Japan               5    8.88 Small   2440   113   103 C              
##  6  8672 Mexico              4   10.9  Small   2285    97    82 B              
##  7  7399 Japan/USA           5    8.61 Small   2275    97    90 C              
##  8  7254 Korea               1   10.1  Small   2350    98    74 B              
##  9  9599 Japan               5   11.4  Small   2295   109    90 B              
## 10  5866 Japan              NA    8.36 Small   1900    73    73 C              
## # ℹ 50 more rows
car[which(complete.cases(car)==FALSE),]
## # A tibble: 11 × 9
##    Price Country Reliability Mileage Type    Weight Disp.    HP Oil_Consumption
##    <int> <fct>         <int>   <dbl> <fct>    <int> <int> <int> <fct>          
##  1  5866 Japan            NA    8.36 Small     1900    73    73 C              
##  2 10855 USA              NA   10.9  Sporty    2840   107    92 B              
##  3 13071 Japan            NA   10.1  Sporty    2485   109    97 B              
##  4 18900 Germany          NA   10.5  Compact   2670   121   108 B              
##  5 15930 France           NA   11.8  Compact   2575   116   120 A              
##  6  9999 Korea            NA   12.4  Medium    2885   143   110 A              
##  7 14495 USA              NA   13.5  Medium    3220   189   135 A              
##  8 13995 USA              NA   15.8  Van       3195   151   110 A              
##  9 14929 Japan            NA   14.2  Van       3415   143   107 A              
## 10 13949 Japan            NA   14.2  Van       3185   146   138 A              
## 11 14799 Japan            NA   15.0  Van       3690   146   106 A
set.seed(111)       # 设置随机数生成器初始值

# 对数据集car缺失值进行插值
car.imputed <- rfImpute(Oil_Consumption ~ .,data = car)   
## ntree      OOB      1      2      3
##   300:   5.00%  0.00% 12.50% 11.11%
## ntree      OOB      1      2      3
##   300:   6.67%  2.86%  6.25% 22.22%
## ntree      OOB      1      2      3
##   300:   6.67%  5.71% 12.50%  0.00%
## ntree      OOB      1      2      3
##   300:   3.33%  2.86%  6.25%  0.00%
## ntree      OOB      1      2      3
##   300:   5.00%  0.00% 12.50% 11.11%
car.imputed[which(complete.cases(car)==FALSE),]  
##    Oil_Consumption Price Country Reliability   Mileage    Type Weight Disp.  HP
## 10               C  5866   Japan    4.245769  8.356618   Small   1900    73  73
## 21               B 10855     USA    3.515819 10.927885  Sporty   2840   107  92
## 22               B 13071   Japan    3.524643 10.147321  Sporty   2485   109  97
## 23               B 18900 Germany    3.581661 10.523148 Compact   2670   121 108
## 34               A 15930  France    3.104609 11.838542 Compact   2575   116 120
## 45               A  9999   Korea    2.995523 12.353261  Medium   2885   143 110
## 49               A 14495     USA    2.996719 13.529762  Medium   3220   189 135
## 54               A 13995     USA    2.982237 15.784722     Van   3195   151 110
## 58               A 14929   Japan    2.965870 14.206250     Van   3415   143 107
## 59               A 13949   Japan    2.993517 14.206250     Van   3185   146 138
## 60               A 14799   Japan    2.980623 14.953947     Van   3690   146 106
car<-car.imputed 
tibble(car)
## # A tibble: 60 × 9
##    Oil_Consumption Price Country   Reliability Mileage Type  Weight Disp.    HP
##    <fct>           <int> <fct>           <dbl>   <dbl> <fct>  <int> <int> <int>
##  1 C                8895 USA              4       8.61 Small   2560    97   113
##  2 C                7402 USA              2       8.61 Small   2345   114    90
##  3 C                6319 Korea            4       7.68 Small   1845    81    63
##  4 C                6635 Japan/USA        5       8.88 Small   2260    91    92
##  5 C                6599 Japan            5       8.88 Small   2440   113   103
##  6 B                8672 Mexico           4      10.9  Small   2285    97    82
##  7 C                7399 Japan/USA        5       8.61 Small   2275    97    90
##  8 B                7254 Korea            1      10.1  Small   2350    98    74
##  9 B                9599 Japan            5      11.4  Small   2295   109    90
## 10 C                5866 Japan            4.25    8.36 Small   1900    73    73
## # ℹ 50 more rows

生成训练样本集和测试样本集

library(sampling)
a <- round(1/4*sum(car$Oil_Consumption=="A"))
b <- round(1/4*sum(car$Oil_Consumption=="B"))
c <- round(1/4*sum(car$Oil_Consumption=="C"))

a;b;c
## [1] 9
## [1] 4
## [1] 2
#使用strata函数对car.test.frame中的“分组油耗”变量进行分层抽样
set.seed(12345)
sub <- car %>% 
  strata(stratanames="Oil_Consumption",size=c(c,b,a),method="srswor")

tibble(sub)   #输出抽样信息
## # A tibble: 15 × 4
##    Oil_Consumption ID_unit  Prob Stratum
##    <fct>             <int> <dbl>   <int>
##  1 C                     3 0.222       1
##  2 C                    12 0.222       1
##  3 B                    19 0.25        2
##  4 B                    22 0.25        2
##  5 B                    25 0.25        2
##  6 B                    36 0.25        2
##  7 A                    16 0.257       3
##  8 A                    29 0.257       3
##  9 A                    31 0.257       3
## 10 A                    35 0.257       3
## 11 A                    47 0.257       3
## 12 A                    49 0.257       3
## 13 A                    54 0.257       3
## 14 A                    57 0.257       3
## 15 A                    59 0.257       3
Train_Car <- car[-sub$ID_unit,]   #生成训练样本集
Test_Car <- car[sub$ID_unit,]           #生成测试样本集
dim(Train_Car);dim(Test_Car)      #显示训练集和测试集行数,检查比例是否为3:1
## [1] 45  9
## [1] 15  9

利用ipred包中的bagging建立组合分类树模型

# bagging算法
library(ipred)
set.seed(42)

#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP


#设置基学习器的参数
ctl = rpart.control(minsplit = 3, maxcompete = 3,
                    maxdepth = 3, cp =0.001, xval = 5)

#建立装袋模型
bagM1 = bagging(formula=formula1, data = Train_Car, 
                nbagg = 400, coob = TRUE, control = ctl)
names(bagM1)
## [1] "y"      "X"      "mtrees" "OOB"    "comb"   "err"    "call"
# 基于测试样本预测
cFit1 = predict(bagM1, Test_Car, type = "class")
# 计算混淆矩阵
confM1 = table(Test_Car$Oil_Consumption, cFit1)
error1 = (sum(confM1)-sum(diag(confM1)))/sum(confM1);error1
## [1] 0.1333333

boosting算法实现

# adaboost
library(adabag)
set.seed(42)

#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP


#设置基学习器的参数
ctl = rpart.control(minsplit = 3, maxcompete = 3,
                    maxdepth = 3, cp =0.001, xval = 5)


boostM = boosting(formula=formula1, data =  Train_Car, boos = TRUE, 
                  mfinal = 400, coeflearn = "Freund", control = ctl)
names(boostM)
## [1] "formula"    "trees"      "weights"    "votes"      "prob"      
## [6] "class"      "importance" "terms"      "call"
tibble(boostM$importance)
## # A tibble: 7 × 1
##   `boostM$importance`
##                 <dbl>
## 1               15.0 
## 2               13.2 
## 3                7.31
## 4               36.6 
## 5                9.05
## 6                8.78
## 7               10.0
# 绘制各变量重要性柱状图
par(las=2)
barplot(sort(boostM$importance),
        cex.names = 0.6,
        names.arg = names(sort(boostM$importance)), horiz = T)

# 预测
adaboost_pred = predict.boosting(boostM, Test_Car, type = "class")
adaboost_pred$confusion
##                Observed Class
## Predicted Class A B C
##               A 7 0 0
##               B 2 4 0
##               C 0 0 2
error2<-adaboost_pred$error
error2
## [1] 0.1333333

随机森林算法实现

# randomForest
library(randomForest)

#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP

set.seed(123456)
err<-as.numeric()
for(i in 1:7){  
  mtry_test <- randomForest(formula=formula1, data= Train_Car, mtry=i)    
  err<- append(err, mean(mtry_test$err.rate ))
} 
print(err)
## [1] 0.3673575 0.4284538 0.4286064 0.3892693 0.3989968 0.4086314 0.3998921
mtry<-which.min(err)
ntree_fit<-randomForest(formula=formula1, data= Train_Car, mtry=mtry, ntree=500)
names(ntree_fit)
##  [1] "call"            "type"            "predicted"       "err.rate"       
##  [5] "confusion"       "votes"           "oob.times"       "classes"        
##  [9] "importance"      "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"          "y"              
## [17] "test"            "inbag"           "terms"
oobList<-ntree_fit[["err.rate"]] %>% head()
oobList
##            OOB          A         B         C
## [1,] 0.3529412 0.00000000 0.6666667 0.6666667
## [2,] 0.3214286 0.06250000 0.6250000 0.7500000
## [3,] 0.3421053 0.04545455 0.6666667 0.8571429
## [4,] 0.3488372 0.12000000 0.5454545 0.8571429
## [5,] 0.3953488 0.16000000 0.6363636 0.8571429
## [6,] 0.3181818 0.04000000 0.5833333 0.8571429
# Rstudio 绘图不能显示中文解决方案
library(showtext)
showtext_auto()
plot(ntree_fit,lty=1:4,col=1:4,main="随机森林OOB错判率和决策树棵数")  #发现ntree取100时误差稳定
legend("topright",lty=1:4,col=1:4,c("OOB","A","B","C"))

#计算属性重要性
set.seed(12345)
rf<-randomForest(formula=formula1, data= Train_Car, 
                 mtry=mtry, ntree=400, importance=T)
names(rf)
##  [1] "call"            "type"            "predicted"       "err.rate"       
##  [5] "confusion"       "votes"           "oob.times"       "classes"        
##  [9] "importance"      "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"          "y"              
## [17] "test"            "inbag"           "terms"
rf$importance
##                      A            B            C MeanDecreaseAccuracy
## Price       0.05681833 -0.012071429  0.123750000          0.048152329
## Country     0.01916658 -0.009625992 -0.043708333          0.001571375
## Reliability 0.04390660 -0.018503968  0.012708333          0.022438986
## Type        0.03085724  0.015500000  0.048333333          0.027730990
## Weight      0.04464988 -0.017113095  0.107833333          0.036676868
## Disp.       0.09722219  0.027008929  0.071083333          0.071386699
## HP          0.05586813  0.020571429 -0.009166667          0.036238523
##             MeanDecreaseGini
## Price               3.757830
## Country             2.510776
## Reliability         3.074495
## Type                3.117578
## Weight              3.731996
## Disp.               4.392771
## HP                  3.196797
varImpPlot(rf,main = "The importance of variables")

#测试样本
pre_RF<-predict(rf,newdata=Test_Car)
confM3<-table(pre_RF,Test_Car$Oil_Consumption)

#计算准确率
Accuracy_RF<-sum(diag(confM3))/sum(confM3)*100
error3 = (100-Accuracy_RF)/100;error3
## [1] 0.1333333

结论

errorList = cbind(error1,error2,error3 )
pander::pander(errorList*100)
error1 error2 error3
13.33 13.33 13.33