特别说明:j48()函数的使用需要先安装Java软件
首先导入car.test.frame数据集,为了增加可读性,然后将其变量名转换成中文。library(tidyverse)
library(rpart)
data(car.test.frame)
car<-tibble(car.test.frame)
car
## # A tibble: 60 × 8
## Price Country Reliability Mileage Type Weight Disp. HP
## <int> <fct> <int> <int> <fct> <int> <int> <int>
## 1 8895 USA 4 33 Small 2560 97 113
## 2 7402 USA 2 33 Small 2345 114 90
## 3 6319 Korea 4 37 Small 1845 81 63
## 4 6635 Japan/USA 5 32 Small 2260 91 92
## 5 6599 Japan 5 32 Small 2440 113 103
## 6 8672 Mexico 4 26 Small 2285 97 82
## 7 7399 Japan/USA 5 33 Small 2275 97 90
## 8 7254 Korea 1 28 Small 2350 98 74
## 9 9599 Japan 5 25 Small 2295 109 90
## 10 5866 Japan NA 34 Small 1900 73 73
## # ℹ 50 more rows
car$Mileage <- 100*4.546/(1.6*car$Mileage)
names(car) <- c("价格","产地","可靠性","油耗","类型","车重",
"发动机功率","净马力")
tibble(car)
## # A tibble: 60 × 8
## 价格 产地 可靠性 油耗 类型 车重 发动机功率 净马力
## <int> <fct> <int> <dbl> <fct> <int> <int> <int>
## 1 8895 USA 4 8.61 Small 2560 97 113
## 2 7402 USA 2 8.61 Small 2345 114 90
## 3 6319 Korea 4 7.68 Small 1845 81 63
## 4 6635 Japan/USA 5 8.88 Small 2260 91 92
## 5 6599 Japan 5 8.88 Small 2440 113 103
## 6 8672 Mexico 4 10.9 Small 2285 97 82
## 7 7399 Japan/USA 5 8.61 Small 2275 97 90
## 8 7254 Korea 1 10.1 Small 2350 98 74
## 9 9599 Japan 5 11.4 Small 2295 109 90
## 10 5866 Japan NA 8.36 Small 1900 73 73
## # ℹ 50 more rows
summary(car) #获取数据集的概要信息
## 价格 产地 可靠性 油耗 类型
## Min. : 5866 USA :26 Min. :1.000 Min. : 7.679 Compact:15
## 1st Qu.: 9932 Japan :19 1st Qu.:2.000 1st Qu.:10.523 Large : 3
## Median :12216 Japan/USA: 7 Median :3.000 Median :12.353 Medium :13
## Mean :12616 Korea : 3 Mean :3.388 Mean :11.962 Small :13
## 3rd Qu.:14933 Germany : 2 3rd Qu.:5.000 3rd Qu.:13.530 Sporty : 9
## Max. :24760 France : 1 Max. :5.000 Max. :15.785 Van : 7
## (Other) : 2 NA's :11
## 车重 发动机功率 净马力
## Min. :1845 Min. : 73.0 Min. : 63.0
## 1st Qu.:2571 1st Qu.:113.8 1st Qu.:101.5
## Median :2885 Median :144.5 Median :111.5
## Mean :2901 Mean :152.1 Mean :122.3
## 3rd Qu.:3231 3rd Qu.:180.0 3rd Qu.:142.8
## Max. :3855 Max. :305.0 Max. :225.0
##
#传统方式利用现有变量生成新的变量
#Group_Mileage <- matrix(0,60,1) #设矩阵Group_Mileage用来存放新变量
#Group_Mileage[which(car$"油耗">=11.6)] <- "A"
#Group_Mileage[which(car$"油耗"<=9)] <- "C"
#Group_Mileage[which(Group_Mileage==0)] <- "B"
#car$"分组油耗" <- Group_Mileage
#car[1:10,c(4,9)]
#tibble(car)
#利用mutate()函数生成新的变量
library(dplyr)
car <- car %>%
mutate(分组油耗 = case_when(
油耗 >= 11.6 ~ "A",
油耗 <= 9 ~ "C",
TRUE ~ "B" # 其他情况赋值为 "B"
))
# 查看修改后的前10行数据的指定列
print(car[1:10, c(4, 9)]) # 油耗在第四列,分组油耗是第九列
## # A tibble: 10 × 2
## 油耗 分组油耗
## <dbl> <chr>
## 1 8.61 C
## 2 8.61 C
## 3 7.68 C
## 4 8.88 C
## 5 8.88 C
## 6 10.9 B
## 7 8.61 C
## 8 10.1 B
## 9 11.4 B
## 10 8.36 C
# 查看整个数据框
tibble(car)
## # A tibble: 60 × 9
## 价格 产地 可靠性 油耗 类型 车重 发动机功率 净马力 分组油耗
## <int> <fct> <int> <dbl> <fct> <int> <int> <int> <chr>
## 1 8895 USA 4 8.61 Small 2560 97 113 C
## 2 7402 USA 2 8.61 Small 2345 114 90 C
## 3 6319 Korea 4 7.68 Small 1845 81 63 C
## 4 6635 Japan/USA 5 8.88 Small 2260 91 92 C
## 5 6599 Japan 5 8.88 Small 2440 113 103 C
## 6 8672 Mexico 4 10.9 Small 2285 97 82 B
## 7 7399 Japan/USA 5 8.61 Small 2275 97 90 C
## 8 7254 Korea 1 10.1 Small 2350 98 74 B
## 9 9599 Japan 5 11.4 Small 2295 109 90 B
## 10 5866 Japan NA 8.36 Small 1900 73 73 C
## # ℹ 50 more rows
注:R对中文识别不太完善,所以我们还原变量名称为英文
names(car)=c("Price","Country","Reliability","Mileage",
"Type","Weight","Disp.","HP","Oil_Consumption")
car$Oil_Consumption<-as.factor(car$Oil_Consumption)
rfImpute函数:利用随机森林模型中的临近矩阵来对将要进行模型建立的预测数据中存在缺失值进行插值,经过不断的迭代修正所插入的缺失值,尽可能的得到最优的样本拟合值
library(randomForest)
tibble(car)
## # A tibble: 60 × 9
## Price Country Reliability Mileage Type Weight Disp. HP Oil_Consumption
## <int> <fct> <int> <dbl> <fct> <int> <int> <int> <fct>
## 1 8895 USA 4 8.61 Small 2560 97 113 C
## 2 7402 USA 2 8.61 Small 2345 114 90 C
## 3 6319 Korea 4 7.68 Small 1845 81 63 C
## 4 6635 Japan/USA 5 8.88 Small 2260 91 92 C
## 5 6599 Japan 5 8.88 Small 2440 113 103 C
## 6 8672 Mexico 4 10.9 Small 2285 97 82 B
## 7 7399 Japan/USA 5 8.61 Small 2275 97 90 C
## 8 7254 Korea 1 10.1 Small 2350 98 74 B
## 9 9599 Japan 5 11.4 Small 2295 109 90 B
## 10 5866 Japan NA 8.36 Small 1900 73 73 C
## # ℹ 50 more rows
car[which(complete.cases(car)==FALSE),]
## # A tibble: 11 × 9
## Price Country Reliability Mileage Type Weight Disp. HP Oil_Consumption
## <int> <fct> <int> <dbl> <fct> <int> <int> <int> <fct>
## 1 5866 Japan NA 8.36 Small 1900 73 73 C
## 2 10855 USA NA 10.9 Sporty 2840 107 92 B
## 3 13071 Japan NA 10.1 Sporty 2485 109 97 B
## 4 18900 Germany NA 10.5 Compact 2670 121 108 B
## 5 15930 France NA 11.8 Compact 2575 116 120 A
## 6 9999 Korea NA 12.4 Medium 2885 143 110 A
## 7 14495 USA NA 13.5 Medium 3220 189 135 A
## 8 13995 USA NA 15.8 Van 3195 151 110 A
## 9 14929 Japan NA 14.2 Van 3415 143 107 A
## 10 13949 Japan NA 14.2 Van 3185 146 138 A
## 11 14799 Japan NA 15.0 Van 3690 146 106 A
set.seed(111) # 设置随机数生成器初始值
# 对数据集car缺失值进行插值
car.imputed <- rfImpute(Oil_Consumption ~ .,data = car)
## ntree OOB 1 2 3
## 300: 5.00% 0.00% 12.50% 11.11%
## ntree OOB 1 2 3
## 300: 6.67% 2.86% 6.25% 22.22%
## ntree OOB 1 2 3
## 300: 6.67% 5.71% 12.50% 0.00%
## ntree OOB 1 2 3
## 300: 3.33% 2.86% 6.25% 0.00%
## ntree OOB 1 2 3
## 300: 5.00% 0.00% 12.50% 11.11%
car.imputed[which(complete.cases(car)==FALSE),]
## Oil_Consumption Price Country Reliability Mileage Type Weight Disp. HP
## 10 C 5866 Japan 4.245769 8.356618 Small 1900 73 73
## 21 B 10855 USA 3.515819 10.927885 Sporty 2840 107 92
## 22 B 13071 Japan 3.524643 10.147321 Sporty 2485 109 97
## 23 B 18900 Germany 3.581661 10.523148 Compact 2670 121 108
## 34 A 15930 France 3.104609 11.838542 Compact 2575 116 120
## 45 A 9999 Korea 2.995523 12.353261 Medium 2885 143 110
## 49 A 14495 USA 2.996719 13.529762 Medium 3220 189 135
## 54 A 13995 USA 2.982237 15.784722 Van 3195 151 110
## 58 A 14929 Japan 2.965870 14.206250 Van 3415 143 107
## 59 A 13949 Japan 2.993517 14.206250 Van 3185 146 138
## 60 A 14799 Japan 2.980623 14.953947 Van 3690 146 106
car<-car.imputed
tibble(car)
## # A tibble: 60 × 9
## Oil_Consumption Price Country Reliability Mileage Type Weight Disp. HP
## <fct> <int> <fct> <dbl> <dbl> <fct> <int> <int> <int>
## 1 C 8895 USA 4 8.61 Small 2560 97 113
## 2 C 7402 USA 2 8.61 Small 2345 114 90
## 3 C 6319 Korea 4 7.68 Small 1845 81 63
## 4 C 6635 Japan/USA 5 8.88 Small 2260 91 92
## 5 C 6599 Japan 5 8.88 Small 2440 113 103
## 6 B 8672 Mexico 4 10.9 Small 2285 97 82
## 7 C 7399 Japan/USA 5 8.61 Small 2275 97 90
## 8 B 7254 Korea 1 10.1 Small 2350 98 74
## 9 B 9599 Japan 5 11.4 Small 2295 109 90
## 10 C 5866 Japan 4.25 8.36 Small 1900 73 73
## # ℹ 50 more rows
library(sampling)
a <- round(1/4*sum(car$Oil_Consumption=="A"))
b <- round(1/4*sum(car$Oil_Consumption=="B"))
c <- round(1/4*sum(car$Oil_Consumption=="C"))
a;b;c
## [1] 9
## [1] 4
## [1] 2
#使用strata函数对car.test.frame中的“分组油耗”变量进行分层抽样
set.seed(12345)
sub <- car %>%
strata(stratanames="Oil_Consumption",size=c(c,b,a),method="srswor")
tibble(sub) #输出抽样信息
## # A tibble: 15 × 4
## Oil_Consumption ID_unit Prob Stratum
## <fct> <int> <dbl> <int>
## 1 C 3 0.222 1
## 2 C 12 0.222 1
## 3 B 19 0.25 2
## 4 B 22 0.25 2
## 5 B 25 0.25 2
## 6 B 36 0.25 2
## 7 A 16 0.257 3
## 8 A 29 0.257 3
## 9 A 31 0.257 3
## 10 A 35 0.257 3
## 11 A 47 0.257 3
## 12 A 49 0.257 3
## 13 A 54 0.257 3
## 14 A 57 0.257 3
## 15 A 59 0.257 3
Train_Car <- car[-sub$ID_unit,] #生成训练样本集
Test_Car <- car[sub$ID_unit,] #生成测试样本集
dim(Train_Car);dim(Test_Car) #显示训练集和测试集行数,检查比例是否为3:1
## [1] 45 9
## [1] 15 9
# bagging算法
library(ipred)
set.seed(42)
#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
#设置基学习器的参数
ctl = rpart.control(minsplit = 3, maxcompete = 3,
maxdepth = 3, cp =0.001, xval = 5)
#建立装袋模型
bagM1 = bagging(formula=formula1, data = Train_Car,
nbagg = 400, coob = TRUE, control = ctl)
names(bagM1)
## [1] "y" "X" "mtrees" "OOB" "comb" "err" "call"
# 基于测试样本预测
cFit1 = predict(bagM1, Test_Car, type = "class")
# 计算混淆矩阵
confM1 = table(Test_Car$Oil_Consumption, cFit1)
error1 = (sum(confM1)-sum(diag(confM1)))/sum(confM1);error1
## [1] 0.1333333
# adaboost
library(adabag)
set.seed(42)
#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
#设置基学习器的参数
ctl = rpart.control(minsplit = 3, maxcompete = 3,
maxdepth = 3, cp =0.001, xval = 5)
boostM = boosting(formula=formula1, data = Train_Car, boos = TRUE,
mfinal = 400, coeflearn = "Freund", control = ctl)
names(boostM)
## [1] "formula" "trees" "weights" "votes" "prob"
## [6] "class" "importance" "terms" "call"
tibble(boostM$importance)
## # A tibble: 7 × 1
## `boostM$importance`
## <dbl>
## 1 15.0
## 2 13.2
## 3 7.31
## 4 36.6
## 5 9.05
## 6 8.78
## 7 10.0
# 绘制各变量重要性柱状图
par(las=2)
barplot(sort(boostM$importance),
cex.names = 0.6,
names.arg = names(sort(boostM$importance)), horiz = T)
# 预测
adaboost_pred = predict.boosting(boostM, Test_Car, type = "class")
adaboost_pred$confusion
## Observed Class
## Predicted Class A B C
## A 7 0 0
## B 2 4 0
## C 0 0 2
error2<-adaboost_pred$error
error2
## [1] 0.1333333
# randomForest
library(randomForest)
#公式指明目标变量和自变量
formula1=Oil_Consumption~Price+Country+Reliability+Type+Weight+Disp.+HP
set.seed(123456)
err<-as.numeric()
for(i in 1:7){
mtry_test <- randomForest(formula=formula1, data= Train_Car, mtry=i)
err<- append(err, mean(mtry_test$err.rate ))
}
print(err)
## [1] 0.3673575 0.4284538 0.4286064 0.3892693 0.3989968 0.4086314 0.3998921
mtry<-which.min(err)
ntree_fit<-randomForest(formula=formula1, data= Train_Car, mtry=mtry, ntree=500)
names(ntree_fit)
## [1] "call" "type" "predicted" "err.rate"
## [5] "confusion" "votes" "oob.times" "classes"
## [9] "importance" "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest" "y"
## [17] "test" "inbag" "terms"
oobList<-ntree_fit[["err.rate"]] %>% head()
oobList
## OOB A B C
## [1,] 0.3529412 0.00000000 0.6666667 0.6666667
## [2,] 0.3214286 0.06250000 0.6250000 0.7500000
## [3,] 0.3421053 0.04545455 0.6666667 0.8571429
## [4,] 0.3488372 0.12000000 0.5454545 0.8571429
## [5,] 0.3953488 0.16000000 0.6363636 0.8571429
## [6,] 0.3181818 0.04000000 0.5833333 0.8571429
# Rstudio 绘图不能显示中文解决方案
library(showtext)
showtext_auto()
plot(ntree_fit,lty=1:4,col=1:4,main="随机森林OOB错判率和决策树棵数") #发现ntree取100时误差稳定
legend("topright",lty=1:4,col=1:4,c("OOB","A","B","C"))
#计算属性重要性
set.seed(12345)
rf<-randomForest(formula=formula1, data= Train_Car,
mtry=mtry, ntree=400, importance=T)
names(rf)
## [1] "call" "type" "predicted" "err.rate"
## [5] "confusion" "votes" "oob.times" "classes"
## [9] "importance" "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest" "y"
## [17] "test" "inbag" "terms"
rf$importance
## A B C MeanDecreaseAccuracy
## Price 0.05681833 -0.012071429 0.123750000 0.048152329
## Country 0.01916658 -0.009625992 -0.043708333 0.001571375
## Reliability 0.04390660 -0.018503968 0.012708333 0.022438986
## Type 0.03085724 0.015500000 0.048333333 0.027730990
## Weight 0.04464988 -0.017113095 0.107833333 0.036676868
## Disp. 0.09722219 0.027008929 0.071083333 0.071386699
## HP 0.05586813 0.020571429 -0.009166667 0.036238523
## MeanDecreaseGini
## Price 3.757830
## Country 2.510776
## Reliability 3.074495
## Type 3.117578
## Weight 3.731996
## Disp. 4.392771
## HP 3.196797
varImpPlot(rf,main = "The importance of variables")
#测试样本
pre_RF<-predict(rf,newdata=Test_Car)
confM3<-table(pre_RF,Test_Car$Oil_Consumption)
#计算准确率
Accuracy_RF<-sum(diag(confM3))/sum(confM3)*100
error3 = (100-Accuracy_RF)/100;error3
## [1] 0.1333333
errorList = cbind(error1,error2,error3 )
pander::pander(errorList*100)
error1 | error2 | error3 |
---|---|---|
13.33 | 13.33 | 13.33 |