library(tidyverse)
# Rstudio绘图不能显示中文解决方案
library(showtext)
showtext_auto()
dir<-'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
wdbc.data <- read.csv(dir,header = F)
dim(wdbc.data)
## [1] 569 32
names(wdbc.data) <- c('ID','Diagnosis','radius_mean',
'texture_mean',
'perimeter_mean','area_mean','smoothness_mean',
'compactness_mean',
'concavity_mean','concave points_mean','symmetry_mean',
'fractal dimension_mean',
'radius_sd','texture_sd','perimeter_sd','area_sd',
'smoothness_sd','compactness_sd',
'concavity_sd','concave points_sd','symmetry_sd',
'fractal dimension_sd','radius_max_mean',
'texture_max_mean','perimeter_max_mean',
'area_max_mean','smoothness_max_mean',
'compactness_max_mean',
'concavity_max_mean','concave points_max_mean',
'symmetry_max_mean','fractal dimension_max_mean')
## M = malignant 恶性的, B = benign 良性的
# 将目标属性编码因子类型
wdbc.data$Diagnosis
## [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
## [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
## [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
## [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
wdbc.data$Diagnosis<-factor(wdbc.data$Diagnosis,
levels=c("M","B"),
labels=c("malignant","benign"))
write.csv(wdbc.data,"wdbc.csv")
wdbc<-read.csv("wdbc.csv")
tibble(wdbc)
## # A tibble: 569 × 33
## X ID Diagnosis radius_mean texture_mean perimeter_mean area_mean
## <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 842302 malignant 18.0 10.4 123. 1001
## 2 2 842517 malignant 20.6 17.8 133. 1326
## 3 3 84300903 malignant 19.7 21.2 130 1203
## 4 4 84348301 malignant 11.4 20.4 77.6 386.
## 5 5 84358402 malignant 20.3 14.3 135. 1297
## 6 6 843786 malignant 12.4 15.7 82.6 477.
## 7 7 844359 malignant 18.2 20.0 120. 1040
## 8 8 84458202 malignant 13.7 20.8 90.2 578.
## 9 9 844981 malignant 13 21.8 87.5 520.
## 10 10 84501001 malignant 12.5 24.0 84.0 476.
## # … with 559 more rows, and 26 more variables: smoothness_mean <dbl>,
## # compactness_mean <dbl>, concavity_mean <dbl>, concave.points_mean <dbl>,
## # symmetry_mean <dbl>, fractal.dimension_mean <dbl>, radius_sd <dbl>,
## # texture_sd <dbl>, perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## # compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## # symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## # texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
wdbc$Diagnosis %>% table()
## .
## benign malignant
## 357 212
## prop.table():计算table各列的占比
options(digits=4)
wdbc$Diagnosis %>% table() %>% prop.table()*100
## .
## benign malignant
## 62.74 37.26
data<-wdbc[,-c(1,2)]
tibble(data)
## # A tibble: 569 × 31
## Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 malignant 18.0 10.4 123. 1001 0.118
## 2 malignant 20.6 17.8 133. 1326 0.0847
## 3 malignant 19.7 21.2 130 1203 0.110
## 4 malignant 11.4 20.4 77.6 386. 0.142
## 5 malignant 20.3 14.3 135. 1297 0.100
## 6 malignant 12.4 15.7 82.6 477. 0.128
## 7 malignant 18.2 20.0 120. 1040 0.0946
## 8 malignant 13.7 20.8 90.2 578. 0.119
## 9 malignant 13 21.8 87.5 520. 0.127
## 10 malignant 12.5 24.0 84.0 476. 0.119
## # … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
## # concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
## # fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
## # perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## # compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## # symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## # texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
which(is.na(data)) #判断是否有缺失值
## integer(0)
summary(data)
## Diagnosis radius_mean texture_mean perimeter_mean
## Length:569 Min. : 6.98 Min. : 9.71 Min. : 43.8
## Class :character 1st Qu.:11.70 1st Qu.:16.17 1st Qu.: 75.2
## Mode :character Median :13.37 Median :18.84 Median : 86.2
## Mean :14.13 Mean :19.29 Mean : 92.0
## 3rd Qu.:15.78 3rd Qu.:21.80 3rd Qu.:104.1
## Max. :28.11 Max. :39.28 Max. :188.5
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 144 Min. :0.0526 Min. :0.0194 Min. :0.0000
## 1st Qu.: 420 1st Qu.:0.0864 1st Qu.:0.0649 1st Qu.:0.0296
## Median : 551 Median :0.0959 Median :0.0926 Median :0.0615
## Mean : 655 Mean :0.0964 Mean :0.1043 Mean :0.0888
## 3rd Qu.: 783 3rd Qu.:0.1053 3rd Qu.:0.1304 3rd Qu.:0.1307
## Max. :2501 Max. :0.1634 Max. :0.3454 Max. :0.4268
## concave.points_mean symmetry_mean fractal.dimension_mean radius_sd
## Min. :0.0000 Min. :0.106 Min. :0.0500 Min. :0.112
## 1st Qu.:0.0203 1st Qu.:0.162 1st Qu.:0.0577 1st Qu.:0.232
## Median :0.0335 Median :0.179 Median :0.0615 Median :0.324
## Mean :0.0489 Mean :0.181 Mean :0.0628 Mean :0.405
## 3rd Qu.:0.0740 3rd Qu.:0.196 3rd Qu.:0.0661 3rd Qu.:0.479
## Max. :0.2012 Max. :0.304 Max. :0.0974 Max. :2.873
## texture_sd perimeter_sd area_sd smoothness_sd
## Min. :0.360 Min. : 0.757 Min. : 6.8 Min. :0.00171
## 1st Qu.:0.834 1st Qu.: 1.606 1st Qu.: 17.9 1st Qu.:0.00517
## Median :1.108 Median : 2.287 Median : 24.5 Median :0.00638
## Mean :1.217 Mean : 2.866 Mean : 40.3 Mean :0.00704
## 3rd Qu.:1.474 3rd Qu.: 3.357 3rd Qu.: 45.2 3rd Qu.:0.00815
## Max. :4.885 Max. :21.980 Max. :542.2 Max. :0.03113
## compactness_sd concavity_sd concave.points_sd symmetry_sd
## Min. :0.00225 Min. :0.0000 Min. :0.00000 Min. :0.00788
## 1st Qu.:0.01308 1st Qu.:0.0151 1st Qu.:0.00764 1st Qu.:0.01516
## Median :0.02045 Median :0.0259 Median :0.01093 Median :0.01873
## Mean :0.02548 Mean :0.0319 Mean :0.01180 Mean :0.02054
## 3rd Qu.:0.03245 3rd Qu.:0.0420 3rd Qu.:0.01471 3rd Qu.:0.02348
## Max. :0.13540 Max. :0.3960 Max. :0.05279 Max. :0.07895
## fractal.dimension_sd radius_max_mean texture_max_mean perimeter_max_mean
## Min. :0.000895 Min. : 7.93 Min. :12.0 Min. : 50.4
## 1st Qu.:0.002248 1st Qu.:13.01 1st Qu.:21.1 1st Qu.: 84.1
## Median :0.003187 Median :14.97 Median :25.4 Median : 97.7
## Mean :0.003795 Mean :16.27 Mean :25.7 Mean :107.3
## 3rd Qu.:0.004558 3rd Qu.:18.79 3rd Qu.:29.7 3rd Qu.:125.4
## Max. :0.029840 Max. :36.04 Max. :49.5 Max. :251.2
## area_max_mean smoothness_max_mean compactness_max_mean concavity_max_mean
## Min. : 185 Min. :0.0712 Min. :0.0273 Min. :0.000
## 1st Qu.: 515 1st Qu.:0.1166 1st Qu.:0.1472 1st Qu.:0.115
## Median : 686 Median :0.1313 Median :0.2119 Median :0.227
## Mean : 881 Mean :0.1324 Mean :0.2543 Mean :0.272
## 3rd Qu.:1084 3rd Qu.:0.1460 3rd Qu.:0.3391 3rd Qu.:0.383
## Max. :4254 Max. :0.2226 Max. :1.0580 Max. :1.252
## concave.points_max_mean symmetry_max_mean fractal.dimension_max_mean
## Min. :0.0000 Min. :0.157 Min. :0.0550
## 1st Qu.:0.0649 1st Qu.:0.250 1st Qu.:0.0715
## Median :0.0999 Median :0.282 Median :0.0800
## Mean :0.1146 Mean :0.290 Mean :0.0839
## 3rd Qu.:0.1614 3rd Qu.:0.318 3rd Qu.:0.0921
## Max. :0.2910 Max. :0.664 Max. :0.2075
由于wdbc.data中的指标id并无有效的实际作用,因此采用data进行操作。我将diagnosis指标设置为0-1变量,即malignant设置成1,benign设置为0。并且为了防止量纲对数据分析的影响,我将各指标进行了最大最小标准化,使所有数据都落在0~1之间。
#将肿瘤类型恶性或良性转化成0-1变量
data$Diagnosis <- ifelse(data$Diagnosis=="malignant", 1, 0)
#进行数据标准化
for (i in 2:31) {
data[,i] = (data[,i]-min(data[,i]))/(max(data[,i])-min(data[,i]))
}
tibble(data)
# A tibble: 569 × 31
Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.521 0.0227 0.546 0.364 0.594
2 1 0.643 0.273 0.616 0.502 0.290
3 1 0.601 0.390 0.596 0.449 0.514
4 1 0.210 0.361 0.234 0.103 0.811
5 1 0.630 0.157 0.631 0.489 0.430
6 1 0.259 0.203 0.268 0.142 0.679
7 1 0.533 0.347 0.524 0.380 0.379
8 1 0.318 0.376 0.321 0.184 0.598
9 1 0.285 0.410 0.302 0.160 0.674
10 1 0.259 0.485 0.278 0.141 0.596
# … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
# concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
# fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
# perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
# compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
# symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
# texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
set.seed(111)
sampleId <- sample(nrow(data),nrow(data)*1/4)
test <- data[sampleId,]
train <- data[-sampleId,]
rbind(dim(train),dim(test))
[,1] [,2]
[1,] 427 31
[2,] 142 31
library(class)
Warning: package 'class' was built under R version 4.1.2
options(digits=3)
#基于全体观测样本建模
errRatio<-vector()
for(i in 1:30){
KnnFit<-knn(train=data[,2:31],test=data[,2:31],
cl=data[,1],k=i)
CT<-table(data[,1],KnnFit)
errRatio<-c(errRatio,(1-sum(diag(CT))/sum(CT))*100)
}
plot(errRatio,type="l",col="blue",
xlab="K:number of neighbors",
ylab="errRatio(%)",
main="errRatio and number of neighbors K",
ylim=c(0,10))
points(1:30,errRatio,pch=8,col=1)
#旁置法KNN
errRatio1<-vector()
for(i in 1:30){
KnnFit<-knn(train=train[,2:31],
test=test[,2:31],
cl=train[,1],k=i)
CT<-table(test[,1],KnnFit) #计算混淆矩阵
#计算分类错误率
errRatio1<-c(errRatio1,(1-sum(diag(CT))/sum(CT))*100)
}
lines(1:30,errRatio1,lty=2,col="red")
points(1:30,errRatio1,pch=10,col=2)
# 留一法交叉验证KNN
set.seed(12345)
errRatio2<-vector()
for(i in 1:30){
KnnFit<-knn.cv(train=data[,2:31],cl=data[,1],k=i)
CT<-table(data[,1],KnnFit)
errRatio2<-c(errRatio2,(1-sum(diag(CT))/sum(CT))*100)
}
lines(1:30,errRatio2,col="black")
points(1:30,errRatio2,pch=15,col=3)
legend("topright",
c("Total","train","leave-one-out method"),
lty=1:2,
col=c("blue","red","black"),cex=0.6)
which.min(errRatio1)
[1] 3
which.min(errRatio2[1:10])
[1] 10
由图像可知,其中的蓝线为全部观测进入训练样本集时,参数K取1~30下的错判率曲线。K=1时错判率为0.上方的红色虚线为旁置法的错判率曲线,K=2时达到最小。黑色实线为留一法的错判率曲线,K=3时达到最小。此外,图中基于整个样本的训练误差曲线在最下方,可见是对预测误差偏低的乐观估计,并且容易出现过拟合的现象。因此,取了留一法曲线前十个K值当中的最小值,留一法曲线基本位于三条线中间,K=3或9比较合适。
# 五折交叉检验
len = length(data[,1])
size = round(len/5)
set.seed(222)
kId = sample(1:len) #将数据随机打乱次序
testGroup = list()
trainGroup = list()
for (i in 1:5) {
if(i==5){
addTestGroup = data[kId[(1+(i-1)*size):len],]
addTrainGroup = data[-kId[(1+(i-1)*size):len],]
}else{
addTestGroup = data[kId[(1+(i-1)*size):(i*size)],]
addTrainGroup = data[-kId[(1+(i-1)*size):(i*size)],]
}
testGroup[[i]] = addTestGroup
trainGroup[[i]] = addTrainGroup
}
names(testGroup) = c("testGroup1", "testGroup2",
"testGroup3", "testGroup4",
"testGroup5")
names(trainGroup) = c("trainGroup1", "trainGroup2",
"trainGroup3", "trainGroup4",
"trainGroup5")
err = vector()
for (i in 1:30) {
errRatioVector<-vector()
for (j in 1:5) {
testGroupData = testGroup[[j]]
trainGroupData = trainGroup[[j]]
fit<-knn(train=trainGroupData[,2:31],
test=testGroupData[,2:31],
cl=trainGroupData[,1],k=i,prob=FALSE)
CT<-table(testGroupData[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatioVector<-c(errRatioVector,errRatio)
}
index = mean(errRatioVector)
err = c(err, index)
}
err
[1] 4.39 4.22 3.16 2.99 2.63 2.99 2.81 2.81 2.11 2.63 3.34 2.81 2.81 3.16 3.16
[16] 2.99 3.34 3.52 3.52 3.69 3.69 3.87 4.22 4.04 4.04 3.87 4.04 4.40 4.40 4.22
which.min(err)
[1] 9
plot(x=1:30, y=err, type = 'l', ylab = 'err', xlab = 'k')
points(1:30,err,pch=15,col=2)
通常来说,都会以五折或者十折交叉检验,本模型将采取五折进行交叉检验。将k取1~30的所有交叉检验结果取平均值画出折线图,可以发现k值取到9时f分类错误率达到最小,与上述结论相同,k=9时是比较客观的估计。
# 判断变量的重要性
library(class)
errRatio4 = vector()
fit<-knn(train=train[,2:31],
test=test[,2:31],
cl=train[,1],k=9)
CT<-table(test[,1],fit)
errRatio4<-c(errRatio4,(1-sum(diag(CT))/sum(CT))*100)
errRatio4
[1] 4.23
errDelteX = vector()
for(i in -1:-30){
fit<-knn(train=train[,2:31][,i],
test=test[,2:31][,i],
cl=train[,1],k=9)
CT<-table(test[,1],fit)
errDelteX<-c(errDelteX,
(1-sum(diag(CT))/sum(CT))*100)
}
errDelteX
[1] 4.23 3.52 4.23 4.23 3.52 2.82 3.52 3.52 3.52 4.23 4.23 4.23 4.23 4.23 4.23
[16] 4.23 5.63 4.23 4.23 4.23 3.52 2.82 4.23 4.23 4.23 3.52 3.52 4.23 4.93 3.52
plot(errDelteX,type="l",xlab="variable",
ylab="error",
cex.main=0.8)
points(1:30,errDelteX,pch=15,col=2)
abline(h=errRatio4, col="red", lwd=4)
which.max(errDelteX)
[1] 17
names(data)
[1] "Diagnosis" "radius_mean"
[3] "texture_mean" "perimeter_mean"
[5] "area_mean" "smoothness_mean"
[7] "compactness_mean" "concavity_mean"
[9] "concave.points_mean" "symmetry_mean"
[11] "fractal.dimension_mean" "radius_sd"
[13] "texture_sd" "perimeter_sd"
[15] "area_sd" "smoothness_sd"
[17] "compactness_sd" "concavity_sd"
[19] "concave.points_sd" "symmetry_sd"
[21] "fractal.dimension_sd" "radius_max_mean"
[23] "texture_max_mean" "perimeter_max_mean"
[25] "area_max_mean" "smoothness_max_mean"
[27] "compactness_max_mean" "concavity_max_mean"
[29] "concave.points_max_mean" "symmetry_max_mean"
[31] "fractal.dimension_max_mean"
tibble(data[,c(18,30)])
# A tibble: 569 × 2
concavity_sd symmetry_max_mean
<dbl> <dbl>
1 0.136 0.598
2 0.0470 0.234
3 0.0968 0.404
4 0.143 1
5 0.144 0.158
6 0.0927 0.477
7 0.0569 0.295
8 0.0628 0.322
9 0.0897 0.555
10 0.196 0.552
# … with 559 more rows
#计算各个变量重要性FI
FI<-errDelteX+1/30
FI
## [1] 4.26 3.55 4.26 4.26 3.55 2.85 3.55 3.55 3.55 4.26 4.26 4.26 4.26 4.26 4.26
## [16] 4.26 5.67 4.26 4.26 4.26 3.55 2.85 4.26 4.26 4.26 3.55 3.55 4.26 4.96 3.55
#计算各个输入变量权重wi
wi<-FI/sum(FI)
wi
## [1] 0.0353 0.0294 0.0353 0.0353 0.0294 0.0236 0.0294 0.0294 0.0294 0.0353
## [11] 0.0353 0.0353 0.0353 0.0353 0.0353 0.0353 0.0469 0.0353 0.0353 0.0353
## [21] 0.0294 0.0236 0.0353 0.0353 0.0353 0.0294 0.0294 0.0353 0.0411 0.0294
weightdist<-function(x,y){
if(length(x)!=length(y))
{('不符合要求!');
} else {
a=wi*(x-y)^2
weightdist<-a %>% sum() %>% sqrt()
}
return(weightdist)
}
library(e1071)
errRatio9<-vector()
for(i in 1:30){
KnnFit<-gknn(Diagnosis~.,data=train,k=i,method=weightdist)
fit<-predict(KnnFit,test,type='class')
CT<-table(test[,1],fit) #计算混淆矩阵
#计算分类错误率
errRatio9<-c(errRatio9,(1-sum(diag(CT))/sum(CT))*100)
}
plot(x=1:30, y= errRatio9, type = 'l', ylab = 'errRatio9', xlab = 'k')
points(1:30,errRatio9,pch=10,col=2)
model<-gknn(Diagnosis~.,data=train,k=3,method=weightdist)
fit<-predict(model,test,type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 39.4
model<-gknn(Diagnosis~.,data=train,k=5,method=weightdist)
fit<-predict(model,test,type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 44.4
将分别剔除各指标后的误差绘制成折线图,并且与未剔除时的误差进行对比。误差值越高说明该指标更为重要;若误差值没有影响,则说明该指标对结果毫无影响;若误差值小于原始值,则说明该变量应该被剔除。
data1<-data[,-c(3,6:10,22,23,27,28,31)]
library(e1071)
model<-gknn(Diagnosis~.,data=train[,-c(3,6:10,22,23,27,28,31)],k=5,method=weightdist)
fit<-predict(model,test[,-c(3,6:10,22,23,27,28,31)],type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 40.1
model<-gknn(Diagnosis~.,data=train[,-c(3,6:10,22,23,27,28,31)],k=3,method=weightdist)
fit<-predict(model,test[,-c(3,6:10,22,23,27,28,31)],type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 37.3
#基于加权欧式距离的knn算法
library(kknn)
par(mfrow=c(2,1))
getwd()
## [1] "/Users/huanghuilin/Desktop/360安全云盘同步版/2022/2022/2022年数据挖掘课程/上机/第5次上机 KNN和加权KNN算法实现/上机作业"
tibble(data)
## # A tibble: 569 × 31
## Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.521 0.0227 0.546 0.364 0.594
## 2 1 0.643 0.273 0.616 0.502 0.290
## 3 1 0.601 0.390 0.596 0.449 0.514
## 4 1 0.210 0.361 0.234 0.103 0.811
## 5 1 0.630 0.157 0.631 0.489 0.430
## 6 1 0.259 0.203 0.268 0.142 0.679
## 7 1 0.533 0.347 0.524 0.380 0.379
## 8 1 0.318 0.376 0.321 0.184 0.598
## 9 1 0.285 0.410 0.302 0.160 0.674
## 10 1 0.259 0.485 0.278 0.141 0.596
## # … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
## # concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
## # fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
## # perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## # compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## # symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## # texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
train$Diagnosis<-as.factor(train$Diagnosis)
fit<-train.kknn(formula=Diagnosis~.,
data=train, kmax=20,distance=2,
kernel=c("rectangular","triangular","gaussian"),
na.action=na.omit())
#fit
plot(fit$MISCLASS[,1]*100,type="l", ylim = c(2,5),
main="different errRatio",
cex.main=0.8,xlab="the number of K",ylab="errRatio")
lines(fit$MISCLASS[,2]*100,lty=2,col=1)
lines(fit$MISCLASS[,3]*100,lty=3,col=2)
#给出图例
legend("topleft",
legend=c("rectangular","triangular","gaussian"),
lty=c(1,2,3),col=c(1,1,2),cex=0.7)
fit<-kknn(formula=Diagnosis~.,train=train,
test=test,k=9,distance=2,
kernel="gaussian",na.action=na.omit())
CT<-table(test[,1],fit$fitted.values)
errRatio5 = (1-sum(diag(CT))/sum(CT))*100
errRatio5
## [1] 3.52
errRatio4
## [1] 4.23
errlist = c(errRatio5, errRatio4)
errlist
## [1] 3.52 4.23
rate = (errRatio4-errRatio5)/errRatio4
rate
## [1] 0.167
errGraph<-barplot(errlist,
main="Contrast figure(K=9)",
cex.main=0.8,xlab="way",
ylab="errRatio(%)",axes=FALSE)
axis(side=1,at=c(0,errGraph,3),
labels=c(" ", "加权K-近邻法","K-近邻法",""),
tcl=0.25)
axis(side=2,tcl=0.25)
根据交叉检验的结果,最佳的k值为9,在k=9时,未通过欧式距离加权的模型错判概率为4.23,经过了基于加权KNN算法后,错判率达到了3.52。提升率为16.7%,提升效果显著。由此可见,加权KNN算法对于模型的优化有着不错的结果。