导入数据

 library(tidyverse)

# Rstudio绘图不能显示中文解决方案
library(showtext)
showtext_auto()

 dir<-'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
 wdbc.data <- read.csv(dir,header = F)
 dim(wdbc.data)
## [1] 569  32
 names(wdbc.data) <- c('ID','Diagnosis','radius_mean',
                       'texture_mean',
'perimeter_mean','area_mean','smoothness_mean',
'compactness_mean',
'concavity_mean','concave points_mean','symmetry_mean',
'fractal dimension_mean',
'radius_sd','texture_sd','perimeter_sd','area_sd',
'smoothness_sd','compactness_sd',
'concavity_sd','concave points_sd','symmetry_sd',
'fractal dimension_sd','radius_max_mean',
'texture_max_mean','perimeter_max_mean',
'area_max_mean','smoothness_max_mean',
'compactness_max_mean',
'concavity_max_mean','concave points_max_mean',
'symmetry_max_mean','fractal dimension_max_mean')
 
 
## M = malignant 恶性的, B = benign 良性的
# 将目标属性编码因子类型
wdbc.data$Diagnosis
##   [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
##  [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
##  [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
##  [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
wdbc.data$Diagnosis<-factor(wdbc.data$Diagnosis,
                           levels=c("M","B"),
               labels=c("malignant","benign"))

write.csv(wdbc.data,"wdbc.csv")
wdbc<-read.csv("wdbc.csv")
tibble(wdbc)
## # A tibble: 569 × 33
##        X       ID Diagnosis radius_mean texture_mean perimeter_mean area_mean
##    <int>    <int> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
##  1     1   842302 malignant        18.0         10.4          123.      1001 
##  2     2   842517 malignant        20.6         17.8          133.      1326 
##  3     3 84300903 malignant        19.7         21.2          130       1203 
##  4     4 84348301 malignant        11.4         20.4           77.6      386.
##  5     5 84358402 malignant        20.3         14.3          135.      1297 
##  6     6   843786 malignant        12.4         15.7           82.6      477.
##  7     7   844359 malignant        18.2         20.0          120.      1040 
##  8     8 84458202 malignant        13.7         20.8           90.2      578.
##  9     9   844981 malignant        13           21.8           87.5      520.
## 10    10 84501001 malignant        12.5         24.0           84.0      476.
## # … with 559 more rows, and 26 more variables: smoothness_mean <dbl>,
## #   compactness_mean <dbl>, concavity_mean <dbl>, concave.points_mean <dbl>,
## #   symmetry_mean <dbl>, fractal.dimension_mean <dbl>, radius_sd <dbl>,
## #   texture_sd <dbl>, perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## #   compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## #   symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## #   texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
wdbc$Diagnosis %>% table() 
## .
##    benign malignant 
##       357       212
## prop.table():计算table各列的占比
options(digits=4)
wdbc$Diagnosis %>% table() %>% prop.table()*100 
## .
##    benign malignant 
##     62.74     37.26
data<-wdbc[,-c(1,2)]
tibble(data)
## # A tibble: 569 × 31
##    Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
##    <chr>           <dbl>        <dbl>          <dbl>     <dbl>           <dbl>
##  1 malignant        18.0         10.4          123.      1001           0.118 
##  2 malignant        20.6         17.8          133.      1326           0.0847
##  3 malignant        19.7         21.2          130       1203           0.110 
##  4 malignant        11.4         20.4           77.6      386.          0.142 
##  5 malignant        20.3         14.3          135.      1297           0.100 
##  6 malignant        12.4         15.7           82.6      477.          0.128 
##  7 malignant        18.2         20.0          120.      1040           0.0946
##  8 malignant        13.7         20.8           90.2      578.          0.119 
##  9 malignant        13           21.8           87.5      520.          0.127 
## 10 malignant        12.5         24.0           84.0      476.          0.119 
## # … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
## #   concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
## #   fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
## #   perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## #   compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## #   symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## #   texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
which(is.na(data))  #判断是否有缺失值
## integer(0)
summary(data)
##   Diagnosis          radius_mean     texture_mean   perimeter_mean 
##  Length:569         Min.   : 6.98   Min.   : 9.71   Min.   : 43.8  
##  Class :character   1st Qu.:11.70   1st Qu.:16.17   1st Qu.: 75.2  
##  Mode  :character   Median :13.37   Median :18.84   Median : 86.2  
##                     Mean   :14.13   Mean   :19.29   Mean   : 92.0  
##                     3rd Qu.:15.78   3rd Qu.:21.80   3rd Qu.:104.1  
##                     Max.   :28.11   Max.   :39.28   Max.   :188.5  
##    area_mean    smoothness_mean  compactness_mean concavity_mean  
##  Min.   : 144   Min.   :0.0526   Min.   :0.0194   Min.   :0.0000  
##  1st Qu.: 420   1st Qu.:0.0864   1st Qu.:0.0649   1st Qu.:0.0296  
##  Median : 551   Median :0.0959   Median :0.0926   Median :0.0615  
##  Mean   : 655   Mean   :0.0964   Mean   :0.1043   Mean   :0.0888  
##  3rd Qu.: 783   3rd Qu.:0.1053   3rd Qu.:0.1304   3rd Qu.:0.1307  
##  Max.   :2501   Max.   :0.1634   Max.   :0.3454   Max.   :0.4268  
##  concave.points_mean symmetry_mean   fractal.dimension_mean   radius_sd    
##  Min.   :0.0000      Min.   :0.106   Min.   :0.0500         Min.   :0.112  
##  1st Qu.:0.0203      1st Qu.:0.162   1st Qu.:0.0577         1st Qu.:0.232  
##  Median :0.0335      Median :0.179   Median :0.0615         Median :0.324  
##  Mean   :0.0489      Mean   :0.181   Mean   :0.0628         Mean   :0.405  
##  3rd Qu.:0.0740      3rd Qu.:0.196   3rd Qu.:0.0661         3rd Qu.:0.479  
##  Max.   :0.2012      Max.   :0.304   Max.   :0.0974         Max.   :2.873  
##    texture_sd     perimeter_sd       area_sd      smoothness_sd    
##  Min.   :0.360   Min.   : 0.757   Min.   :  6.8   Min.   :0.00171  
##  1st Qu.:0.834   1st Qu.: 1.606   1st Qu.: 17.9   1st Qu.:0.00517  
##  Median :1.108   Median : 2.287   Median : 24.5   Median :0.00638  
##  Mean   :1.217   Mean   : 2.866   Mean   : 40.3   Mean   :0.00704  
##  3rd Qu.:1.474   3rd Qu.: 3.357   3rd Qu.: 45.2   3rd Qu.:0.00815  
##  Max.   :4.885   Max.   :21.980   Max.   :542.2   Max.   :0.03113  
##  compactness_sd     concavity_sd    concave.points_sd  symmetry_sd     
##  Min.   :0.00225   Min.   :0.0000   Min.   :0.00000   Min.   :0.00788  
##  1st Qu.:0.01308   1st Qu.:0.0151   1st Qu.:0.00764   1st Qu.:0.01516  
##  Median :0.02045   Median :0.0259   Median :0.01093   Median :0.01873  
##  Mean   :0.02548   Mean   :0.0319   Mean   :0.01180   Mean   :0.02054  
##  3rd Qu.:0.03245   3rd Qu.:0.0420   3rd Qu.:0.01471   3rd Qu.:0.02348  
##  Max.   :0.13540   Max.   :0.3960   Max.   :0.05279   Max.   :0.07895  
##  fractal.dimension_sd radius_max_mean texture_max_mean perimeter_max_mean
##  Min.   :0.000895     Min.   : 7.93   Min.   :12.0     Min.   : 50.4     
##  1st Qu.:0.002248     1st Qu.:13.01   1st Qu.:21.1     1st Qu.: 84.1     
##  Median :0.003187     Median :14.97   Median :25.4     Median : 97.7     
##  Mean   :0.003795     Mean   :16.27   Mean   :25.7     Mean   :107.3     
##  3rd Qu.:0.004558     3rd Qu.:18.79   3rd Qu.:29.7     3rd Qu.:125.4     
##  Max.   :0.029840     Max.   :36.04   Max.   :49.5     Max.   :251.2     
##  area_max_mean  smoothness_max_mean compactness_max_mean concavity_max_mean
##  Min.   : 185   Min.   :0.0712      Min.   :0.0273       Min.   :0.000     
##  1st Qu.: 515   1st Qu.:0.1166      1st Qu.:0.1472       1st Qu.:0.115     
##  Median : 686   Median :0.1313      Median :0.2119       Median :0.227     
##  Mean   : 881   Mean   :0.1324      Mean   :0.2543       Mean   :0.272     
##  3rd Qu.:1084   3rd Qu.:0.1460      3rd Qu.:0.3391       3rd Qu.:0.383     
##  Max.   :4254   Max.   :0.2226      Max.   :1.0580       Max.   :1.252     
##  concave.points_max_mean symmetry_max_mean fractal.dimension_max_mean
##  Min.   :0.0000          Min.   :0.157     Min.   :0.0550            
##  1st Qu.:0.0649          1st Qu.:0.250     1st Qu.:0.0715            
##  Median :0.0999          Median :0.282     Median :0.0800            
##  Mean   :0.1146          Mean   :0.290     Mean   :0.0839            
##  3rd Qu.:0.1614          3rd Qu.:0.318     3rd Qu.:0.0921            
##  Max.   :0.2910          Max.   :0.664     Max.   :0.2075

划分训练样本集和测试样本集(比例3:1)

由于wdbc.data中的指标id并无有效的实际作用,因此采用data进行操作。我将diagnosis指标设置为0-1变量,即malignant设置成1,benign设置为0。并且为了防止量纲对数据分析的影响,我将各指标进行了最大最小标准化,使所有数据都落在0~1之间。

#将肿瘤类型恶性或良性转化成0-1变量
data$Diagnosis <- ifelse(data$Diagnosis=="malignant", 1, 0)
#进行数据标准化
for (i in 2:31) {
  data[,i] = (data[,i]-min(data[,i]))/(max(data[,i])-min(data[,i]))
}

tibble(data)
# A tibble: 569 × 31
   Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
       <dbl>       <dbl>        <dbl>          <dbl>     <dbl>           <dbl>
 1         1       0.521       0.0227          0.546     0.364           0.594
 2         1       0.643       0.273           0.616     0.502           0.290
 3         1       0.601       0.390           0.596     0.449           0.514
 4         1       0.210       0.361           0.234     0.103           0.811
 5         1       0.630       0.157           0.631     0.489           0.430
 6         1       0.259       0.203           0.268     0.142           0.679
 7         1       0.533       0.347           0.524     0.380           0.379
 8         1       0.318       0.376           0.321     0.184           0.598
 9         1       0.285       0.410           0.302     0.160           0.674
10         1       0.259       0.485           0.278     0.141           0.596
# … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
#   concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
#   fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
#   perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
#   compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
#   symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
#   texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
set.seed(111)
sampleId <- sample(nrow(data),nrow(data)*1/4)
test <- data[sampleId,]
train <- data[-sampleId,]
rbind(dim(train),dim(test))
     [,1] [,2]
[1,]  427   31
[2,]  142   31

分别基于全体样本、旁置法和留一法建立knn模型,并择出最优模型

 library(class)
Warning: package 'class' was built under R version 4.1.2
options(digits=3)
#基于全体观测样本建模
errRatio<-vector()
for(i in 1:30){
  KnnFit<-knn(train=data[,2:31],test=data[,2:31],
              cl=data[,1],k=i)
  CT<-table(data[,1],KnnFit)
  errRatio<-c(errRatio,(1-sum(diag(CT))/sum(CT))*100)
}
plot(errRatio,type="l",col="blue",
     xlab="K:number of neighbors",
     ylab="errRatio(%)",
     main="errRatio and number of neighbors K",
     ylim=c(0,10))

points(1:30,errRatio,pch=8,col=1)

#旁置法KNN
errRatio1<-vector()
for(i in 1:30){
  KnnFit<-knn(train=train[,2:31],
              test=test[,2:31],
              cl=train[,1],k=i)
  CT<-table(test[,1],KnnFit) #计算混淆矩阵
  #计算分类错误率
  errRatio1<-c(errRatio1,(1-sum(diag(CT))/sum(CT))*100)
}
lines(1:30,errRatio1,lty=2,col="red")
points(1:30,errRatio1,pch=10,col=2)


# 留一法交叉验证KNN

set.seed(12345)
errRatio2<-vector()
for(i in 1:30){
  KnnFit<-knn.cv(train=data[,2:31],cl=data[,1],k=i)
  CT<-table(data[,1],KnnFit)
  errRatio2<-c(errRatio2,(1-sum(diag(CT))/sum(CT))*100)
}
lines(1:30,errRatio2,col="black")
points(1:30,errRatio2,pch=15,col=3)
legend("topright",
       c("Total","train","leave-one-out method"),
       lty=1:2,
       col=c("blue","red","black"),cex=0.6)

which.min(errRatio1)
[1] 3
which.min(errRatio2[1:10])
[1] 10

由图像可知,其中的蓝线为全部观测进入训练样本集时,参数K取1~30下的错判率曲线。K=1时错判率为0.上方的红色虚线为旁置法的错判率曲线,K=2时达到最小。黑色实线为留一法的错判率曲线,K=3时达到最小。此外,图中基于整个样本的训练误差曲线在最下方,可见是对预测误差偏低的乐观估计,并且容易出现过拟合的现象。因此,取了留一法曲线前十个K值当中的最小值,留一法曲线基本位于三条线中间,K=3或9比较合适。

自编交叉验证函数,寻找最优参数k

# 五折交叉检验
len = length(data[,1])
size = round(len/5) 
set.seed(222)
kId = sample(1:len)  #将数据随机打乱次序
testGroup = list()
trainGroup = list()
for (i in 1:5) {
  if(i==5){
    addTestGroup = data[kId[(1+(i-1)*size):len],]  
    addTrainGroup = data[-kId[(1+(i-1)*size):len],]
  }else{
    addTestGroup = data[kId[(1+(i-1)*size):(i*size)],]
    addTrainGroup = data[-kId[(1+(i-1)*size):(i*size)],]
  }
  testGroup[[i]] = addTestGroup
  trainGroup[[i]] = addTrainGroup
}
names(testGroup) = c("testGroup1", "testGroup2", 
                     "testGroup3", "testGroup4", 
                     "testGroup5")
names(trainGroup) = c("trainGroup1", "trainGroup2", 
                      "trainGroup3", "trainGroup4", 
                      "trainGroup5")
err = vector()
for (i in 1:30) {
 errRatioVector<-vector()
  for (j in 1:5) {  
    testGroupData = testGroup[[j]]
    trainGroupData = trainGroup[[j]]
    fit<-knn(train=trainGroupData[,2:31],
                test=testGroupData[,2:31],
                cl=trainGroupData[,1],k=i,prob=FALSE)
   
    CT<-table(testGroupData[,1],fit)
    errRatio<-(1-sum(diag(CT))/sum(CT))*100
    errRatioVector<-c(errRatioVector,errRatio)
  }
  index = mean(errRatioVector)
  err = c(err, index)
}

err
 [1] 4.39 4.22 3.16 2.99 2.63 2.99 2.81 2.81 2.11 2.63 3.34 2.81 2.81 3.16 3.16
[16] 2.99 3.34 3.52 3.52 3.69 3.69 3.87 4.22 4.04 4.04 3.87 4.04 4.40 4.40 4.22
which.min(err)
[1] 9
plot(x=1:30, y=err, type = 'l', ylab = 'err', xlab = 'k')
points(1:30,err,pch=15,col=2)

通常来说,都会以五折或者十折交叉检验,本模型将采取五折进行交叉检验。将k取1~30的所有交叉检验结果取平均值画出折线图,可以发现k值取到9时f分类错误率达到最小,与上述结论相同,k=9时是比较客观的估计。

讨论wdbc 变量重要性,通过比较计算剔除该变量前后的errRatio来衡量重要性,k=9

# 判断变量的重要性
library(class)
errRatio4 = vector()
fit<-knn(train=train[,2:31],
         test=test[,2:31],
         cl=train[,1],k=9)
CT<-table(test[,1],fit)
errRatio4<-c(errRatio4,(1-sum(diag(CT))/sum(CT))*100)
errRatio4
[1] 4.23
errDelteX = vector()
for(i in -1:-30){
  fit<-knn(train=train[,2:31][,i],
           test=test[,2:31][,i],
           cl=train[,1],k=9)
  CT<-table(test[,1],fit)
  errDelteX<-c(errDelteX,
               (1-sum(diag(CT))/sum(CT))*100)
}

errDelteX
 [1] 4.23 3.52 4.23 4.23 3.52 2.82 3.52 3.52 3.52 4.23 4.23 4.23 4.23 4.23 4.23
[16] 4.23 5.63 4.23 4.23 4.23 3.52 2.82 4.23 4.23 4.23 3.52 3.52 4.23 4.93 3.52
plot(errDelteX,type="l",xlab="variable",
     ylab="error",
     cex.main=0.8)
points(1:30,errDelteX,pch=15,col=2)
abline(h=errRatio4, col="red", lwd=4)

which.max(errDelteX)
[1] 17
names(data)
 [1] "Diagnosis"                  "radius_mean"               
 [3] "texture_mean"               "perimeter_mean"            
 [5] "area_mean"                  "smoothness_mean"           
 [7] "compactness_mean"           "concavity_mean"            
 [9] "concave.points_mean"        "symmetry_mean"             
[11] "fractal.dimension_mean"     "radius_sd"                 
[13] "texture_sd"                 "perimeter_sd"              
[15] "area_sd"                    "smoothness_sd"             
[17] "compactness_sd"             "concavity_sd"              
[19] "concave.points_sd"          "symmetry_sd"               
[21] "fractal.dimension_sd"       "radius_max_mean"           
[23] "texture_max_mean"           "perimeter_max_mean"        
[25] "area_max_mean"              "smoothness_max_mean"       
[27] "compactness_max_mean"       "concavity_max_mean"        
[29] "concave.points_max_mean"    "symmetry_max_mean"         
[31] "fractal.dimension_max_mean"
tibble(data[,c(18,30)])
# A tibble: 569 × 2
   concavity_sd symmetry_max_mean
          <dbl>             <dbl>
 1       0.136              0.598
 2       0.0470             0.234
 3       0.0968             0.404
 4       0.143              1    
 5       0.144              0.158
 6       0.0927             0.477
 7       0.0569             0.295
 8       0.0628             0.322
 9       0.0897             0.555
10       0.196              0.552
# … with 559 more rows
#计算各个变量重要性FI
FI<-errDelteX+1/30
FI
##  [1] 4.26 3.55 4.26 4.26 3.55 2.85 3.55 3.55 3.55 4.26 4.26 4.26 4.26 4.26 4.26
## [16] 4.26 5.67 4.26 4.26 4.26 3.55 2.85 4.26 4.26 4.26 3.55 3.55 4.26 4.96 3.55
#计算各个输入变量权重wi
wi<-FI/sum(FI)
wi
##  [1] 0.0353 0.0294 0.0353 0.0353 0.0294 0.0236 0.0294 0.0294 0.0294 0.0353
## [11] 0.0353 0.0353 0.0353 0.0353 0.0353 0.0353 0.0469 0.0353 0.0353 0.0353
## [21] 0.0294 0.0236 0.0353 0.0353 0.0353 0.0294 0.0294 0.0353 0.0411 0.0294
weightdist<-function(x,y){
  if(length(x)!=length(y))
  {('不符合要求!');
  } else {
    a=wi*(x-y)^2
    weightdist<-a %>% sum() %>% sqrt()
  }
  return(weightdist)
}

    library(e1071)

 errRatio9<-vector()
   for(i in 1:30){
  KnnFit<-gknn(Diagnosis~.,data=train,k=i,method=weightdist)
   fit<-predict(KnnFit,test,type='class')
   CT<-table(test[,1],fit) #计算混淆矩阵
  #计算分类错误率
  errRatio9<-c(errRatio9,(1-sum(diag(CT))/sum(CT))*100)
   }
plot(x=1:30, y= errRatio9, type = 'l', ylab = 'errRatio9', xlab = 'k')
points(1:30,errRatio9,pch=10,col=2)

model<-gknn(Diagnosis~.,data=train,k=3,method=weightdist)
fit<-predict(model,test,type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 39.4
model<-gknn(Diagnosis~.,data=train,k=5,method=weightdist)
fit<-predict(model,test,type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 44.4

将分别剔除各指标后的误差绘制成折线图,并且与未剔除时的误差进行对比。误差值越高说明该指标更为重要;若误差值没有影响,则说明该指标对结果毫无影响;若误差值小于原始值,则说明该变量应该被剔除。

data1<-data[,-c(3,6:10,22,23,27,28,31)]

 library(e1071)
 model<-gknn(Diagnosis~.,data=train[,-c(3,6:10,22,23,27,28,31)],k=5,method=weightdist)
fit<-predict(model,test[,-c(3,6:10,22,23,27,28,31)],type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 40.1
model<-gknn(Diagnosis~.,data=train[,-c(3,6:10,22,23,27,28,31)],k=3,method=weightdist)
fit<-predict(model,test[,-c(3,6:10,22,23,27,28,31)],type='class')
CT<-table(test[,1],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 37.3

基于加权欧式距离的knn算法(核函数)

#基于加权欧式距离的knn算法
library(kknn)
par(mfrow=c(2,1))
getwd()
## [1] "/Users/huanghuilin/Desktop/360安全云盘同步版/2022/2022/2022年数据挖掘课程/上机/第5次上机 KNN和加权KNN算法实现/上机作业"
tibble(data)
## # A tibble: 569 × 31
##    Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
##        <dbl>       <dbl>        <dbl>          <dbl>     <dbl>           <dbl>
##  1         1       0.521       0.0227          0.546     0.364           0.594
##  2         1       0.643       0.273           0.616     0.502           0.290
##  3         1       0.601       0.390           0.596     0.449           0.514
##  4         1       0.210       0.361           0.234     0.103           0.811
##  5         1       0.630       0.157           0.631     0.489           0.430
##  6         1       0.259       0.203           0.268     0.142           0.679
##  7         1       0.533       0.347           0.524     0.380           0.379
##  8         1       0.318       0.376           0.321     0.184           0.598
##  9         1       0.285       0.410           0.302     0.160           0.674
## 10         1       0.259       0.485           0.278     0.141           0.596
## # … with 559 more rows, and 25 more variables: compactness_mean <dbl>,
## #   concavity_mean <dbl>, concave.points_mean <dbl>, symmetry_mean <dbl>,
## #   fractal.dimension_mean <dbl>, radius_sd <dbl>, texture_sd <dbl>,
## #   perimeter_sd <dbl>, area_sd <dbl>, smoothness_sd <dbl>,
## #   compactness_sd <dbl>, concavity_sd <dbl>, concave.points_sd <dbl>,
## #   symmetry_sd <dbl>, fractal.dimension_sd <dbl>, radius_max_mean <dbl>,
## #   texture_max_mean <dbl>, perimeter_max_mean <dbl>, area_max_mean <dbl>, …
train$Diagnosis<-as.factor(train$Diagnosis)
fit<-train.kknn(formula=Diagnosis~.,
                data=train, kmax=20,distance=2,
                kernel=c("rectangular","triangular","gaussian"),
                na.action=na.omit())

#fit
plot(fit$MISCLASS[,1]*100,type="l", ylim = c(2,5),
     main="different errRatio",
     cex.main=0.8,xlab="the number of K",ylab="errRatio")
lines(fit$MISCLASS[,2]*100,lty=2,col=1)
lines(fit$MISCLASS[,3]*100,lty=3,col=2)

#给出图例
legend("topleft",
       legend=c("rectangular","triangular","gaussian"),
       lty=c(1,2,3),col=c(1,1,2),cex=0.7)  



fit<-kknn(formula=Diagnosis~.,train=train,
          test=test,k=9,distance=2,
          kernel="gaussian",na.action=na.omit())
CT<-table(test[,1],fit$fitted.values)
errRatio5 = (1-sum(diag(CT))/sum(CT))*100
errRatio5
## [1] 3.52
errRatio4
## [1] 4.23
errlist = c(errRatio5, errRatio4)
errlist
## [1] 3.52 4.23
rate = (errRatio4-errRatio5)/errRatio4
rate
## [1] 0.167
errGraph<-barplot(errlist,
                  main="Contrast figure(K=9)",
                  cex.main=0.8,xlab="way",
                  ylab="errRatio(%)",axes=FALSE)
axis(side=1,at=c(0,errGraph,3),
     labels=c(" ", "加权K-近邻法","K-近邻法",""),
     tcl=0.25)
axis(side=2,tcl=0.25)

根据交叉检验的结果,最佳的k值为9,在k=9时,未通过欧式距离加权的模型错判概率为4.23,经过了基于加权KNN算法后,错判率达到了3.52。提升率为16.7%,提升效果显著。由此可见,加权KNN算法对于模型的优化有着不错的结果。