library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.2.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.2.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
setwd('c:/data')
wbc<-read.csv("wbc.csv")
glimpse(wbc)
## Rows: 569
## Columns: 32
## $ id <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
table(wbc$diagnosis)
##
## B M
## 357 212
wbc$id <-NULL
#분류모형에 타겟변수 범주형(factor)만 가능합니다.
wbc$diagnosis<-factor(wbc$diagnosis,levels=c("B","M"),
labels=c("Benign","Malignant"))
table(wbc$diagnosis)
##
## Benign Malignant
## 357 212
# %>% shift+ctrl+m
idx<-createDataPartition(wbc$diagnosis,p=0.7,list=FALSE)
train<-wbc[idx,]
test<-wbc[-idx,]
tc <- trainControl(method="cv",number=10) #10-fold 교차검증
train(diagnosis ~., wbc, method="knn",
preProcess=c("center", "scale"),
trControl=tc,metric="Accuracy")->knnfit
knnfit
## k-Nearest Neighbors
##
## 569 samples
## 30 predictor
## 2 classes: 'Benign', 'Malignant'
##
## Pre-processing: centered (30), scaled (30)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 512, 512, 512, 512, 511, 513, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9649425 0.9227634
## 7 0.9631568 0.9189172
## 9 0.9684513 0.9309400
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
confusionMatrix(knnfit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction Benign Malignant
## Benign 62.2 2.6
## Malignant 0.5 34.6
##
## Accuracy (average) : 0.9684
predict(knnfit,test,type='prob')->prefit1
head(prefit1)
## Benign Malignant
## 1 0.0000000 1.0000000
## 2 0.0000000 1.0000000
## 3 0.1111111 0.8888889
## 4 0.0000000 1.0000000
## 5 0.2222222 0.7777778
## 6 1.0000000 0.0000000
predict(knnfit,test,type='raw')->prefit2
head(prefit2)
## [1] Malignant Malignant Malignant Malignant Malignant Benign
## Levels: Benign Malignant