library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.2.3에서 작성되었습니다
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: 패키지 'caret'는 R 버전 4.2.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.2.3에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: lattice
setwd('c:/data')


wbc<-read.csv("wbc.csv")
glimpse(wbc)
## Rows: 569
## Columns: 32
## $ id                      <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis               <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean             <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean            <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean          <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean               <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean         <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean        <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean          <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean     <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean           <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean  <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se               <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se              <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se            <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se                 <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se           <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se          <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se            <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se       <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se             <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se    <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst            <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst           <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst         <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst              <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst        <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst       <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst         <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst    <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst          <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
table(wbc$diagnosis)
## 
##   B   M 
## 357 212
wbc$id <-NULL

#분류모형에 타겟변수 범주형(factor)만 가능합니다.
wbc$diagnosis<-factor(wbc$diagnosis,levels=c("B","M"),
                      labels=c("Benign","Malignant"))

                                            
table(wbc$diagnosis)
## 
##    Benign Malignant 
##       357       212
# %>% shift+ctrl+m


idx<-createDataPartition(wbc$diagnosis,p=0.7,list=FALSE)
train<-wbc[idx,]
test<-wbc[-idx,]


tc <- trainControl(method="cv",number=10) #10-fold 교차검증
train(diagnosis ~., wbc, method="knn",
      preProcess=c("center", "scale"),
      trControl=tc,metric="Accuracy")->knnfit

knnfit
## k-Nearest Neighbors 
## 
## 569 samples
##  30 predictor
##   2 classes: 'Benign', 'Malignant' 
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 512, 512, 512, 512, 511, 513, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.9649425  0.9227634
##   7  0.9631568  0.9189172
##   9  0.9684513  0.9309400
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
confusionMatrix(knnfit)
## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##            Reference
## Prediction  Benign Malignant
##   Benign      62.2       2.6
##   Malignant    0.5      34.6
##                             
##  Accuracy (average) : 0.9684
predict(knnfit,test,type='prob')->prefit1
head(prefit1)
##      Benign Malignant
## 1 0.0000000 1.0000000
## 2 0.0000000 1.0000000
## 3 0.1111111 0.8888889
## 4 0.0000000 1.0000000
## 5 0.2222222 0.7777778
## 6 1.0000000 0.0000000
predict(knnfit,test,type='raw')->prefit2
head(prefit2)
## [1] Malignant Malignant Malignant Malignant Malignant Benign   
## Levels: Benign Malignant