ls()
## character(0)
rm(list = ls())
getwd()
## [1] "D:/data"
#암진단 데이터입니다, 양성과 악성를 분류하는 데이터입니다.
#유의사항 독립변수중 id,X 데이터는 제외한다. 데이터분할은 7:3으로 한다.
#train/test test 데이터의 ROC_ACU 결과를 rmarkdown 단톡방으로 6월18일까지 제출해주세요!
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## Warning: 패키지 'tidyr'는 R 버전 4.1.3에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## 필요한 패키지를 로딩중입니다: lattice
##
## 다음의 패키지를 부착합니다: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(recipes)
## Warning: 패키지 'recipes'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(proxy)
##
## 다음의 패키지를 부착합니다: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
df<-read.csv("wbc.csv")
#1) ID 번호, 2) 진단 (M = 악성, B = 양성) 3-32) 각 세포 핵에 대해 10 개의 실제 값 특징이 계산됩니다
# a) 반경 (경계의 중심에서 점까지의 거리의 평균) b) 질감 (회색 음영 값의 표준 편차) c) 경계
# d) 영역 e) 평활도 (반경 길이의 국부적 변화) f) 컴팩트 함 (둘레 ^ 2 / 면적 - 1.0)
# g) 오목한 부분 (윤곽선의 오목한 부분의 심각도) h) 오목점 (윤곽선의 오목한 부분 수)
# i) 대칭, j) 프랙탈 치수 ( "해안선 근사치"- 1)
train_list<-createDataPartition(y=df$diagnosis,p=0.7,list=FALSE)
head(train_list)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 6
## [5,] 7
## [6,] 8
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 399
NROW(df_test)
## [1] 170
df_train %>% glimpse
## Rows: 399
## Columns: 33
## $ id <int> 842302, 842517, 84300903, 843786, 844359, 8445~
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
df_test %>% glimpse
## Rows: 170
## Columns: 33
## $ id <int> 84348301, 84358402, 84501001, 84862001, 851065~
## $ diagnosis <chr> "M", "M", "M", "M", "B", "B", "M", "M", "M", "~
## $ radius_mean <dbl> 11.420, 20.290, 12.460, 16.130, 13.080, 9.504,~
## $ texture_mean <dbl> 20.38, 14.34, 24.04, 20.68, 15.71, 12.44, 21.3~
## $ perimeter_mean <dbl> 77.58, 135.10, 83.97, 108.10, 85.63, 60.34, 11~
## $ area_mean <dbl> 386.1, 1297.0, 475.9, 798.8, 520.0, 273.9, 904~
## $ smoothness_mean <dbl> 0.14250, 0.10030, 0.11860, 0.11700, 0.10750, 0~
## $ compactness_mean <dbl> 0.28390, 0.13280, 0.23960, 0.20220, 0.12700, 0~
## $ concavity_mean <dbl> 0.241400, 0.198000, 0.227300, 0.172200, 0.0456~
## $ concave.points_mean <dbl> 0.105200, 0.104300, 0.085430, 0.102800, 0.0311~
## $ symmetry_mean <dbl> 0.2597, 0.1809, 0.2030, 0.2164, 0.1967, 0.1815~
## $ fractal_dimension_mean <dbl> 0.09744, 0.05883, 0.08243, 0.07356, 0.06811, 0~
## $ radius_se <dbl> 0.4956, 0.7572, 0.2976, 0.5692, 0.1852, 0.2773~
## $ texture_se <dbl> 1.1560, 0.7813, 1.5990, 1.0730, 0.7477, 0.9768~
## $ perimeter_se <dbl> 3.445, 5.438, 2.039, 3.854, 1.383, 1.909, 5.45~
## $ area_se <dbl> 27.230, 94.440, 23.940, 54.180, 14.670, 15.700~
## $ smoothness_se <dbl> 0.009110, 0.011490, 0.007149, 0.007026, 0.0040~
## $ compactness_se <dbl> 0.074580, 0.024610, 0.072170, 0.025010, 0.0189~
## $ concavity_se <dbl> 0.056610, 0.056880, 0.077430, 0.031880, 0.0169~
## $ concave.points_se <dbl> 0.018670, 0.018850, 0.014320, 0.012970, 0.0064~
## $ symmetry_se <dbl> 0.059630, 0.017560, 0.017890, 0.016890, 0.0167~
## $ fractal_dimension_se <dbl> 0.009208, 0.005115, 0.010080, 0.004142, 0.0024~
## $ radius_worst <dbl> 14.910, 22.540, 15.090, 20.960, 14.500, 10.230~
## $ texture_worst <dbl> 26.50, 16.67, 40.68, 31.48, 20.49, 15.66, 31.5~
## $ perimeter_worst <dbl> 98.87, 152.20, 97.65, 136.80, 96.09, 65.13, 17~
## $ area_worst <dbl> 567.7, 1575.0, 711.4, 1315.0, 630.5, 314.9, 22~
## $ smoothness_worst <dbl> 0.20980, 0.13740, 0.18530, 0.17890, 0.13120, 0~
## $ compactness_worst <dbl> 0.86630, 0.20500, 1.05800, 0.42330, 0.27760, 0~
## $ concavity_worst <dbl> 0.686900, 0.400000, 1.105000, 0.478400, 0.1890~
## $ concave.points_worst <dbl> 0.25750, 0.16250, 0.22100, 0.20730, 0.07283, 0~
## $ symmetry_worst <dbl> 0.6638, 0.2364, 0.4366, 0.3706, 0.3184, 0.2450~
## $ fractal_dimension_worst <dbl> 0.17300, 0.07678, 0.20750, 0.11420, 0.08183, 0~
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
df_train %>% mutate(index="train")->df_train
df_test %>% mutate(index="test")->df_test
bind_rows(df_train,df_test)->full
full %>% select(-id,-X)->full
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "~
## $ radius_mean <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index <chr> "train", "train", "train", "train", "train", "~
full$diagnosis<-as.factor(full$diagnosis)
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
## $ radius_mean <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index <chr> "train", "train", "train", "train", "train", "~
colSums(is.na(full))
## diagnosis radius_mean texture_mean
## 0 0 0
## perimeter_mean area_mean smoothness_mean
## 0 0 0
## compactness_mean concavity_mean concave.points_mean
## 0 0 0
## symmetry_mean fractal_dimension_mean radius_se
## 0 0 0
## texture_se perimeter_se area_se
## 0 0 0
## smoothness_se compactness_se concavity_se
## 0 0 0
## concave.points_se symmetry_se fractal_dimension_se
## 0 0 0
## radius_worst texture_worst perimeter_worst
## 0 0 0
## area_worst smoothness_worst compactness_worst
## 0 0 0
## concavity_worst concave.points_worst symmetry_worst
## 0 0 0
## fractal_dimension_worst index
## 0 0
summary(is.na(full))
## diagnosis radius_mean texture_mean perimeter_mean
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## area_mean smoothness_mean compactness_mean concavity_mean
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## concave.points_mean symmetry_mean fractal_dimension_mean radius_se
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## texture_se perimeter_se area_se smoothness_se
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## compactness_se concavity_se concave.points_se symmetry_se
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## area_worst smoothness_worst compactness_worst concavity_worst
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
## concave.points_worst symmetry_worst fractal_dimension_worst index
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:569 FALSE:569 FALSE:569 FALSE:569
names(full)
## [1] "diagnosis" "radius_mean"
## [3] "texture_mean" "perimeter_mean"
## [5] "area_mean" "smoothness_mean"
## [7] "compactness_mean" "concavity_mean"
## [9] "concave.points_mean" "symmetry_mean"
## [11] "fractal_dimension_mean" "radius_se"
## [13] "texture_se" "perimeter_se"
## [15] "area_se" "smoothness_se"
## [17] "compactness_se" "concavity_se"
## [19] "concave.points_se" "symmetry_se"
## [21] "fractal_dimension_se" "radius_worst"
## [23] "texture_worst" "perimeter_worst"
## [25] "area_worst" "smoothness_worst"
## [27] "compactness_worst" "concavity_worst"
## [29] "concave.points_worst" "symmetry_worst"
## [31] "fractal_dimension_worst" "index"
full %>% glimpse
## Rows: 569
## Columns: 32
## $ diagnosis <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
## $ radius_mean <dbl> 17.99, 20.57, 19.69, 12.45, 18.25, 13.71, 13.0~
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 15.70, 19.98, 20.83, 21.8~
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 82.57, 119.60, 90.20, ~
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 477.1, 1040.0, 577.9, ~
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.12780, 0.09463, 0~
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.17000, 0.10900, 0~
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.15780, 0.11270, 0~
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.08089, 0.07400, 0~
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2087, 0.1794, 0.2196~
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.07613, 0.05742, 0~
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.3345, 0.4467, 0.5835~
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 0.8902, 0.7732, 1.3770~
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 2.217, 3.180, 3.856, 2.40~
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.19, 53.91, 50.96, 24.~
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.007510, 0.0043~
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.033450, 0.0138~
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.03672, 0.02254, 0~
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.011370, 0.0103~
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.02165, 0.01369, 0~
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.005082, 0.0021~
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 15.47, 22.88, 17.06, 15.4~
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 23.75, 27.66, 28.14, 30.7~
## $ perimeter_worst <dbl> 184.6, 158.8, 152.5, 103.4, 153.2, 110.6, 106.~
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 741.6, 1606.0, 897.0, ~
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.1791, 0.1442, 0.1654~
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.5249, 0.2576, 0.3682~
## $ concavity_worst <dbl> 0.7119, 0.2416, 0.4504, 0.5355, 0.3784, 0.2678~
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.17410, 0.19320, 0~
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.3985, 0.3063, 0.3196~
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.12440, 0.08368, 0~
## $ index <chr> "train", "train", "train", "train", "train", "~
recipe(diagnosis~.,data=full) %>%
step_YeoJohnson(radius_mean,texture_mean,perimeter_mean,area_mean,
smoothness_mean, compactness_mean, concavity_mean,
concave.points_mean, symmetry_mean, fractal_dimension_mean,
radius_se, texture_se, perimeter_se, area_se, smoothness_se,
compactness_se, concavity_se, concave.points_se, symmetry_se,
fractal_dimension_se,
radius_worst, texture_worst, perimeter_worst, area_worst,
smoothness_worst, compactness_worst, concavity_worst,
concave.points_worst, symmetry_worst, fractal_dimension_worst) %>%
step_center(radius_mean,texture_mean,perimeter_mean,area_mean,
smoothness_mean, compactness_mean, concavity_mean,
concave.points_mean, symmetry_mean, fractal_dimension_mean,
radius_se, texture_se, perimeter_se, area_se, smoothness_se,
compactness_se, concavity_se, concave.points_se, symmetry_se,
fractal_dimension_se,
radius_worst, texture_worst, perimeter_worst, area_worst,
smoothness_worst, compactness_worst, concavity_worst,
concave.points_worst, symmetry_worst, fractal_dimension_worst) %>%
step_scale(radius_mean,texture_mean,perimeter_mean,area_mean,
smoothness_mean, compactness_mean, concavity_mean,
concave.points_mean, symmetry_mean, fractal_dimension_mean,
radius_se, texture_se, perimeter_se, area_se, smoothness_se,
compactness_se, concavity_se, concave.points_se, symmetry_se,
fractal_dimension_se,
radius_worst, texture_worst, perimeter_worst, area_worst,
smoothness_worst, compactness_worst, concavity_worst,
concave.points_worst, symmetry_worst, fractal_dimension_worst) %>%
prep() %>% juice()->data
data %>% glimpse
## Rows: 569
## Columns: 32
## $ radius_mean <dbl> 1.133883785, 1.617924232, 1.463509328, -0.3677~
## $ texture_mean <dbl> -2.6763108, -0.2641451, 0.5473245, -0.8241080,~
## $ perimeter_mean <dbl> 1.25871524, 1.52738014, 1.45338613, -0.2510152~
## $ area_mean <dbl> 1.1254308234, 1.6325123335, 1.4603611261, -0.3~
## $ smoothness_mean <dbl> 1.56708746, -0.82623545, 0.94138212, 2.2354545~
## $ compactness_mean <dbl> 3.28062806, -0.48664348, 1.05199990, 1.2432415~
## $ concavity_mean <dbl> 2.65054179, -0.02382489, 1.36227979, 0.8655400~
## $ concave.points_mean <dbl> 2.53024886, 0.54766227, 2.03543978, 0.82393067~
## $ symmetry_mean <dbl> 2.215565542, 0.001391139, 0.938858720, 1.00451~
## $ fractal_dimension_mean <dbl> 2.2537638, -0.8678888, -0.3976580, 1.8883435, ~
## $ radius_se <dbl> 1.891121233, 0.957773674, 1.456535070, 0.00581~
## $ texture_se <dbl> -0.49776327, -0.99743439, -0.83324887, -0.5383~
## $ perimeter_se <dbl> 1.97134979, 0.73176211, 1.21196297, -0.0841302~
## $ area_se <dbl> 1.8744596, 1.2757387, 1.4947914, 0.0432361, 0.~
## $ smoothness_se <dbl> -0.21381351, -0.60481867, -0.29674391, 0.15620~
## $ compactness_se <dbl> 1.31570389, -0.69231710, 0.81425704, 0.4451519~
## $ concavity_se <dbl> 0.72338965, -0.44039256, 0.21288911, 0.1598845~
## $ concave.points_se <dbl> 0.66023900, 0.25993335, 1.42357487, -0.0690627~
## $ symmetry_se <dbl> 1.14774677, -0.80474229, 0.23682715, 0.1340009~
## $ fractal_dimension_se <dbl> 0.90628565, -0.09935632, 0.29330133, 0.4864178~
## $ radius_worst <dbl> 1.61821085, 1.57730024, 1.41850851, 0.05600571~
## $ texture_worst <dbl> -1.48705971, -0.28812729, 0.07134428, -0.22876~
## $ perimeter_worst <dbl> 1.808915062, 1.429358435, 1.320779419, 0.11951~
## $ area_worst <dbl> 1.650761073, 1.608609957, 1.424055444, 0.07894~
## $ smoothness_worst <dbl> 1.30653666, -0.37528175, 0.52694375, 2.0467119~
## $ compactness_worst <dbl> 1.94102968, -0.29631980, 1.20863781, 1.5744313~
## $ concavity_worst <dbl> 1.72866091, 0.07068341, 1.00462771, 1.28100735~
## $ concave.points_worst <dbl> 1.93395138, 1.10062517, 1.72122879, 0.95470737~
## $ symmetry_worst <dbl> 2.7482041, -0.2436753, 1.1512420, 1.7525273, 0~
## $ fractal_dimension_worst <dbl> 1.93531174, 0.28094279, 0.20121416, 2.23983079~
## $ index <fct> train, train, train, train, train, train, trai~
## $ diagnosis <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
names(data)
## [1] "radius_mean" "texture_mean"
## [3] "perimeter_mean" "area_mean"
## [5] "smoothness_mean" "compactness_mean"
## [7] "concavity_mean" "concave.points_mean"
## [9] "symmetry_mean" "fractal_dimension_mean"
## [11] "radius_se" "texture_se"
## [13] "perimeter_se" "area_se"
## [15] "smoothness_se" "compactness_se"
## [17] "concavity_se" "concave.points_se"
## [19] "symmetry_se" "fractal_dimension_se"
## [21] "radius_worst" "texture_worst"
## [23] "perimeter_worst" "area_worst"
## [25] "smoothness_worst" "compactness_worst"
## [27] "concavity_worst" "concave.points_worst"
## [29] "symmetry_worst" "fractal_dimension_worst"
## [31] "index" "diagnosis"
data %>% filter(index=="train") %>% select(-index)->train
data %>% filter(index=="test") %>% select(-index)->test
train %>% glimpse
## Rows: 399
## Columns: 31
## $ radius_mean <dbl> 1.133883785, 1.617924232, 1.463509328, -0.3677~
## $ texture_mean <dbl> -2.6763108, -0.2641451, 0.5473245, -0.8241080,~
## $ perimeter_mean <dbl> 1.25871524, 1.52738014, 1.45338613, -0.2510152~
## $ area_mean <dbl> 1.1254308234, 1.6325123335, 1.4603611261, -0.3~
## $ smoothness_mean <dbl> 1.56708746, -0.82623545, 0.94138212, 2.2354545~
## $ compactness_mean <dbl> 3.28062806, -0.48664348, 1.05199990, 1.2432415~
## $ concavity_mean <dbl> 2.65054179, -0.02382489, 1.36227979, 0.8655400~
## $ concave.points_mean <dbl> 2.53024886, 0.54766227, 2.03543978, 0.82393067~
## $ symmetry_mean <dbl> 2.215565542, 0.001391139, 0.938858720, 1.00451~
## $ fractal_dimension_mean <dbl> 2.2537638, -0.8678888, -0.3976580, 1.8883435, ~
## $ radius_se <dbl> 1.891121233, 0.957773674, 1.456535070, 0.00581~
## $ texture_se <dbl> -0.49776327, -0.99743439, -0.83324887, -0.5383~
## $ perimeter_se <dbl> 1.97134979, 0.73176211, 1.21196297, -0.0841302~
## $ area_se <dbl> 1.8744596, 1.2757387, 1.4947914, 0.0432361, 0.~
## $ smoothness_se <dbl> -0.21381351, -0.60481867, -0.29674391, 0.15620~
## $ compactness_se <dbl> 1.31570389, -0.69231710, 0.81425704, 0.4451519~
## $ concavity_se <dbl> 0.72338965, -0.44039256, 0.21288911, 0.1598845~
## $ concave.points_se <dbl> 0.66023900, 0.25993335, 1.42357487, -0.0690627~
## $ symmetry_se <dbl> 1.14774677, -0.80474229, 0.23682715, 0.1340009~
## $ fractal_dimension_se <dbl> 0.90628565, -0.09935632, 0.29330133, 0.4864178~
## $ radius_worst <dbl> 1.61821085, 1.57730024, 1.41850851, 0.05600571~
## $ texture_worst <dbl> -1.48705971, -0.28812729, 0.07134428, -0.22876~
## $ perimeter_worst <dbl> 1.808915062, 1.429358435, 1.320779419, 0.11951~
## $ area_worst <dbl> 1.650761073, 1.608609957, 1.424055444, 0.07894~
## $ smoothness_worst <dbl> 1.30653666, -0.37528175, 0.52694375, 2.0467119~
## $ compactness_worst <dbl> 1.94102968, -0.29631980, 1.20863781, 1.5744313~
## $ concavity_worst <dbl> 1.72866091, 0.07068341, 1.00462771, 1.28100735~
## $ concave.points_worst <dbl> 1.93395138, 1.10062517, 1.72122879, 0.95470737~
## $ symmetry_worst <dbl> 2.7482041, -0.2436753, 1.1512420, 1.7525273, 0~
## $ fractal_dimension_worst <dbl> 1.93531174, 0.28094279, 0.20121416, 2.23983079~
## $ diagnosis <fct> M, M, M, M, M, M, M, M, M, M, M, M, M, M, M, B~
test %>% glimpse
## Rows: 170
## Columns: 31
## $ radius_mean <dbl> -0.75859441, 1.56988048, -0.36416505, 0.715646~
## $ texture_mean <dbl> 0.35740629, -1.23243596, 1.10845128, 0.4237809~
## $ perimeter_mean <dbl> -0.51443375, 1.58194943, -0.18125733, 0.803825~
## $ area_mean <dbl> -0.83550310, 1.59371956, -0.37066306, 0.696029~
## $ smoothness_mean <dbl> 3.28066684, 0.28012535, 1.58130803, 1.46754343~
## $ compactness_mean <dbl> 3.39991742, 0.53886631, 2.56110495, 1.85294273~
## $ concavity_mean <dbl> 1.9142129, 1.3698061, 1.7373434, 1.0461727, -0~
## $ concave.points_mean <dbl> 1.4504311, 1.4272370, 0.9409324, 1.3885800, -0~
## $ symmetry_mean <dbl> 2.864862154, -0.009552062, 0.796597103, 1.2853~
## $ fractal_dimension_mean <dbl> 4.90660199, -0.56195552, 2.78064892, 1.5243395~
## $ radius_se <dbl> 0.790980012, 1.477705479, -0.241027598, 1.0379~
## $ texture_se <dbl> 0.09693632, -0.85015993, 0.88023086, -0.084873~
## $ perimeter_se <dbl> 0.7554983, 1.4504094, -0.2602392, 0.9431884, -~
## $ area_se <dbl> 0.0454752, 1.4985659, -0.1561116, 0.9491189, -~
## $ smoothness_se <dbl> 0.689095329, 1.481763364, 0.035976834, -0.0049~
## $ compactness_se <dbl> 2.74186785, -0.04847723, 2.60729247, -0.026141~
## $ concavity_se <dbl> 0.8187979283, 0.8277424542, 1.5085202629, -0.0~
## $ concave.points_se <dbl> 1.114026779, 1.143198850, 0.409035052, 0.19024~
## $ symmetry_se <dbl> 4.72851977, -0.36077483, -0.32085405, -0.44182~
## $ fractal_dimension_se <dbl> 2.045710868, 0.498889164, 2.375256073, 0.13117~
## $ radius_worst <dbl> -0.08361851, 1.29258941, -0.03778559, 1.078894~
## $ texture_worst <dbl> 0.22788904, -1.63644398, 2.11071757, 0.9658412~
## $ perimeter_worst <dbl> -0.03944504, 1.31542433, -0.08432038, 1.014820~
## $ area_worst <dbl> -0.436477155, 1.308335943, 0.001718328, 1.0412~
## $ smoothness_worst <dbl> 3.391290721, 0.220362270, 2.318255541, 2.03795~
## $ compactness_worst <dbl> 2.28027115, -0.13171342, 2.48549100, 1.2035759~
## $ concavity_worst <dbl> 1.673614727, 0.816755340, 2.353607940, 1.10082~
## $ concave.points_worst <dbl> 1.8607404, 0.8063672, 1.4960429, 1.3469812, -0~
## $ symmetry_worst <dbl> 6.04072615, -0.86758960, 2.36835989, 1.3015633~
## $ fractal_dimension_worst <dbl> 4.93067187, -0.39675052, 6.84083682, 1.6750863~
## $ diagnosis <fct> M, M, M, M, B, B, M, M, M, M, M, M, B, M, M, B~
ctrl<-trainControl(method="cv",summaryFunction=twoClassSummary,classProbs=TRUE)
train(diagnosis~.,data=train, method="rpart",metric="ROC",trControl=ctrl)->rpfit
rpfit
## CART
##
## 399 samples
## 30 predictor
## 2 classes: 'B', 'M'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 359, 359, 359, 359, 359, 359, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.02013423 0.9303429 0.928 0.8990476
## 0.04026846 0.9251429 0.928 0.9123810
## 0.82550336 0.7766667 0.940 0.6133333
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02013423.
confusionMatrix(rpfit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction B M
## B 58.1 3.8
## M 4.5 33.6
##
## Accuracy (average) : 0.9173
test %>% glimpse
## Rows: 170
## Columns: 31
## $ radius_mean <dbl> -0.75859441, 1.56988048, -0.36416505, 0.715646~
## $ texture_mean <dbl> 0.35740629, -1.23243596, 1.10845128, 0.4237809~
## $ perimeter_mean <dbl> -0.51443375, 1.58194943, -0.18125733, 0.803825~
## $ area_mean <dbl> -0.83550310, 1.59371956, -0.37066306, 0.696029~
## $ smoothness_mean <dbl> 3.28066684, 0.28012535, 1.58130803, 1.46754343~
## $ compactness_mean <dbl> 3.39991742, 0.53886631, 2.56110495, 1.85294273~
## $ concavity_mean <dbl> 1.9142129, 1.3698061, 1.7373434, 1.0461727, -0~
## $ concave.points_mean <dbl> 1.4504311, 1.4272370, 0.9409324, 1.3885800, -0~
## $ symmetry_mean <dbl> 2.864862154, -0.009552062, 0.796597103, 1.2853~
## $ fractal_dimension_mean <dbl> 4.90660199, -0.56195552, 2.78064892, 1.5243395~
## $ radius_se <dbl> 0.790980012, 1.477705479, -0.241027598, 1.0379~
## $ texture_se <dbl> 0.09693632, -0.85015993, 0.88023086, -0.084873~
## $ perimeter_se <dbl> 0.7554983, 1.4504094, -0.2602392, 0.9431884, -~
## $ area_se <dbl> 0.0454752, 1.4985659, -0.1561116, 0.9491189, -~
## $ smoothness_se <dbl> 0.689095329, 1.481763364, 0.035976834, -0.0049~
## $ compactness_se <dbl> 2.74186785, -0.04847723, 2.60729247, -0.026141~
## $ concavity_se <dbl> 0.8187979283, 0.8277424542, 1.5085202629, -0.0~
## $ concave.points_se <dbl> 1.114026779, 1.143198850, 0.409035052, 0.19024~
## $ symmetry_se <dbl> 4.72851977, -0.36077483, -0.32085405, -0.44182~
## $ fractal_dimension_se <dbl> 2.045710868, 0.498889164, 2.375256073, 0.13117~
## $ radius_worst <dbl> -0.08361851, 1.29258941, -0.03778559, 1.078894~
## $ texture_worst <dbl> 0.22788904, -1.63644398, 2.11071757, 0.9658412~
## $ perimeter_worst <dbl> -0.03944504, 1.31542433, -0.08432038, 1.014820~
## $ area_worst <dbl> -0.436477155, 1.308335943, 0.001718328, 1.0412~
## $ smoothness_worst <dbl> 3.391290721, 0.220362270, 2.318255541, 2.03795~
## $ compactness_worst <dbl> 2.28027115, -0.13171342, 2.48549100, 1.2035759~
## $ concavity_worst <dbl> 1.673614727, 0.816755340, 2.353607940, 1.10082~
## $ concave.points_worst <dbl> 1.8607404, 0.8063672, 1.4960429, 1.3469812, -0~
## $ symmetry_worst <dbl> 6.04072615, -0.86758960, 2.36835989, 1.3015633~
## $ fractal_dimension_worst <dbl> 4.93067187, -0.39675052, 6.84083682, 1.6750863~
## $ diagnosis <fct> M, M, M, M, B, B, M, M, M, M, M, M, B, M, M, B~
predict(rpfit,test,type="prob")->rffit1
predict(rpfit,test,type="raw")->rffit2
head(rffit1)
## B M
## 1 0.962809917 0.03719008
## 2 0.008403361 0.99159664
## 3 0.962809917 0.03719008
## 4 0.008403361 0.99159664
## 5 0.962809917 0.03719008
## 6 0.962809917 0.03719008
head(rffit2)
## [1] B M B M B B
## Levels: B M
confusionMatrix(rffit2,test$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 102 11
## M 5 52
##
## Accuracy : 0.9059
## 95% CI : (0.8517, 0.9452)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7942
##
## Mcnemar's Test P-Value : 0.2113
##
## Sensitivity : 0.9533
## Specificity : 0.8254
## Pos Pred Value : 0.9027
## Neg Pred Value : 0.9123
## Prevalence : 0.6294
## Detection Rate : 0.6000
## Detection Prevalence : 0.6647
## Balanced Accuracy : 0.8893
##
## 'Positive' Class : B
##
importance<-varImp(rpfit,scale=FALSE)
print(importance)
## rpart variable importance
##
## only 20 most important variables shown (out of 30)
##
## Overall
## perimeter_worst 148.877
## radius_worst 145.295
## concave.points_worst 137.543
## area_worst 133.384
## concave.points_mean 132.678
## texture_worst 20.175
## smoothness_worst 9.799
## texture_mean 8.431
## symmetry_worst 6.225
## area_mean 6.225
## smoothness_se 0.000
## radius_mean 0.000
## perimeter_mean 0.000
## fractal_dimension_mean 0.000
## compactness_se 0.000
## fractal_dimension_worst 0.000
## compactness_mean 0.000
## area_se 0.000
## compactness_worst 0.000
## radius_se 0.000
rffit2_num<-as.numeric(rffit2)
result<-roc(test$diagnosis,rffit2_num)
## Setting levels: control = B, case = M
## Setting direction: controls < cases
result
##
## Call:
## roc.default(response = test$diagnosis, predictor = rffit2_num)
##
## Data: rffit2_num in 107 controls (test$diagnosis B) < 63 cases (test$diagnosis M).
## Area under the curve: 0.8893
result$auc
## Area under the curve: 0.8893