library(readr)
wisc_bc_data <- read_csv("C:/Users/dnred/Downloads/wisc_bc_data.csv")
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(wisc_bc_data)
wd<-wisc_bc_data #rename
str(wd) #view data
## spc_tbl_ [569 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:569] 842302 842517 84300903 84348301 84358402 ...
## $ diagnosis : chr [1:569] "M" "M" "M" "M" ...
## $ radius_mean : num [1:569] 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num [1:569] 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num [1:569] 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num [1:569] 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num [1:569] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num [1:569] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num [1:569] 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave points_mean : num [1:569] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num [1:569] 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num [1:569] 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num [1:569] 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num [1:569] 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num [1:569] 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num [1:569] 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num [1:569] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num [1:569] 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num [1:569] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num [1:569] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num [1:569] 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num [1:569] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num [1:569] 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num [1:569] 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num [1:569] 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num [1:569] 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num [1:569] 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num [1:569] 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num [1:569] 0.712 0.242 0.45 0.687 0.4 ...
## $ concave points_worst : num [1:569] 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num [1:569] 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num [1:569] 0.1189 0.089 0.0876 0.173 0.0768 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. diagnosis = col_character(),
## .. radius_mean = col_double(),
## .. texture_mean = col_double(),
## .. perimeter_mean = col_double(),
## .. area_mean = col_double(),
## .. smoothness_mean = col_double(),
## .. compactness_mean = col_double(),
## .. concavity_mean = col_double(),
## .. `concave points_mean` = col_double(),
## .. symmetry_mean = col_double(),
## .. fractal_dimension_mean = col_double(),
## .. radius_se = col_double(),
## .. texture_se = col_double(),
## .. perimeter_se = col_double(),
## .. area_se = col_double(),
## .. smoothness_se = col_double(),
## .. compactness_se = col_double(),
## .. concavity_se = col_double(),
## .. `concave points_se` = col_double(),
## .. symmetry_se = col_double(),
## .. fractal_dimension_se = col_double(),
## .. radius_worst = col_double(),
## .. texture_worst = col_double(),
## .. perimeter_worst = col_double(),
## .. area_worst = col_double(),
## .. smoothness_worst = col_double(),
## .. compactness_worst = col_double(),
## .. concavity_worst = col_double(),
## .. `concave points_worst` = col_double(),
## .. symmetry_worst = col_double(),
## .. fractal_dimension_worst = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#save copy of dataset so we dont override original dataset, remove first column
wd1<-wd[-1]
View(wd1)
#show all values of diagnosis feature
wd1$diagnosis
## [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
## [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
## [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
## [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
#will show how many values falls in each category of this feature
table(wd1$diagnosis)
##
## B M
## 357 212
#will give proportion of each category of this feature
prop.table(table(wd1$diagnosis))
##
## B M
## 0.6274165 0.3725835
#allows us to analyze numeric features
summary(wd1[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
#we observe none of these values are on the same scale, we must scale it
#create normalization function (used for supervised learning classification problems)
normalize<-function(x) {
return((x-min(x))/(max(x)-min(x)))
}
normalize(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
normalize(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
#normalize the wbcd data
wd_n<-as.data.frame(lapply(wd1[2:31],normalize))
#view summary to check that it is scaled
summary(wd_n[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.1174 1st Qu.:0.3046
## Median :0.3024 Median :0.1729 Median :0.3904
## Mean :0.3382 Mean :0.2169 Mean :0.3948
## 3rd Qu.:0.4164 3rd Qu.:0.2711 3rd Qu.:0.4755
## Max. :1.0000 Max. :1.0000 Max. :1.0000
#recode diagnosis as factor- stored as a vector of integer values with a
#corresponding set of character vlaues to use when factor is displayed
wd1$diagnosis<-factor(wd1$diagnosis, levels=c("B", "M"), labels=c("Benign","Malignant"))
str(wd1$diagnosis)
## Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
#we must split the dataset in order to evaluate how good our classification is
#divide into training dataset and test dataset
#training dataset what we will use the to build the knn model
#test dataset will allow us to determine how well out model performs
wd1_train<-wd_n[1:469, ]
head(wd1_train)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 0.5210374 0.0226581 0.5459885 0.3637328 0.5937528
## 2 0.6431445 0.2725736 0.6157833 0.5015907 0.2898799
## 3 0.6014956 0.3902604 0.5957432 0.4494168 0.5143089
## 4 0.2100904 0.3608387 0.2335015 0.1029056 0.8113208
## 5 0.6298926 0.1565776 0.6309861 0.4892895 0.4303512
## 6 0.2588386 0.2025702 0.2679842 0.1415058 0.6786133
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1 0.7920373 0.7031396 0.7311133 0.6863636
## 2 0.1817680 0.2036082 0.3487575 0.3797980
## 3 0.4310165 0.4625117 0.6356859 0.5095960
## 4 0.8113613 0.5656045 0.5228628 0.7762626
## 5 0.3478928 0.4639175 0.5183897 0.3782828
## 6 0.4619962 0.3697282 0.4020378 0.5186869
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.6055181 0.35614702 0.12046941 0.36903360 0.27381126
## 2 0.1413227 0.15643672 0.08258929 0.12444047 0.12565979
## 3 0.2112468 0.22962158 0.09430251 0.18037035 0.16292179
## 4 1.0000000 0.13909107 0.17587518 0.12665504 0.03815479
## 5 0.1868155 0.23382220 0.09306489 0.22056260 0.16368757
## 6 0.5511794 0.08075321 0.11713225 0.06879329 0.03808008
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1 0.1592956 0.35139844 0.13568182 0.3006251 0.31164518
## 2 0.1193867 0.08132304 0.04696970 0.2538360 0.08453875
## 3 0.1508312 0.28395470 0.09676768 0.3898466 0.20569032
## 4 0.2514532 0.54321507 0.14295455 0.3536655 0.72814769
## 5 0.3323588 0.16791841 0.14363636 0.3570752 0.13617943
## 6 0.1970629 0.23431069 0.09272727 0.2153817 0.19372995
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1 0.1830424 0.6207755 0.1415245 0.6683102 0.45069799
## 2 0.0911101 0.6069015 0.3035714 0.5398177 0.43521431
## 3 0.1270055 0.5563856 0.3600746 0.5084417 0.37450845
## 4 0.2872048 0.2483102 0.3859275 0.2413467 0.09400806
## 5 0.1457996 0.5197439 0.1239339 0.5069476 0.34157491
## 6 0.1446596 0.2682319 0.3126333 0.2639076 0.13674794
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1 0.6011358 0.6192916 0.5686102 0.9120275
## 2 0.3475533 0.1545634 0.1929712 0.6391753
## 3 0.4835898 0.3853751 0.3597444 0.8350515
## 4 0.9154725 0.8140117 0.5486422 0.8848797
## 5 0.4373638 0.1724151 0.3194888 0.5584192
## 6 0.7127386 0.4827837 0.4277157 0.5982818
## symmetry_worst fractal_dimension_worst
## 1 0.5984624 0.4188640
## 2 0.2335896 0.2228781
## 3 0.4037059 0.2134330
## 4 1.0000000 0.7737111
## 5 0.1575005 0.1425948
## 6 0.4770353 0.4549390
wd1_test<-wd_n[470:569, ]
head(wd1_test)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 470 0.2195561 0.2864390 0.2252090 0.11253446 0.5856279
## 471 0.1271239 0.2969226 0.1223136 0.06176034 0.3324907
## 472 0.2394340 0.6232668 0.2284569 0.12996819 0.3149770
## 473 0.3757395 0.1765303 0.3639002 0.23049841 0.2559357
## 474 0.2503195 0.6851539 0.2323958 0.13654295 0.2199151
## 475 0.1845331 0.1998647 0.1839541 0.09136797 0.4339623
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 470 0.39543586 0.23898782 0.27654076 0.4530303
## 471 0.13250721 0.06907216 0.07524851 0.5949495
## 472 0.12459358 0.05545923 0.11814115 0.4010101
## 473 0.20277897 0.12977976 0.16008946 0.3166667
## 474 0.04478253 0.00000000 0.00000000 0.3237374
## 475 0.26844979 0.11984536 0.07808151 0.4045455
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 470 0.4757793 0.10812964 0.30494165 0.10695943 0.039312810
## 471 0.2984414 0.09636067 0.21875000 0.08537907 0.029749831
## 472 0.1478517 0.17910556 0.50450849 0.15747067 0.071270345
## 473 0.1417439 0.04819844 0.01617751 0.05036988 0.030833137
## 474 0.2030329 0.12094876 0.72639675 0.10022146 0.052910171
## 475 0.3877422 0.01328988 0.03929455 0.02563257 0.005220415
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 470 0.43774008 0.22387118 0.12527778 0.3487403 0.14335566
## 471 0.19675698 0.13246913 0.06853535 0.1872135 0.16488434
## 472 0.19730088 0.09987382 0.03699495 0.2240955 0.17712613
## 473 0.05296257 0.11602127 0.05833333 0.1591021 0.05062757
## 474 0.19124996 0.04499504 0.00000000 0.0000000 0.33106321
## 475 0.09385729 0.21538438 0.06962121 0.1267475 0.05963303
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 470 0.14932355 0.1931697 0.3566098 0.1879078 0.08427546
## 471 0.10427290 0.1141942 0.3624733 0.1019473 0.04915454
## 472 0.10323646 0.2017076 0.5679638 0.1834255 0.09398348
## 473 0.05127620 0.3290644 0.1652452 0.3067384 0.17730043
## 474 0.07742907 0.1963714 0.6937633 0.1726680 0.09184526
## 475 0.13049487 0.1426539 0.1953625 0.1512526 0.06092705
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## 470 0.7054745 0.25274811 0.25447284 0.4865979
## 471 0.3449118 0.12312872 0.10199681 0.2254296
## 472 0.2174602 0.06788524 0.04412141 0.1906186
## 473 0.2333091 0.24430732 0.25167732 0.3941581
## 474 0.1522155 0.02409989 0.00000000 0.0000000
## 475 0.4096282 0.35170902 0.26876997 0.2737457
## symmetry_worst fractal_dimension_worst
## 470 0.2158486 0.24701561
## 471 0.3171693 0.19808474
## 472 0.1653854 0.07444576
## 473 0.2213680 0.18162141
## 474 0.1663710 0.08126722
## 475 0.2002760 0.34736980
#the diagnosis label has been excluded from the training and tests datasets
#we need it for training the knn model, so we pick up first column that we
#dropped from original dataset and store it into its own vector *not adding to dataset*
wd1_train_labels<-wd1[1:469, 1]
wd1_test_labels<-wd1[470:569, 1]
head(wd1_test_labels)
## # A tibble: 6 × 1
## diagnosis
## <fct>
## 1 Benign
## 2 Benign
## 3 Benign
## 4 Benign
## 5 Benign
## 6 Benign
#rule of thumb to take sqrt of training dataset to determine class
sqrt(469)
## [1] 21.65641
library(class)
## Warning: package 'class' was built under R version 4.3.3
#Begin training a model on the data
#to classify our test instances, we will use the knn implementation from the class
#package, which provides a set of basic R functions for classifications
wd1_test_pred<-knn(train=wd1_train, test=wd1_test,
cl=wd1_train_labels$diagnosis, k=21)
wd1_test_pred
## [1] Benign Benign Benign Benign Benign Benign Benign
## [8] Benign Benign Benign Malignant Benign Benign Benign
## [15] Benign Benign Benign Benign Malignant Benign Benign
## [22] Benign Benign Malignant Benign Benign Benign Benign
## [29] Benign Malignant Malignant Benign Malignant Benign Malignant
## [36] Benign Benign Benign Benign Benign Malignant Benign
## [43] Benign Malignant Benign Benign Benign Malignant Malignant
## [50] Benign Benign Benign Malignant Benign Benign Benign
## [57] Benign Benign Benign Benign Benign Benign Benign
## [64] Benign Malignant Benign Malignant Malignant Benign Benign
## [71] Benign Benign Benign Benign Benign Benign Benign
## [78] Benign Benign Benign Benign Benign Benign Benign
## [85] Benign Benign Benign Benign Benign Benign Benign
## [92] Benign Benign Malignant Malignant Malignant Malignant Malignant
## [99] Malignant Benign
## Levels: Benign Malignant
#test model; evaluate how well the predicted classes are
#create a cross tabulation of predicted vs. actual
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
CrossTable(x=wd1_test_labels$diagnosis, y=wd1_test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wd1_test_pred
## wd1_test_labels$diagnosis | Benign | Malignant | Row Total |
## --------------------------|-----------|-----------|-----------|
## Benign | 77 | 0 | 77 |
## | 1.000 | 0.000 | 0.770 |
## | 0.975 | 0.000 | |
## | 0.770 | 0.000 | |
## --------------------------|-----------|-----------|-----------|
## Malignant | 2 | 21 | 23 |
## | 0.087 | 0.913 | 0.230 |
## | 0.025 | 1.000 | |
## | 0.020 | 0.210 | |
## --------------------------|-----------|-----------|-----------|
## Column Total | 79 | 21 | 100 |
## | 0.790 | 0.210 | |
## --------------------------|-----------|-----------|-----------|
##
##
#this is called a confusion matrix, we can see that we have 77 TN and 21 TP
#we have 0 FP and 2 FN