library(readr)
wisc_bc_data <- read_csv("C:/Users/dnred/Downloads/wisc_bc_data.csv")
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(wisc_bc_data)

wd<-wisc_bc_data #rename
str(wd) #view data
## spc_tbl_ [569 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                     : num [1:569] 842302 842517 84300903 84348301 84358402 ...
##  $ diagnosis              : chr [1:569] "M" "M" "M" "M" ...
##  $ radius_mean            : num [1:569] 18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num [1:569] 10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num [1:569] 122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num [1:569] 1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num [1:569] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num [1:569] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num [1:569] 0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave points_mean    : num [1:569] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num [1:569] 0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num [1:569] 0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num [1:569] 1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num [1:569] 0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num [1:569] 8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num [1:569] 153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num [1:569] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num [1:569] 0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num [1:569] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave points_se      : num [1:569] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num [1:569] 0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num [1:569] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num [1:569] 25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num [1:569] 17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num [1:569] 184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num [1:569] 2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num [1:569] 0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num [1:569] 0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num [1:569] 0.712 0.242 0.45 0.687 0.4 ...
##  $ concave points_worst   : num [1:569] 0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num [1:569] 0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num [1:569] 0.1189 0.089 0.0876 0.173 0.0768 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   diagnosis = col_character(),
##   ..   radius_mean = col_double(),
##   ..   texture_mean = col_double(),
##   ..   perimeter_mean = col_double(),
##   ..   area_mean = col_double(),
##   ..   smoothness_mean = col_double(),
##   ..   compactness_mean = col_double(),
##   ..   concavity_mean = col_double(),
##   ..   `concave points_mean` = col_double(),
##   ..   symmetry_mean = col_double(),
##   ..   fractal_dimension_mean = col_double(),
##   ..   radius_se = col_double(),
##   ..   texture_se = col_double(),
##   ..   perimeter_se = col_double(),
##   ..   area_se = col_double(),
##   ..   smoothness_se = col_double(),
##   ..   compactness_se = col_double(),
##   ..   concavity_se = col_double(),
##   ..   `concave points_se` = col_double(),
##   ..   symmetry_se = col_double(),
##   ..   fractal_dimension_se = col_double(),
##   ..   radius_worst = col_double(),
##   ..   texture_worst = col_double(),
##   ..   perimeter_worst = col_double(),
##   ..   area_worst = col_double(),
##   ..   smoothness_worst = col_double(),
##   ..   compactness_worst = col_double(),
##   ..   concavity_worst = col_double(),
##   ..   `concave points_worst` = col_double(),
##   ..   symmetry_worst = col_double(),
##   ..   fractal_dimension_worst = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#save copy of dataset so we dont override original dataset, remove first column
wd1<-wd[-1] 
View(wd1)

#show all values of diagnosis feature
wd1$diagnosis 
##   [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
##  [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
##  [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
##  [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
##  [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
#will show how many values falls in each category of this feature 
table(wd1$diagnosis) 
## 
##   B   M 
## 357 212
#will give proportion of each category of this feature
prop.table(table(wd1$diagnosis)) 
## 
##         B         M 
## 0.6274165 0.3725835
#allows us to analyze numeric features 
summary(wd1[c("radius_mean", "area_mean", "smoothness_mean")]) 
##   radius_mean       area_mean      smoothness_mean  
##  Min.   : 6.981   Min.   : 143.5   Min.   :0.05263  
##  1st Qu.:11.700   1st Qu.: 420.3   1st Qu.:0.08637  
##  Median :13.370   Median : 551.1   Median :0.09587  
##  Mean   :14.127   Mean   : 654.9   Mean   :0.09636  
##  3rd Qu.:15.780   3rd Qu.: 782.7   3rd Qu.:0.10530  
##  Max.   :28.110   Max.   :2501.0   Max.   :0.16340
#we observe none of these values are on the same scale, we must scale it
#create normalization function (used for supervised learning classification problems)
normalize<-function(x) {
  return((x-min(x))/(max(x)-min(x)))
}
normalize(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
normalize(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
#normalize the wbcd data
wd_n<-as.data.frame(lapply(wd1[2:31],normalize))

#view summary to check that it is scaled
summary(wd_n[c("radius_mean", "area_mean", "smoothness_mean")]) 
##   radius_mean       area_mean      smoothness_mean 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.2233   1st Qu.:0.1174   1st Qu.:0.3046  
##  Median :0.3024   Median :0.1729   Median :0.3904  
##  Mean   :0.3382   Mean   :0.2169   Mean   :0.3948  
##  3rd Qu.:0.4164   3rd Qu.:0.2711   3rd Qu.:0.4755  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000
#recode diagnosis as factor- stored as a vector of integer values with a 
#corresponding set of character vlaues to use when factor is displayed
wd1$diagnosis<-factor(wd1$diagnosis, levels=c("B", "M"), labels=c("Benign","Malignant"))
str(wd1$diagnosis)
##  Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
#we must split the dataset in order to evaluate how good our classification is
#divide into training dataset and test dataset
#training dataset what we will use the to build the knn model
#test dataset will allow us to determine how well out model performs
wd1_train<-wd_n[1:469, ]
head(wd1_train)
##   radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1   0.5210374    0.0226581      0.5459885 0.3637328       0.5937528
## 2   0.6431445    0.2725736      0.6157833 0.5015907       0.2898799
## 3   0.6014956    0.3902604      0.5957432 0.4494168       0.5143089
## 4   0.2100904    0.3608387      0.2335015 0.1029056       0.8113208
## 5   0.6298926    0.1565776      0.6309861 0.4892895       0.4303512
## 6   0.2588386    0.2025702      0.2679842 0.1415058       0.6786133
##   compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1        0.7920373      0.7031396           0.7311133     0.6863636
## 2        0.1817680      0.2036082           0.3487575     0.3797980
## 3        0.4310165      0.4625117           0.6356859     0.5095960
## 4        0.8113613      0.5656045           0.5228628     0.7762626
## 5        0.3478928      0.4639175           0.5183897     0.3782828
## 6        0.4619962      0.3697282           0.4020378     0.5186869
##   fractal_dimension_mean  radius_se texture_se perimeter_se    area_se
## 1              0.6055181 0.35614702 0.12046941   0.36903360 0.27381126
## 2              0.1413227 0.15643672 0.08258929   0.12444047 0.12565979
## 3              0.2112468 0.22962158 0.09430251   0.18037035 0.16292179
## 4              1.0000000 0.13909107 0.17587518   0.12665504 0.03815479
## 5              0.1868155 0.23382220 0.09306489   0.22056260 0.16368757
## 6              0.5511794 0.08075321 0.11713225   0.06879329 0.03808008
##   smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1     0.1592956     0.35139844   0.13568182         0.3006251  0.31164518
## 2     0.1193867     0.08132304   0.04696970         0.2538360  0.08453875
## 3     0.1508312     0.28395470   0.09676768         0.3898466  0.20569032
## 4     0.2514532     0.54321507   0.14295455         0.3536655  0.72814769
## 5     0.3323588     0.16791841   0.14363636         0.3570752  0.13617943
## 6     0.1970629     0.23431069   0.09272727         0.2153817  0.19372995
##   fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1            0.1830424    0.6207755     0.1415245       0.6683102 0.45069799
## 2            0.0911101    0.6069015     0.3035714       0.5398177 0.43521431
## 3            0.1270055    0.5563856     0.3600746       0.5084417 0.37450845
## 4            0.2872048    0.2483102     0.3859275       0.2413467 0.09400806
## 5            0.1457996    0.5197439     0.1239339       0.5069476 0.34157491
## 6            0.1446596    0.2682319     0.3126333       0.2639076 0.13674794
##   smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1        0.6011358         0.6192916       0.5686102            0.9120275
## 2        0.3475533         0.1545634       0.1929712            0.6391753
## 3        0.4835898         0.3853751       0.3597444            0.8350515
## 4        0.9154725         0.8140117       0.5486422            0.8848797
## 5        0.4373638         0.1724151       0.3194888            0.5584192
## 6        0.7127386         0.4827837       0.4277157            0.5982818
##   symmetry_worst fractal_dimension_worst
## 1      0.5984624               0.4188640
## 2      0.2335896               0.2228781
## 3      0.4037059               0.2134330
## 4      1.0000000               0.7737111
## 5      0.1575005               0.1425948
## 6      0.4770353               0.4549390
wd1_test<-wd_n[470:569, ]
head(wd1_test)
##     radius_mean texture_mean perimeter_mean  area_mean smoothness_mean
## 470   0.2195561    0.2864390      0.2252090 0.11253446       0.5856279
## 471   0.1271239    0.2969226      0.1223136 0.06176034       0.3324907
## 472   0.2394340    0.6232668      0.2284569 0.12996819       0.3149770
## 473   0.3757395    0.1765303      0.3639002 0.23049841       0.2559357
## 474   0.2503195    0.6851539      0.2323958 0.13654295       0.2199151
## 475   0.1845331    0.1998647      0.1839541 0.09136797       0.4339623
##     compactness_mean concavity_mean concave.points_mean symmetry_mean
## 470       0.39543586     0.23898782          0.27654076     0.4530303
## 471       0.13250721     0.06907216          0.07524851     0.5949495
## 472       0.12459358     0.05545923          0.11814115     0.4010101
## 473       0.20277897     0.12977976          0.16008946     0.3166667
## 474       0.04478253     0.00000000          0.00000000     0.3237374
## 475       0.26844979     0.11984536          0.07808151     0.4045455
##     fractal_dimension_mean  radius_se texture_se perimeter_se     area_se
## 470              0.4757793 0.10812964 0.30494165   0.10695943 0.039312810
## 471              0.2984414 0.09636067 0.21875000   0.08537907 0.029749831
## 472              0.1478517 0.17910556 0.50450849   0.15747067 0.071270345
## 473              0.1417439 0.04819844 0.01617751   0.05036988 0.030833137
## 474              0.2030329 0.12094876 0.72639675   0.10022146 0.052910171
## 475              0.3877422 0.01328988 0.03929455   0.02563257 0.005220415
##     smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 470    0.43774008     0.22387118   0.12527778         0.3487403  0.14335566
## 471    0.19675698     0.13246913   0.06853535         0.1872135  0.16488434
## 472    0.19730088     0.09987382   0.03699495         0.2240955  0.17712613
## 473    0.05296257     0.11602127   0.05833333         0.1591021  0.05062757
## 474    0.19124996     0.04499504   0.00000000         0.0000000  0.33106321
## 475    0.09385729     0.21538438   0.06962121         0.1267475  0.05963303
##     fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 470           0.14932355    0.1931697     0.3566098       0.1879078 0.08427546
## 471           0.10427290    0.1141942     0.3624733       0.1019473 0.04915454
## 472           0.10323646    0.2017076     0.5679638       0.1834255 0.09398348
## 473           0.05127620    0.3290644     0.1652452       0.3067384 0.17730043
## 474           0.07742907    0.1963714     0.6937633       0.1726680 0.09184526
## 475           0.13049487    0.1426539     0.1953625       0.1512526 0.06092705
##     smoothness_worst compactness_worst concavity_worst concave.points_worst
## 470        0.7054745        0.25274811      0.25447284            0.4865979
## 471        0.3449118        0.12312872      0.10199681            0.2254296
## 472        0.2174602        0.06788524      0.04412141            0.1906186
## 473        0.2333091        0.24430732      0.25167732            0.3941581
## 474        0.1522155        0.02409989      0.00000000            0.0000000
## 475        0.4096282        0.35170902      0.26876997            0.2737457
##     symmetry_worst fractal_dimension_worst
## 470      0.2158486              0.24701561
## 471      0.3171693              0.19808474
## 472      0.1653854              0.07444576
## 473      0.2213680              0.18162141
## 474      0.1663710              0.08126722
## 475      0.2002760              0.34736980
#the diagnosis label has been excluded from the training and tests datasets
#we need it for training the knn model, so we pick up first column that we 
#dropped from original dataset and store it into its own vector *not adding to dataset*
wd1_train_labels<-wd1[1:469, 1]
wd1_test_labels<-wd1[470:569, 1]
head(wd1_test_labels)
## # A tibble: 6 × 1
##   diagnosis
##   <fct>    
## 1 Benign   
## 2 Benign   
## 3 Benign   
## 4 Benign   
## 5 Benign   
## 6 Benign
#rule of thumb to take sqrt of training dataset to determine class
sqrt(469)
## [1] 21.65641
library(class)
## Warning: package 'class' was built under R version 4.3.3
#Begin training a model on the data
#to classify our test instances, we will use the knn implementation from the class
#package, which provides a set of basic R functions for classifications
wd1_test_pred<-knn(train=wd1_train, test=wd1_test, 
                   cl=wd1_train_labels$diagnosis, k=21)
wd1_test_pred
##   [1] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##   [8] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##  [15] Benign    Benign    Benign    Benign    Malignant Benign    Benign   
##  [22] Benign    Benign    Malignant Benign    Benign    Benign    Benign   
##  [29] Benign    Malignant Malignant Benign    Malignant Benign    Malignant
##  [36] Benign    Benign    Benign    Benign    Benign    Malignant Benign   
##  [43] Benign    Malignant Benign    Benign    Benign    Malignant Malignant
##  [50] Benign    Benign    Benign    Malignant Benign    Benign    Benign   
##  [57] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [64] Benign    Malignant Benign    Malignant Malignant Benign    Benign   
##  [71] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [78] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [85] Benign    Benign    Benign    Benign    Benign    Benign    Benign   
##  [92] Benign    Benign    Malignant Malignant Malignant Malignant Malignant
##  [99] Malignant Benign   
## Levels: Benign Malignant
#test model; evaluate how well the predicted classes are
#create a cross tabulation of predicted vs. actual
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
CrossTable(x=wd1_test_labels$diagnosis, y=wd1_test_pred, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                           | wd1_test_pred 
## wd1_test_labels$diagnosis |    Benign | Malignant | Row Total | 
## --------------------------|-----------|-----------|-----------|
##                    Benign |        77 |         0 |        77 | 
##                           |     1.000 |     0.000 |     0.770 | 
##                           |     0.975 |     0.000 |           | 
##                           |     0.770 |     0.000 |           | 
## --------------------------|-----------|-----------|-----------|
##                 Malignant |         2 |        21 |        23 | 
##                           |     0.087 |     0.913 |     0.230 | 
##                           |     0.025 |     1.000 |           | 
##                           |     0.020 |     0.210 |           | 
## --------------------------|-----------|-----------|-----------|
##              Column Total |        79 |        21 |       100 | 
##                           |     0.790 |     0.210 |           | 
## --------------------------|-----------|-----------|-----------|
## 
## 
#this is called a confusion matrix, we can see that we have 77 TN and 21 TP
#we have 0 FP and 2 FN