The dataset used is the Wisconsin Breast Cancer Dataset.
library(readr)
wisc_bc_data <- read_csv("~/Downloads/CS 583/wisc_bc_data.csv")
## Rows: 569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): diagnosis
## dbl (31): id, radius_mean, texture_mean, perimeter_mean, area_mean, smoothne...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wisc_bc_data
## # A tibble: 569 × 32
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 842302 M 18.0 10.4 123. 1001
## 2 842517 M 20.6 17.8 133. 1326
## 3 84300903 M 19.7 21.2 130 1203
## 4 84348301 M 11.4 20.4 77.6 386.
## 5 84358402 M 20.3 14.3 135. 1297
## 6 843786 M 12.4 15.7 82.6 477.
## 7 844359 M 18.2 20.0 120. 1040
## 8 84458202 M 13.7 20.8 90.2 578.
## 9 844981 M 13 21.8 87.5 520.
## 10 84501001 M 12.5 24.0 84.0 476.
## # ℹ 559 more rows
## # ℹ 26 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## # concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## # fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## # perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>, …
The initial dataset is saved as a new dataset so as to not override the original dataset. The id column(first column) is dropped as we don’t need it in our analysis
wd<-wisc_bc_data
str(wd)
## spc_tbl_ [569 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:569] 842302 842517 84300903 84348301 84358402 ...
## $ diagnosis : chr [1:569] "M" "M" "M" "M" ...
## $ radius_mean : num [1:569] 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num [1:569] 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num [1:569] 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num [1:569] 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num [1:569] 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num [1:569] 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num [1:569] 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave points_mean : num [1:569] 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num [1:569] 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num [1:569] 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num [1:569] 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num [1:569] 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num [1:569] 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num [1:569] 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num [1:569] 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num [1:569] 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num [1:569] 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num [1:569] 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num [1:569] 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num [1:569] 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num [1:569] 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num [1:569] 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num [1:569] 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num [1:569] 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num [1:569] 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num [1:569] 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num [1:569] 0.712 0.242 0.45 0.687 0.4 ...
## $ concave points_worst : num [1:569] 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num [1:569] 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num [1:569] 0.1189 0.089 0.0876 0.173 0.0768 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. diagnosis = col_character(),
## .. radius_mean = col_double(),
## .. texture_mean = col_double(),
## .. perimeter_mean = col_double(),
## .. area_mean = col_double(),
## .. smoothness_mean = col_double(),
## .. compactness_mean = col_double(),
## .. concavity_mean = col_double(),
## .. `concave points_mean` = col_double(),
## .. symmetry_mean = col_double(),
## .. fractal_dimension_mean = col_double(),
## .. radius_se = col_double(),
## .. texture_se = col_double(),
## .. perimeter_se = col_double(),
## .. area_se = col_double(),
## .. smoothness_se = col_double(),
## .. compactness_se = col_double(),
## .. concavity_se = col_double(),
## .. `concave points_se` = col_double(),
## .. symmetry_se = col_double(),
## .. fractal_dimension_se = col_double(),
## .. radius_worst = col_double(),
## .. texture_worst = col_double(),
## .. perimeter_worst = col_double(),
## .. area_worst = col_double(),
## .. smoothness_worst = col_double(),
## .. compactness_worst = col_double(),
## .. concavity_worst = col_double(),
## .. `concave points_worst` = col_double(),
## .. symmetry_worst = col_double(),
## .. fractal_dimension_worst = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
wd1<-wd[-1]
wd1
## # A tibble: 569 × 31
## diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M 18.0 10.4 123. 1001 0.118
## 2 M 20.6 17.8 133. 1326 0.0847
## 3 M 19.7 21.2 130 1203 0.110
## 4 M 11.4 20.4 77.6 386. 0.142
## 5 M 20.3 14.3 135. 1297 0.100
## 6 M 12.4 15.7 82.6 477. 0.128
## 7 M 18.2 20.0 120. 1040 0.0946
## 8 M 13.7 20.8 90.2 578. 0.119
## 9 M 13 21.8 87.5 520. 0.127
## 10 M 12.5 24.0 84.0 476. 0.119
## # ℹ 559 more rows
## # ℹ 25 more variables: compactness_mean <dbl>, concavity_mean <dbl>,
## # `concave points_mean` <dbl>, symmetry_mean <dbl>,
## # fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## # perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>, …
The values for diagnosis are viewed and the numeric features are examined
wd1$diagnosis
## [1] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [19] "M" "B" "B" "B" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M"
## [37] "M" "B" "M" "M" "M" "M" "M" "M" "M" "M" "B" "M" "B" "B" "B" "B" "B" "M"
## [55] "M" "B" "M" "M" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B" "M" "B"
## [73] "M" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "B"
## [91] "B" "M" "B" "B" "M" "M" "B" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [109] "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B"
## [127] "M" "M" "B" "M" "B" "M" "M" "B" "M" "M" "B" "B" "M" "B" "B" "M" "B" "B"
## [145] "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "M"
## [163] "M" "B" "M" "B" "B" "M" "M" "B" "B" "M" "M" "B" "B" "B" "B" "M" "B" "B"
## [181] "M" "M" "M" "B" "M" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "M" "M"
## [199] "M" "M" "B" "M" "M" "M" "B" "M" "B" "M" "B" "B" "M" "B" "M" "M" "M" "M"
## [217] "B" "B" "M" "M" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "B" "M"
## [235] "B" "B" "M" "M" "B" "M" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "M" "B"
## [253] "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "M" "B" "B" "B" "B"
## [271] "B" "B" "M" "B" "M" "B" "B" "M" "B" "B" "M" "B" "M" "M" "B" "B" "B" "B"
## [289] "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "B"
## [307] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "M"
## [325] "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "M" "B" "M" "B" "M" "B" "B"
## [343] "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "M" "M" "B" "B" "B" "B" "B" "B"
## [361] "B" "B" "B" "B" "B" "M" "M" "B" "M" "M" "M" "B" "M" "M" "B" "B" "B" "B"
## [379] "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "B" "M" "B" "B" "M" "M" "B" "B"
## [397] "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B"
## [415] "M" "B" "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B"
## [433] "M" "M" "B" "M" "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "B" "M"
## [451] "B" "M" "B" "B" "B" "B" "B" "B" "B" "B" "M" "M" "B" "B" "B" "B" "B" "B"
## [469] "M" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "B" "B" "B" "B" "B"
## [487] "B" "M" "B" "M" "B" "B" "M" "B" "B" "B" "B" "B" "M" "M" "B" "M" "B" "M"
## [505] "B" "B" "B" "B" "B" "M" "B" "B" "M" "B" "M" "B" "M" "M" "B" "B" "B" "M"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "M" "B" "M" "M" "B" "B" "B"
## [541] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "M" "M" "M" "M" "M" "M" "B"
table(wd1$diagnosis)
##
## B M
## 357 212
prop.table(table(wd1$diagnosis))
##
## B M
## 0.6274165 0.3725835
summary(wd1[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. : 6.981 Min. : 143.5 Min. :0.05263
## 1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
## Median :13.370 Median : 551.1 Median :0.09587
## Mean :14.127 Mean : 654.9 Mean :0.09636
## 3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
## Max. :28.110 Max. :2501.0 Max. :0.16340
The numbers from the output of the summary are all over the place so they need to be scaled and that is done by creating a normalization function to scale data.
normalize<-function(x){
return((x-min(x))/(max(x)-min(x)))
}
#example of how the function works
normalize(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
normalize(c(10,20,30,40,50))
## [1] 0.00 0.25 0.50 0.75 1.00
All features are normalized using lapply() and then ‘as.data.frame’ is used to convert the data back to a matrix because lapply() changes it into a list.
normalize(c(1, 10,100,1000))
## [1] 0.000000000 0.009009009 0.099099099 1.000000000
wd_n<-as.data.frame(lapply(wd1[2:31], normalize))
summary(wd_n[c("radius_mean", "area_mean", "smoothness_mean")])
## radius_mean area_mean smoothness_mean
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2233 1st Qu.:0.1174 1st Qu.:0.3046
## Median :0.3024 Median :0.1729 Median :0.3904
## Mean :0.3382 Mean :0.2169 Mean :0.3948
## 3rd Qu.:0.4164 3rd Qu.:0.2711 3rd Qu.:0.4755
## Max. :1.0000 Max. :1.0000 Max. :1.0000
The next step is to make the target variable a factor by recoding diagnosis as a factor Notice that ‘wd1’ dataset is used instead of the normalized dataset ‘wd_n’. This is done because ’wd_n’dataset does not include the diagnosis column, it contains normalized data from the 2nd column to the 31st column while wd1 still has all columns intact
wd1$diagnosis<-factor(wd1$diagnosis, levels = c("B", "M"),labels=c("Benign", "Malignant"))
str(wd1$diagnosis)
## Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
The dataset is split into training and test datasets and labels are created for the training and test datasets. The training and test dataset are gotten from the ‘wd_n’ dataset while the labels are gotten from the ‘wd1’ dataset.
wd1_train<-wd_n[1:469, ]
wd1_test<-wd_n[470:569, ]
wd1_train_labels<-wd1[1:469, 1]
wd1_test_labels<-wd1[470:569, 1]
head(wd1_test_labels)
## # A tibble: 6 × 1
## diagnosis
## <fct>
## 1 Benign
## 2 Benign
## 3 Benign
## 4 Benign
## 5 Benign
## 6 Benign
After the dataset has been split, we then move on to training a model on the data amd that is done by selecting the value of k for the kNN algorithm. The value of k is gotten by taking the square root of the training data as a thumb rule. Alternatively, several values of k can be tested on a variety of test datasets inorder to choose the one that delivers the best classification performance.
library(class)
wd1_test_pred<-knn(train=wd1_train, test=wd1_test, cl=wd1_train_labels$diagnosis, k=21)
wd1_test_pred
## [1] Benign Benign Benign Benign Benign Benign Benign
## [8] Benign Benign Benign Malignant Benign Benign Benign
## [15] Benign Benign Benign Benign Malignant Benign Benign
## [22] Benign Benign Malignant Benign Benign Benign Benign
## [29] Benign Malignant Malignant Benign Malignant Benign Malignant
## [36] Benign Benign Benign Benign Benign Malignant Benign
## [43] Benign Malignant Benign Benign Benign Malignant Malignant
## [50] Benign Benign Benign Malignant Benign Benign Benign
## [57] Benign Benign Benign Benign Benign Benign Benign
## [64] Benign Malignant Benign Malignant Malignant Benign Benign
## [71] Benign Benign Benign Benign Benign Benign Benign
## [78] Benign Benign Benign Benign Benign Benign Benign
## [85] Benign Benign Benign Benign Benign Benign Benign
## [92] Benign Benign Malignant Malignant Malignant Malignant Malignant
## [99] Malignant Benign
## Levels: Benign Malignant
library(gmodels)
CrossTable(x = wd1_test_labels$diagnosis, y=wd1_test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | wd1_test_pred
## wd1_test_labels$diagnosis | Benign | Malignant | Row Total |
## --------------------------|-----------|-----------|-----------|
## Benign | 77 | 0 | 77 |
## | 1.000 | 0.000 | 0.770 |
## | 0.975 | 0.000 | |
## | 0.770 | 0.000 | |
## --------------------------|-----------|-----------|-----------|
## Malignant | 2 | 21 | 23 |
## | 0.087 | 0.913 | 0.230 |
## | 0.025 | 1.000 | |
## | 0.020 | 0.210 | |
## --------------------------|-----------|-----------|-----------|
## Column Total | 79 | 21 | 100 |
## | 0.790 | 0.210 | |
## --------------------------|-----------|-----------|-----------|
##
##
Based on this evalution, the model classfied: 77 Benign cells as Benign(True Negative) and 0 Benign cells as Malignant(False Positive). 2 Malignant cells as Benign(False Negative) and 21 Malignant cells as Malignant(True Positive). The output gotten from this evaluation is a Confusion Matrix.
Note: A False Negative in this situation is a dangerous result because it means that the model is telling a cancer patient that they do not have cancer while they do and that could lead to serious complications.