library(MASS)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(lattice)
library(ggplot2)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(readr)
library(ROCR)
library(readxl)
library(e1071)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
cancer_data = read.csv("CancerData.csv")
head(cancer_data)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 17.99 10.38 122.80 1001.0
## 2 842517 M 20.57 17.77 132.90 1326.0
## 3 84300903 M 19.69 21.25 130.00 1203.0
## 4 84348301 M 11.42 20.38 77.58 386.1
## 5 84358402 M 20.29 14.34 135.10 1297.0
## 6 843786 M 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.03003 0.006193 25.38 17.33 184.60
## 2 0.01389 0.003532 24.99 23.41 158.80
## 3 0.02250 0.004571 23.57 25.53 152.50
## 4 0.05963 0.009208 14.91 26.50 98.87
## 5 0.01756 0.005115 22.54 16.67 152.20
## 6 0.02165 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## concave.points_worst symmetry_worst fractal_dimension_worst
## 1 0.2654 0.4601 0.11890
## 2 0.1860 0.2750 0.08902
## 3 0.2430 0.3613 0.08758
## 4 0.2575 0.6638 0.17300
## 5 0.1625 0.2364 0.07678
## 6 0.1741 0.3985 0.12440
diagnosis, if the cancer
is M/malignent or B/benignstr(cancer_data)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
summary(cancer_data)
## id diagnosis radius_mean texture_mean
## Min. : 8670 Length:569 Min. : 6.981 Min. : 9.71
## 1st Qu.: 869218 Class :character 1st Qu.:11.700 1st Qu.:16.17
## Median : 906024 Mode :character Median :13.370 Median :18.84
## Mean : 30371831 Mean :14.127 Mean :19.29
## 3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80
## Max. :911320502 Max. :28.110 Max. :39.28
## perimeter_mean area_mean smoothness_mean compactness_mean
## Min. : 43.79 Min. : 143.5 Min. :0.05263 Min. :0.01938
## 1st Qu.: 75.17 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492
## Median : 86.24 Median : 551.1 Median :0.09587 Median :0.09263
## Mean : 91.97 Mean : 654.9 Mean :0.09636 Mean :0.10434
## 3rd Qu.:104.10 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040
## Max. :188.50 Max. :2501.0 Max. :0.16340 Max. :0.34540
## concavity_mean concave.points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02956 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.06154 Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.08880 Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.13070 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.42680 Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se concave.points_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638
## Median :0.006380 Median :0.020450 Median :0.02589 Median :0.010930
## Mean :0.007041 Mean :0.025478 Mean :0.03189 Mean :0.011796
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710
## Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790
## symmetry_se fractal_dimension_se radius_worst texture_worst
## Min. :0.007882 Min. :0.0008948 Min. : 7.93 Min. :12.02
## 1st Qu.:0.015160 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08
## Median :0.018730 Median :0.0031870 Median :14.97 Median :25.41
## Mean :0.020542 Mean :0.0037949 Mean :16.27 Mean :25.68
## 3rd Qu.:0.023480 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72
## Max. :0.078950 Max. :0.0298400 Max. :36.04 Max. :49.54
## perimeter_worst area_worst smoothness_worst compactness_worst
## Min. : 50.41 Min. : 185.2 Min. :0.07117 Min. :0.02729
## 1st Qu.: 84.11 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720
## Median : 97.66 Median : 686.5 Median :0.13130 Median :0.21190
## Mean :107.26 Mean : 880.6 Mean :0.13237 Mean :0.25427
## 3rd Qu.:125.40 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910
## Max. :251.20 Max. :4254.0 Max. :0.22260 Max. :1.05800
## concavity_worst concave.points_worst symmetry_worst fractal_dimension_worst
## Min. :0.0000 Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.1145 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2267 Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.2722 Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3829 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :1.2520 Max. :0.29100 Max. :0.6638 Max. :0.20750
colSums(is.na(cancer_data)) #the code book says there are no missing, but double checking
## id diagnosis radius_mean
## 0 0 0
## texture_mean perimeter_mean area_mean
## 0 0 0
## smoothness_mean compactness_mean concavity_mean
## 0 0 0
## concave.points_mean symmetry_mean fractal_dimension_mean
## 0 0 0
## radius_se texture_se perimeter_se
## 0 0 0
## area_se smoothness_se compactness_se
## 0 0 0
## concavity_se concave.points_se symmetry_se
## 0 0 0
## fractal_dimension_se radius_worst texture_worst
## 0 0 0
## perimeter_worst area_worst smoothness_worst
## 0 0 0
## compactness_worst concavity_worst concave.points_worst
## 0 0 0
## symmetry_worst fractal_dimension_worst
## 0 0
diagnosis
levelsggplot(cancer_data, aes(x = diagnosis, fill = diagnosis)) +
geom_bar() +
scale_fill_manual(values = c("B" = "blue", "M" = "red")) + # Swaps default colors
labs(title = "Distribution of Diagnosis", x = "Diagnosis", y = "Frequency") +
theme_minimal()
table(cancer_data$diagnosis)
##
## B M
## 357 212
cancer_data_copy = cancer_data
cancer_data_copy <- dplyr::select(cancer_data_copy, -id, -diagnosis)
corr_matrix <- cor(cancer_data_copy)
print(corr_matrix)
## radius_mean texture_mean perimeter_mean area_mean
## radius_mean 1.000000000 0.323781891 0.997855281 0.987357170
## texture_mean 0.323781891 1.000000000 0.329533059 0.321085696
## perimeter_mean 0.997855281 0.329533059 1.000000000 0.986506804
## area_mean 0.987357170 0.321085696 0.986506804 1.000000000
## smoothness_mean 0.170581187 -0.023388516 0.207278164 0.177028377
## compactness_mean 0.506123578 0.236702222 0.556936211 0.498501682
## concavity_mean 0.676763550 0.302417828 0.716135650 0.685982829
## concave.points_mean 0.822528522 0.293464051 0.850977041 0.823268869
## symmetry_mean 0.147741242 0.071400980 0.183027212 0.151293079
## fractal_dimension_mean -0.311630826 -0.076437183 -0.261476908 -0.283109812
## radius_se 0.679090388 0.275868676 0.691765014 0.732562227
## texture_se -0.097317443 0.386357623 -0.086761078 -0.066280214
## perimeter_se 0.674171616 0.281673115 0.693134890 0.726628328
## area_se 0.735863663 0.259844987 0.744982694 0.800085921
## smoothness_se -0.222600125 0.006613777 -0.202694026 -0.166776667
## compactness_se 0.205999980 0.191974611 0.250743681 0.212582551
## concavity_se 0.194203623 0.143293077 0.228082345 0.207660060
## concave.points_se 0.376168956 0.163851025 0.407216916 0.372320282
## symmetry_se -0.104320881 0.009127168 -0.081629327 -0.072496588
## fractal_dimension_se -0.042641269 0.054457520 -0.005523391 -0.019886963
## radius_worst 0.969538973 0.352572947 0.969476363 0.962746086
## texture_worst 0.297007644 0.912044589 0.303038372 0.287488627
## perimeter_worst 0.965136514 0.358039575 0.970386887 0.959119574
## area_worst 0.941082460 0.343545947 0.941549808 0.959213326
## smoothness_worst 0.119616140 0.077503359 0.150549404 0.123522939
## compactness_worst 0.413462823 0.277829592 0.455774228 0.390410309
## concavity_worst 0.526911462 0.301025224 0.563879263 0.512605920
## concave.points_worst 0.744214198 0.295315843 0.771240789 0.722016626
## symmetry_worst 0.163953335 0.105007910 0.189115040 0.143569914
## fractal_dimension_worst 0.007065886 0.119205351 0.051018530 0.003737597
## smoothness_mean compactness_mean concavity_mean
## radius_mean 0.17058119 0.50612358 0.67676355
## texture_mean -0.02338852 0.23670222 0.30241783
## perimeter_mean 0.20727816 0.55693621 0.71613565
## area_mean 0.17702838 0.49850168 0.68598283
## smoothness_mean 1.00000000 0.65912322 0.52198377
## compactness_mean 0.65912322 1.00000000 0.88312067
## concavity_mean 0.52198377 0.88312067 1.00000000
## concave.points_mean 0.55369517 0.83113504 0.92139103
## symmetry_mean 0.55777479 0.60264105 0.50066662
## fractal_dimension_mean 0.58479200 0.56536866 0.33678336
## radius_se 0.30146710 0.49747345 0.63192482
## texture_se 0.06840645 0.04620483 0.07621835
## perimeter_se 0.29609193 0.54890526 0.66039079
## area_se 0.24655243 0.45565285 0.61742681
## smoothness_se 0.33237544 0.13529927 0.09856375
## compactness_se 0.31894330 0.73872179 0.67027882
## concavity_se 0.24839568 0.57051687 0.69127021
## concave.points_se 0.38067569 0.64226185 0.68325992
## symmetry_se 0.20077438 0.22997659 0.17800921
## fractal_dimension_se 0.28360670 0.50731813 0.44930075
## radius_worst 0.21312014 0.53531540 0.68823641
## texture_worst 0.03607180 0.24813283 0.29987889
## perimeter_worst 0.23885263 0.59021043 0.72956492
## area_worst 0.20671836 0.50960381 0.67598723
## smoothness_worst 0.80532420 0.56554117 0.44882204
## compactness_worst 0.47246844 0.86580904 0.75496802
## concavity_worst 0.43492571 0.81627525 0.88410264
## concave.points_worst 0.50305335 0.81557322 0.86132303
## symmetry_worst 0.39430948 0.51022343 0.40946413
## fractal_dimension_worst 0.49931637 0.68738232 0.51492989
## concave.points_mean symmetry_mean
## radius_mean 0.82252852 0.14774124
## texture_mean 0.29346405 0.07140098
## perimeter_mean 0.85097704 0.18302721
## area_mean 0.82326887 0.15129308
## smoothness_mean 0.55369517 0.55777479
## compactness_mean 0.83113504 0.60264105
## concavity_mean 0.92139103 0.50066662
## concave.points_mean 1.00000000 0.46249739
## symmetry_mean 0.46249739 1.00000000
## fractal_dimension_mean 0.16691738 0.47992133
## radius_se 0.69804983 0.30337926
## texture_se 0.02147958 0.12805293
## perimeter_se 0.71064987 0.31389276
## area_se 0.69029854 0.22397022
## smoothness_se 0.02765331 0.18732117
## compactness_se 0.49042425 0.42165915
## concavity_se 0.43916707 0.34262702
## concave.points_se 0.61563413 0.39329787
## symmetry_se 0.09535079 0.44913654
## fractal_dimension_se 0.25758375 0.33178615
## radius_worst 0.83031763 0.18572775
## texture_worst 0.29275171 0.09065069
## perimeter_worst 0.85592313 0.21916856
## area_worst 0.80962962 0.17719338
## smoothness_worst 0.45275305 0.42667503
## compactness_worst 0.66745368 0.47320001
## concavity_worst 0.75239950 0.43372101
## concave.points_worst 0.91015531 0.43029661
## symmetry_worst 0.37574415 0.69982580
## fractal_dimension_worst 0.36866113 0.43841350
## fractal_dimension_mean radius_se texture_se
## radius_mean -0.3116308263 0.6790903880 -0.09731744
## texture_mean -0.0764371834 0.2758686762 0.38635762
## perimeter_mean -0.2614769081 0.6917650135 -0.08676108
## area_mean -0.2831098117 0.7325622270 -0.06628021
## smoothness_mean 0.5847920019 0.3014670983 0.06840645
## compactness_mean 0.5653686634 0.4974734461 0.04620483
## concavity_mean 0.3367833594 0.6319248221 0.07621835
## concave.points_mean 0.1669173832 0.6980498336 0.02147958
## symmetry_mean 0.4799213301 0.3033792632 0.12805293
## fractal_dimension_mean 1.0000000000 0.0001109951 0.16417397
## radius_se 0.0001109951 1.0000000000 0.21324734
## texture_se 0.1641739659 0.2132473373 1.00000000
## perimeter_se 0.0398299316 0.9727936770 0.22317073
## area_se -0.0901702475 0.9518301121 0.11156725
## smoothness_se 0.4019644254 0.1645142198 0.39724285
## compactness_se 0.5598366906 0.3560645755 0.23169970
## concavity_se 0.4466303217 0.3323575376 0.19499846
## concave.points_se 0.3411980444 0.5133464414 0.23028340
## symmetry_se 0.3450073971 0.2405673625 0.41162068
## fractal_dimension_se 0.6881315775 0.2277535327 0.27972275
## radius_worst -0.2536914949 0.7150651951 -0.11169031
## texture_worst -0.0512692020 0.1947985568 0.40900277
## perimeter_worst -0.2051512113 0.7196838037 -0.10224192
## area_worst -0.2318544512 0.7515484761 -0.08319499
## smoothness_worst 0.5049420754 0.1419185529 -0.07365766
## compactness_worst 0.4587981567 0.2871031656 -0.09243935
## concavity_worst 0.3462338763 0.3805846346 -0.06895622
## concave.points_worst 0.1753254492 0.5310623278 -0.11963752
## symmetry_worst 0.3340186839 0.0945428304 -0.12821476
## fractal_dimension_worst 0.7672967792 0.0495594325 -0.04565457
## perimeter_se area_se smoothness_se compactness_se
## radius_mean 0.67417162 0.73586366 -0.222600125 0.2060000
## texture_mean 0.28167311 0.25984499 0.006613777 0.1919746
## perimeter_mean 0.69313489 0.74498269 -0.202694026 0.2507437
## area_mean 0.72662833 0.80008592 -0.166776667 0.2125826
## smoothness_mean 0.29609193 0.24655243 0.332375443 0.3189433
## compactness_mean 0.54890526 0.45565285 0.135299268 0.7387218
## concavity_mean 0.66039079 0.61742681 0.098563746 0.6702788
## concave.points_mean 0.71064987 0.69029854 0.027653308 0.4904242
## symmetry_mean 0.31389276 0.22397022 0.187321165 0.4216591
## fractal_dimension_mean 0.03982993 -0.09017025 0.401964425 0.5598367
## radius_se 0.97279368 0.95183011 0.164514220 0.3560646
## texture_se 0.22317073 0.11156725 0.397242853 0.2316997
## perimeter_se 1.00000000 0.93765541 0.151075331 0.4163224
## area_se 0.93765541 1.00000000 0.075150338 0.2848401
## smoothness_se 0.15107533 0.07515034 1.000000000 0.3366961
## compactness_se 0.41632237 0.28484006 0.336696081 1.0000000
## concavity_se 0.36248158 0.27089473 0.268684760 0.8012683
## concave.points_se 0.55626408 0.41572957 0.328429499 0.7440827
## symmetry_se 0.26648709 0.13410898 0.413506125 0.3947128
## fractal_dimension_se 0.24414277 0.12707090 0.427374207 0.8032688
## radius_worst 0.69720059 0.75737319 -0.230690710 0.2046072
## texture_worst 0.20037085 0.19649665 -0.074742965 0.1430026
## perimeter_worst 0.72103131 0.76121264 -0.217303755 0.2605158
## area_worst 0.73071297 0.81140796 -0.182195478 0.1993713
## smoothness_worst 0.13005439 0.12538943 0.314457456 0.2273942
## compactness_worst 0.34191945 0.28325654 -0.055558139 0.6787804
## concavity_worst 0.41889882 0.38510014 -0.058298387 0.6391467
## concave.points_worst 0.55489723 0.53816631 -0.102006796 0.4832083
## symmetry_worst 0.10993043 0.07412629 -0.107342098 0.2778784
## fractal_dimension_worst 0.08543257 0.01753930 0.101480315 0.5909728
## concavity_se concave.points_se symmetry_se
## radius_mean 0.1942036 0.37616896 -0.104320881
## texture_mean 0.1432931 0.16385103 0.009127168
## perimeter_mean 0.2280823 0.40721692 -0.081629327
## area_mean 0.2076601 0.37232028 -0.072496588
## smoothness_mean 0.2483957 0.38067569 0.200774376
## compactness_mean 0.5705169 0.64226185 0.229976591
## concavity_mean 0.6912702 0.68325992 0.178009208
## concave.points_mean 0.4391671 0.61563413 0.095350787
## symmetry_mean 0.3426270 0.39329787 0.449136542
## fractal_dimension_mean 0.4466303 0.34119804 0.345007397
## radius_se 0.3323575 0.51334644 0.240567362
## texture_se 0.1949985 0.23028340 0.411620680
## perimeter_se 0.3624816 0.55626408 0.266487092
## area_se 0.2708947 0.41572957 0.134108980
## smoothness_se 0.2686848 0.32842950 0.413506125
## compactness_se 0.8012683 0.74408267 0.394712835
## concavity_se 1.0000000 0.77180399 0.309428578
## concave.points_se 0.7718040 1.00000000 0.312780223
## symmetry_se 0.3094286 0.31278022 1.000000000
## fractal_dimension_se 0.7273722 0.61104414 0.369078083
## radius_worst 0.1869035 0.35812667 -0.128120769
## texture_worst 0.1002410 0.08674121 -0.077473420
## perimeter_worst 0.2266804 0.39499925 -0.103753044
## area_worst 0.1883527 0.34227116 -0.110342743
## smoothness_worst 0.1684813 0.21535060 -0.012661800
## compactness_worst 0.4848578 0.45288838 0.060254879
## concavity_worst 0.6625641 0.54959238 0.037119049
## concave.points_worst 0.4404723 0.60244961 -0.030413396
## symmetry_worst 0.1977878 0.14311567 0.389402485
## fractal_dimension_worst 0.4393293 0.31065455 0.078079476
## fractal_dimension_se radius_worst texture_worst
## radius_mean -0.042641269 0.96953897 0.297007644
## texture_mean 0.054457520 0.35257295 0.912044589
## perimeter_mean -0.005523391 0.96947636 0.303038372
## area_mean -0.019886963 0.96274609 0.287488627
## smoothness_mean 0.283606699 0.21312014 0.036071799
## compactness_mean 0.507318127 0.53531540 0.248132833
## concavity_mean 0.449300749 0.68823641 0.299878889
## concave.points_mean 0.257583746 0.83031763 0.292751713
## symmetry_mean 0.331786146 0.18572775 0.090650688
## fractal_dimension_mean 0.688131577 -0.25369149 -0.051269202
## radius_se 0.227753533 0.71506520 0.194798557
## texture_se 0.279722748 -0.11169031 0.409002766
## perimeter_se 0.244142773 0.69720059 0.200370854
## area_se 0.127070903 0.75737319 0.196496649
## smoothness_se 0.427374207 -0.23069071 -0.074742965
## compactness_se 0.803268818 0.20460717 0.143002583
## concavity_se 0.727372184 0.18690352 0.100240984
## concave.points_se 0.611044139 0.35812667 0.086741210
## symmetry_se 0.369078083 -0.12812077 -0.077473420
## fractal_dimension_se 1.000000000 -0.03748762 -0.003195029
## radius_worst -0.037487618 1.00000000 0.359920754
## texture_worst -0.003195029 0.35992075 1.000000000
## perimeter_worst -0.001000398 0.99370792 0.365098245
## area_worst -0.022736147 0.98401456 0.345842283
## smoothness_worst 0.170568316 0.21657443 0.225429415
## compactness_worst 0.390158842 0.47582004 0.360832339
## concavity_worst 0.379974661 0.57397471 0.368365607
## concave.points_worst 0.215204013 0.78742385 0.359754610
## symmetry_worst 0.111093956 0.24352920 0.233027461
## fractal_dimension_worst 0.591328066 0.09349198 0.219122425
## perimeter_worst area_worst smoothness_worst
## radius_mean 0.965136514 0.94108246 0.11961614
## texture_mean 0.358039575 0.34354595 0.07750336
## perimeter_mean 0.970386887 0.94154981 0.15054940
## area_mean 0.959119574 0.95921333 0.12352294
## smoothness_mean 0.238852626 0.20671836 0.80532420
## compactness_mean 0.590210428 0.50960381 0.56554117
## concavity_mean 0.729564917 0.67598723 0.44882204
## concave.points_mean 0.855923128 0.80962962 0.45275305
## symmetry_mean 0.219168559 0.17719338 0.42667503
## fractal_dimension_mean -0.205151211 -0.23185445 0.50494208
## radius_se 0.719683804 0.75154848 0.14191855
## texture_se -0.102241922 -0.08319499 -0.07365766
## perimeter_se 0.721031310 0.73071297 0.13005439
## area_se 0.761212636 0.81140796 0.12538943
## smoothness_se -0.217303755 -0.18219548 0.31445746
## compactness_se 0.260515840 0.19937133 0.22739423
## concavity_se 0.226680426 0.18835265 0.16848132
## concave.points_se 0.394999252 0.34227116 0.21535060
## symmetry_se -0.103753044 -0.11034274 -0.01266180
## fractal_dimension_se -0.001000398 -0.02273615 0.17056832
## radius_worst 0.993707916 0.98401456 0.21657443
## texture_worst 0.365098245 0.34584228 0.22542941
## perimeter_worst 1.000000000 0.97757809 0.23677460
## area_worst 0.977578091 1.00000000 0.20914533
## smoothness_worst 0.236774604 0.20914533 1.00000000
## compactness_worst 0.529407690 0.43829628 0.56818652
## concavity_worst 0.618344080 0.54333053 0.51852329
## concave.points_worst 0.816322102 0.74741880 0.54769090
## symmetry_worst 0.269492769 0.20914551 0.49383833
## fractal_dimension_worst 0.138956862 0.07964703 0.61762419
## compactness_worst concavity_worst concave.points_worst
## radius_mean 0.41346282 0.52691146 0.7442142
## texture_mean 0.27782959 0.30102522 0.2953158
## perimeter_mean 0.45577423 0.56387926 0.7712408
## area_mean 0.39041031 0.51260592 0.7220166
## smoothness_mean 0.47246844 0.43492571 0.5030534
## compactness_mean 0.86580904 0.81627525 0.8155732
## concavity_mean 0.75496802 0.88410264 0.8613230
## concave.points_mean 0.66745368 0.75239950 0.9101553
## symmetry_mean 0.47320001 0.43372101 0.4302966
## fractal_dimension_mean 0.45879816 0.34623388 0.1753254
## radius_se 0.28710317 0.38058463 0.5310623
## texture_se -0.09243935 -0.06895622 -0.1196375
## perimeter_se 0.34191945 0.41889882 0.5548972
## area_se 0.28325654 0.38510014 0.5381663
## smoothness_se -0.05555814 -0.05829839 -0.1020068
## compactness_se 0.67878035 0.63914670 0.4832083
## concavity_se 0.48485780 0.66256413 0.4404723
## concave.points_se 0.45288838 0.54959238 0.6024496
## symmetry_se 0.06025488 0.03711905 -0.0304134
## fractal_dimension_se 0.39015884 0.37997466 0.2152040
## radius_worst 0.47582004 0.57397471 0.7874239
## texture_worst 0.36083234 0.36836561 0.3597546
## perimeter_worst 0.52940769 0.61834408 0.8163221
## area_worst 0.43829628 0.54333053 0.7474188
## smoothness_worst 0.56818652 0.51852329 0.5476909
## compactness_worst 1.00000000 0.89226090 0.8010804
## concavity_worst 0.89226090 1.00000000 0.8554339
## concave.points_worst 0.80108036 0.85543386 1.0000000
## symmetry_worst 0.61444050 0.53251973 0.5025285
## fractal_dimension_worst 0.81045486 0.68651092 0.5111141
## symmetry_worst fractal_dimension_worst
## radius_mean 0.16395333 0.007065886
## texture_mean 0.10500791 0.119205351
## perimeter_mean 0.18911504 0.051018530
## area_mean 0.14356991 0.003737597
## smoothness_mean 0.39430948 0.499316369
## compactness_mean 0.51022343 0.687382323
## concavity_mean 0.40946413 0.514929891
## concave.points_mean 0.37574415 0.368661134
## symmetry_mean 0.69982580 0.438413498
## fractal_dimension_mean 0.33401868 0.767296779
## radius_se 0.09454283 0.049559432
## texture_se -0.12821476 -0.045654569
## perimeter_se 0.10993043 0.085432572
## area_se 0.07412629 0.017539295
## smoothness_se -0.10734210 0.101480315
## compactness_se 0.27787843 0.590972763
## concavity_se 0.19778782 0.439329269
## concave.points_se 0.14311567 0.310654551
## symmetry_se 0.38940248 0.078079476
## fractal_dimension_se 0.11109396 0.591328066
## radius_worst 0.24352920 0.093491979
## texture_worst 0.23302746 0.219122425
## perimeter_worst 0.26949277 0.138956862
## area_worst 0.20914551 0.079647034
## smoothness_worst 0.49383833 0.617624192
## compactness_worst 0.61444050 0.810454856
## concavity_worst 0.53251973 0.686510921
## concave.points_worst 0.50252849 0.511114146
## symmetry_worst 1.00000000 0.537848206
## fractal_dimension_worst 0.53784821 1.000000000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
# Adjust margins to give labels more space
par(mar = c(5, 5, 5, 5))
# Plot correlation matrix with larger text and better spacing
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=45, tl.cex=0.6) # Adjust tl.cex for text size
id because it is irrelevant and would skew
the data while providing no significant value, will be done further
down# Load necessary libraries
library(caret)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071) # For SVM
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
library(car)
cancer_data = read.csv("CancerData.csv")
# Remove 'id' column
cancer_data <- cancer_data[, !(names(cancer_data) %in% c("id"))]
# Encode 'diagnosis' as factor (B = 0, M = 1)
cancer_data$diagnosis <- as.numeric(factor(cancer_data$diagnosis, levels = c("B", "M"))) - 1
# Keep a copy of the full dataset before feature selection
cancer_data_full <- cancer_data
# Remove multicollinear variables for logistic regression
cancer_data <- dplyr::select(cancer_data, -radius_mean, -perimeter_mean, -area_mean, -compactness_mean,
-concave.points_mean, -area_worst, -smoothness_worst, -concavity_worst,
-fractal_dimension_se, -concave.points_worst, -perimeter_worst, -radius_worst,
-compactness_se, -texture_worst, -symmetry_worst, -radius_se, -concavity_se,
-fractal_dimension_worst)
# Split into training (80%) and testing (20%) sets for both datasets
set.seed(2025)
trainIndex <- createDataPartition(cancer_data$diagnosis, p = 0.8, list = FALSE)
train_data <- cancer_data[trainIndex, ]
test_data <- cancer_data[-trainIndex, ]
train_data_full <- cancer_data_full[trainIndex, ] # Full dataset (all features)
test_data_full <- cancer_data_full[-trainIndex, ]
print("trainig set number of Benign and Malignant observations")
## [1] "trainig set number of Benign and Malignant observations"
table(train_data$diagnosis)
##
## 0 1
## 279 177
print("testing set number of Benign and Malignant observations")
## [1] "testing set number of Benign and Malignant observations"
table(test_data$diagnosis)
##
## 0 1
## 78 35
# Standardize numeric features (excluding 'diagnosis')
preprocess_params <- preProcess(train_data[, -1], method = c("center", "scale"))
train_data[, -1] <- predict(preprocess_params, train_data[, -1])
test_data[, -1] <- predict(preprocess_params, test_data[, -1])
preprocess_params_full <- preProcess(train_data_full[, -1], method = c("center", "scale"))
train_data_full[, -1] <- predict(preprocess_params_full, train_data_full[, -1])
test_data_full[, -1] <- predict(preprocess_params_full, test_data_full[, -1])
# Convert diagnosis to factor
train_data$diagnosis <- factor(train_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
test_data$diagnosis <- factor(test_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
train_data_full$diagnosis <- factor(train_data_full$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
test_data_full$diagnosis <- factor(test_data_full$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
# Define training control for cross-validation
train_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
# Train Logistic Regression (using reduced dataset)
logistic_model <- train(diagnosis ~ ., data = train_data, method = "glm",
family = binomial, trControl = train_control, metric = "ROC")
# Train Random Forest (using full dataset)
rf_model <- train(diagnosis ~ ., data = train_data_full, method = "rf",
trControl = train_control, metric = "ROC", ntree = 100)
# Train SVM (using full dataset)
svm_model <- train(diagnosis ~ ., data = train_data_full, method = "svmRadial",
trControl = train_control, metric = "ROC")
# Train XGBoost (using full dataset)
xgb_grid <- expand.grid(
nrounds = 100, # Number of boosting iterations
max_depth = 3,
eta = 0.1,
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 1,
subsample = 0.8
)
xgb_model <- train(
diagnosis ~ .,
data = train_data_full,
method = "xgbTree",
trControl = train_control,
metric = "ROC",
tuneGrid = xgb_grid
)
# Function to evaluate models
evaluate_model <- function(model, test_data) {
predictions <- predict(model, test_data)
prob_predictions <- predict(model, test_data, type = "prob")[, 2]
accuracy <- mean(predictions == test_data$diagnosis)
auc <- roc(test_data$diagnosis, prob_predictions)$auc
cat("\nModel:", model$method)
cat("\nAccuracy:", round(accuracy, 4))
cat("\nAUC-ROC:", round(auc, 4))
cat("\nConfusion Matrix:\n")
print(confusionMatrix(predictions, test_data$diagnosis))
}
# Evaluate models
evaluate_model(logistic_model, test_data) # Using reduced dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
##
## Model: glm
## Accuracy: 0.9469
## AUC-ROC: 0.9901
## Confusion Matrix:
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 75 3
## Malignant 3 32
##
## Accuracy : 0.9469
## 95% CI : (0.888, 0.9803)
## No Information Rate : 0.6903
## P-Value [Acc > NIR] : 1.511e-11
##
## Kappa : 0.8758
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9615
## Specificity : 0.9143
## Pos Pred Value : 0.9615
## Neg Pred Value : 0.9143
## Prevalence : 0.6903
## Detection Rate : 0.6637
## Detection Prevalence : 0.6903
## Balanced Accuracy : 0.9379
##
## 'Positive' Class : Benign
##
evaluate_model(rf_model, test_data_full) # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
##
## Model: rf
## Accuracy: 0.9823
## AUC-ROC: 0.9828
## Confusion Matrix:
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 77 1
## Malignant 1 34
##
## Accuracy : 0.9823
## 95% CI : (0.9375, 0.9978)
## No Information Rate : 0.6903
## P-Value [Acc > NIR] : 8.537e-16
##
## Kappa : 0.9586
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9872
## Specificity : 0.9714
## Pos Pred Value : 0.9872
## Neg Pred Value : 0.9714
## Prevalence : 0.6903
## Detection Rate : 0.6814
## Detection Prevalence : 0.6903
## Balanced Accuracy : 0.9793
##
## 'Positive' Class : Benign
##
evaluate_model(svm_model, test_data_full) # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
##
## Model: svmRadial
## Accuracy: 0.9646
## AUC-ROC: 0.9938
## Confusion Matrix:
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 75 1
## Malignant 3 34
##
## Accuracy : 0.9646
## 95% CI : (0.9118, 0.9903)
## No Information Rate : 0.6903
## P-Value [Acc > NIR] : 1.826e-13
##
## Kappa : 0.9185
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9615
## Specificity : 0.9714
## Pos Pred Value : 0.9868
## Neg Pred Value : 0.9189
## Prevalence : 0.6903
## Detection Rate : 0.6637
## Detection Prevalence : 0.6726
## Balanced Accuracy : 0.9665
##
## 'Positive' Class : Benign
##
evaluate_model(xgb_model, test_data_full) # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
##
## Model: xgbTree
## Accuracy: 0.9823
## AUC-ROC: 0.9912
## Confusion Matrix:
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 77 1
## Malignant 1 34
##
## Accuracy : 0.9823
## 95% CI : (0.9375, 0.9978)
## No Information Rate : 0.6903
## P-Value [Acc > NIR] : 8.537e-16
##
## Kappa : 0.9586
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9872
## Specificity : 0.9714
## Pos Pred Value : 0.9872
## Neg Pred Value : 0.9714
## Prevalence : 0.6903
## Detection Rate : 0.6814
## Detection Prevalence : 0.6903
## Balanced Accuracy : 0.9793
##
## 'Positive' Class : Benign
##
# Load required libraries
library(caTools)
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot) # For better tree visualization
## Warning: package 'rpart.plot' was built under R version 4.3.3
# Ensure 'diagnosis' is a factor
cancer_data$diagnosis <- factor(cancer_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
# Set seed for reproducibility
set.seed(2025)
# Perform stratified split
split_2 <- sample.split(cancer_data$diagnosis, SplitRatio = 0.8)
# Create training and test sets
cancer_train_2 <- subset(cancer_data, split_2 == TRUE)
cancer_test_2 <- subset(cancer_data, split_2 == FALSE)
# Train Decision Tree Model
tree_model <- rpart(diagnosis ~ ., data = cancer_train_2, method = "class")
# Plot the tree using an improved visualization
rpart.plot(tree_model, type = 2, extra = 104, tweak = 1.2, box.palette = "RdYlGn", shadow.col = "gray", nn = TRUE)
# Get variable importance
rf_importance <- varImp(rf_model)
# Print importance
print(rf_importance)
## rf variable importance
##
## only 20 most important variables shown (out of 30)
##
## Overall
## concave.points_worst 100.000
## area_worst 80.975
## area_mean 71.959
## radius_worst 62.302
## concave.points_mean 59.854
## perimeter_mean 55.627
## concavity_worst 42.759
## area_se 41.396
## radius_mean 38.970
## perimeter_worst 33.813
## concavity_mean 27.732
## compactness_worst 27.351
## radius_se 25.590
## perimeter_se 22.055
## compactness_mean 16.623
## texture_mean 15.874
## fractal_dimension_worst 9.994
## texture_worst 8.734
## concave.points_se 8.183
## smoothness_worst 7.095
# Plot importance
plot(rf_importance, main = "Variable Importance - Random Forest")