library(MASS)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(lattice)
library(ggplot2)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(readr)
library(ROCR)
library(readxl)
library(e1071)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
cancer_data = read.csv("CancerData.csv")
head(cancer_data)
##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M       17.99        10.38         122.80    1001.0
## 2   842517         M       20.57        17.77         132.90    1326.0
## 3 84300903         M       19.69        21.25         130.00    1203.0
## 4 84348301         M       11.42        20.38          77.58     386.1
## 5 84358402         M       20.29        14.34         135.10    1297.0
## 6   843786         M       12.45        15.70          82.57     477.1
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1         0.11840          0.27760         0.3001             0.14710
## 2         0.08474          0.07864         0.0869             0.07017
## 3         0.10960          0.15990         0.1974             0.12790
## 4         0.14250          0.28390         0.2414             0.10520
## 5         0.10030          0.13280         0.1980             0.10430
## 6         0.12780          0.17000         0.1578             0.08089
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1        0.2419                0.07871    1.0950     0.9053        8.589
## 2        0.1812                0.05667    0.5435     0.7339        3.398
## 3        0.2069                0.05999    0.7456     0.7869        4.585
## 4        0.2597                0.09744    0.4956     1.1560        3.445
## 5        0.1809                0.05883    0.7572     0.7813        5.438
## 6        0.2087                0.07613    0.3345     0.8902        2.217
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1  153.40      0.006399        0.04904      0.05373           0.01587
## 2   74.08      0.005225        0.01308      0.01860           0.01340
## 3   94.03      0.006150        0.04006      0.03832           0.02058
## 4   27.23      0.009110        0.07458      0.05661           0.01867
## 5   94.44      0.011490        0.02461      0.05688           0.01885
## 6   27.19      0.007510        0.03345      0.03672           0.01137
##   symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1     0.03003             0.006193        25.38         17.33          184.60
## 2     0.01389             0.003532        24.99         23.41          158.80
## 3     0.02250             0.004571        23.57         25.53          152.50
## 4     0.05963             0.009208        14.91         26.50           98.87
## 5     0.01756             0.005115        22.54         16.67          152.20
## 6     0.02165             0.005082        15.47         23.75          103.40
##   area_worst smoothness_worst compactness_worst concavity_worst
## 1     2019.0           0.1622            0.6656          0.7119
## 2     1956.0           0.1238            0.1866          0.2416
## 3     1709.0           0.1444            0.4245          0.4504
## 4      567.7           0.2098            0.8663          0.6869
## 5     1575.0           0.1374            0.2050          0.4000
## 6      741.6           0.1791            0.5249          0.5355
##   concave.points_worst symmetry_worst fractal_dimension_worst
## 1               0.2654         0.4601                 0.11890
## 2               0.1860         0.2750                 0.08902
## 3               0.2430         0.3613                 0.08758
## 4               0.2575         0.6638                 0.17300
## 5               0.1625         0.2364                 0.07678
## 6               0.1741         0.3985                 0.12440

Only one character variable, diagnosis, if the cancer is M/malignent or B/benign

str(cancer_data)
## 'data.frame':    569 obs. of  32 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
summary(cancer_data)
##        id             diagnosis          radius_mean      texture_mean  
##  Min.   :     8670   Length:569         Min.   : 6.981   Min.   : 9.71  
##  1st Qu.:   869218   Class :character   1st Qu.:11.700   1st Qu.:16.17  
##  Median :   906024   Mode  :character   Median :13.370   Median :18.84  
##  Mean   : 30371831                      Mean   :14.127   Mean   :19.29  
##  3rd Qu.:  8813129                      3rd Qu.:15.780   3rd Qu.:21.80  
##  Max.   :911320502                      Max.   :28.110   Max.   :39.28  
##  perimeter_mean     area_mean      smoothness_mean   compactness_mean 
##  Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
##  1st Qu.: 75.17   1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492  
##  Median : 86.24   Median : 551.1   Median :0.09587   Median :0.09263  
##  Mean   : 91.97   Mean   : 654.9   Mean   :0.09636   Mean   :0.10434  
##  3rd Qu.:104.10   3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040  
##  Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
##  concavity_mean    concave.points_mean symmetry_mean    fractal_dimension_mean
##  Min.   :0.00000   Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
##  1st Qu.:0.02956   1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
##  Median :0.06154   Median :0.03350     Median :0.1792   Median :0.06154       
##  Mean   :0.08880   Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
##  3rd Qu.:0.13070   3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
##  Max.   :0.42680   Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
##    radius_se        texture_se      perimeter_se       area_se       
##  Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
##  1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
##  Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
##  Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
##  3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
##  Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
##  smoothness_se      compactness_se      concavity_se     concave.points_se 
##  Min.   :0.001713   Min.   :0.002252   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638  
##  Median :0.006380   Median :0.020450   Median :0.02589   Median :0.010930  
##  Mean   :0.007041   Mean   :0.025478   Mean   :0.03189   Mean   :0.011796  
##  3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710  
##  Max.   :0.031130   Max.   :0.135400   Max.   :0.39600   Max.   :0.052790  
##   symmetry_se       fractal_dimension_se  radius_worst   texture_worst  
##  Min.   :0.007882   Min.   :0.0008948    Min.   : 7.93   Min.   :12.02  
##  1st Qu.:0.015160   1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08  
##  Median :0.018730   Median :0.0031870    Median :14.97   Median :25.41  
##  Mean   :0.020542   Mean   :0.0037949    Mean   :16.27   Mean   :25.68  
##  3rd Qu.:0.023480   3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72  
##  Max.   :0.078950   Max.   :0.0298400    Max.   :36.04   Max.   :49.54  
##  perimeter_worst    area_worst     smoothness_worst  compactness_worst
##  Min.   : 50.41   Min.   : 185.2   Min.   :0.07117   Min.   :0.02729  
##  1st Qu.: 84.11   1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720  
##  Median : 97.66   Median : 686.5   Median :0.13130   Median :0.21190  
##  Mean   :107.26   Mean   : 880.6   Mean   :0.13237   Mean   :0.25427  
##  3rd Qu.:125.40   3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910  
##  Max.   :251.20   Max.   :4254.0   Max.   :0.22260   Max.   :1.05800  
##  concavity_worst  concave.points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.0000   Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.1145   1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.2267   Median :0.09993      Median :0.2822   Median :0.08004        
##  Mean   :0.2722   Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.3829   3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :1.2520   Max.   :0.29100      Max.   :0.6638   Max.   :0.20750

Shows no missing or NA values

colSums(is.na(cancer_data)) #the code book says there are no missing, but double checking
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst 
##                       0                       0

Visually we can see the distribution of diagnosis levels

ggplot(cancer_data, aes(x = diagnosis, fill = diagnosis)) +
  geom_bar() +
  scale_fill_manual(values = c("B" = "blue", "M" = "red")) +  # Swaps default colors
  labs(title = "Distribution of Diagnosis", x = "Diagnosis", y = "Frequency") +
  theme_minimal()

table(cancer_data$diagnosis)
## 
##   B   M 
## 357 212

We can see that there are 357 benign observations, and 212 Malignent obvservations (the goal of this study is to see if we can classify certain results, so early detection can catch it, but we ultimatley want benign, we will focus mostly of malignent, so we can see the data that causes the malignent results)

cancer_data_copy = cancer_data
cancer_data_copy <- dplyr::select(cancer_data_copy, -id, -diagnosis)
corr_matrix <- cor(cancer_data_copy)
print(corr_matrix)
##                          radius_mean texture_mean perimeter_mean    area_mean
## radius_mean              1.000000000  0.323781891    0.997855281  0.987357170
## texture_mean             0.323781891  1.000000000    0.329533059  0.321085696
## perimeter_mean           0.997855281  0.329533059    1.000000000  0.986506804
## area_mean                0.987357170  0.321085696    0.986506804  1.000000000
## smoothness_mean          0.170581187 -0.023388516    0.207278164  0.177028377
## compactness_mean         0.506123578  0.236702222    0.556936211  0.498501682
## concavity_mean           0.676763550  0.302417828    0.716135650  0.685982829
## concave.points_mean      0.822528522  0.293464051    0.850977041  0.823268869
## symmetry_mean            0.147741242  0.071400980    0.183027212  0.151293079
## fractal_dimension_mean  -0.311630826 -0.076437183   -0.261476908 -0.283109812
## radius_se                0.679090388  0.275868676    0.691765014  0.732562227
## texture_se              -0.097317443  0.386357623   -0.086761078 -0.066280214
## perimeter_se             0.674171616  0.281673115    0.693134890  0.726628328
## area_se                  0.735863663  0.259844987    0.744982694  0.800085921
## smoothness_se           -0.222600125  0.006613777   -0.202694026 -0.166776667
## compactness_se           0.205999980  0.191974611    0.250743681  0.212582551
## concavity_se             0.194203623  0.143293077    0.228082345  0.207660060
## concave.points_se        0.376168956  0.163851025    0.407216916  0.372320282
## symmetry_se             -0.104320881  0.009127168   -0.081629327 -0.072496588
## fractal_dimension_se    -0.042641269  0.054457520   -0.005523391 -0.019886963
## radius_worst             0.969538973  0.352572947    0.969476363  0.962746086
## texture_worst            0.297007644  0.912044589    0.303038372  0.287488627
## perimeter_worst          0.965136514  0.358039575    0.970386887  0.959119574
## area_worst               0.941082460  0.343545947    0.941549808  0.959213326
## smoothness_worst         0.119616140  0.077503359    0.150549404  0.123522939
## compactness_worst        0.413462823  0.277829592    0.455774228  0.390410309
## concavity_worst          0.526911462  0.301025224    0.563879263  0.512605920
## concave.points_worst     0.744214198  0.295315843    0.771240789  0.722016626
## symmetry_worst           0.163953335  0.105007910    0.189115040  0.143569914
## fractal_dimension_worst  0.007065886  0.119205351    0.051018530  0.003737597
##                         smoothness_mean compactness_mean concavity_mean
## radius_mean                  0.17058119       0.50612358     0.67676355
## texture_mean                -0.02338852       0.23670222     0.30241783
## perimeter_mean               0.20727816       0.55693621     0.71613565
## area_mean                    0.17702838       0.49850168     0.68598283
## smoothness_mean              1.00000000       0.65912322     0.52198377
## compactness_mean             0.65912322       1.00000000     0.88312067
## concavity_mean               0.52198377       0.88312067     1.00000000
## concave.points_mean          0.55369517       0.83113504     0.92139103
## symmetry_mean                0.55777479       0.60264105     0.50066662
## fractal_dimension_mean       0.58479200       0.56536866     0.33678336
## radius_se                    0.30146710       0.49747345     0.63192482
## texture_se                   0.06840645       0.04620483     0.07621835
## perimeter_se                 0.29609193       0.54890526     0.66039079
## area_se                      0.24655243       0.45565285     0.61742681
## smoothness_se                0.33237544       0.13529927     0.09856375
## compactness_se               0.31894330       0.73872179     0.67027882
## concavity_se                 0.24839568       0.57051687     0.69127021
## concave.points_se            0.38067569       0.64226185     0.68325992
## symmetry_se                  0.20077438       0.22997659     0.17800921
## fractal_dimension_se         0.28360670       0.50731813     0.44930075
## radius_worst                 0.21312014       0.53531540     0.68823641
## texture_worst                0.03607180       0.24813283     0.29987889
## perimeter_worst              0.23885263       0.59021043     0.72956492
## area_worst                   0.20671836       0.50960381     0.67598723
## smoothness_worst             0.80532420       0.56554117     0.44882204
## compactness_worst            0.47246844       0.86580904     0.75496802
## concavity_worst              0.43492571       0.81627525     0.88410264
## concave.points_worst         0.50305335       0.81557322     0.86132303
## symmetry_worst               0.39430948       0.51022343     0.40946413
## fractal_dimension_worst      0.49931637       0.68738232     0.51492989
##                         concave.points_mean symmetry_mean
## radius_mean                      0.82252852    0.14774124
## texture_mean                     0.29346405    0.07140098
## perimeter_mean                   0.85097704    0.18302721
## area_mean                        0.82326887    0.15129308
## smoothness_mean                  0.55369517    0.55777479
## compactness_mean                 0.83113504    0.60264105
## concavity_mean                   0.92139103    0.50066662
## concave.points_mean              1.00000000    0.46249739
## symmetry_mean                    0.46249739    1.00000000
## fractal_dimension_mean           0.16691738    0.47992133
## radius_se                        0.69804983    0.30337926
## texture_se                       0.02147958    0.12805293
## perimeter_se                     0.71064987    0.31389276
## area_se                          0.69029854    0.22397022
## smoothness_se                    0.02765331    0.18732117
## compactness_se                   0.49042425    0.42165915
## concavity_se                     0.43916707    0.34262702
## concave.points_se                0.61563413    0.39329787
## symmetry_se                      0.09535079    0.44913654
## fractal_dimension_se             0.25758375    0.33178615
## radius_worst                     0.83031763    0.18572775
## texture_worst                    0.29275171    0.09065069
## perimeter_worst                  0.85592313    0.21916856
## area_worst                       0.80962962    0.17719338
## smoothness_worst                 0.45275305    0.42667503
## compactness_worst                0.66745368    0.47320001
## concavity_worst                  0.75239950    0.43372101
## concave.points_worst             0.91015531    0.43029661
## symmetry_worst                   0.37574415    0.69982580
## fractal_dimension_worst          0.36866113    0.43841350
##                         fractal_dimension_mean    radius_se  texture_se
## radius_mean                      -0.3116308263 0.6790903880 -0.09731744
## texture_mean                     -0.0764371834 0.2758686762  0.38635762
## perimeter_mean                   -0.2614769081 0.6917650135 -0.08676108
## area_mean                        -0.2831098117 0.7325622270 -0.06628021
## smoothness_mean                   0.5847920019 0.3014670983  0.06840645
## compactness_mean                  0.5653686634 0.4974734461  0.04620483
## concavity_mean                    0.3367833594 0.6319248221  0.07621835
## concave.points_mean               0.1669173832 0.6980498336  0.02147958
## symmetry_mean                     0.4799213301 0.3033792632  0.12805293
## fractal_dimension_mean            1.0000000000 0.0001109951  0.16417397
## radius_se                         0.0001109951 1.0000000000  0.21324734
## texture_se                        0.1641739659 0.2132473373  1.00000000
## perimeter_se                      0.0398299316 0.9727936770  0.22317073
## area_se                          -0.0901702475 0.9518301121  0.11156725
## smoothness_se                     0.4019644254 0.1645142198  0.39724285
## compactness_se                    0.5598366906 0.3560645755  0.23169970
## concavity_se                      0.4466303217 0.3323575376  0.19499846
## concave.points_se                 0.3411980444 0.5133464414  0.23028340
## symmetry_se                       0.3450073971 0.2405673625  0.41162068
## fractal_dimension_se              0.6881315775 0.2277535327  0.27972275
## radius_worst                     -0.2536914949 0.7150651951 -0.11169031
## texture_worst                    -0.0512692020 0.1947985568  0.40900277
## perimeter_worst                  -0.2051512113 0.7196838037 -0.10224192
## area_worst                       -0.2318544512 0.7515484761 -0.08319499
## smoothness_worst                  0.5049420754 0.1419185529 -0.07365766
## compactness_worst                 0.4587981567 0.2871031656 -0.09243935
## concavity_worst                   0.3462338763 0.3805846346 -0.06895622
## concave.points_worst              0.1753254492 0.5310623278 -0.11963752
## symmetry_worst                    0.3340186839 0.0945428304 -0.12821476
## fractal_dimension_worst           0.7672967792 0.0495594325 -0.04565457
##                         perimeter_se     area_se smoothness_se compactness_se
## radius_mean               0.67417162  0.73586366  -0.222600125      0.2060000
## texture_mean              0.28167311  0.25984499   0.006613777      0.1919746
## perimeter_mean            0.69313489  0.74498269  -0.202694026      0.2507437
## area_mean                 0.72662833  0.80008592  -0.166776667      0.2125826
## smoothness_mean           0.29609193  0.24655243   0.332375443      0.3189433
## compactness_mean          0.54890526  0.45565285   0.135299268      0.7387218
## concavity_mean            0.66039079  0.61742681   0.098563746      0.6702788
## concave.points_mean       0.71064987  0.69029854   0.027653308      0.4904242
## symmetry_mean             0.31389276  0.22397022   0.187321165      0.4216591
## fractal_dimension_mean    0.03982993 -0.09017025   0.401964425      0.5598367
## radius_se                 0.97279368  0.95183011   0.164514220      0.3560646
## texture_se                0.22317073  0.11156725   0.397242853      0.2316997
## perimeter_se              1.00000000  0.93765541   0.151075331      0.4163224
## area_se                   0.93765541  1.00000000   0.075150338      0.2848401
## smoothness_se             0.15107533  0.07515034   1.000000000      0.3366961
## compactness_se            0.41632237  0.28484006   0.336696081      1.0000000
## concavity_se              0.36248158  0.27089473   0.268684760      0.8012683
## concave.points_se         0.55626408  0.41572957   0.328429499      0.7440827
## symmetry_se               0.26648709  0.13410898   0.413506125      0.3947128
## fractal_dimension_se      0.24414277  0.12707090   0.427374207      0.8032688
## radius_worst              0.69720059  0.75737319  -0.230690710      0.2046072
## texture_worst             0.20037085  0.19649665  -0.074742965      0.1430026
## perimeter_worst           0.72103131  0.76121264  -0.217303755      0.2605158
## area_worst                0.73071297  0.81140796  -0.182195478      0.1993713
## smoothness_worst          0.13005439  0.12538943   0.314457456      0.2273942
## compactness_worst         0.34191945  0.28325654  -0.055558139      0.6787804
## concavity_worst           0.41889882  0.38510014  -0.058298387      0.6391467
## concave.points_worst      0.55489723  0.53816631  -0.102006796      0.4832083
## symmetry_worst            0.10993043  0.07412629  -0.107342098      0.2778784
## fractal_dimension_worst   0.08543257  0.01753930   0.101480315      0.5909728
##                         concavity_se concave.points_se  symmetry_se
## radius_mean                0.1942036        0.37616896 -0.104320881
## texture_mean               0.1432931        0.16385103  0.009127168
## perimeter_mean             0.2280823        0.40721692 -0.081629327
## area_mean                  0.2076601        0.37232028 -0.072496588
## smoothness_mean            0.2483957        0.38067569  0.200774376
## compactness_mean           0.5705169        0.64226185  0.229976591
## concavity_mean             0.6912702        0.68325992  0.178009208
## concave.points_mean        0.4391671        0.61563413  0.095350787
## symmetry_mean              0.3426270        0.39329787  0.449136542
## fractal_dimension_mean     0.4466303        0.34119804  0.345007397
## radius_se                  0.3323575        0.51334644  0.240567362
## texture_se                 0.1949985        0.23028340  0.411620680
## perimeter_se               0.3624816        0.55626408  0.266487092
## area_se                    0.2708947        0.41572957  0.134108980
## smoothness_se              0.2686848        0.32842950  0.413506125
## compactness_se             0.8012683        0.74408267  0.394712835
## concavity_se               1.0000000        0.77180399  0.309428578
## concave.points_se          0.7718040        1.00000000  0.312780223
## symmetry_se                0.3094286        0.31278022  1.000000000
## fractal_dimension_se       0.7273722        0.61104414  0.369078083
## radius_worst               0.1869035        0.35812667 -0.128120769
## texture_worst              0.1002410        0.08674121 -0.077473420
## perimeter_worst            0.2266804        0.39499925 -0.103753044
## area_worst                 0.1883527        0.34227116 -0.110342743
## smoothness_worst           0.1684813        0.21535060 -0.012661800
## compactness_worst          0.4848578        0.45288838  0.060254879
## concavity_worst            0.6625641        0.54959238  0.037119049
## concave.points_worst       0.4404723        0.60244961 -0.030413396
## symmetry_worst             0.1977878        0.14311567  0.389402485
## fractal_dimension_worst    0.4393293        0.31065455  0.078079476
##                         fractal_dimension_se radius_worst texture_worst
## radius_mean                     -0.042641269   0.96953897   0.297007644
## texture_mean                     0.054457520   0.35257295   0.912044589
## perimeter_mean                  -0.005523391   0.96947636   0.303038372
## area_mean                       -0.019886963   0.96274609   0.287488627
## smoothness_mean                  0.283606699   0.21312014   0.036071799
## compactness_mean                 0.507318127   0.53531540   0.248132833
## concavity_mean                   0.449300749   0.68823641   0.299878889
## concave.points_mean              0.257583746   0.83031763   0.292751713
## symmetry_mean                    0.331786146   0.18572775   0.090650688
## fractal_dimension_mean           0.688131577  -0.25369149  -0.051269202
## radius_se                        0.227753533   0.71506520   0.194798557
## texture_se                       0.279722748  -0.11169031   0.409002766
## perimeter_se                     0.244142773   0.69720059   0.200370854
## area_se                          0.127070903   0.75737319   0.196496649
## smoothness_se                    0.427374207  -0.23069071  -0.074742965
## compactness_se                   0.803268818   0.20460717   0.143002583
## concavity_se                     0.727372184   0.18690352   0.100240984
## concave.points_se                0.611044139   0.35812667   0.086741210
## symmetry_se                      0.369078083  -0.12812077  -0.077473420
## fractal_dimension_se             1.000000000  -0.03748762  -0.003195029
## radius_worst                    -0.037487618   1.00000000   0.359920754
## texture_worst                   -0.003195029   0.35992075   1.000000000
## perimeter_worst                 -0.001000398   0.99370792   0.365098245
## area_worst                      -0.022736147   0.98401456   0.345842283
## smoothness_worst                 0.170568316   0.21657443   0.225429415
## compactness_worst                0.390158842   0.47582004   0.360832339
## concavity_worst                  0.379974661   0.57397471   0.368365607
## concave.points_worst             0.215204013   0.78742385   0.359754610
## symmetry_worst                   0.111093956   0.24352920   0.233027461
## fractal_dimension_worst          0.591328066   0.09349198   0.219122425
##                         perimeter_worst  area_worst smoothness_worst
## radius_mean                 0.965136514  0.94108246       0.11961614
## texture_mean                0.358039575  0.34354595       0.07750336
## perimeter_mean              0.970386887  0.94154981       0.15054940
## area_mean                   0.959119574  0.95921333       0.12352294
## smoothness_mean             0.238852626  0.20671836       0.80532420
## compactness_mean            0.590210428  0.50960381       0.56554117
## concavity_mean              0.729564917  0.67598723       0.44882204
## concave.points_mean         0.855923128  0.80962962       0.45275305
## symmetry_mean               0.219168559  0.17719338       0.42667503
## fractal_dimension_mean     -0.205151211 -0.23185445       0.50494208
## radius_se                   0.719683804  0.75154848       0.14191855
## texture_se                 -0.102241922 -0.08319499      -0.07365766
## perimeter_se                0.721031310  0.73071297       0.13005439
## area_se                     0.761212636  0.81140796       0.12538943
## smoothness_se              -0.217303755 -0.18219548       0.31445746
## compactness_se              0.260515840  0.19937133       0.22739423
## concavity_se                0.226680426  0.18835265       0.16848132
## concave.points_se           0.394999252  0.34227116       0.21535060
## symmetry_se                -0.103753044 -0.11034274      -0.01266180
## fractal_dimension_se       -0.001000398 -0.02273615       0.17056832
## radius_worst                0.993707916  0.98401456       0.21657443
## texture_worst               0.365098245  0.34584228       0.22542941
## perimeter_worst             1.000000000  0.97757809       0.23677460
## area_worst                  0.977578091  1.00000000       0.20914533
## smoothness_worst            0.236774604  0.20914533       1.00000000
## compactness_worst           0.529407690  0.43829628       0.56818652
## concavity_worst             0.618344080  0.54333053       0.51852329
## concave.points_worst        0.816322102  0.74741880       0.54769090
## symmetry_worst              0.269492769  0.20914551       0.49383833
## fractal_dimension_worst     0.138956862  0.07964703       0.61762419
##                         compactness_worst concavity_worst concave.points_worst
## radius_mean                    0.41346282      0.52691146            0.7442142
## texture_mean                   0.27782959      0.30102522            0.2953158
## perimeter_mean                 0.45577423      0.56387926            0.7712408
## area_mean                      0.39041031      0.51260592            0.7220166
## smoothness_mean                0.47246844      0.43492571            0.5030534
## compactness_mean               0.86580904      0.81627525            0.8155732
## concavity_mean                 0.75496802      0.88410264            0.8613230
## concave.points_mean            0.66745368      0.75239950            0.9101553
## symmetry_mean                  0.47320001      0.43372101            0.4302966
## fractal_dimension_mean         0.45879816      0.34623388            0.1753254
## radius_se                      0.28710317      0.38058463            0.5310623
## texture_se                    -0.09243935     -0.06895622           -0.1196375
## perimeter_se                   0.34191945      0.41889882            0.5548972
## area_se                        0.28325654      0.38510014            0.5381663
## smoothness_se                 -0.05555814     -0.05829839           -0.1020068
## compactness_se                 0.67878035      0.63914670            0.4832083
## concavity_se                   0.48485780      0.66256413            0.4404723
## concave.points_se              0.45288838      0.54959238            0.6024496
## symmetry_se                    0.06025488      0.03711905           -0.0304134
## fractal_dimension_se           0.39015884      0.37997466            0.2152040
## radius_worst                   0.47582004      0.57397471            0.7874239
## texture_worst                  0.36083234      0.36836561            0.3597546
## perimeter_worst                0.52940769      0.61834408            0.8163221
## area_worst                     0.43829628      0.54333053            0.7474188
## smoothness_worst               0.56818652      0.51852329            0.5476909
## compactness_worst              1.00000000      0.89226090            0.8010804
## concavity_worst                0.89226090      1.00000000            0.8554339
## concave.points_worst           0.80108036      0.85543386            1.0000000
## symmetry_worst                 0.61444050      0.53251973            0.5025285
## fractal_dimension_worst        0.81045486      0.68651092            0.5111141
##                         symmetry_worst fractal_dimension_worst
## radius_mean                 0.16395333             0.007065886
## texture_mean                0.10500791             0.119205351
## perimeter_mean              0.18911504             0.051018530
## area_mean                   0.14356991             0.003737597
## smoothness_mean             0.39430948             0.499316369
## compactness_mean            0.51022343             0.687382323
## concavity_mean              0.40946413             0.514929891
## concave.points_mean         0.37574415             0.368661134
## symmetry_mean               0.69982580             0.438413498
## fractal_dimension_mean      0.33401868             0.767296779
## radius_se                   0.09454283             0.049559432
## texture_se                 -0.12821476            -0.045654569
## perimeter_se                0.10993043             0.085432572
## area_se                     0.07412629             0.017539295
## smoothness_se              -0.10734210             0.101480315
## compactness_se              0.27787843             0.590972763
## concavity_se                0.19778782             0.439329269
## concave.points_se           0.14311567             0.310654551
## symmetry_se                 0.38940248             0.078079476
## fractal_dimension_se        0.11109396             0.591328066
## radius_worst                0.24352920             0.093491979
## texture_worst               0.23302746             0.219122425
## perimeter_worst             0.26949277             0.138956862
## area_worst                  0.20914551             0.079647034
## smoothness_worst            0.49383833             0.617624192
## compactness_worst           0.61444050             0.810454856
## concavity_worst             0.53251973             0.686510921
## concave.points_worst        0.50252849             0.511114146
## symmetry_worst              1.00000000             0.537848206
## fractal_dimension_worst     0.53784821             1.000000000

Hard to see, so will make visual plot

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
# Adjust margins to give labels more space
par(mar = c(5, 5, 5, 5)) 

# Plot correlation matrix with larger text and better spacing
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45, tl.cex=0.6)  # Adjust tl.cex for text size

Almost all variables are correlated, very heavily, creating a dataset that is hard to navigate and properly process, as indicated by the VIF function, these high values skews and implodes the results, giving false results.

Removing heavily correlated variables, is needed to process the remaining variables, scrutinizing variables in the correlation matrix that have values at 0.8 or above, working toward VIF values of below 10.

cancer_data_copy_2 = cancer_data
cancer_data_copy_2 <- dplyr::select(cancer_data_copy_2, -id, -diagnosis, -radius_mean, -perimeter_mean, -area_mean, -compactness_mean, -concave.points_mean, -area_worst, -smoothness_worst, -concavity_worst, -fractal_dimension_se, -concave.points_worst, -perimeter_worst, -radius_worst, -compactness_se, -texture_worst, -symmetry_worst, -radius_se, -concavity_se, -fractal_dimension_worst)
corr_matrix_2 <- cor(cancer_data_copy_2)
print(corr_matrix_2)
##                        texture_mean smoothness_mean concavity_mean
## texture_mean            1.000000000     -0.02338852     0.30241783
## smoothness_mean        -0.023388516      1.00000000     0.52198377
## concavity_mean          0.302417828      0.52198377     1.00000000
## symmetry_mean           0.071400980      0.55777479     0.50066662
## fractal_dimension_mean -0.076437183      0.58479200     0.33678336
## texture_se              0.386357623      0.06840645     0.07621835
## perimeter_se            0.281673115      0.29609193     0.66039079
## area_se                 0.259844987      0.24655243     0.61742681
## smoothness_se           0.006613777      0.33237544     0.09856375
## concave.points_se       0.163851025      0.38067569     0.68325992
## symmetry_se             0.009127168      0.20077438     0.17800921
## compactness_worst       0.277829592      0.47246844     0.75496802
##                        symmetry_mean fractal_dimension_mean  texture_se
## texture_mean              0.07140098            -0.07643718  0.38635762
## smoothness_mean           0.55777479             0.58479200  0.06840645
## concavity_mean            0.50066662             0.33678336  0.07621835
## symmetry_mean             1.00000000             0.47992133  0.12805293
## fractal_dimension_mean    0.47992133             1.00000000  0.16417397
## texture_se                0.12805293             0.16417397  1.00000000
## perimeter_se              0.31389276             0.03982993  0.22317073
## area_se                   0.22397022            -0.09017025  0.11156725
## smoothness_se             0.18732117             0.40196443  0.39724285
## concave.points_se         0.39329787             0.34119804  0.23028340
## symmetry_se               0.44913654             0.34500740  0.41162068
## compactness_worst         0.47320001             0.45879816 -0.09243935
##                        perimeter_se     area_se smoothness_se concave.points_se
## texture_mean             0.28167311  0.25984499   0.006613777         0.1638510
## smoothness_mean          0.29609193  0.24655243   0.332375443         0.3806757
## concavity_mean           0.66039079  0.61742681   0.098563746         0.6832599
## symmetry_mean            0.31389276  0.22397022   0.187321165         0.3932979
## fractal_dimension_mean   0.03982993 -0.09017025   0.401964425         0.3411980
## texture_se               0.22317073  0.11156725   0.397242853         0.2302834
## perimeter_se             1.00000000  0.93765541   0.151075331         0.5562641
## area_se                  0.93765541  1.00000000   0.075150338         0.4157296
## smoothness_se            0.15107533  0.07515034   1.000000000         0.3284295
## concave.points_se        0.55626408  0.41572957   0.328429499         1.0000000
## symmetry_se              0.26648709  0.13410898   0.413506125         0.3127802
## compactness_worst        0.34191945  0.28325654  -0.055558139         0.4528884
##                        symmetry_se compactness_worst
## texture_mean           0.009127168        0.27782959
## smoothness_mean        0.200774376        0.47246844
## concavity_mean         0.178009208        0.75496802
## symmetry_mean          0.449136542        0.47320001
## fractal_dimension_mean 0.345007397        0.45879816
## texture_se             0.411620680       -0.09243935
## perimeter_se           0.266487092        0.34191945
## area_se                0.134108980        0.28325654
## smoothness_se          0.413506125       -0.05555814
## concave.points_se      0.312780223        0.45288838
## symmetry_se            1.000000000        0.06025488
## compactness_worst      0.060254879        1.00000000

We can now see a majority of values that fall below 0.8

library(corrplot)
# Adjust margins to give labels more space
par(mar = c(5, 5, 5, 5)) 

# Plot correlation matrix with larger text and better spacing
corrplot(corr_matrix_2, method="circle", type="upper", order="hclust",
         tl.col="black", tl.srt=45, tl.cex=0.6)  # Adjust tl.cex for text size

cancer_data$diagnosis <- as.factor(cancer_data$diagnosis)
cols_to_remove <- c("id", "radius_mean", "perimeter_mean", "area_mean", "compactness_mean",
                    "concave.points_mean", "area_worst", "smoothness_worst", "concavity_worst", 
                    "fractal_dimension_se", "concave.points_worst", "perimeter_worst", 
                    "radius_worst", "compactness_se", "texture_worst", "symmetry_worst", 
                    "radius_se", "concavity_se", "fractal_dimension_worst")

cancer_data <- dplyr::select(cancer_data, -all_of(cols_to_remove))
# sample.split used instead of just sample, to keep an even split between benign and malignant values
library(caTools)
set.seed(2025)
split <- sample.split(cancer_data$diagnosis, SplitRatio = 0.8)
cancer_train <- subset(cancer_data, split == TRUE)
cancer_test <- subset(cancer_data, split == FALSE)
glm_model <- glm(diagnosis ~ ., data = cancer_train, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_model)
## 
## Call:
## glm(formula = diagnosis ~ ., family = binomial, data = cancer_train)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -11.40427    5.10549  -2.234 0.025501 *  
## texture_mean              0.43537    0.10727   4.059 4.94e-05 ***
## smoothness_mean         117.85202   40.95343   2.878 0.004006 ** 
## concavity_mean           42.36850   12.08800   3.505 0.000457 ***
## symmetry_mean            23.52253   19.65488   1.197 0.231393    
## fractal_dimension_mean -318.16610   88.23909  -3.606 0.000311 ***
## texture_se               -0.88536    0.88585  -0.999 0.317577    
## perimeter_se             -1.36216    0.73491  -1.854 0.063810 .  
## area_se                   0.24501    0.06166   3.973 7.09e-05 ***
## smoothness_se           110.53026  174.59289   0.633 0.526685    
## concave.points_se      -233.80006  107.72999  -2.170 0.029989 *  
## symmetry_se            -111.31879   57.53932  -1.935 0.053033 .  
## compactness_worst        14.44933    4.95394   2.917 0.003537 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 602.315  on 455  degrees of freedom
## Residual deviance:  88.727  on 443  degrees of freedom
## AIC: 114.73
## 
## Number of Fisher Scoring iterations: 9
library(car)
vif(glm_model)
##           texture_mean        smoothness_mean         concavity_mean 
##               2.694404               3.163111               3.334797 
##          symmetry_mean fractal_dimension_mean             texture_se 
##               2.775519               3.695366               3.053795 
##           perimeter_se                area_se          smoothness_se 
##               8.670477               9.899194               2.129733 
##      concave.points_se            symmetry_se      compactness_worst 
##               4.064652               2.951036               4.302383

The above logistic model, is to just show what was taken out/ what was left as well as show casing the VIF values, letting us know that all the values are below 10

Removing the values that inflated analysis via correlation, while also removing id because it is irrelevant and would skew the data while providing no significant value, will be done further down

Will be running through multiple models to find the best one

# Load necessary libraries
library(caret)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(e1071)  # For SVM
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(dplyr)
library(car)
cancer_data = read.csv("CancerData.csv")

# Remove 'id' column
cancer_data <- cancer_data[, !(names(cancer_data) %in% c("id"))]

# Encode 'diagnosis' as factor (B = 0, M = 1)
cancer_data$diagnosis <- as.numeric(factor(cancer_data$diagnosis, levels = c("B", "M"))) - 1

# Keep a copy of the full dataset before feature selection
cancer_data_full <- cancer_data  

# Remove multicollinear variables for logistic regression
cancer_data <- dplyr::select(cancer_data, -radius_mean, -perimeter_mean, -area_mean, -compactness_mean, 
                             -concave.points_mean, -area_worst, -smoothness_worst, -concavity_worst, 
                             -fractal_dimension_se, -concave.points_worst, -perimeter_worst, -radius_worst, 
                             -compactness_se, -texture_worst, -symmetry_worst, -radius_se, -concavity_se, 
                             -fractal_dimension_worst)

# Split into training (80%) and testing (20%) sets for both datasets
set.seed(2025)
trainIndex <- createDataPartition(cancer_data$diagnosis, p = 0.8, list = FALSE)

train_data <- cancer_data[trainIndex, ]
test_data <- cancer_data[-trainIndex, ]

train_data_full <- cancer_data_full[trainIndex, ]  # Full dataset (all features)
test_data_full <- cancer_data_full[-trainIndex, ]  

print("trainig set number of Benign and Malignant observations")
## [1] "trainig set number of Benign and Malignant observations"
table(train_data$diagnosis)
## 
##   0   1 
## 279 177
print("testing set number of Benign and Malignant observations")
## [1] "testing set number of Benign and Malignant observations"
table(test_data$diagnosis)
## 
##  0  1 
## 78 35
# Standardize numeric features (excluding 'diagnosis')
preprocess_params <- preProcess(train_data[, -1], method = c("center", "scale"))
train_data[, -1] <- predict(preprocess_params, train_data[, -1])
test_data[, -1] <- predict(preprocess_params, test_data[, -1])

preprocess_params_full <- preProcess(train_data_full[, -1], method = c("center", "scale"))
train_data_full[, -1] <- predict(preprocess_params_full, train_data_full[, -1])
test_data_full[, -1] <- predict(preprocess_params_full, test_data_full[, -1])

# Convert diagnosis to factor
train_data$diagnosis <- factor(train_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
test_data$diagnosis <- factor(test_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))

train_data_full$diagnosis <- factor(train_data_full$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))
test_data_full$diagnosis <- factor(test_data_full$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))

# Define training control for cross-validation
train_control <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)

# Train Logistic Regression (using reduced dataset)
logistic_model <- train(diagnosis ~ ., data = train_data, method = "glm",
                        family = binomial, trControl = train_control, metric = "ROC")

# Train Random Forest (using full dataset)
rf_model <- train(diagnosis ~ ., data = train_data_full, method = "rf",
                  trControl = train_control, metric = "ROC", ntree = 100)

# Train SVM (using full dataset)
svm_model <- train(diagnosis ~ ., data = train_data_full, method = "svmRadial",
                   trControl = train_control, metric = "ROC")

# Train XGBoost (using full dataset)
xgb_grid <- expand.grid(
  nrounds = 100,  # Number of boosting iterations
  max_depth = 3,  
  eta = 0.1,      
  gamma = 0,      
  colsample_bytree = 0.8,  
  min_child_weight = 1,
  subsample = 0.8
)

xgb_model <- train(
  diagnosis ~ ., 
  data = train_data_full, 
  method = "xgbTree",
  trControl = train_control, 
  metric = "ROC",
  tuneGrid = xgb_grid
)

# Function to evaluate models
evaluate_model <- function(model, test_data) {
  predictions <- predict(model, test_data)
  prob_predictions <- predict(model, test_data, type = "prob")[, 2]
  
  accuracy <- mean(predictions == test_data$diagnosis)
  auc <- roc(test_data$diagnosis, prob_predictions)$auc
  
  cat("\nModel:", model$method)
  cat("\nAccuracy:", round(accuracy, 4))
  cat("\nAUC-ROC:", round(auc, 4))
  cat("\nConfusion Matrix:\n")
  print(confusionMatrix(predictions, test_data$diagnosis))
}

# Evaluate models
evaluate_model(logistic_model, test_data)       # Using reduced dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
## 
## Model: glm
## Accuracy: 0.9469
## AUC-ROC: 0.9901
## Confusion Matrix:
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        75         3
##   Malignant      3        32
##                                          
##                Accuracy : 0.9469         
##                  95% CI : (0.888, 0.9803)
##     No Information Rate : 0.6903         
##     P-Value [Acc > NIR] : 1.511e-11      
##                                          
##                   Kappa : 0.8758         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9615         
##             Specificity : 0.9143         
##          Pos Pred Value : 0.9615         
##          Neg Pred Value : 0.9143         
##              Prevalence : 0.6903         
##          Detection Rate : 0.6637         
##    Detection Prevalence : 0.6903         
##       Balanced Accuracy : 0.9379         
##                                          
##        'Positive' Class : Benign         
## 
evaluate_model(rf_model, test_data_full)        # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
## 
## Model: rf
## Accuracy: 0.9823
## AUC-ROC: 0.9828
## Confusion Matrix:
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        77         1
##   Malignant      1        34
##                                           
##                Accuracy : 0.9823          
##                  95% CI : (0.9375, 0.9978)
##     No Information Rate : 0.6903          
##     P-Value [Acc > NIR] : 8.537e-16       
##                                           
##                   Kappa : 0.9586          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9872          
##             Specificity : 0.9714          
##          Pos Pred Value : 0.9872          
##          Neg Pred Value : 0.9714          
##              Prevalence : 0.6903          
##          Detection Rate : 0.6814          
##    Detection Prevalence : 0.6903          
##       Balanced Accuracy : 0.9793          
##                                           
##        'Positive' Class : Benign          
## 
evaluate_model(svm_model, test_data_full)       # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
## 
## Model: svmRadial
## Accuracy: 0.9646
## AUC-ROC: 0.9938
## Confusion Matrix:
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        75         1
##   Malignant      3        34
##                                           
##                Accuracy : 0.9646          
##                  95% CI : (0.9118, 0.9903)
##     No Information Rate : 0.6903          
##     P-Value [Acc > NIR] : 1.826e-13       
##                                           
##                   Kappa : 0.9185          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9615          
##             Specificity : 0.9714          
##          Pos Pred Value : 0.9868          
##          Neg Pred Value : 0.9189          
##              Prevalence : 0.6903          
##          Detection Rate : 0.6637          
##    Detection Prevalence : 0.6726          
##       Balanced Accuracy : 0.9665          
##                                           
##        'Positive' Class : Benign          
## 
evaluate_model(xgb_model, test_data_full)       # Using full dataset
## Setting levels: control = Benign, case = Malignant
## Setting direction: controls < cases
## 
## Model: xgbTree
## Accuracy: 0.9823
## AUC-ROC: 0.9912
## Confusion Matrix:
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Benign Malignant
##   Benign        77         1
##   Malignant      1        34
##                                           
##                Accuracy : 0.9823          
##                  95% CI : (0.9375, 0.9978)
##     No Information Rate : 0.6903          
##     P-Value [Acc > NIR] : 8.537e-16       
##                                           
##                   Kappa : 0.9586          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9872          
##             Specificity : 0.9714          
##          Pos Pred Value : 0.9872          
##          Neg Pred Value : 0.9714          
##              Prevalence : 0.6903          
##          Detection Rate : 0.6814          
##    Detection Prevalence : 0.6903          
##       Balanced Accuracy : 0.9793          
##                                           
##        'Positive' Class : Benign          
## 
# Load required libraries
library(caTools)
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)  # For better tree visualization
## Warning: package 'rpart.plot' was built under R version 4.3.3
# Ensure 'diagnosis' is a factor
cancer_data$diagnosis <- factor(cancer_data$diagnosis, levels = c(0, 1), labels = c("Benign", "Malignant"))

# Set seed for reproducibility
set.seed(2025)

# Perform stratified split
split_2 <- sample.split(cancer_data$diagnosis, SplitRatio = 0.8)

# Create training and test sets
cancer_train_2 <- subset(cancer_data, split_2 == TRUE)
cancer_test_2 <- subset(cancer_data, split_2 == FALSE)

# Train Decision Tree Model
tree_model <- rpart(diagnosis ~ ., data = cancer_train_2, method = "class")

# Plot the tree using an improved visualization
rpart.plot(tree_model, type = 2, extra = 104, tweak = 1.2, box.palette = "RdYlGn", shadow.col = "gray", nn = TRUE)

# Get variable importance
rf_importance <- varImp(rf_model)

# Print importance
print(rf_importance)
## rf variable importance
## 
##   only 20 most important variables shown (out of 30)
## 
##                         Overall
## concave.points_worst    100.000
## area_worst               80.975
## area_mean                71.959
## radius_worst             62.302
## concave.points_mean      59.854
## perimeter_mean           55.627
## concavity_worst          42.759
## area_se                  41.396
## radius_mean              38.970
## perimeter_worst          33.813
## concavity_mean           27.732
## compactness_worst        27.351
## radius_se                25.590
## perimeter_se             22.055
## compactness_mean         16.623
## texture_mean             15.874
## fractal_dimension_worst   9.994
## texture_worst             8.734
## concave.points_se         8.183
## smoothness_worst          7.095
# Plot importance
plot(rf_importance, main = "Variable Importance - Random Forest")