library("rmarkdown")
library("naniar")
library("visdat")
library("kableExtra")
library("corrplot")
## corrplot 0.92 loaded
library("rpart.plot")
## Loading required package: rpart
library("rpart")
library("ggplot2")
library("gridExtra")
library(car)
## Loading required package: carData
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(tidyverse)  
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()        masks ggplot2::%+%()
## ✖ psych::alpha()      masks ggplot2::alpha()
## ✖ dplyr::combine()    masks gridExtra::combine()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::recode()     masks car::recode()
## ✖ purrr::some()       masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(devtools)
## Loading required package: usethis
library("StepReg")
data_orig <- read.csv("breast-cancer.csv")
str(data_orig)
## 'data.frame':    569 obs. of  32 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
vis_miss(data_orig, warn_large_data = FALSE)

vis_dat(data_orig, warn_large_data = FALSE)

data1 <- data_orig[, -1]
data1$diagnosis <- ifelse(data1$diagnosis == "B", 0, 1)
data1$diagnosis <- as.factor(data1$diagnosis)
str(data1)
## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...
kable(head(data1, 20)) %>% kable_styling(font_size = 10) %>% 
  scroll_box(height = "500px")
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave.points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave.points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave.points_worst symmetry_worst fractal_dimension_worst
1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.049040 0.05373 0.015870 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.26540 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.013080 0.01860 0.013400 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.18600 0.2750 0.08902
1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.040060 0.03832 0.020580 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.24300 0.3613 0.08758
1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.074580 0.05661 0.018670 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.25750 0.6638 0.17300
1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.024610 0.05688 0.018850 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.16250 0.2364 0.07678
1 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 0.2087 0.07613 0.3345 0.8902 2.217 27.19 0.007510 0.033450 0.03672 0.011370 0.02165 0.005082 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.17410 0.3985 0.12440
1 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 0.1794 0.05742 0.4467 0.7732 3.180 53.91 0.004314 0.013820 0.02254 0.010390 0.01369 0.002179 22.88 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.19320 0.3063 0.08368
1 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 0.2196 0.07451 0.5835 1.3770 3.856 50.96 0.008805 0.030290 0.02488 0.014480 0.01486 0.005412 17.06 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.15560 0.3196 0.11510
1 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 0.2350 0.07389 0.3063 1.0020 2.406 24.32 0.005731 0.035020 0.03553 0.012260 0.02143 0.003749 15.49 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.20600 0.4378 0.10720
1 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 0.2030 0.08243 0.2976 1.5990 2.039 23.94 0.007149 0.072170 0.07743 0.014320 0.01789 0.010080 15.09 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.22100 0.4366 0.20750
1 16.02 23.24 102.70 797.8 0.08206 0.06669 0.03299 0.03323 0.1528 0.05697 0.3795 1.1870 2.466 40.51 0.004029 0.009269 0.01101 0.007591 0.01460 0.003042 19.19 33.88 123.80 1150.0 0.1181 0.1551 0.1459 0.09975 0.2948 0.08452
1 15.78 17.89 103.60 781.0 0.09710 0.12920 0.09954 0.06606 0.1842 0.06082 0.5058 0.9849 3.564 54.16 0.005771 0.040610 0.02791 0.012820 0.02008 0.004144 20.42 27.28 136.50 1299.0 0.1396 0.5609 0.3965 0.18100 0.3792 0.10480
1 19.17 24.80 132.40 1123.0 0.09740 0.24580 0.20650 0.11180 0.2397 0.07800 0.9555 3.5680 11.070 116.20 0.003139 0.082970 0.08890 0.040900 0.04484 0.012840 20.96 29.94 151.70 1332.0 0.1037 0.3903 0.3639 0.17670 0.3176 0.10230
1 15.85 23.95 103.70 782.7 0.08401 0.10020 0.09938 0.05364 0.1847 0.05338 0.4033 1.0780 2.903 36.58 0.009769 0.031260 0.05051 0.019920 0.02981 0.003002 16.84 27.66 112.00 876.5 0.1131 0.1924 0.2322 0.11190 0.2809 0.06287
1 13.73 22.61 93.60 578.3 0.11310 0.22930 0.21280 0.08025 0.2069 0.07682 0.2121 1.1690 2.061 19.21 0.006429 0.059360 0.05501 0.016280 0.01961 0.008093 15.03 32.01 108.80 697.7 0.1651 0.7725 0.6943 0.22080 0.3596 0.14310
1 14.54 27.54 96.73 658.8 0.11390 0.15950 0.16390 0.07364 0.2303 0.07077 0.3700 1.0330 2.879 32.55 0.005607 0.042400 0.04741 0.010900 0.01857 0.005466 17.46 37.13 124.10 943.2 0.1678 0.6577 0.7026 0.17120 0.4218 0.13410
1 14.68 20.13 94.74 684.5 0.09867 0.07200 0.07395 0.05259 0.1586 0.05922 0.4727 1.2400 3.195 45.40 0.005718 0.011620 0.01998 0.011090 0.01410 0.002085 19.07 30.88 123.40 1138.0 0.1464 0.1871 0.2914 0.16090 0.3029 0.08216
1 16.13 20.68 108.10 798.8 0.11700 0.20220 0.17220 0.10280 0.2164 0.07356 0.5692 1.0730 3.854 54.18 0.007026 0.025010 0.03188 0.012970 0.01689 0.004142 20.96 31.48 136.80 1315.0 0.1789 0.4233 0.4784 0.20730 0.3706 0.11420
1 19.81 22.15 130.00 1260.0 0.09831 0.10270 0.14790 0.09498 0.1582 0.05395 0.7582 1.0170 5.865 112.40 0.006494 0.018930 0.03391 0.015210 0.01356 0.001997 27.32 30.88 186.80 2398.0 0.1512 0.3150 0.5372 0.23880 0.2768 0.07615
0 13.54 14.36 87.46 566.3 0.09779 0.08129 0.06664 0.04781 0.1885 0.05766 0.2699 0.7886 2.058 23.56 0.008462 0.014600 0.02387 0.013150 0.01980 0.002300 15.11 19.26 99.70 711.2 0.1440 0.1773 0.2390 0.12880 0.2977 0.07259

Correlation plot

data_matrix <- cor(data1[, -1])

corrplot(data_matrix, order="hclust", type='upper',tl.srt = 45, tl.cex = 0.6)

The correlation plot shows multiple variables being highly correlated with another. For example: radius mean and area mean is highly correlated, area mean and perimeter mean is also highly correlated.

Distribution of all numerical independent variable

The original dataset has 30 independent variables. Below is a for loop that is creating a histogram for all 30 independent variables. You may have to enlarge the output to view all graphs.

plot_list <- list()
number_of_variables <- ncol(data1)

for(i in 2:number_of_variables) {
  p <- ggplot(data1, aes_string(x = names(data1)[i])) +
    geom_histogram(bins = 30) + 
    ggtitle(paste("Histogram of", names(data1)[i]))
  
  plot_list[[i - 1]] <- p 
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
do.call(grid.arrange, c(plot_list, ncol = 5))

ggplot(data1, aes(x = radius_mean, y = texture_mean, color = diagnosis)) +
  geom_point() +
  ggtitle("Scatter Plot of Radius Mean vs Texture Mean")

ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
  geom_boxplot() +
  ggtitle("Box Plot of Radius Mean by Diagnosis")

ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
  geom_violin() +
  ggtitle("Violin Plot of Radius Mean by Diagnosis")

ggplot(data1, aes(x = radius_mean, y = texture_mean)) +
  geom_point() +
  facet_wrap(~ diagnosis) +
  ggtitle("Faceted Scatter Plots for Radius Mean vs Texture Mean")

VIF

model <- glm(diagnosis ~., data = data1, family = binomial())
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
vif(model)
##             radius_mean            texture_mean          perimeter_mean 
##             4318063.764              140816.298             1691913.733 
##               area_mean         smoothness_mean        compactness_mean 
##             6331653.442              213016.610              415839.480 
##          concavity_mean     concave.points_mean           symmetry_mean 
##              105593.174              192381.201               11851.056 
##  fractal_dimension_mean               radius_se              texture_se 
##                3513.136              610335.622              356773.142 
##            perimeter_se                 area_se           smoothness_se 
##               49276.836             1109444.175               41333.693 
##          compactness_se            concavity_se       concave.points_se 
##              473636.453              778242.166             1574955.419 
##             symmetry_se    fractal_dimension_se            radius_worst 
##               24678.711              463075.232             3511808.081 
##           texture_worst         perimeter_worst              area_worst 
##              823081.427              617752.398             4767645.798 
##        smoothness_worst       compactness_worst         concavity_worst 
##               58738.790               91238.174             1705825.343 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##              561062.026               10409.339              193961.537

Factor Analysis

data_fa <- data1[,-1]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.83
## MSA for each item = 
##             radius_mean            texture_mean          perimeter_mean 
##                    0.83                    0.64                    0.85 
##               area_mean         smoothness_mean        compactness_mean 
##                    0.86                    0.81                    0.88 
##          concavity_mean     concave.points_mean           symmetry_mean 
##                    0.89                    0.90                    0.83 
##  fractal_dimension_mean               radius_se              texture_se 
##                    0.83                    0.83                    0.48 
##            perimeter_se                 area_se           smoothness_se 
##                    0.84                    0.85                    0.64 
##          compactness_se            concavity_se       concave.points_se 
##                    0.87                    0.83                    0.84 
##             symmetry_se    fractal_dimension_se            radius_worst 
##                    0.58                    0.81                    0.82 
##           texture_worst         perimeter_worst              area_worst 
##                    0.60                    0.88                    0.82 
##        smoothness_worst       compactness_worst         concavity_worst 
##                    0.75                    0.85                    0.90 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##                    0.89                    0.69                    0.81

Since MSA = 0.83 > 0.5, we can run Factor Analysis.

cortest.bartlett(datamatrix, nrow(data1))
## $chisq
## [1] 39362.12
## 
## $p.value
## [1] 0
## 
## $df
## [1] 435

With a Chi-square value of 39362.12 and df of 435, it is significant with an alpha value of 0.05.

ev <- eigen(cor(data_fa))
ev$values
##  [1] 1.328161e+01 5.691355e+00 2.817949e+00 1.980640e+00 1.648731e+00
##  [6] 1.207357e+00 6.752201e-01 4.766171e-01 4.168948e-01 3.506935e-01
## [11] 2.939157e-01 2.611614e-01 2.413575e-01 1.570097e-01 9.413497e-02
## [16] 7.986280e-02 5.939904e-02 5.261878e-02 4.947759e-02 3.115940e-02
## [21] 2.997289e-02 2.743940e-02 2.434084e-02 1.805501e-02 1.548127e-02
## [26] 8.177640e-03 6.900464e-03 1.589338e-03 7.488031e-04 1.330448e-04
Factor = c(1:30)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")

Diagram

fa_var <-  fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
fa.diagram(fa_var)

PCA

scaled_df <- apply(data1[, -1], 2, scale)
dt = head(scaled_df)
kbl(dt)%>%
  kable_styling(bootstrap_options = c("striped", "hover"))
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave.points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave.points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave.points_worst symmetry_worst fractal_dimension_worst
1.0960995 -2.0715123 1.2688173 0.9835095 1.5670875 3.2806281 2.6505418 2.5302489 2.2155655 2.2537638 2.4875451 -0.5647681 2.8305403 2.4853907 -0.2138135 1.3157039 0.7233897 0.6602390 1.1477468 0.9062856 1.8850310 -1.3580985 2.3015755 1.9994782 1.3065367 2.6143647 2.1076718 2.2940576 2.7482041 1.9353117
1.8282120 -0.3533215 1.6844726 1.9070303 -0.8262354 -0.4866435 -0.0238249 0.5476623 0.0013911 -0.8678888 0.4988157 -0.8754733 0.2630955 0.7417493 -0.6048187 -0.6923171 -0.4403926 0.2599334 -0.8047423 -0.0993563 1.8043398 -0.3688786 1.5337764 1.8888270 -0.3752817 -0.4300658 -0.1466200 1.0861286 -0.2436753 0.2809428
1.5784992 0.4557859 1.5651260 1.5575132 0.9413821 1.0519999 1.3622798 2.0354398 0.9388587 -0.3976580 1.2275958 -0.7793976 0.8501802 1.1802975 -0.2967439 0.8142570 0.2128891 1.4235749 0.2368272 0.2933013 1.5105411 -0.0239533 1.3462906 1.4550043 0.5269438 1.0819801 0.8542223 1.9532817 1.1512420 0.2012142
-0.7682333 0.2535091 -0.5921661 -0.7637917 3.2806668 3.3999174 1.9142129 1.4504311 2.8648622 4.9066020 0.3260865 -0.1103120 0.2863415 -0.2881246 0.6890953 2.7418679 0.8187979 1.1140268 4.7285198 2.0457109 -0.2812170 0.1338663 -0.2497196 -0.5495377 3.3912907 3.8899747 1.9878392 2.1738732 6.0407261 4.9306719
1.7487579 -1.1508038 1.7750113 1.8246238 0.2801253 0.5388663 1.3698061 1.4272370 -0.0095521 -0.5619555 1.2694258 -0.7895490 1.2720701 1.1893103 1.4817634 -0.0484772 0.8277425 1.1431989 -0.3607748 0.4988892 1.2974336 -1.4654809 1.3373627 1.2196511 0.2203623 -0.3131190 0.6126397 0.7286181 -0.8675896 -0.3967505
-0.4759559 -0.8346009 -0.3868077 -0.5052059 2.2354545 1.2432416 0.8655400 0.8239307 1.0045179 1.8883435 -0.2548461 -0.5921406 -0.3210217 -0.2890039 0.1562093 0.4451520 0.1598845 -0.0690628 0.1340009 0.4864178 -0.1653528 -0.3135604 -0.1149083 -0.2441054 2.0467119 1.7201029 1.2621327 0.9050914 1.7525273 2.2398308
data.cov <- cov(scaled_df)
data.eigen <- eigen(data.cov)
str(data.eigen)
## List of 2
##  $ values : num [1:30] 13.28 5.69 2.82 1.98 1.65 ...
##  $ vectors: num [1:30, 1:30] -0.219 -0.104 -0.228 -0.221 -0.143 ...
##  - attr(*, "class")= chr "eigen"
phi <- data.eigen$vectors[,1:2]
print(phi)
##              [,1]         [,2]
##  [1,] -0.21890244 -0.233857132
##  [2,] -0.10372458 -0.059706088
##  [3,] -0.22753729 -0.215181361
##  [4,] -0.22099499 -0.231076711
##  [5,] -0.14258969  0.186113023
##  [6,] -0.23928535  0.151891610
##  [7,] -0.25840048  0.060165363
##  [8,] -0.26085376 -0.034767500
##  [9,] -0.13816696  0.190348770
## [10,] -0.06436335  0.366575471
## [11,] -0.20597878 -0.105552152
## [12,] -0.01742803  0.089979682
## [13,] -0.21132592 -0.089457234
## [14,] -0.20286964 -0.152292628
## [15,] -0.01453145  0.204430453
## [16,] -0.17039345  0.232715896
## [17,] -0.15358979  0.197207283
## [18,] -0.18341740  0.130321560
## [19,] -0.04249842  0.183848000
## [20,] -0.10256832  0.280092027
## [21,] -0.22799663 -0.219866379
## [22,] -0.10446933 -0.045467298
## [23,] -0.23663968 -0.199878428
## [24,] -0.22487053 -0.219351858
## [25,] -0.12795256  0.172304352
## [26,] -0.21009588  0.143593173
## [27,] -0.22876753  0.097964114
## [28,] -0.25088597 -0.008257235
## [29,] -0.12290456  0.141883349
## [30,] -0.13178394  0.275339469
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]

PC <- data.frame(x = row.names(data1), PC1, PC2)
head(PC)
##   x       PC1       PC2
## 1 1 -9.184755  1.946870
## 2 2 -2.385703 -3.764859
## 3 3 -5.728855 -1.074229
## 4 4 -7.116691 10.266556
## 5 5 -3.931842 -1.946359
## 6 6 -2.378155  3.946456
ggplot(PC, aes(PC1, PC2)) + 
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_text(aes(label = x), size = 3) +
  xlab("First Principal Component") + 
  ylab("Second Principal Component") + 
  ggtitle("First Two Principal Components of Breast Cancer")

Logistic Regression

data2 <- data1[, c(1:7)]
set.seed(123)
dt <- sort(sample(nrow(data2), nrow(data2) *.70))
train <- data2[dt,]
test <- data2[-dt,] 

model <- glm(diagnosis ~ ., data = train, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
## 
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -13.05199   10.25871  -1.272   0.2033    
## radius_mean       -4.89296    3.44654  -1.420   0.1557    
## texture_mean       0.33265    0.06893   4.826 1.39e-06 ***
## perimeter_mean     0.42229    0.47672   0.886   0.3757    
## area_mean          0.03948    0.01689   2.337   0.0195 *  
## smoothness_mean  114.03299   26.57068   4.292 1.77e-05 ***
## compactness_mean   3.21523   16.04871   0.200   0.8412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 515.0  on 397  degrees of freedom
## Residual deviance: 124.1  on 391  degrees of freedom
## AIC: 138.1
## 
## Number of Fisher Scoring iterations: 8
vif(model)
##      radius_mean     texture_mean   perimeter_mean        area_mean 
##       617.457109         1.495914       501.578328       106.257441 
##  smoothness_mean compactness_mean 
##         3.122056         9.734957

Forward Selection Method

stepwiseLogit(diagnosis ~ ., data = train, selection = "forward", select = "SL", sle = 0.05)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##       Table 1. Summary of Parameters      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##            Paramters              Value   
## ——————————————————————————————————————————
## Response Variable              diagnosis   
## Included Variable              NULL        
## Selection Method               forward     
## Select Criterion               SL          
## Entry Significance Level(sle)  0.05        
## Variable significance test     Rao         
## Multicollinearity Terms        NULL        
## Intercept                      1           
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                                    Table 2. Variables Type                                   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   class                                        variable                                      
## —————————————————————————————————————————————————————————————————————————————————————————————
## factor   diagnosis                                                                            
## numeric  radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                       Table 3. Process of Selection                       
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  Step   EnteredEffect   RemovedEffect  DF  NumberIn           SL          
## ——————————————————————————————————————————————————————————————————————————
## 0     1                               1   1         1                      
## 1     perimeter_mean                  1   2         2.24513488967356e-49   
## 2     smoothness_mean                 1   3         1.08217259042434e-12   
## 3     texture_mean                    1   4         1.18573248420783e-08   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                 Table 4. Selected Varaibles                
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  variables1    variables2      variables3      variables4  
## ———————————————————————————————————————————————————————————
## 1           perimeter_mean  smoothness_mean  texture_mean   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                          Table 5. Coefficients of the Selected Variables                         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##     Variable          Estimate            StdError            t.value             P.value        
## —————————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept)      -39.0321512304377  4.99432978394651    -7.81529312619692  5.48352688524868e-15   
## perimeter_mean   0.205417660846271  0.0268940585535758  7.63803129367987   2.20568259640575e-14   
## smoothness_mean  130.566610524472   21.1401454504411    6.17623993319062   6.56462014341027e-10   
## texture_mean     0.337461681104516  0.0671347998059276  5.02662824764572   4.99178694892396e-07   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗

Classification Tree

dt <- sort(sample(nrow(data2), nrow(data2) *.7))
train <- data2[dt,]
test <- data2[-dt,]
rtree <- rpart(diagnosis ~ ., data2, method = "class")
rpart.plot(rtree)