library("rmarkdown")
library("naniar")
library("visdat")
library("kableExtra")
library("corrplot")

## corrplot 0.92 loaded

library("rpart.plot")

## Loading required package: rpart

library("rpart")
library("ggplot2")
library("gridExtra")
library(car)

## Loading required package: carData

library(psych)

## 
## Attaching package: 'psych'

## The following object is masked from 'package:car':
## 
##     logit

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()        masks ggplot2::%+%()
## ✖ psych::alpha()      masks ggplot2::alpha()
## ✖ dplyr::combine()    masks gridExtra::combine()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::recode()     masks car::recode()
## ✖ purrr::some()       masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(devtools)

## Loading required package: usethis

library("StepReg")

data_orig <- read.csv("breast-cancer.csv")
str(data_orig)

## 'data.frame':    569 obs. of  32 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : chr  "M" "M" "M" "M" ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

vis_miss(data_orig, warn_large_data = FALSE)

vis_dat(data_orig, warn_large_data = FALSE)

data1 <- data_orig[, -1]
data1$diagnosis <- ifelse(data1$diagnosis == "B", 0, 1)
data1$diagnosis <- as.factor(data1$diagnosis)
str(data1)

## 'data.frame':    569 obs. of  31 variables:
##  $ diagnosis              : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

kable(head(data1, 20)) %>% kable_styling(font_size = 10) %>% 
  scroll_box(height = "500px")

diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave.points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave.points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave.points_worst	symmetry_worst	fractal_dimension_worst
1	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	1.0950	0.9053	8.589	153.40	0.006399	0.049040	0.05373	0.015870	0.03003	0.006193	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.26540	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	0.5435	0.7339	3.398	74.08	0.005225	0.013080	0.01860	0.013400	0.01389	0.003532	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.18600	0.2750	0.08902
1	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	0.7456	0.7869	4.585	94.03	0.006150	0.040060	0.03832	0.020580	0.02250	0.004571	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.24300	0.3613	0.08758
1	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	0.4956	1.1560	3.445	27.23	0.009110	0.074580	0.05661	0.018670	0.05963	0.009208	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.25750	0.6638	0.17300
1	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	0.7572	0.7813	5.438	94.44	0.011490	0.024610	0.05688	0.018850	0.01756	0.005115	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.16250	0.2364	0.07678
1	12.45	15.70	82.57	477.1	0.12780	0.17000	0.15780	0.08089	0.2087	0.07613	0.3345	0.8902	2.217	27.19	0.007510	0.033450	0.03672	0.011370	0.02165	0.005082	15.47	23.75	103.40	741.6	0.1791	0.5249	0.5355	0.17410	0.3985	0.12440
1	18.25	19.98	119.60	1040.0	0.09463	0.10900	0.11270	0.07400	0.1794	0.05742	0.4467	0.7732	3.180	53.91	0.004314	0.013820	0.02254	0.010390	0.01369	0.002179	22.88	27.66	153.20	1606.0	0.1442	0.2576	0.3784	0.19320	0.3063	0.08368
1	13.71	20.83	90.20	577.9	0.11890	0.16450	0.09366	0.05985	0.2196	0.07451	0.5835	1.3770	3.856	50.96	0.008805	0.030290	0.02488	0.014480	0.01486	0.005412	17.06	28.14	110.60	897.0	0.1654	0.3682	0.2678	0.15560	0.3196	0.11510
1	13.00	21.82	87.50	519.8	0.12730	0.19320	0.18590	0.09353	0.2350	0.07389	0.3063	1.0020	2.406	24.32	0.005731	0.035020	0.03553	0.012260	0.02143	0.003749	15.49	30.73	106.20	739.3	0.1703	0.5401	0.5390	0.20600	0.4378	0.10720
1	12.46	24.04	83.97	475.9	0.11860	0.23960	0.22730	0.08543	0.2030	0.08243	0.2976	1.5990	2.039	23.94	0.007149	0.072170	0.07743	0.014320	0.01789	0.010080	15.09	40.68	97.65	711.4	0.1853	1.0580	1.1050	0.22100	0.4366	0.20750
1	16.02	23.24	102.70	797.8	0.08206	0.06669	0.03299	0.03323	0.1528	0.05697	0.3795	1.1870	2.466	40.51	0.004029	0.009269	0.01101	0.007591	0.01460	0.003042	19.19	33.88	123.80	1150.0	0.1181	0.1551	0.1459	0.09975	0.2948	0.08452
1	15.78	17.89	103.60	781.0	0.09710	0.12920	0.09954	0.06606	0.1842	0.06082	0.5058	0.9849	3.564	54.16	0.005771	0.040610	0.02791	0.012820	0.02008	0.004144	20.42	27.28	136.50	1299.0	0.1396	0.5609	0.3965	0.18100	0.3792	0.10480
1	19.17	24.80	132.40	1123.0	0.09740	0.24580	0.20650	0.11180	0.2397	0.07800	0.9555	3.5680	11.070	116.20	0.003139	0.082970	0.08890	0.040900	0.04484	0.012840	20.96	29.94	151.70	1332.0	0.1037	0.3903	0.3639	0.17670	0.3176	0.10230
1	15.85	23.95	103.70	782.7	0.08401	0.10020	0.09938	0.05364	0.1847	0.05338	0.4033	1.0780	2.903	36.58	0.009769	0.031260	0.05051	0.019920	0.02981	0.003002	16.84	27.66	112.00	876.5	0.1131	0.1924	0.2322	0.11190	0.2809	0.06287
1	13.73	22.61	93.60	578.3	0.11310	0.22930	0.21280	0.08025	0.2069	0.07682	0.2121	1.1690	2.061	19.21	0.006429	0.059360	0.05501	0.016280	0.01961	0.008093	15.03	32.01	108.80	697.7	0.1651	0.7725	0.6943	0.22080	0.3596	0.14310
1	14.54	27.54	96.73	658.8	0.11390	0.15950	0.16390	0.07364	0.2303	0.07077	0.3700	1.0330	2.879	32.55	0.005607	0.042400	0.04741	0.010900	0.01857	0.005466	17.46	37.13	124.10	943.2	0.1678	0.6577	0.7026	0.17120	0.4218	0.13410
1	14.68	20.13	94.74	684.5	0.09867	0.07200	0.07395	0.05259	0.1586	0.05922	0.4727	1.2400	3.195	45.40	0.005718	0.011620	0.01998	0.011090	0.01410	0.002085	19.07	30.88	123.40	1138.0	0.1464	0.1871	0.2914	0.16090	0.3029	0.08216
1	16.13	20.68	108.10	798.8	0.11700	0.20220	0.17220	0.10280	0.2164	0.07356	0.5692	1.0730	3.854	54.18	0.007026	0.025010	0.03188	0.012970	0.01689	0.004142	20.96	31.48	136.80	1315.0	0.1789	0.4233	0.4784	0.20730	0.3706	0.11420
1	19.81	22.15	130.00	1260.0	0.09831	0.10270	0.14790	0.09498	0.1582	0.05395	0.7582	1.0170	5.865	112.40	0.006494	0.018930	0.03391	0.015210	0.01356	0.001997	27.32	30.88	186.80	2398.0	0.1512	0.3150	0.5372	0.23880	0.2768	0.07615
0	13.54	14.36	87.46	566.3	0.09779	0.08129	0.06664	0.04781	0.1885	0.05766	0.2699	0.7886	2.058	23.56	0.008462	0.014600	0.02387	0.013150	0.01980	0.002300	15.11	19.26	99.70	711.2	0.1440	0.1773	0.2390	0.12880	0.2977	0.07259

Correlation plot

data_matrix <- cor(data1[, -1])

corrplot(data_matrix, order="hclust", type='upper',tl.srt = 45, tl.cex = 0.6)

The correlation plot shows multiple variables being highly correlated with another. For example: radius mean and area mean is highly correlated, area mean and perimeter mean is also highly correlated.

Distribution of all numerical independent variable

The original dataset has 30 independent variables. Below is a for loop that is creating a histogram for all 30 independent variables. You may have to enlarge the output to view all graphs.

plot_list <- list()
number_of_variables <- ncol(data1)

for(i in 2:number_of_variables) {
  p <- ggplot(data1, aes_string(x = names(data1)[i])) +
    geom_histogram(bins = 30) + 
    ggtitle(paste("Histogram of", names(data1)[i]))
  
  plot_list[[i - 1]] <- p 
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

do.call(grid.arrange, c(plot_list, ncol = 5))

Plot 1 is a scatter plot of radius mean and texture mean.
Plot 2 is a box plot of the variable radius mean by diagnosis.
Plot 3 is a violin plot of the variable radius mean by diagnosis.
Plot 4 is a faceted scatter plot for radius mean vs texture mean by diagnosis.

ggplot(data1, aes(x = radius_mean, y = texture_mean, color = diagnosis)) +
  geom_point() +
  ggtitle("Scatter Plot of Radius Mean vs Texture Mean")

ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
  geom_boxplot() +
  ggtitle("Box Plot of Radius Mean by Diagnosis")

ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
  geom_violin() +
  ggtitle("Violin Plot of Radius Mean by Diagnosis")

ggplot(data1, aes(x = radius_mean, y = texture_mean)) +
  geom_point() +
  facet_wrap(~ diagnosis) +
  ggtitle("Faceted Scatter Plots for Radius Mean vs Texture Mean")

VIF

model <- glm(diagnosis ~., data = data1, family = binomial())

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

vif(model)

##             radius_mean            texture_mean          perimeter_mean 
##             4318063.764              140816.298             1691913.733 
##               area_mean         smoothness_mean        compactness_mean 
##             6331653.442              213016.610              415839.480 
##          concavity_mean     concave.points_mean           symmetry_mean 
##              105593.174              192381.201               11851.056 
##  fractal_dimension_mean               radius_se              texture_se 
##                3513.136              610335.622              356773.142 
##            perimeter_se                 area_se           smoothness_se 
##               49276.836             1109444.175               41333.693 
##          compactness_se            concavity_se       concave.points_se 
##              473636.453              778242.166             1574955.419 
##             symmetry_se    fractal_dimension_se            radius_worst 
##               24678.711              463075.232             3511808.081 
##           texture_worst         perimeter_worst              area_worst 
##              823081.427              617752.398             4767645.798 
##        smoothness_worst       compactness_worst         concavity_worst 
##               58738.790               91238.174             1705825.343 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##              561062.026               10409.339              193961.537

Factor Analysis

data_fa <- data1[,-1]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.83
## MSA for each item = 
##             radius_mean            texture_mean          perimeter_mean 
##                    0.83                    0.64                    0.85 
##               area_mean         smoothness_mean        compactness_mean 
##                    0.86                    0.81                    0.88 
##          concavity_mean     concave.points_mean           symmetry_mean 
##                    0.89                    0.90                    0.83 
##  fractal_dimension_mean               radius_se              texture_se 
##                    0.83                    0.83                    0.48 
##            perimeter_se                 area_se           smoothness_se 
##                    0.84                    0.85                    0.64 
##          compactness_se            concavity_se       concave.points_se 
##                    0.87                    0.83                    0.84 
##             symmetry_se    fractal_dimension_se            radius_worst 
##                    0.58                    0.81                    0.82 
##           texture_worst         perimeter_worst              area_worst 
##                    0.60                    0.88                    0.82 
##        smoothness_worst       compactness_worst         concavity_worst 
##                    0.75                    0.85                    0.90 
##    concave.points_worst          symmetry_worst fractal_dimension_worst 
##                    0.89                    0.69                    0.81

Since MSA = 0.83 > 0.5, we can run Factor Analysis.

cortest.bartlett(datamatrix, nrow(data1))

## $chisq
## [1] 39362.12
## 
## $p.value
## [1] 0
## 
## $df
## [1] 435

With a Chi-square value of 39362.12 and df of 435, it is significant with an alpha value of 0.05.

ev <- eigen(cor(data_fa))
ev$values

##  [1] 1.328161e+01 5.691355e+00 2.817949e+00 1.980640e+00 1.648731e+00
##  [6] 1.207357e+00 6.752201e-01 4.766171e-01 4.168948e-01 3.506935e-01
## [11] 2.939157e-01 2.611614e-01 2.413575e-01 1.570097e-01 9.413497e-02
## [16] 7.986280e-02 5.939904e-02 5.261878e-02 4.947759e-02 3.115940e-02
## [21] 2.997289e-02 2.743940e-02 2.434084e-02 1.805501e-02 1.548127e-02
## [26] 8.177640e-03 6.900464e-03 1.589338e-03 7.488031e-04 1.330448e-04

Factor = c(1:30)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")

Diagram

fa_var <-  fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

fa.diagram(fa_var)

PCA

scaled_df <- apply(data1[, -1], 2, scale)
dt = head(scaled_df)
kbl(dt)%>%
  kable_styling(bootstrap_options = c("striped", "hover"))

radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave.points_mean	symmetry_mean	fractal_dimension_mean	radius_se	texture_se	perimeter_se	area_se	smoothness_se	compactness_se	concavity_se	concave.points_se	symmetry_se	fractal_dimension_se	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave.points_worst	symmetry_worst	fractal_dimension_worst
1.0960995	-2.0715123	1.2688173	0.9835095	1.5670875	3.2806281	2.6505418	2.5302489	2.2155655	2.2537638	2.4875451	-0.5647681	2.8305403	2.4853907	-0.2138135	1.3157039	0.7233897	0.6602390	1.1477468	0.9062856	1.8850310	-1.3580985	2.3015755	1.9994782	1.3065367	2.6143647	2.1076718	2.2940576	2.7482041	1.9353117
1.8282120	-0.3533215	1.6844726	1.9070303	-0.8262354	-0.4866435	-0.0238249	0.5476623	0.0013911	-0.8678888	0.4988157	-0.8754733	0.2630955	0.7417493	-0.6048187	-0.6923171	-0.4403926	0.2599334	-0.8047423	-0.0993563	1.8043398	-0.3688786	1.5337764	1.8888270	-0.3752817	-0.4300658	-0.1466200	1.0861286	-0.2436753	0.2809428
1.5784992	0.4557859	1.5651260	1.5575132	0.9413821	1.0519999	1.3622798	2.0354398	0.9388587	-0.3976580	1.2275958	-0.7793976	0.8501802	1.1802975	-0.2967439	0.8142570	0.2128891	1.4235749	0.2368272	0.2933013	1.5105411	-0.0239533	1.3462906	1.4550043	0.5269438	1.0819801	0.8542223	1.9532817	1.1512420	0.2012142
-0.7682333	0.2535091	-0.5921661	-0.7637917	3.2806668	3.3999174	1.9142129	1.4504311	2.8648622	4.9066020	0.3260865	-0.1103120	0.2863415	-0.2881246	0.6890953	2.7418679	0.8187979	1.1140268	4.7285198	2.0457109	-0.2812170	0.1338663	-0.2497196	-0.5495377	3.3912907	3.8899747	1.9878392	2.1738732	6.0407261	4.9306719
1.7487579	-1.1508038	1.7750113	1.8246238	0.2801253	0.5388663	1.3698061	1.4272370	-0.0095521	-0.5619555	1.2694258	-0.7895490	1.2720701	1.1893103	1.4817634	-0.0484772	0.8277425	1.1431989	-0.3607748	0.4988892	1.2974336	-1.4654809	1.3373627	1.2196511	0.2203623	-0.3131190	0.6126397	0.7286181	-0.8675896	-0.3967505
-0.4759559	-0.8346009	-0.3868077	-0.5052059	2.2354545	1.2432416	0.8655400	0.8239307	1.0045179	1.8883435	-0.2548461	-0.5921406	-0.3210217	-0.2890039	0.1562093	0.4451520	0.1598845	-0.0690628	0.1340009	0.4864178	-0.1653528	-0.3135604	-0.1149083	-0.2441054	2.0467119	1.7201029	1.2621327	0.9050914	1.7525273	2.2398308

data.cov <- cov(scaled_df)
data.eigen <- eigen(data.cov)
str(data.eigen)

## List of 2
##  $ values : num [1:30] 13.28 5.69 2.82 1.98 1.65 ...
##  $ vectors: num [1:30, 1:30] -0.219 -0.104 -0.228 -0.221 -0.143 ...
##  - attr(*, "class")= chr "eigen"

phi <- data.eigen$vectors[,1:2]
print(phi)

##              [,1]         [,2]
##  [1,] -0.21890244 -0.233857132
##  [2,] -0.10372458 -0.059706088
##  [3,] -0.22753729 -0.215181361
##  [4,] -0.22099499 -0.231076711
##  [5,] -0.14258969  0.186113023
##  [6,] -0.23928535  0.151891610
##  [7,] -0.25840048  0.060165363
##  [8,] -0.26085376 -0.034767500
##  [9,] -0.13816696  0.190348770
## [10,] -0.06436335  0.366575471
## [11,] -0.20597878 -0.105552152
## [12,] -0.01742803  0.089979682
## [13,] -0.21132592 -0.089457234
## [14,] -0.20286964 -0.152292628
## [15,] -0.01453145  0.204430453
## [16,] -0.17039345  0.232715896
## [17,] -0.15358979  0.197207283
## [18,] -0.18341740  0.130321560
## [19,] -0.04249842  0.183848000
## [20,] -0.10256832  0.280092027
## [21,] -0.22799663 -0.219866379
## [22,] -0.10446933 -0.045467298
## [23,] -0.23663968 -0.199878428
## [24,] -0.22487053 -0.219351858
## [25,] -0.12795256  0.172304352
## [26,] -0.21009588  0.143593173
## [27,] -0.22876753  0.097964114
## [28,] -0.25088597 -0.008257235
## [29,] -0.12290456  0.141883349
## [30,] -0.13178394  0.275339469

PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]

PC <- data.frame(x = row.names(data1), PC1, PC2)
head(PC)

##   x       PC1       PC2
## 1 1 -9.184755  1.946870
## 2 2 -2.385703 -3.764859
## 3 3 -5.728855 -1.074229
## 4 4 -7.116691 10.266556
## 5 5 -3.931842 -1.946359
## 6 6 -2.378155  3.946456

ggplot(PC, aes(PC1, PC2)) + 
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_text(aes(label = x), size = 3) +
  xlab("First Principal Component") + 
  ylab("Second Principal Component") + 
  ggtitle("First Two Principal Components of Breast Cancer")

Logistic Regression

data2 <- data1[, c(1:7)]
set.seed(123)
dt <- sort(sample(nrow(data2), nrow(data2) *.70))
train <- data2[dt,]
test <- data2[-dt,] 

model <- glm(diagnosis ~ ., data = train, family = binomial(link = "logit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model)

## 
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -13.05199   10.25871  -1.272   0.2033    
## radius_mean       -4.89296    3.44654  -1.420   0.1557    
## texture_mean       0.33265    0.06893   4.826 1.39e-06 ***
## perimeter_mean     0.42229    0.47672   0.886   0.3757    
## area_mean          0.03948    0.01689   2.337   0.0195 *  
## smoothness_mean  114.03299   26.57068   4.292 1.77e-05 ***
## compactness_mean   3.21523   16.04871   0.200   0.8412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 515.0  on 397  degrees of freedom
## Residual deviance: 124.1  on 391  degrees of freedom
## AIC: 138.1
## 
## Number of Fisher Scoring iterations: 8

vif(model)

##      radius_mean     texture_mean   perimeter_mean        area_mean 
##       617.457109         1.495914       501.578328       106.257441 
##  smoothness_mean compactness_mean 
##         3.122056         9.734957

Forward Selection Method

stepwiseLogit(diagnosis ~ ., data = train, selection = "forward", select = "SL", sle = 0.05)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##       Table 1. Summary of Parameters      
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##            Paramters              Value   
## ——————————————————————————————————————————
## Response Variable              diagnosis   
## Included Variable              NULL        
## Selection Method               forward     
## Select Criterion               SL          
## Entry Significance Level(sle)  0.05        
## Variable significance test     Rao         
## Multicollinearity Terms        NULL        
## Intercept                      1           
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                                    Table 2. Variables Type                                   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##   class                                        variable                                      
## —————————————————————————————————————————————————————————————————————————————————————————————
## factor   diagnosis                                                                            
## numeric  radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                       Table 3. Process of Selection                       
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  Step   EnteredEffect   RemovedEffect  DF  NumberIn           SL          
## ——————————————————————————————————————————————————————————————————————————
## 0     1                               1   1         1                      
## 1     perimeter_mean                  1   2         2.24513488967356e-49   
## 2     smoothness_mean                 1   3         1.08217259042434e-12   
## 3     texture_mean                    1   4         1.18573248420783e-08   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                 Table 4. Selected Varaibles                
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##  variables1    variables2      variables3      variables4  
## ———————————————————————————————————————————————————————————
## 1           perimeter_mean  smoothness_mean  texture_mean   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## 
##                          Table 5. Coefficients of the Selected Variables                         
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##     Variable          Estimate            StdError            t.value             P.value        
## —————————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept)      -39.0321512304377  4.99432978394651    -7.81529312619692  5.48352688524868e-15   
## perimeter_mean   0.205417660846271  0.0268940585535758  7.63803129367987   2.20568259640575e-14   
## smoothness_mean  130.566610524472   21.1401454504411    6.17623993319062   6.56462014341027e-10   
## texture_mean     0.337461681104516  0.0671347998059276  5.02662824764572   4.99178694892396e-07   
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗

Classification Tree

dt <- sort(sample(nrow(data2), nrow(data2) *.7))
train <- data2[dt,]
test <- data2[-dt,]
rtree <- rpart(diagnosis ~ ., data2, method = "class")
rpart.plot(rtree)

Untitled

Kelvin Anderson

2023-11-12