library("rmarkdown")
library("naniar")
library("visdat")
library("kableExtra")
library("corrplot")
## corrplot 0.92 loaded
library("rpart.plot")
## Loading required package: rpart
library("rpart")
library("ggplot2")
library("gridExtra")
library(car)
## Loading required package: carData
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(devtools)
## Loading required package: usethis
library("StepReg")
data_orig <- read.csv("breast-cancer.csv")
str(data_orig)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
vis_miss(data_orig, warn_large_data = FALSE)
vis_dat(data_orig, warn_large_data = FALSE)
data1 <- data_orig[, -1]
data1$diagnosis <- ifelse(data1$diagnosis == "B", 0, 1)
data1$diagnosis <- as.factor(data1$diagnosis)
str(data1)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
kable(head(data1, 20)) %>% kable_styling(font_size = 10) %>%
scroll_box(height = "500px")
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave.points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave.points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave.points_worst | symmetry_worst | fractal_dimension_worst |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.049040 | 0.05373 | 0.015870 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.26540 | 0.4601 | 0.11890 |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.013080 | 0.01860 | 0.013400 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.18600 | 0.2750 | 0.08902 |
| 1 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.040060 | 0.03832 | 0.020580 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.24300 | 0.3613 | 0.08758 |
| 1 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.074580 | 0.05661 | 0.018670 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.25750 | 0.6638 | 0.17300 |
| 1 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.024610 | 0.05688 | 0.018850 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.16250 | 0.2364 | 0.07678 |
| 1 | 12.45 | 15.70 | 82.57 | 477.1 | 0.12780 | 0.17000 | 0.15780 | 0.08089 | 0.2087 | 0.07613 | 0.3345 | 0.8902 | 2.217 | 27.19 | 0.007510 | 0.033450 | 0.03672 | 0.011370 | 0.02165 | 0.005082 | 15.47 | 23.75 | 103.40 | 741.6 | 0.1791 | 0.5249 | 0.5355 | 0.17410 | 0.3985 | 0.12440 |
| 1 | 18.25 | 19.98 | 119.60 | 1040.0 | 0.09463 | 0.10900 | 0.11270 | 0.07400 | 0.1794 | 0.05742 | 0.4467 | 0.7732 | 3.180 | 53.91 | 0.004314 | 0.013820 | 0.02254 | 0.010390 | 0.01369 | 0.002179 | 22.88 | 27.66 | 153.20 | 1606.0 | 0.1442 | 0.2576 | 0.3784 | 0.19320 | 0.3063 | 0.08368 |
| 1 | 13.71 | 20.83 | 90.20 | 577.9 | 0.11890 | 0.16450 | 0.09366 | 0.05985 | 0.2196 | 0.07451 | 0.5835 | 1.3770 | 3.856 | 50.96 | 0.008805 | 0.030290 | 0.02488 | 0.014480 | 0.01486 | 0.005412 | 17.06 | 28.14 | 110.60 | 897.0 | 0.1654 | 0.3682 | 0.2678 | 0.15560 | 0.3196 | 0.11510 |
| 1 | 13.00 | 21.82 | 87.50 | 519.8 | 0.12730 | 0.19320 | 0.18590 | 0.09353 | 0.2350 | 0.07389 | 0.3063 | 1.0020 | 2.406 | 24.32 | 0.005731 | 0.035020 | 0.03553 | 0.012260 | 0.02143 | 0.003749 | 15.49 | 30.73 | 106.20 | 739.3 | 0.1703 | 0.5401 | 0.5390 | 0.20600 | 0.4378 | 0.10720 |
| 1 | 12.46 | 24.04 | 83.97 | 475.9 | 0.11860 | 0.23960 | 0.22730 | 0.08543 | 0.2030 | 0.08243 | 0.2976 | 1.5990 | 2.039 | 23.94 | 0.007149 | 0.072170 | 0.07743 | 0.014320 | 0.01789 | 0.010080 | 15.09 | 40.68 | 97.65 | 711.4 | 0.1853 | 1.0580 | 1.1050 | 0.22100 | 0.4366 | 0.20750 |
| 1 | 16.02 | 23.24 | 102.70 | 797.8 | 0.08206 | 0.06669 | 0.03299 | 0.03323 | 0.1528 | 0.05697 | 0.3795 | 1.1870 | 2.466 | 40.51 | 0.004029 | 0.009269 | 0.01101 | 0.007591 | 0.01460 | 0.003042 | 19.19 | 33.88 | 123.80 | 1150.0 | 0.1181 | 0.1551 | 0.1459 | 0.09975 | 0.2948 | 0.08452 |
| 1 | 15.78 | 17.89 | 103.60 | 781.0 | 0.09710 | 0.12920 | 0.09954 | 0.06606 | 0.1842 | 0.06082 | 0.5058 | 0.9849 | 3.564 | 54.16 | 0.005771 | 0.040610 | 0.02791 | 0.012820 | 0.02008 | 0.004144 | 20.42 | 27.28 | 136.50 | 1299.0 | 0.1396 | 0.5609 | 0.3965 | 0.18100 | 0.3792 | 0.10480 |
| 1 | 19.17 | 24.80 | 132.40 | 1123.0 | 0.09740 | 0.24580 | 0.20650 | 0.11180 | 0.2397 | 0.07800 | 0.9555 | 3.5680 | 11.070 | 116.20 | 0.003139 | 0.082970 | 0.08890 | 0.040900 | 0.04484 | 0.012840 | 20.96 | 29.94 | 151.70 | 1332.0 | 0.1037 | 0.3903 | 0.3639 | 0.17670 | 0.3176 | 0.10230 |
| 1 | 15.85 | 23.95 | 103.70 | 782.7 | 0.08401 | 0.10020 | 0.09938 | 0.05364 | 0.1847 | 0.05338 | 0.4033 | 1.0780 | 2.903 | 36.58 | 0.009769 | 0.031260 | 0.05051 | 0.019920 | 0.02981 | 0.003002 | 16.84 | 27.66 | 112.00 | 876.5 | 0.1131 | 0.1924 | 0.2322 | 0.11190 | 0.2809 | 0.06287 |
| 1 | 13.73 | 22.61 | 93.60 | 578.3 | 0.11310 | 0.22930 | 0.21280 | 0.08025 | 0.2069 | 0.07682 | 0.2121 | 1.1690 | 2.061 | 19.21 | 0.006429 | 0.059360 | 0.05501 | 0.016280 | 0.01961 | 0.008093 | 15.03 | 32.01 | 108.80 | 697.7 | 0.1651 | 0.7725 | 0.6943 | 0.22080 | 0.3596 | 0.14310 |
| 1 | 14.54 | 27.54 | 96.73 | 658.8 | 0.11390 | 0.15950 | 0.16390 | 0.07364 | 0.2303 | 0.07077 | 0.3700 | 1.0330 | 2.879 | 32.55 | 0.005607 | 0.042400 | 0.04741 | 0.010900 | 0.01857 | 0.005466 | 17.46 | 37.13 | 124.10 | 943.2 | 0.1678 | 0.6577 | 0.7026 | 0.17120 | 0.4218 | 0.13410 |
| 1 | 14.68 | 20.13 | 94.74 | 684.5 | 0.09867 | 0.07200 | 0.07395 | 0.05259 | 0.1586 | 0.05922 | 0.4727 | 1.2400 | 3.195 | 45.40 | 0.005718 | 0.011620 | 0.01998 | 0.011090 | 0.01410 | 0.002085 | 19.07 | 30.88 | 123.40 | 1138.0 | 0.1464 | 0.1871 | 0.2914 | 0.16090 | 0.3029 | 0.08216 |
| 1 | 16.13 | 20.68 | 108.10 | 798.8 | 0.11700 | 0.20220 | 0.17220 | 0.10280 | 0.2164 | 0.07356 | 0.5692 | 1.0730 | 3.854 | 54.18 | 0.007026 | 0.025010 | 0.03188 | 0.012970 | 0.01689 | 0.004142 | 20.96 | 31.48 | 136.80 | 1315.0 | 0.1789 | 0.4233 | 0.4784 | 0.20730 | 0.3706 | 0.11420 |
| 1 | 19.81 | 22.15 | 130.00 | 1260.0 | 0.09831 | 0.10270 | 0.14790 | 0.09498 | 0.1582 | 0.05395 | 0.7582 | 1.0170 | 5.865 | 112.40 | 0.006494 | 0.018930 | 0.03391 | 0.015210 | 0.01356 | 0.001997 | 27.32 | 30.88 | 186.80 | 2398.0 | 0.1512 | 0.3150 | 0.5372 | 0.23880 | 0.2768 | 0.07615 |
| 0 | 13.54 | 14.36 | 87.46 | 566.3 | 0.09779 | 0.08129 | 0.06664 | 0.04781 | 0.1885 | 0.05766 | 0.2699 | 0.7886 | 2.058 | 23.56 | 0.008462 | 0.014600 | 0.02387 | 0.013150 | 0.01980 | 0.002300 | 15.11 | 19.26 | 99.70 | 711.2 | 0.1440 | 0.1773 | 0.2390 | 0.12880 | 0.2977 | 0.07259 |
data_matrix <- cor(data1[, -1])
corrplot(data_matrix, order="hclust", type='upper',tl.srt = 45, tl.cex = 0.6)
The correlation plot shows multiple variables being highly correlated
with another. For example: radius mean and area mean is highly
correlated, area mean and perimeter mean is also highly correlated.
The original dataset has 30 independent variables. Below is a for loop that is creating a histogram for all 30 independent variables. You may have to enlarge the output to view all graphs.
plot_list <- list()
number_of_variables <- ncol(data1)
for(i in 2:number_of_variables) {
p <- ggplot(data1, aes_string(x = names(data1)[i])) +
geom_histogram(bins = 30) +
ggtitle(paste("Histogram of", names(data1)[i]))
plot_list[[i - 1]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
do.call(grid.arrange, c(plot_list, ncol = 5))
ggplot(data1, aes(x = radius_mean, y = texture_mean, color = diagnosis)) +
geom_point() +
ggtitle("Scatter Plot of Radius Mean vs Texture Mean")
ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
geom_boxplot() +
ggtitle("Box Plot of Radius Mean by Diagnosis")
ggplot(data1, aes(x = diagnosis, y = radius_mean)) +
geom_violin() +
ggtitle("Violin Plot of Radius Mean by Diagnosis")
ggplot(data1, aes(x = radius_mean, y = texture_mean)) +
geom_point() +
facet_wrap(~ diagnosis) +
ggtitle("Faceted Scatter Plots for Radius Mean vs Texture Mean")
model <- glm(diagnosis ~., data = data1, family = binomial())
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
vif(model)
## radius_mean texture_mean perimeter_mean
## 4318063.764 140816.298 1691913.733
## area_mean smoothness_mean compactness_mean
## 6331653.442 213016.610 415839.480
## concavity_mean concave.points_mean symmetry_mean
## 105593.174 192381.201 11851.056
## fractal_dimension_mean radius_se texture_se
## 3513.136 610335.622 356773.142
## perimeter_se area_se smoothness_se
## 49276.836 1109444.175 41333.693
## compactness_se concavity_se concave.points_se
## 473636.453 778242.166 1574955.419
## symmetry_se fractal_dimension_se radius_worst
## 24678.711 463075.232 3511808.081
## texture_worst perimeter_worst area_worst
## 823081.427 617752.398 4767645.798
## smoothness_worst compactness_worst concavity_worst
## 58738.790 91238.174 1705825.343
## concave.points_worst symmetry_worst fractal_dimension_worst
## 561062.026 10409.339 193961.537
data_fa <- data1[,-1]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.83
## MSA for each item =
## radius_mean texture_mean perimeter_mean
## 0.83 0.64 0.85
## area_mean smoothness_mean compactness_mean
## 0.86 0.81 0.88
## concavity_mean concave.points_mean symmetry_mean
## 0.89 0.90 0.83
## fractal_dimension_mean radius_se texture_se
## 0.83 0.83 0.48
## perimeter_se area_se smoothness_se
## 0.84 0.85 0.64
## compactness_se concavity_se concave.points_se
## 0.87 0.83 0.84
## symmetry_se fractal_dimension_se radius_worst
## 0.58 0.81 0.82
## texture_worst perimeter_worst area_worst
## 0.60 0.88 0.82
## smoothness_worst compactness_worst concavity_worst
## 0.75 0.85 0.90
## concave.points_worst symmetry_worst fractal_dimension_worst
## 0.89 0.69 0.81
Since MSA = 0.83 > 0.5, we can run Factor Analysis.
cortest.bartlett(datamatrix, nrow(data1))
## $chisq
## [1] 39362.12
##
## $p.value
## [1] 0
##
## $df
## [1] 435
With a Chi-square value of 39362.12 and df of 435, it is significant with an alpha value of 0.05.
ev <- eigen(cor(data_fa))
ev$values
## [1] 1.328161e+01 5.691355e+00 2.817949e+00 1.980640e+00 1.648731e+00
## [6] 1.207357e+00 6.752201e-01 4.766171e-01 4.168948e-01 3.506935e-01
## [11] 2.939157e-01 2.611614e-01 2.413575e-01 1.570097e-01 9.413497e-02
## [16] 7.986280e-02 5.939904e-02 5.261878e-02 4.947759e-02 3.115940e-02
## [21] 2.997289e-02 2.743940e-02 2.434084e-02 1.805501e-02 1.548127e-02
## [26] 8.177640e-03 6.900464e-03 1.589338e-03 7.488031e-04 1.330448e-04
Factor = c(1:30)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")
fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
fa.diagram(fa_var)
scaled_df <- apply(data1[, -1], 2, scale)
dt = head(scaled_df)
kbl(dt)%>%
kable_styling(bootstrap_options = c("striped", "hover"))
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave.points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave.points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave.points_worst | symmetry_worst | fractal_dimension_worst |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.0960995 | -2.0715123 | 1.2688173 | 0.9835095 | 1.5670875 | 3.2806281 | 2.6505418 | 2.5302489 | 2.2155655 | 2.2537638 | 2.4875451 | -0.5647681 | 2.8305403 | 2.4853907 | -0.2138135 | 1.3157039 | 0.7233897 | 0.6602390 | 1.1477468 | 0.9062856 | 1.8850310 | -1.3580985 | 2.3015755 | 1.9994782 | 1.3065367 | 2.6143647 | 2.1076718 | 2.2940576 | 2.7482041 | 1.9353117 |
| 1.8282120 | -0.3533215 | 1.6844726 | 1.9070303 | -0.8262354 | -0.4866435 | -0.0238249 | 0.5476623 | 0.0013911 | -0.8678888 | 0.4988157 | -0.8754733 | 0.2630955 | 0.7417493 | -0.6048187 | -0.6923171 | -0.4403926 | 0.2599334 | -0.8047423 | -0.0993563 | 1.8043398 | -0.3688786 | 1.5337764 | 1.8888270 | -0.3752817 | -0.4300658 | -0.1466200 | 1.0861286 | -0.2436753 | 0.2809428 |
| 1.5784992 | 0.4557859 | 1.5651260 | 1.5575132 | 0.9413821 | 1.0519999 | 1.3622798 | 2.0354398 | 0.9388587 | -0.3976580 | 1.2275958 | -0.7793976 | 0.8501802 | 1.1802975 | -0.2967439 | 0.8142570 | 0.2128891 | 1.4235749 | 0.2368272 | 0.2933013 | 1.5105411 | -0.0239533 | 1.3462906 | 1.4550043 | 0.5269438 | 1.0819801 | 0.8542223 | 1.9532817 | 1.1512420 | 0.2012142 |
| -0.7682333 | 0.2535091 | -0.5921661 | -0.7637917 | 3.2806668 | 3.3999174 | 1.9142129 | 1.4504311 | 2.8648622 | 4.9066020 | 0.3260865 | -0.1103120 | 0.2863415 | -0.2881246 | 0.6890953 | 2.7418679 | 0.8187979 | 1.1140268 | 4.7285198 | 2.0457109 | -0.2812170 | 0.1338663 | -0.2497196 | -0.5495377 | 3.3912907 | 3.8899747 | 1.9878392 | 2.1738732 | 6.0407261 | 4.9306719 |
| 1.7487579 | -1.1508038 | 1.7750113 | 1.8246238 | 0.2801253 | 0.5388663 | 1.3698061 | 1.4272370 | -0.0095521 | -0.5619555 | 1.2694258 | -0.7895490 | 1.2720701 | 1.1893103 | 1.4817634 | -0.0484772 | 0.8277425 | 1.1431989 | -0.3607748 | 0.4988892 | 1.2974336 | -1.4654809 | 1.3373627 | 1.2196511 | 0.2203623 | -0.3131190 | 0.6126397 | 0.7286181 | -0.8675896 | -0.3967505 |
| -0.4759559 | -0.8346009 | -0.3868077 | -0.5052059 | 2.2354545 | 1.2432416 | 0.8655400 | 0.8239307 | 1.0045179 | 1.8883435 | -0.2548461 | -0.5921406 | -0.3210217 | -0.2890039 | 0.1562093 | 0.4451520 | 0.1598845 | -0.0690628 | 0.1340009 | 0.4864178 | -0.1653528 | -0.3135604 | -0.1149083 | -0.2441054 | 2.0467119 | 1.7201029 | 1.2621327 | 0.9050914 | 1.7525273 | 2.2398308 |
data.cov <- cov(scaled_df)
data.eigen <- eigen(data.cov)
str(data.eigen)
## List of 2
## $ values : num [1:30] 13.28 5.69 2.82 1.98 1.65 ...
## $ vectors: num [1:30, 1:30] -0.219 -0.104 -0.228 -0.221 -0.143 ...
## - attr(*, "class")= chr "eigen"
phi <- data.eigen$vectors[,1:2]
print(phi)
## [,1] [,2]
## [1,] -0.21890244 -0.233857132
## [2,] -0.10372458 -0.059706088
## [3,] -0.22753729 -0.215181361
## [4,] -0.22099499 -0.231076711
## [5,] -0.14258969 0.186113023
## [6,] -0.23928535 0.151891610
## [7,] -0.25840048 0.060165363
## [8,] -0.26085376 -0.034767500
## [9,] -0.13816696 0.190348770
## [10,] -0.06436335 0.366575471
## [11,] -0.20597878 -0.105552152
## [12,] -0.01742803 0.089979682
## [13,] -0.21132592 -0.089457234
## [14,] -0.20286964 -0.152292628
## [15,] -0.01453145 0.204430453
## [16,] -0.17039345 0.232715896
## [17,] -0.15358979 0.197207283
## [18,] -0.18341740 0.130321560
## [19,] -0.04249842 0.183848000
## [20,] -0.10256832 0.280092027
## [21,] -0.22799663 -0.219866379
## [22,] -0.10446933 -0.045467298
## [23,] -0.23663968 -0.199878428
## [24,] -0.22487053 -0.219351858
## [25,] -0.12795256 0.172304352
## [26,] -0.21009588 0.143593173
## [27,] -0.22876753 0.097964114
## [28,] -0.25088597 -0.008257235
## [29,] -0.12290456 0.141883349
## [30,] -0.13178394 0.275339469
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]
PC <- data.frame(x = row.names(data1), PC1, PC2)
head(PC)
## x PC1 PC2
## 1 1 -9.184755 1.946870
## 2 2 -2.385703 -3.764859
## 3 3 -5.728855 -1.074229
## 4 4 -7.116691 10.266556
## 5 5 -3.931842 -1.946359
## 6 6 -2.378155 3.946456
ggplot(PC, aes(PC1, PC2)) +
modelr::geom_ref_line(h = 0) +
modelr::geom_ref_line(v = 0) +
geom_text(aes(label = x), size = 3) +
xlab("First Principal Component") +
ylab("Second Principal Component") +
ggtitle("First Two Principal Components of Breast Cancer")
data2 <- data1[, c(1:7)]
set.seed(123)
dt <- sort(sample(nrow(data2), nrow(data2) *.70))
train <- data2[dt,]
test <- data2[-dt,]
model <- glm(diagnosis ~ ., data = train, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"),
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.05199 10.25871 -1.272 0.2033
## radius_mean -4.89296 3.44654 -1.420 0.1557
## texture_mean 0.33265 0.06893 4.826 1.39e-06 ***
## perimeter_mean 0.42229 0.47672 0.886 0.3757
## area_mean 0.03948 0.01689 2.337 0.0195 *
## smoothness_mean 114.03299 26.57068 4.292 1.77e-05 ***
## compactness_mean 3.21523 16.04871 0.200 0.8412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 515.0 on 397 degrees of freedom
## Residual deviance: 124.1 on 391 degrees of freedom
## AIC: 138.1
##
## Number of Fisher Scoring iterations: 8
vif(model)
## radius_mean texture_mean perimeter_mean area_mean
## 617.457109 1.495914 501.578328 106.257441
## smoothness_mean compactness_mean
## 3.122056 9.734957
stepwiseLogit(diagnosis ~ ., data = train, selection = "forward", select = "SL", sle = 0.05)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Table 1. Summary of Parameters
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Paramters Value
## ——————————————————————————————————————————
## Response Variable diagnosis
## Included Variable NULL
## Selection Method forward
## Select Criterion SL
## Entry Significance Level(sle) 0.05
## Variable significance test Rao
## Multicollinearity Terms NULL
## Intercept 1
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 2. Variables Type
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## class variable
## —————————————————————————————————————————————————————————————————————————————————————————————
## factor diagnosis
## numeric radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 3. Process of Selection
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Step EnteredEffect RemovedEffect DF NumberIn SL
## ——————————————————————————————————————————————————————————————————————————
## 0 1 1 1 1
## 1 perimeter_mean 1 2 2.24513488967356e-49
## 2 smoothness_mean 1 3 1.08217259042434e-12
## 3 texture_mean 1 4 1.18573248420783e-08
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 4. Selected Varaibles
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## variables1 variables2 variables3 variables4
## ———————————————————————————————————————————————————————————
## 1 perimeter_mean smoothness_mean texture_mean
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
##
## Table 5. Coefficients of the Selected Variables
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
## Variable Estimate StdError t.value P.value
## —————————————————————————————————————————————————————————————————————————————————————————————————
## (Intercept) -39.0321512304377 4.99432978394651 -7.81529312619692 5.48352688524868e-15
## perimeter_mean 0.205417660846271 0.0268940585535758 7.63803129367987 2.20568259640575e-14
## smoothness_mean 130.566610524472 21.1401454504411 6.17623993319062 6.56462014341027e-10
## texture_mean 0.337461681104516 0.0671347998059276 5.02662824764572 4.99178694892396e-07
## ‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗‗
dt <- sort(sample(nrow(data2), nrow(data2) *.7))
train <- data2[dt,]
test <- data2[-dt,]
rtree <- rpart(diagnosis ~ ., data2, method = "class")
rpart.plot(rtree)