When working with modeling we usually try at the first with all the predictor variables to see which one has the best significance in order to predict our wanted outcome. sometimes we start removing/adding variables in order to see if our model improves.for this blog, ill use a dataset from Kaggle/UIC where ill do glm regression and then ill use the “MASS” package to find down the most important predictors variables
library(dplyr)
library(MASS)
library(tidyr)
library(tidyverse)
library(caret)
library(car)
library(pROC)
data <- read_csv('data.csv')[,c(-1,-33)]
index <- data$diagnosis %>% createDataPartition(p = 0.8, list = FALSE, times = 1)
df <- data[index,]
eval.data <- data[-index,]
df$diagnosis <- ifelse(df$diagnosis=="M",1,0)
diagnosis <- data.matrix(df[,2])
index2 <- diagnosis %>% createDataPartition(p = 0.8, list = FALSE, times = 1)
df.train <- df[index2,]
df.test <- df[-index2,]
diagnosis.train <- diagnosis[index2,]
diagnosis.test <- diagnosis[-index2,]
str(df.train)## Classes 'tbl_df', 'tbl' and 'data.frame': 367 obs. of 31 variables:
## $ diagnosis : num 1 1 1 1 1 1 1 1 1 1 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 569 obs. of 5 variables:
## ..$ row : int 1 2 3 4 5 6 7 8 9 10 ...
## ..$ col : chr NA NA NA NA ...
## ..$ expected: chr "33 columns" "33 columns" "33 columns" "33 columns" ...
## ..$ actual : chr "32 columns" "32 columns" "32 columns" "32 columns" ...
## ..$ file : chr "'data.csv'" "'data.csv'" "'data.csv'" "'data.csv'" ...
summary(df.train)## diagnosis radius_mean texture_mean perimeter_mean
## Min. :0.000 Min. : 6.981 Min. : 9.71 Min. : 43.79
## 1st Qu.:0.000 1st Qu.:11.705 1st Qu.:16.00 1st Qu.: 74.86
## Median :0.000 Median :13.280 Median :18.89 Median : 85.98
## Mean :0.376 Mean :14.061 Mean :19.16 Mean : 91.57
## 3rd Qu.:1.000 3rd Qu.:15.475 3rd Qu.:21.79 3rd Qu.:103.45
## Max. :1.000 Max. :28.110 Max. :32.47 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.02650 Min. :0.00000
## 1st Qu.: 420.1 1st Qu.:0.08679 1st Qu.:0.06298 1st Qu.:0.02699
## Median : 546.3 Median :0.09579 Median :0.09509 Median :0.06155
## Mean : 649.6 Mean :0.09635 Mean :0.10475 Mean :0.08878
## 3rd Qu.: 746.8 3rd Qu.:0.10520 3rd Qu.:0.13035 3rd Qu.:0.13000
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42640
## concave points_mean symmetry_mean fractal_dimension_mean radius_se
## Min. :0.00000 Min. :0.1305 Min. :0.05024 Min. :0.1144
## 1st Qu.:0.01969 1st Qu.:0.1623 1st Qu.:0.05788 1st Qu.:0.2329
## Median :0.03370 Median :0.1794 Median :0.06140 Median :0.3305
## Mean :0.04875 Mean :0.1821 Mean :0.06305 Mean :0.4140
## 3rd Qu.:0.07214 3rd Qu.:0.1966 3rd Qu.:0.06630 3rd Qu.:0.5108
## Max. :0.18450 Max. :0.2906 Max. :0.09744 Max. :2.8730
## texture_se perimeter_se area_se smoothness_se
## Min. :0.3602 Min. : 0.757 Min. : 7.326 Min. :0.002887
## 1st Qu.:0.8274 1st Qu.: 1.609 1st Qu.: 17.855 1st Qu.:0.005288
## Median :1.1110 Median : 2.344 Median : 24.790 Median :0.006470
## Mean :1.2263 Mean : 2.951 Mean : 41.518 Mean :0.007138
## 3rd Qu.:1.5045 3rd Qu.: 3.489 3rd Qu.: 47.715 3rd Qu.:0.008183
## Max. :3.6470 Max. :21.980 Max. :542.200 Max. :0.031130
## compactness_se concavity_se concave points_se symmetry_se
## Min. :0.00371 Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.01281 1st Qu.:0.01443 1st Qu.:0.007654 1st Qu.:0.015210
## Median :0.02025 Median :0.02586 Median :0.010930 Median :0.018970
## Mean :0.02608 Mean :0.03231 Mean :0.011991 Mean :0.021047
## 3rd Qu.:0.03291 3rd Qu.:0.04414 3rd Qu.:0.015135 3rd Qu.:0.024340
## Max. :0.13540 Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022115 1st Qu.:12.99 1st Qu.:20.79 1st Qu.: 84.09
## Median :0.0032240 Median :14.91 Median :25.50 Median : 97.65
## Mean :0.0039349 Mean :16.18 Mean :25.49 Mean :106.83
## 3rd Qu.:0.0046290 3rd Qu.:18.23 3rd Qu.:29.80 3rd Qu.:123.45
## Max. :0.0298400 Max. :36.04 Max. :47.16 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.08484 Min. :0.04327 Min. :0.0000
## 1st Qu.: 514.0 1st Qu.:0.11810 1st Qu.:0.14360 1st Qu.:0.1090
## Median : 683.4 Median :0.13110 Median :0.21010 Median :0.2249
## Mean : 872.7 Mean :0.13202 Mean :0.25324 Mean :0.2694
## 3rd Qu.:1031.0 3rd Qu.:0.14430 3rd Qu.:0.33950 3rd Qu.:0.3790
## Max. :4254.0 Max. :0.20980 Max. :1.05800 Max. :1.1050
## concave points_worst symmetry_worst fractal_dimension_worst
## Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.06363 1st Qu.:0.2508 1st Qu.:0.07106
## Median :0.10150 Median :0.2833 Median :0.07999
## Mean :0.11387 Mean :0.2917 Mean :0.08418
## 3rd Qu.:0.16190 3rd Qu.:0.3169 3rd Qu.:0.09330
## Max. :0.29100 Max. :0.6638 Max. :0.20750
As you can see when we do glm regression with all the variables we cant clearly see which one is the most important variable for our modeling.
model1 <- glm(diagnosis ~., data = df.train,family = 'binomial')
summary(model1)##
## Call:
## glm(formula = diagnosis ~ ., family = "binomial", data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.025e-04 -2.100e-08 -2.100e-08 2.100e-08 2.663e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.112e+03 2.974e+05 -0.004 0.997
## radius_mean -2.016e+02 9.001e+04 -0.002 0.998
## texture_mean 2.489e+01 3.210e+03 0.008 0.994
## perimeter_mean -6.327e+00 1.087e+04 -0.001 1.000
## area_mean 1.578e+00 4.480e+02 0.004 0.997
## smoothness_mean 3.766e+03 5.858e+05 0.006 0.995
## compactness_mean -6.862e+03 5.756e+05 -0.012 0.990
## concavity_mean 5.553e+03 4.091e+05 0.014 0.989
## `concave points_mean` 3.446e+03 7.171e+05 0.005 0.996
## symmetry_mean -2.241e+03 2.981e+05 -0.008 0.994
## fractal_dimension_mean 1.626e+03 1.545e+06 0.001 0.999
## radius_se 1.067e+03 2.474e+05 0.004 0.997
## texture_se 4.790e+01 2.054e+04 0.002 0.998
## perimeter_se -1.152e+02 2.029e+04 -0.006 0.995
## area_se 6.431e+00 2.068e+03 0.003 0.998
## smoothness_se 1.389e+03 2.252e+06 0.001 1.000
## compactness_se 5.961e+03 5.487e+05 0.011 0.991
## concavity_se -4.060e+02 5.113e+05 -0.001 0.999
## `concave points_se` 2.064e+04 2.348e+06 0.009 0.993
## symmetry_se -1.426e+04 1.149e+06 -0.012 0.990
## fractal_dimension_se -1.349e+05 1.006e+07 -0.013 0.989
## radius_worst 3.387e+01 2.480e+04 0.001 0.999
## texture_worst 1.742e-01 2.374e+03 0.000 1.000
## perimeter_worst 6.695e+00 3.931e+03 0.002 0.999
## area_worst 1.580e-01 2.198e+02 0.001 0.999
## smoothness_worst -3.201e+03 4.079e+05 -0.008 0.994
## compactness_worst -5.779e+02 2.032e+05 -0.003 0.998
## concavity_worst -2.618e+02 7.819e+04 -0.003 0.997
## `concave points_worst` 2.969e+03 3.980e+05 0.007 0.994
## symmetry_worst 2.746e+03 2.470e+05 0.011 0.991
## fractal_dimension_worst 1.490e+04 1.117e+06 0.013 0.989
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.8597e+02 on 366 degrees of freedom
## Residual deviance: 9.1058e-07 on 336 degrees of freedom
## AIC: 62
##
## Number of Fisher Scoring iterations: 25
vif(model1)## radius_mean texture_mean perimeter_mean
## 22975.16459 274.73834 15107.44106
## area_mean smoothness_mean compactness_mean
## 4199.30939 43.05623 843.43936
## concavity_mean `concave points_mean` symmetry_mean
## 462.86267 194.22665 71.78244
## fractal_dimension_mean radius_se texture_se
## 120.33742 1270.15352 142.56353
## perimeter_se area_se smoothness_se
## 374.93267 641.56364 51.42448
## compactness_se concavity_se `concave points_se`
## 179.49248 240.08701 170.84898
## symmetry_se fractal_dimension_se radius_worst
## 68.65317 605.75771 2153.13481
## texture_worst perimeter_worst area_worst
## 303.57638 2722.45984 1558.88043
## smoothness_worst compactness_worst concavity_worst
## 51.01320 1289.18809 245.41268
## `concave points_worst` symmetry_worst fractal_dimension_worst
## 212.62260 259.71846 506.36665
model2 <- stepAIC(model1, trace = F)
summary(model2)##
## Call:
## glm(formula = diagnosis ~ radius_mean + texture_mean + compactness_mean +
## concavity_mean + `concave points_mean` + symmetry_mean +
## radius_se + texture_se + perimeter_se + compactness_se +
## `concave points_se` + symmetry_se + fractal_dimension_se +
## perimeter_worst + compactness_worst + `concave points_worst` +
## symmetry_worst + fractal_dimension_worst, family = "binomial",
## data = df.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.872e-04 -2.000e-08 -2.000e-08 2.000e-08 9.034e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.641e+03 2.686e+05 -0.028 0.977
## radius_mean -3.707e+02 1.171e+04 -0.032 0.975
## texture_mean 8.697e+01 2.761e+03 0.031 0.975
## compactness_mean -1.781e+04 5.914e+05 -0.030 0.976
## concavity_mean 1.061e+04 2.785e+05 0.038 0.970
## `concave points_mean` 9.329e+03 3.722e+05 0.025 0.980
## symmetry_mean -6.644e+03 2.137e+05 -0.031 0.975
## radius_se 6.414e+03 1.695e+05 0.038 0.970
## texture_se 2.060e+02 7.517e+03 0.027 0.978
## perimeter_se -4.548e+02 1.116e+04 -0.041 0.967
## compactness_se 2.269e+04 5.904e+05 0.038 0.969
## `concave points_se` 5.486e+04 3.783e+06 0.014 0.988
## symmetry_se -4.796e+04 1.958e+06 -0.024 0.980
## fractal_dimension_se -3.571e+05 1.057e+07 -0.034 0.973
## perimeter_worst 5.938e+01 2.023e+03 0.029 0.977
## compactness_worst -3.120e+03 9.447e+04 -0.033 0.974
## `concave points_worst` 1.024e+04 3.229e+05 0.032 0.975
## symmetry_worst 8.618e+03 3.824e+05 0.023 0.982
## fractal_dimension_worst 3.976e+04 1.322e+06 0.030 0.976
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.8597e+02 on 366 degrees of freedom
## Residual deviance: 6.7515e-06 on 348 degrees of freedom
## AIC: 38
##
## Number of Fisher Scoring iterations: 25
vif(model2)## radius_mean texture_mean compactness_mean
## 2727.4606 1663.6218 4528.4794
## concavity_mean `concave points_mean` symmetry_mean
## 1813.1617 311.7594 356.8702
## radius_se texture_se perimeter_se
## 4427.4354 161.8938 784.1154
## compactness_se `concave points_se` symmetry_se
## 1955.7570 7737.3427 1897.0334
## fractal_dimension_se perimeter_worst compactness_worst
## 9437.6825 4491.7922 1627.7298
## `concave points_worst` symmetry_worst fractal_dimension_worst
## 743.7115 3712.0842 3668.7667
We can see how powerful its the stepAIC function from the “MASS” packages. it gives us the best model by doing the stepwise selection. its a time saving using those package in order to us to obtain a second model based on the first one for which you not sure what combination of your significant variables will get you the best outcome fast.