### Required Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.84 loaded
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(readxl)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(LogicReg)
### Import the data in to R
BreastCancer <- read_excel("C:/Users/Dhruva/Desktop/GRAD 699/Assignments/BC.xlsx")
### Assign "Healthy Control", "Patient" for 1 and 2 values in Classification variable
Classification2 <- c("Healthy Control", "Patient")
Classification <- c(1, 2)
dat <- data.frame(Classification2,Classification)
BreastCancer$Classification <- dat$Classification2[match(BreastCancer$Classification, dat$Classification)]
str(BreastCancer)
## tibble [116 x 10] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:116] 48 83 82 68 86 49 89 76 73 75 ...
## $ BMI : num [1:116] 23.5 20.7 23.1 21.4 21.1 ...
## $ Glucose : num [1:116] 70 92 91 77 92 92 77 118 97 83 ...
## $ Insulin : num [1:116] 2.71 3.12 4.5 3.23 3.55 ...
## $ HOMA : num [1:116] 0.467 0.707 1.01 0.613 0.805 ...
## $ Leptin : num [1:116] 8.81 8.84 17.94 9.88 6.7 ...
## $ Adiponectin : num [1:116] 9.7 5.43 22.43 7.17 4.82 ...
## $ Resistin : num [1:116] 8 4.06 9.28 12.77 10.58 ...
## $ MCP.1 : num [1:116] 417 469 555 928 774 ...
## $ Classification: Factor w/ 2 levels "Healthy Control",..: 1 1 1 1 1 1 1 1 1 1 ...
Breastcancer dataset consists of 10 attributes and 116 observations, 9 independent attributes as the predictors and a classification variable including Age, body mass index kg/m2 (BMI), Glucose (mg/dL), Insulin(µU/mL), Homeostatic model assessment (HOMA), Leptin(ng/mL), Adiponectin (µg/mL), Resistin (ng/mL), Monocyte Chemoattractant Protien-1 (MCP-1) (pg/dL) and classification variable with values of 1 and 2, where 1=Healthy Control, 2=Patient.
# Remove the reponse character variable and include only numeric variables
BreastCancer_X <- select(BreastCancer, -c(10))
datamatrix <- cor(BreastCancer_X)
corrplot(datamatrix, order = "hclust", type = "upper", tl.srt = 45)
res2 <- rcorr(as.matrix(BreastCancer_X), type="pearson")
# Extract the correlation coefficients
res2$r
## Age BMI Glucose Insulin HOMA
## Age 1.000000000 0.008529857 0.2301056 0.03249535 0.12703259
## BMI 0.008529857 1.000000000 0.1388452 0.14529526 0.11448013
## Glucose 0.230105617 0.138845189 1.0000000 0.50465307 0.69621182
## Insulin 0.032495353 0.145295260 0.5046531 1.00000000 0.93219777
## HOMA 0.127032593 0.114480131 0.6962118 0.93219777 1.00000000
## Leptin 0.102626049 0.569592606 0.3050799 0.30146162 0.32720986
## Adiponectin -0.219812891 -0.302734758 -0.1221213 -0.03129608 -0.05633712
## Resistin 0.002741708 0.195350206 0.2913275 0.14673099 0.23110123
## MCP.1 0.013461678 0.224038215 0.2648793 0.17435580 0.25952919
## Leptin Adiponectin Resistin MCP.1
## Age 0.10262605 -0.21981289 0.002741708 0.01346168
## BMI 0.56959261 -0.30273476 0.195350206 0.22403821
## Glucose 0.30507994 -0.12212131 0.291327462 0.26487927
## Insulin 0.30146162 -0.03129608 0.146730986 0.17435580
## HOMA 0.32720986 -0.05633712 0.231101229 0.25952919
## Leptin 1.00000000 -0.09538874 0.256233522 0.01400862
## Adiponectin -0.09538874 1.00000000 -0.252363303 -0.20069450
## Resistin 0.25623352 -0.25236330 1.000000000 0.36647421
## MCP.1 0.01400862 -0.20069450 0.366474210 1.00000000
# Extract p-values
res2$P
## Age BMI Glucose Insulin HOMA
## Age NA 9.275910e-01 1.295826e-02 7.291271e-01 0.1741852421
## BMI 0.92759104 NA 1.371672e-01 1.196603e-01 0.2210792307
## Glucose 0.01295826 1.371672e-01 NA 7.636710e-09 0.0000000000
## Insulin 0.72912710 1.196603e-01 7.636710e-09 NA 0.0000000000
## HOMA 0.17418524 2.210792e-01 0.000000e+00 0.000000e+00 NA
## Leptin 0.27297221 2.514744e-11 8.685101e-04 1.007112e-03 0.0003368068
## Adiponectin 0.01774517 9.561926e-04 1.915741e-01 7.387565e-01 0.5480560517
## Resistin 0.97669762 3.559778e-02 1.509606e-03 1.160094e-01 0.0125614612
## MCP.1 0.88595609 1.562110e-02 4.060124e-03 6.122075e-02 0.0049033192
## Leptin Adiponectin Resistin MCP.1
## Age 2.729722e-01 0.0177451672 9.766976e-01 8.859561e-01
## BMI 2.514744e-11 0.0009561926 3.559778e-02 1.562110e-02
## Glucose 8.685101e-04 0.1915741395 1.509606e-03 4.060124e-03
## Insulin 1.007112e-03 0.7387565182 1.160094e-01 6.122075e-02
## HOMA 3.368068e-04 0.5480560517 1.256146e-02 4.903319e-03
## Leptin NA 0.3084086254 5.497439e-03 8.813557e-01
## Adiponectin 3.084086e-01 NA 6.276340e-03 3.075981e-02
## Resistin 5.497439e-03 0.0062763402 NA 5.210132e-05
## MCP.1 8.813557e-01 0.0307598108 5.210132e-05 NA
# Insignificant correlations are leaved blank
corrplot(res2$r, type="upper", order="hclust",
p.mat = res2$P, sig.level = 0.01, insig = "blank")
BC_lr = glm(BreastCancer$Classification~.,data=BreastCancer,family='binomial')
summary(BC_lr)
##
## Call:
## glm(formula = BreastCancer$Classification ~ ., family = "binomial",
## data = BreastCancer)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2992 -0.8548 0.1847 0.7429 2.1632
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.6512154 3.3580998 -1.683 0.09240 .
## Age -0.0233524 0.0156230 -1.495 0.13498
## BMI -0.1501231 0.0674938 -2.224 0.02613 *
## Glucose 0.1055941 0.0348082 3.034 0.00242 **
## Insulin 0.2071782 0.2629802 0.788 0.43081
## HOMA -0.5978147 1.0898156 -0.549 0.58332
## Leptin -0.0101709 0.0172662 -0.589 0.55582
## Adiponectin -0.0052619 0.0375568 -0.140 0.88858
## Resistin 0.0585546 0.0298523 1.961 0.04982 *
## MCP.1 0.0006975 0.0008068 0.865 0.38730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 159.57 on 115 degrees of freedom
## Residual deviance: 111.73 on 106 degrees of freedom
## AIC: 131.73
##
## Number of Fisher Scoring iterations: 7
vif(BC_lr)
## Age BMI Glucose Insulin HOMA Leptin
## 1.182857 2.290273 2.999550 51.962636 57.737228 1.976384
## Adiponectin Resistin MCP.1
## 1.243850 1.221740 1.312666
From the correlation plot, Glucose and HOMA are highly correlated and also the Insulin and HOMA are highly correlated. Also the VIF High Variable Inflation Factor (VIF) test of multicollinearity shows that VIF is greater than 2.5 for Glucose (2.9), Insulin (51.9), HOMA (57.7).
### Testing for Factor analysis by Kaiser-Meyer-Olki (KMO).
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.53
## MSA for each item =
## Age BMI Glucose Insulin HOMA Leptin
## 0.47 0.46 0.59 0.50 0.52 0.56
## Adiponectin Resistin MCP.1
## 0.57 0.69 0.53
Since the overall MSA is 0.53 (less than 0.6), the data is inadequate for factor analysis. Therefore, step 3 - Number of factors and step 4 - Factor analysis were not perfomed.
BC_lr = glm(BreastCancer$Classification~.,data=BreastCancer,family='binomial')
summary(BC_lr)
##
## Call:
## glm(formula = BreastCancer$Classification ~ ., family = "binomial",
## data = BreastCancer)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2992 -0.8548 0.1847 0.7429 2.1632
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.6512154 3.3580998 -1.683 0.09240 .
## Age -0.0233524 0.0156230 -1.495 0.13498
## BMI -0.1501231 0.0674938 -2.224 0.02613 *
## Glucose 0.1055941 0.0348082 3.034 0.00242 **
## Insulin 0.2071782 0.2629802 0.788 0.43081
## HOMA -0.5978147 1.0898156 -0.549 0.58332
## Leptin -0.0101709 0.0172662 -0.589 0.55582
## Adiponectin -0.0052619 0.0375568 -0.140 0.88858
## Resistin 0.0585546 0.0298523 1.961 0.04982 *
## MCP.1 0.0006975 0.0008068 0.865 0.38730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 159.57 on 115 degrees of freedom
## Residual deviance: 111.73 on 106 degrees of freedom
## AIC: 131.73
##
## Number of Fisher Scoring iterations: 7
vif(BC_lr)
## Age BMI Glucose Insulin HOMA Leptin
## 1.182857 2.290273 2.999550 51.962636 57.737228 1.976384
## Adiponectin Resistin MCP.1
## 1.243850 1.221740 1.312666
Since the data is inadequate for factor analysis, the Factor analysis was not perfomed. Therefore, Logistic Regression model was built on same data. BMI, Glucose, Resistin (p value < 0.05 ) were found to be statistically significant and all other predictors are non-significant. Also the VIF High Variable Inflation Factor (VIF) test of multicollinearity shows that VIF is greater than 2.5 for Glucose (2.9), Insulin (57.2), HOMA (62.3). Since the Insulin and HOMa are non-significant and highly correlated, they can be dropped from the model.