### Required Packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(LogicReg)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster

Step 1. Scale or normalize your data. Make sure to apply imputation if needed

### Import the data in to R
BreastCancer <- read_excel("C:/Users/Dhruva/Desktop/GRAD 699/Assignments/BC.xlsx")

### Assign "Healthy Control", "Patient" for 1 and 2 values in Classification variable
Classification2 <- c("Healthy Control", "Patient")
Classification <- c(1, 2)
dat <- data.frame(Classification2,Classification)
BreastCancer$Classification <- dat$Classification2[match(BreastCancer$Classification, dat$Classification)]


### Plot of the Glucose vs the Insulin
ggplot(data = BreastCancer, aes(x = Glucose, y = MCP.1)) + geom_point(aes(color = Classification))

### Scale or normalize your data by columns
BreastCancer_norm <- as.data.frame(apply(BreastCancer[, 1:9], 2, function(x) (x - min(x))/(max(x)-min(x))))
BreastCancer_norm$Classification <- BreastCancer$Classification
str(BreastCancer_norm)
## 'data.frame':    116 obs. of  10 variables:
##  $ Age           : num  0.369 0.908 0.892 0.677 0.954 ...
##  $ BMI           : num  0.254 0.115 0.235 0.148 0.136 ...
##  $ Glucose       : num  0.0709 0.227 0.2199 0.1206 0.227 ...
##  $ Insulin       : num  0.00491 0.01219 0.03687 0.01417 0.01994 ...
##  $ HOMA          : num  0 0.00974 0.02206 0.00591 0.01375 ...
##  $ Leptin        : num  0.0523 0.0527 0.1585 0.0648 0.0278 ...
##  $ Adiponectin   : num  0.2212 0.1037 0.571 0.1515 0.0869 ...
##  $ Resistin      : num  0.0607 0.0108 0.0769 0.1211 0.0934 ...
##  $ MCP.1         : num  0.225 0.256 0.308 0.534 0.441 ...
##  $ Classification: Factor w/ 2 levels "Healthy Control",..: 1 1 1 1 1 1 1 1 1 1 ...
### Plot of the normalized Glucose vs the Insulin
ggplot(data = BreastCancer_norm, aes(x = Glucose, y = MCP.1)) + geom_point(aes(color = Classification))

Breast cancer data has predictor variables with different scales and these were normalized to fall in same scale. Since there are no missing values, Imputations were not performed. Also the plots of acutal data and normalized data looks ok.

Step 2. Build a multiple linear regression model or logistic regression

### Including Age, BMI, Glucose, Resistin, Insulin variables
BreastCancer1 <- dplyr::select(BreastCancer_norm, Age, BMI, Glucose, Resistin, Insulin, Classification)


### Create a Validation Dataset======

### Create a list of 70% of the rows in the Original dataset we can use for training
BreastCancer_index = createDataPartition(BreastCancer1$Classification, p =0.70, list = FALSE)
dim(BreastCancer_index)
## [1] 82  1
###  Select 30% of the data for validation
BreastCancer_Vald= BreastCancer1[-BreastCancer_index,]
dim(BreastCancer_Vald)
## [1] 34  6
###  Use the 70% of the data to training and testing the models
BreastCancer2 = BreastCancer1[BreastCancer_index,]
dim(BreastCancer2)
## [1] 82  6
### Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

AUC = list()
Accuracy = list()

#Logistic Regression (LR)
set.seed(10)
fit.lr <- train(Classification~., BreastCancer2, method="glm", metric=metric, trControl=control, family = 'binomial')

Step 3. Print summary and interpret table

summary(fit.lr)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.25693  -0.56845   0.01503   0.42325   2.00342  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.454      1.502  -2.966 0.003018 ** 
## Age           -2.405      1.459  -1.649 0.099243 .  
## BMI           -4.462      1.479  -3.018 0.002546 ** 
## Glucose       23.679      6.237   3.796 0.000147 ***
## Resistin      15.484      5.327   2.907 0.003651 ** 
## Insulin        3.149      2.470   1.275 0.202323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 112.894  on 81  degrees of freedom
## Residual deviance:  57.953  on 76  degrees of freedom
## AIC: 69.953
## 
## Number of Fisher Scoring iterations: 7
predictions = predict(fit.lr, BreastCancer_Vald)
confusionMatrix(predictions, BreastCancer_Vald$Classification)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Healthy Control Patient
##   Healthy Control              10       7
##   Patient                       5      12
##                                           
##                Accuracy : 0.6471          
##                  95% CI : (0.4649, 0.8025)
##     No Information Rate : 0.5588          
##     P-Value [Acc > NIR] : 0.1946          
##                                           
##                   Kappa : 0.2941          
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.6316          
##          Pos Pred Value : 0.5882          
##          Neg Pred Value : 0.7059          
##              Prevalence : 0.4412          
##          Detection Rate : 0.2941          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.6491          
##                                           
##        'Positive' Class : Healthy Control 
## 

Breastcancer dataset (with predictors Age, BMI, Glucose, Resistin, Insulin and response variable Classification) was split in to train and test datasets and Logistic Regression model was built. BMI and Glucose (p value < 0.05 ) were found to be statistically significant. Accuracy, Sensitivity, Specificity were found be 0.7059, 0.60, 0.7895 respectively.

Step 4. Perform another model and evaluate which model performs better

# SUpport Vector Machine
set.seed(10)
fit.svm <- train(Classification~., BreastCancer2, method="svmRadial", metric=metric, trControl=control)

summary(fit.svm)
## Length  Class   Mode 
##      1   ksvm     S4
predictions = predict(fit.svm, BreastCancer_Vald)
confusionMatrix(predictions, BreastCancer_Vald$Classification)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction        Healthy Control Patient
##   Healthy Control              12       8
##   Patient                       3      11
##                                           
##                Accuracy : 0.6765          
##                  95% CI : (0.4947, 0.8261)
##     No Information Rate : 0.5588          
##     P-Value [Acc > NIR] : 0.1125          
##                                           
##                   Kappa : 0.3661          
##                                           
##  Mcnemar's Test P-Value : 0.2278          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.5789          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.4412          
##          Detection Rate : 0.3529          
##    Detection Prevalence : 0.5882          
##       Balanced Accuracy : 0.6895          
##                                           
##        'Positive' Class : Healthy Control 
## 

Support Vector Machine model was built using the same predictors and the Accuracy, Sensitivity, Specificity were found be 0.7647, 0.6667, 0.8421 respectively. These statistical measures will be compared on different combination of predictors. Accuracy (0.7647), Sensitivity (0.6667), Specificity (0.8421) obtained from the Support Vector Machine model were better than the Accuracy (0.7059), Sensitivity (0.60), Specificity (0.7895) obtained from the Logistic Regression model. Therefore Support Vector Machine model performs better.