# Radhe Radhe
library(randomForest);library(dplyr);library(caTools)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# The modeling techniques performed on this data include Logistic Regression, Classification Trees, Random Forests and SVM.
# Importing the dataset and converting the dataset attribute class into o and 1
dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))
# Splitting the dataset into the Training set and Test set
#install.packages('caTools')
set.seed(789) #for fixing the referance
split = sample.split(dataset$class, SplitRatio = 0.76)
tran_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
#Fitting the random forest
rf_pima <- randomForest(class ~., data = tran_set, mtry = 8, ntree=171, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Testing the Model
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
rf_probs <- predict(rf_pima, newdata = test_set)
rf_pred <- ifelse(rf_probs > 0.5, 1, 0)
confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 23
## 1 21 41
##
## Accuracy : 0.7609
## 95% CI : (0.6926, 0.8206)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.0009686
##
## Kappa : 0.469
##
## Mcnemar's Test P-Value : 0.8801685
##
## Sensitivity : 0.8250
## Specificity : 0.6406
## Pos Pred Value : 0.8115
## Neg Pred Value : 0.6613
## Prevalence : 0.6522
## Detection Rate : 0.5380
## Detection Prevalence : 0.6630
## Balanced Accuracy : 0.7328
##
## 'Positive' Class : 0
##
ACC_RandomForest <- confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))$overall['Accuracy']
# Random forest graphs
par(mfrow = c(1, 2))
varImpPlot(rf_pima, type = 2, main = "Variable Importance",col = 'black')
plot(rf_pima, main = "Error vs no. of trees grown")

# MODEL lOGISTIC REGRESSION
set.seed(123)
split = sample.split(dataset$class, SplitRatio = 0.75)
Traindata = subset(dataset, split == TRUE)
Testdata = subset(dataset, split == FALSE)
dataset$class <- as.factor(dataset$class)
# Training The Model
glm_Model1 <- glm(class ~., data = Traindata, family = binomial)
summary(glm_Model1)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = Traindata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5786 -0.7009 -0.4046 0.6694 2.8366
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.597820 0.836126 -10.283 < 2e-16 ***
## preg 0.107268 0.038914 2.756 0.00584 **
## plas 0.040055 0.004599 8.709 < 2e-16 ***
## pres -0.018938 0.006538 -2.897 0.00377 **
## skin 0.008982 0.008373 1.073 0.28342
## insu -0.003051 0.001179 -2.588 0.00966 **
## mass 0.088903 0.017876 4.973 6.58e-07 ***
## pedi 0.794833 0.364194 2.182 0.02908 *
## age 0.020087 0.011341 1.771 0.07652 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 745.11 on 575 degrees of freedom
## Residual deviance: 527.03 on 567 degrees of freedom
## AIC: 545.03
##
## Number of Fisher Scoring iterations: 5
# Variables with the p_values greather than 0.01 are insignificant
glm_Model2 <- update(glm_Model1, ~. - skin - insu - age )
summary(glm_Model2)
##
## Call:
## glm(formula = class ~ preg + plas + pres + mass + pedi, family = binomial,
## data = Traindata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7239 -0.7192 -0.4141 0.6964 2.8894
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.821602 0.770056 -10.157 < 2e-16 ***
## preg 0.151722 0.033154 4.576 4.73e-06 ***
## plas 0.036884 0.004024 9.165 < 2e-16 ***
## pres -0.015398 0.006118 -2.517 0.0118 *
## mass 0.084551 0.016569 5.103 3.35e-07 ***
## pedi 0.690636 0.354260 1.950 0.0512 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 745.11 on 575 degrees of freedom
## Residual deviance: 538.28 on 570 degrees of freedom
## AIC: 550.28
##
## Number of Fisher Scoring iterations: 5
# Testing the Model
glm_probs <- predict(glm_Model2, newdata = Testdata, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)
#print("Confusion Matrix for logistic regression");
table(Predicted = glm_pred, Actual = Testdata$class)
## Actual
## Predicted 0 1
## 0 102 29
## 1 23 38
confusionMatrix(as.factor(glm_pred), as.factor(Testdata$class) ) # Confusion Matrix for logistic regression
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 29
## 1 23 38
##
## Accuracy : 0.7292
## 95% CI : (0.6605, 0.7906)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 0.01287
##
## Kappa : 0.3913
##
## Mcnemar's Test P-Value : 0.48807
##
## Sensitivity : 0.8160
## Specificity : 0.5672
## Pos Pred Value : 0.7786
## Neg Pred Value : 0.6230
## Prevalence : 0.6510
## Detection Rate : 0.5312
## Detection Prevalence : 0.6823
## Balanced Accuracy : 0.6916
##
## 'Positive' Class : 0
##
#Accuracy of the GLM
Accur_GLM <- confusionMatrix(as.factor(glm_pred), as.factor(Testdata$class) )$overall['Accuracy']
Accur_GLM
## Accuracy
## 0.7291667
# SVM Model
dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))
set.seed(123)
split = sample.split(dataset$class, SplitRatio = 0.75)
Traindata = subset(dataset, split == TRUE)
Testdata = subset(dataset, split == FALSE)
dataset$class <- as.factor(dataset$class)
library(e1071)
tuned <- tune.svm(class ~., data =Traindata, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.01 1
##
## - best performance: 0.1648667
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 0.1 0.3181150 0.04238603
## 2 1e-05 0.1 0.3179717 0.04236925
## 3 1e-04 0.1 0.3168878 0.04225689
## 4 1e-03 0.1 0.3061051 0.04098718
## 5 1e-02 0.1 0.2375736 0.03176193
## 6 1e-01 0.1 0.1737100 0.02317924
## 7 1e-06 1.0 0.3179722 0.04238000
## 8 1e-05 1.0 0.3168919 0.04225986
## 9 1e-04 1.0 0.3059737 0.04094285
## 10 1e-03 1.0 0.2345881 0.03171840
## 11 1e-02 1.0 0.1648667 0.02418833
## 12 1e-01 1.0 0.1661992 0.02837449
## 13 1e-06 10.0 0.3168940 0.04225442
## 14 1e-05 10.0 0.3059606 0.04094162
## 15 1e-04 10.0 0.2344471 0.03184534
## 16 1e-03 10.0 0.1674606 0.02720375
## 17 1e-02 10.0 0.1729512 0.02660160
## 18 1e-01 10.0 0.1909880 0.04194911
# Fitting the model
svm_model <- svm(class ~., data = Traindata, kernel = "radial", gamma = 0.01, cost = 10)
summary(svm_model)
##
## Call:
## svm(formula = class ~ ., data = Traindata, kernel = "radial", gamma = 0.01,
## cost = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 10
## gamma: 0.01
## epsilon: 0.1
##
##
## Number of Support Vectors: 450
# Testing the model
svm_pred <- predict(svm_model, newdata = Testdata)
svm_pred <- ifelse(svm_pred > 0.5, 1, 0)
(CF <- confusionMatrix(as.factor(svm_pred), as.factor(Testdata$class)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 113 36
## 1 12 31
##
## Accuracy : 0.75
## 95% CI : (0.6826, 0.8096)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 0.0020761
##
## Kappa : 0.3999
##
## Mcnemar's Test P-Value : 0.0009009
##
## Sensitivity : 0.9040
## Specificity : 0.4627
## Pos Pred Value : 0.7584
## Neg Pred Value : 0.7209
## Prevalence : 0.6510
## Detection Rate : 0.5885
## Detection Prevalence : 0.7760
## Balanced Accuracy : 0.6833
##
## 'Positive' Class : 0
##
#Accuracy of the SVM
(Accur_SVM <- CF$overall['Accuracy'])
## Accuracy
## 0.75
# Model Comparision
modelsname <- c('GLM','SVM','RandomForest')
modelsvalue <- c(Accur_GLM,Accur_SVM,ACC_RandomForest)
model_comapre <- data.frame(modelsname,modelsvalue)
ggplot(model_comapre,aes(x=modelsname,y=modelsvalue)) + geom_bar(stat='identity') + theme_bw() + ggtitle('Comparison of Model Accuracy')
