# SRI GANESHYA NAMAH 
# HAR HAR MAHADEV

# It is preferred to do data study and feature engineering so that optimum model with picky attributes gives the best result.
# Model 1 Random forest
#install.packages('randomForest')
#install.packages('dplyr')
library(randomForest);library(dplyr);library(caTools);library(caret);library(randomForest);library(dplyr);library(corrplot)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## corrplot 0.84 loaded
# Setting the working directory to Import the dataset
setwd("C:/Users/vipin.dwivedi.IN/Desktop/Guru_Naman/Guru/Krapa/Machine Learning A-Z New/Part 3 - Classification/Section 20 - Random Forest Classification")
getwd()
## [1] "C:/Users/vipin.dwivedi.IN/Desktop/Guru_Naman/Guru/Krapa/Machine Learning A-Z New/Part 3 - Classification/Section 20 - Random Forest Classification"
# Importing the dataset and converting the dataset attribute class into o and 1

dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)

df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))

# Splitting the dataset into the Training set and Test set
#install.packages('caTools')

set.seed(789) #for fixing the referance
split = sample.split(dataset$class, SplitRatio = 0.76)
tran_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

#Fitting the random forest
rf_pima <- randomForest(class ~., data = tran_set, mtry = 8, ntree=171, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Testing the Model
#install.packages("caret")
library(caret)
rf_probs <- predict(rf_pima, newdata = test_set)
rf_pred <- ifelse(rf_probs > 0.5, 1, 0)
confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 99 23
##          1 21 41
##                                           
##                Accuracy : 0.7609          
##                  95% CI : (0.6926, 0.8206)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.0009686       
##                                           
##                   Kappa : 0.469           
##                                           
##  Mcnemar's Test P-Value : 0.8801685       
##                                           
##             Sensitivity : 0.8250          
##             Specificity : 0.6406          
##          Pos Pred Value : 0.8115          
##          Neg Pred Value : 0.6613          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5380          
##    Detection Prevalence : 0.6630          
##       Balanced Accuracy : 0.7328          
##                                           
##        'Positive' Class : 0               
## 
ACC_RandomForest <- confusionMatrix(as.factor(rf_pred), as.factor(test_set$class))$overall['Accuracy']

# Random forest graphs

par(mfrow = c(1, 2))
varImpPlot(rf_pima, type = 2, main = "Variable Importance",col = 'black')
plot(rf_pima, main = "Error vs no. of trees grown")

# EDA
# install.packages("corrplot")

library(corrplot)
library(caret)
diadata <- read.csv("diab_1.csv",stringsAsFactors = F)
head(diadata);str(diadata)
##   preg plas pres skin insu mass  pedi age           class
## 1    6  148   72   35    0 33.6 0.627  50 tested_positive
## 2    1   85   66   29    0 26.6 0.351  31 tested_negative
## 3    8  183   64    0    0 23.3 0.672  32 tested_positive
## 4    1   89   66   23   94 28.1 0.167  21 tested_negative
## 5    0  137   40   35  168 43.1 2.288  33 tested_positive
## 6    5  116   74    0    0 25.6 0.201  30 tested_negative
## 'data.frame':    768 obs. of  9 variables:
##  $ preg : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ plas : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ pres : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ skin : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ insu : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedi : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age  : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ class: chr  "tested_positive" "tested_negative" "tested_positive" "tested_negative" ...
table(is.na(diadata))
## 
## FALSE 
##  6912
corrplot(cor(diadata[, -9]), type = "lower", method = "number")

 # SVM Model

dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))


set.seed(123)
split = sample.split(dataset$class, SplitRatio = 0.75)
Traindata = subset(dataset, split == TRUE)
Testdata = subset(dataset, split == FALSE)
dataset$class <- as.factor(dataset$class)

library(e1071)

tuned <- tune.svm(class ~., data =Traindata, gamma = 10^(-6:-1), cost = 10^(-1:1))
summary(tuned)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##   0.01    1
## 
## - best performance: 0.1648667 
## 
## - Detailed performance results:
##    gamma cost     error dispersion
## 1  1e-06  0.1 0.3181150 0.04238603
## 2  1e-05  0.1 0.3179717 0.04236925
## 3  1e-04  0.1 0.3168878 0.04225689
## 4  1e-03  0.1 0.3061051 0.04098718
## 5  1e-02  0.1 0.2375736 0.03176193
## 6  1e-01  0.1 0.1737100 0.02317924
## 7  1e-06  1.0 0.3179722 0.04238000
## 8  1e-05  1.0 0.3168919 0.04225986
## 9  1e-04  1.0 0.3059737 0.04094285
## 10 1e-03  1.0 0.2345881 0.03171840
## 11 1e-02  1.0 0.1648667 0.02418833
## 12 1e-01  1.0 0.1661992 0.02837449
## 13 1e-06 10.0 0.3168940 0.04225442
## 14 1e-05 10.0 0.3059606 0.04094162
## 15 1e-04 10.0 0.2344471 0.03184534
## 16 1e-03 10.0 0.1674606 0.02720375
## 17 1e-02 10.0 0.1729512 0.02660160
## 18 1e-01 10.0 0.1909880 0.04194911
# Fitting the model

svm_model  <- svm(class ~., data = Traindata, kernel = "radial", gamma = 0.01, cost = 10) 
summary(svm_model)
## 
## Call:
## svm(formula = class ~ ., data = Traindata, kernel = "radial", gamma = 0.01, 
##     cost = 10)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  10 
##       gamma:  0.01 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  450
# Testing the model
svm_pred <- predict(svm_model, newdata = Testdata)
svm_pred <- ifelse(svm_pred > 0.5, 1, 0)
confusionMatrix(as.factor(svm_pred), as.factor(Testdata$class))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 113  36
##          1  12  31
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.6826, 0.8096)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 0.0020761       
##                                           
##                   Kappa : 0.3999          
##                                           
##  Mcnemar's Test P-Value : 0.0009009       
##                                           
##             Sensitivity : 0.9040          
##             Specificity : 0.4627          
##          Pos Pred Value : 0.7584          
##          Neg Pred Value : 0.7209          
##              Prevalence : 0.6510          
##          Detection Rate : 0.5885          
##    Detection Prevalence : 0.7760          
##       Balanced Accuracy : 0.6833          
##                                           
##        'Positive' Class : 0               
## 
#Accuracy of the SVM
Accur_SVM <- confusionMatrix(as.factor(svm_pred), as.factor(Testdata$class))$overall['Accuracy']
Accur_SVM
## Accuracy 
##     0.75
# MODEL lOGISTIC REGRESSION

set.seed(123)
split = sample.split(dataset$class, SplitRatio = 0.75)
Traindata = subset(dataset, split == TRUE)
Testdata = subset(dataset, split == FALSE)
dataset$class <- as.factor(dataset$class)

# Training The Model

glm_Model1 <- glm(class ~., data = Traindata, family = binomial)
summary(glm_Model1)
## 
## Call:
## glm(formula = class ~ ., family = binomial, data = Traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5786  -0.7009  -0.4046   0.6694   2.8366  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.597820   0.836126 -10.283  < 2e-16 ***
## preg         0.107268   0.038914   2.756  0.00584 ** 
## plas         0.040055   0.004599   8.709  < 2e-16 ***
## pres        -0.018938   0.006538  -2.897  0.00377 ** 
## skin         0.008982   0.008373   1.073  0.28342    
## insu        -0.003051   0.001179  -2.588  0.00966 ** 
## mass         0.088903   0.017876   4.973 6.58e-07 ***
## pedi         0.794833   0.364194   2.182  0.02908 *  
## age          0.020087   0.011341   1.771  0.07652 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 745.11  on 575  degrees of freedom
## Residual deviance: 527.03  on 567  degrees of freedom
## AIC: 545.03
## 
## Number of Fisher Scoring iterations: 5
# Variables with the p_values greather than 0.01 are insignificant

glm_Model2 <- update(glm_Model1, ~. - skin - insu - age )
summary(glm_Model2)
## 
## Call:
## glm(formula = class ~ preg + plas + pres + mass + pedi, family = binomial, 
##     data = Traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7239  -0.7192  -0.4141   0.6964   2.8894  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.821602   0.770056 -10.157  < 2e-16 ***
## preg         0.151722   0.033154   4.576 4.73e-06 ***
## plas         0.036884   0.004024   9.165  < 2e-16 ***
## pres        -0.015398   0.006118  -2.517   0.0118 *  
## mass         0.084551   0.016569   5.103 3.35e-07 ***
## pedi         0.690636   0.354260   1.950   0.0512 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 745.11  on 575  degrees of freedom
## Residual deviance: 538.28  on 570  degrees of freedom
## AIC: 550.28
## 
## Number of Fisher Scoring iterations: 5
# Testing the Model

glm_probs <- predict(glm_Model2, newdata = Testdata, type = "response")
glm_pred <- ifelse(glm_probs > 0.5, 1, 0)

#print("Confusion Matrix for logistic regression"); 

table(Predicted = glm_pred, Actual = Testdata$class)
##          Actual
## Predicted   0   1
##         0 102  29
##         1  23  38
confusionMatrix(as.factor(glm_pred), as.factor(Testdata$class) ) # Confusion Matrix for logistic regression
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 102  29
##          1  23  38
##                                           
##                Accuracy : 0.7292          
##                  95% CI : (0.6605, 0.7906)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 0.01287         
##                                           
##                   Kappa : 0.3913          
##                                           
##  Mcnemar's Test P-Value : 0.48807         
##                                           
##             Sensitivity : 0.8160          
##             Specificity : 0.5672          
##          Pos Pred Value : 0.7786          
##          Neg Pred Value : 0.6230          
##              Prevalence : 0.6510          
##          Detection Rate : 0.5312          
##    Detection Prevalence : 0.6823          
##       Balanced Accuracy : 0.6916          
##                                           
##        'Positive' Class : 0               
## 
#Accuracy of the GLM
Accur_GLM <- confusionMatrix(as.factor(glm_pred), as.factor(Testdata$class) )$overall['Accuracy']
Accur_GLM
##  Accuracy 
## 0.7291667
# Model Comparision

modelsname <- c('GLM','SVM','RandomForest')
modelsvalue <- c(Accur_GLM,Accur_SVM,ACC_RandomForest)
model_comapre <- data.frame(modelsname,modelsvalue)
ggplot(model_comapre,aes(x=modelsname,y=modelsvalue)) + geom_bar(stat='identity') + theme_bw() + ggtitle('Comparison of Model Accuracy')


# # Radhe radhe
# installed.packages()
# install.packages('xgboost')