This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(knitr)
library(tidyverse) #Required for analysis, visualization
## -- Attaching packages ------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.2
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly) # Required for interactive plotting of graphs
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggsci) #Themes package for the plots
diab<-read_csv("diabetes.csv") # It reads the CSV file and assigns to diab object
## Parsed with column specification:
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
You can also embed plots, for example:
head(diab) # Elements of first few rows
## # A tibble: 6 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## # ... with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>
tail(diab)
## # A tibble: 6 x 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9 89 62 0 0 22.5
## 2 10 101 76 48 180 32.9
## 3 2 122 70 27 0 36.8
## 4 5 121 72 23 112 26.2
## 5 1 126 60 0 0 30.1
## 6 1 93 70 31 0 30.4
## # ... with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## # Outcome <dbl>
colnames(diab)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
str(diab)
## tibble [768 x 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Pregnancies = col_double(),
## .. Glucose = col_double(),
## .. BloodPressure = col_double(),
## .. SkinThickness = col_double(),
## .. Insulin = col_double(),
## .. BMI = col_double(),
## .. DiabetesPedigreeFunction = col_double(),
## .. Age = col_double(),
## .. Outcome = col_double()
## .. )
summary(diab)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
diab$Outcome<-factor(diab$Outcome)
class(diab$Outcome)
## [1] "factor"
p1<-ggplot(diab,aes(x=Age,y=Pregnancies,col=Outcome))+geom_point()+geom_smooth(method="loess", se=T)+facet_grid(.~Outcome)
ggplotly(p1)
## `geom_smooth()` using formula 'y ~ x'
p2<-ggplot(diab,aes(x=Age,y=Pregnancies))+geom_boxplot(aes(fill=Outcome))+facet_wrap(Outcome~.)
ggplotly(p2)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p3<-ggplot(diab,aes(x=Pregnancies))+geom_density(aes(fill=Outcome),alpha=0.6)+
geom_vline(aes(xintercept=mean(Pregnancies)),
color="blue", linetype="dashed", size=1)+facet_grid(.~Outcome)+scale_fill_aaas()
ggplotly(p3)
p3<-ggplot(diab,aes(x=Age, y=Pregnancies, size=Glucose, fill=BloodPressure))+geom_point(alpha=0.2)+
facet_grid(.~Outcome)+geom_jitter(width = 0.4)+scale_x_continuous(limits = c(18, 80))+scale_fill_material("red")
ggplotly(p3)
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length
library(caret) # ML package for various methods
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Create the training and test datasets
set.seed(100)
hci<-diab
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$Outcome, p=0.8, list=FALSE) # Data partition for dividing the dataset into training and testing data set. This is useful for cross validation
# Step 2: Create the training dataset
trainData <- hci[trainRowNumbers,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]
# Store X and Y for later use.
x = trainData[, 1:8]
y=trainData$Outcome
xt= testData[, 1:8]
yt=testData$Outcome
# # See the structure of the new dataset
preProcess_range_modeltr <- preProcess(trainData, method='range')
preProcess_range_modelts <- preProcess(testData, method='range')
trainData <- predict(preProcess_range_modeltr, newdata = trainData)
testData <- predict(preProcess_range_modelts, newdata = testData)
# Append the Y variable
trainData$Outcome <- y
testData$Outcome<-yt
levels(trainData$Outcome) <- c("Class0", "Class1") # Convert binary outcome into character for caret package
levels(testData$Outcome) <- c("Class0", "Class1")
#apply(trainData[, 1:8], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
#str(trainData)
#fit control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model1 = train(Outcome ~ ., data=trainData, method='lda', tuneLength = 5, metric='ROC', trControl = fitControl)
model2 = train(Outcome ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl) #KNN Model
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model3 = train(Outcome ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl) #SVM
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model4 = train(Outcome ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl) #RandomForest
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
model5 = train(Outcome ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl) #Adaboost
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
# Compare model performances using resample()
models_compare <- resamples(list(LDA=model1,KNN=model2,SVM=model3,RF=model4, ADA=model5))
# Summary of the models performances
summary(models_compare)
##
## Call:
## summary.resamples(object = models_compare)
##
## Models: LDA, KNN, SVM, RF, ADA
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.7886628 0.8267442 0.8284884 0.8309884 0.8441860 0.8668605 0
## KNN 0.7204942 0.7507267 0.7848837 0.7715116 0.7976744 0.8037791 0
## SVM 0.8305233 0.8328488 0.8372093 0.8363372 0.8377907 0.8433140 0
## RF 0.6478198 0.6702035 0.6879360 0.6915698 0.6879360 0.7639535 0
## ADA 0.7311047 0.7436047 0.7938953 0.7763372 0.8029070 0.8101744 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8125 0.8750 0.8750 0.8725 0.8875 0.9125 0
## KNN 0.8000 0.8000 0.8125 0.8300 0.8375 0.9000 0
## SVM 0.8375 0.8375 0.8875 0.8725 0.8875 0.9125 0
## RF 0.7125 0.7375 0.8625 0.8175 0.8875 0.8875 0
## ADA 0.7750 0.7750 0.7750 0.7950 0.8125 0.8375 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.5581395 0.5813953 0.5813953 0.5860465 0.6046512 0.6046512 0
## KNN 0.3720930 0.3953488 0.5348837 0.5069767 0.5813953 0.6511628 0
## SVM 0.5348837 0.5348837 0.6046512 0.6046512 0.6511628 0.6976744 0
## RF 0.4883721 0.4883721 0.5581395 0.5534884 0.6046512 0.6279070 0
## ADA 0.5348837 0.5581395 0.6046512 0.5953488 0.6279070 0.6511628 0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)
# Step 2: Predict on testData and Compute the confusion matrix
# Using LDA Model
predicted <- predict(model1, testData[,1:8])
confusionMatrix(reference = testData$Outcome, data = predicted, mode='everything')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Class0 Class1
## Class0 88 29
## Class1 12 24
##
## Accuracy : 0.732
## 95% CI : (0.6545, 0.8003)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.02367
##
## Kappa : 0.36
##
## Mcnemar's Test P-Value : 0.01246
##
## Sensitivity : 0.8800
## Specificity : 0.4528
## Pos Pred Value : 0.7521
## Neg Pred Value : 0.6667
## Precision : 0.7521
## Recall : 0.8800
## F1 : 0.8111
## Prevalence : 0.6536
## Detection Rate : 0.5752
## Detection Prevalence : 0.7647
## Balanced Accuracy : 0.6664
##
## 'Positive' Class : Class0
##