Indian Diabetes Data Analysis

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(knitr)
library(tidyverse) #Required for analysis, visualization

## -- Attaching packages ------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.2
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly) # Required for interactive plotting of graphs

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggsci) #Themes package for the plots
diab<-read_csv("diabetes.csv") # It reads the CSV file and assigns to diab object

## Parsed with column specification:
## cols(
##   Pregnancies = col_double(),
##   Glucose = col_double(),
##   BloodPressure = col_double(),
##   SkinThickness = col_double(),
##   Insulin = col_double(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_double(),
##   Outcome = col_double()
## )

Including Plots

You can also embed plots, for example:

head(diab) # Elements of first few rows

## # A tibble: 6 x 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           6     148            72            35       0  33.6
## 2           1      85            66            29       0  26.6
## 3           8     183            64             0       0  23.3
## 4           1      89            66            23      94  28.1
## 5           0     137            40            35     168  43.1
## 6           5     116            74             0       0  25.6
## # ... with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## #   Outcome <dbl>

tail(diab)

## # A tibble: 6 x 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           9      89            62             0       0  22.5
## 2          10     101            76            48     180  32.9
## 3           2     122            70            27       0  36.8
## 4           5     121            72            23     112  26.2
## 5           1     126            60             0       0  30.1
## 6           1      93            70            31       0  30.4
## # ... with 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>,
## #   Outcome <dbl>

colnames(diab)

## [1] "Pregnancies"              "Glucose"                 
## [3] "BloodPressure"            "SkinThickness"           
## [5] "Insulin"                  "BMI"                     
## [7] "DiabetesPedigreeFunction" "Age"                     
## [9] "Outcome"

str(diab)

## tibble [768 x 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Pregnancies = col_double(),
##   ..   Glucose = col_double(),
##   ..   BloodPressure = col_double(),
##   ..   SkinThickness = col_double(),
##   ..   Insulin = col_double(),
##   ..   BMI = col_double(),
##   ..   DiabetesPedigreeFunction = col_double(),
##   ..   Age = col_double(),
##   ..   Outcome = col_double()
##   .. )

summary(diab)

##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

diab$Outcome<-factor(diab$Outcome)
class(diab$Outcome)

## [1] "factor"

p1<-ggplot(diab,aes(x=Age,y=Pregnancies,col=Outcome))+geom_point()+geom_smooth(method="loess", se=T)+facet_grid(.~Outcome)
ggplotly(p1)

## `geom_smooth()` using formula 'y ~ x'

p2<-ggplot(diab,aes(x=Age,y=Pregnancies))+geom_boxplot(aes(fill=Outcome))+facet_wrap(Outcome~.)
ggplotly(p2)

## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

p3<-ggplot(diab,aes(x=Pregnancies))+geom_density(aes(fill=Outcome),alpha=0.6)+
  geom_vline(aes(xintercept=mean(Pregnancies)),
            color="blue", linetype="dashed", size=1)+facet_grid(.~Outcome)+scale_fill_aaas()
ggplotly(p3)

p3<-ggplot(diab,aes(x=Age, y=Pregnancies, size=Glucose, fill=BloodPressure))+geom_point(alpha=0.2)+
  facet_grid(.~Outcome)+geom_jitter(width = 0.4)+scale_x_continuous(limits = c(18, 80))+scale_fill_material("red")
ggplotly(p3)

## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length

## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length

## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length

## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]:
## number of items to replace is not a multiple of replacement length

library(caret) # ML package for various methods

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# Create the training and test datasets
set.seed(100)
hci<-diab

# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$Outcome, p=0.8, list=FALSE) # Data partition for dividing the dataset into training and testing data set. This is useful for cross validation

# Step 2: Create the training  dataset
trainData <- hci[trainRowNumbers,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]

# Store X and Y for later use.
x = trainData[, 1:8]
y=trainData$Outcome

xt= testData[, 1:8]
yt=testData$Outcome
# # See the structure of the new dataset

preProcess_range_modeltr <- preProcess(trainData, method='range')
preProcess_range_modelts <- preProcess(testData, method='range')

trainData <- predict(preProcess_range_modeltr, newdata = trainData)
testData <- predict(preProcess_range_modelts, newdata = testData)

# Append the Y variable
trainData$Outcome <- y
testData$Outcome<-yt
levels(trainData$Outcome) <- c("Class0", "Class1") # Convert binary outcome into character for caret package
levels(testData$Outcome) <- c("Class0", "Class1")

#apply(trainData[, 1:8], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
#str(trainData)

#fit control
fitControl <- trainControl(
  method = 'cv',                   # k-fold cross validation
  number = 5,                      # number of folds
  savePredictions = 'final',       # saves predictions for optimal tuning parameter
  classProbs = T,                  # should class probabilities be returned
  summaryFunction=twoClassSummary  # results summary function
)

# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model1 = train(Outcome ~ ., data=trainData, method='lda', tuneLength = 5, metric='ROC', trControl = fitControl)

model2 = train(Outcome ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl) #KNN Model

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

model3 = train(Outcome ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl) #SVM

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

model4 = train(Outcome ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl) #RandomForest

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

model5 = train(Outcome ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl) #Adaboost

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.

# Compare model performances using resample()
models_compare <- resamples(list(LDA=model1,KNN=model2,SVM=model3,RF=model4, ADA=model5))

# Summary of the models performances
summary(models_compare)

## 
## Call:
## summary.resamples(object = models_compare)
## 
## Models: LDA, KNN, SVM, RF, ADA 
## Number of resamples: 5 
## 
## ROC 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA 0.7886628 0.8267442 0.8284884 0.8309884 0.8441860 0.8668605    0
## KNN 0.7204942 0.7507267 0.7848837 0.7715116 0.7976744 0.8037791    0
## SVM 0.8305233 0.8328488 0.8372093 0.8363372 0.8377907 0.8433140    0
## RF  0.6478198 0.6702035 0.6879360 0.6915698 0.6879360 0.7639535    0
## ADA 0.7311047 0.7436047 0.7938953 0.7763372 0.8029070 0.8101744    0
## 
## Sens 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LDA 0.8125  0.8750 0.8750 0.8725  0.8875 0.9125    0
## KNN 0.8000  0.8000 0.8125 0.8300  0.8375 0.9000    0
## SVM 0.8375  0.8375 0.8875 0.8725  0.8875 0.9125    0
## RF  0.7125  0.7375 0.8625 0.8175  0.8875 0.8875    0
## ADA 0.7750  0.7750 0.7750 0.7950  0.8125 0.8375    0
## 
## Spec 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA 0.5581395 0.5813953 0.5813953 0.5860465 0.6046512 0.6046512    0
## KNN 0.3720930 0.3953488 0.5348837 0.5069767 0.5813953 0.6511628    0
## SVM 0.5348837 0.5348837 0.6046512 0.6046512 0.6511628 0.6976744    0
## RF  0.4883721 0.4883721 0.5581395 0.5534884 0.6046512 0.6279070    0
## ADA 0.5348837 0.5581395 0.6046512 0.5953488 0.6279070 0.6511628    0

# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)

# Step 2: Predict on testData and Compute the confusion matrix
# Using LDA Model
predicted <- predict(model1, testData[,1:8])
confusionMatrix(reference = testData$Outcome, data = predicted, mode='everything')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Class0 Class1
##     Class0     88     29
##     Class1     12     24
##                                           
##                Accuracy : 0.732           
##                  95% CI : (0.6545, 0.8003)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.02367         
##                                           
##                   Kappa : 0.36            
##                                           
##  Mcnemar's Test P-Value : 0.01246         
##                                           
##             Sensitivity : 0.8800          
##             Specificity : 0.4528          
##          Pos Pred Value : 0.7521          
##          Neg Pred Value : 0.6667          
##               Precision : 0.7521          
##                  Recall : 0.8800          
##                      F1 : 0.8111          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5752          
##    Detection Prevalence : 0.7647          
##       Balanced Accuracy : 0.6664          
##                                           
##        'Positive' Class : Class0          
##

Indian Diabetes Data Analysis_v1

Sakib Faisal

1/30/2022

R Markdown

Including Plots