# Sri Ganapate namo namah
# EDA INSULIN
# Importing the dataset and converting the dataset attribute class into o and 1

dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))

install.packages('ggplot2', repos='http://cran.us.r-project.org')
## Installing package into 'C:/Users/vipin.dwivedi.IN/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://cran.us.r-project.org/src/contrib:
##   cannot open URL 'http://cran.us.r-project.org/src/contrib/PACKAGES'
## Warning: package 'ggplot2' is not available (for R version 3.6.2)
## Warning: unable to access index for repository http://cran.us.r-project.org/bin/windows/contrib/3.6:
##   cannot open URL 'http://cran.us.r-project.org/bin/windows/contrib/3.6/PACKAGES'
library(ggplot2)

# EDA Insulin
Densityplot_Insul <- ggplot(dataset, aes(x = insu, color = class, fill = class)) +
  geom_density(alpha = 0.8) +
  theme(legend.position = "bottom") +
  labs(x = "insulin", y = "Density", title = "Density plot of Insulin")

VarInsu <- ggplot(dataset, aes(x = class, y = insu,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle(" Insu Vs Diabetes")

gridExtra::grid.arrange(Densityplot_Insul, VarInsu, ncol = 2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

# EDA Preg
Densityplot_preg <- ggplot(dataset, aes(x = preg, color = class, fill = class)) +
  geom_density(alpha = 0.8) +
  theme(legend.position = "bottom") +
  labs(x = "preg", y = "Density", title = "Density plot of preg")

Varpreg <- ggplot(dataset, aes(x = class, y = preg,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle(" preg Vs Diabetes")

gridExtra::grid.arrange(Densityplot_preg, Varpreg, ncol = 2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

plot1 <- ggplot(dataset, aes(x = age, y = preg)) +
  geom_point(aes(color=class)) + 
  theme(legend.position = "bottom") +
  ggtitle("Relationship of Pregnancies with Age Vs Diabetes")

plot2 <- ggplot(dataset,aes(x=insu,y=pedi))+
  geom_point(aes(color=class))+
  theme(legend.position = "bottom") +
  ggtitle("Relationship of Insulin with pedi Vs Diabetes")

gridExtra::grid.arrange(plot1, plot2, ncol = 2)

plot3 <- ggplot(dataset, aes(x = skin, y = age)) +
  geom_point(aes(color=class)) + 
  ggtitle("Relationship of skin with Age Vs Diabetes")

# EDA
install.packages('corrplot', repos='http://cran.us.r-project.org')
## Installing package into 'C:/Users/vipin.dwivedi.IN/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://cran.us.r-project.org/src/contrib:
##   cannot open URL 'http://cran.us.r-project.org/src/contrib/PACKAGES'
## Warning: package 'corrplot' is not available (for R version 3.6.2)
## Warning: unable to access index for repository http://cran.us.r-project.org/bin/windows/contrib/3.6:
##   cannot open URL 'http://cran.us.r-project.org/bin/windows/contrib/3.6/PACKAGES'
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
diadata <- read.csv("diab_1.csv",stringsAsFactors = F)
head(diadata);str(diadata)
##   preg plas pres skin insu mass  pedi age           class
## 1    6  148   72   35    0 33.6 0.627  50 tested_positive
## 2    1   85   66   29    0 26.6 0.351  31 tested_negative
## 3    8  183   64    0    0 23.3 0.672  32 tested_positive
## 4    1   89   66   23   94 28.1 0.167  21 tested_negative
## 5    0  137   40   35  168 43.1 2.288  33 tested_positive
## 6    5  116   74    0    0 25.6 0.201  30 tested_negative
## 'data.frame':    768 obs. of  9 variables:
##  $ preg : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ plas : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ pres : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ skin : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ insu : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedi : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age  : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ class: chr  "tested_positive" "tested_negative" "tested_positive" "tested_negative" ...
table(is.na(diadata))
## 
## FALSE 
##  6912
corrplot(cor(diadata[, -9]), type = "lower", method = "number")