# Sri Ganapate namo namah
# EDA INSULIN
# Importing the dataset and converting the dataset attribute class into o and 1
dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))
install.packages('ggplot2', repos='http://cran.us.r-project.org')
## Installing package into 'C:/Users/vipin.dwivedi.IN/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://cran.us.r-project.org/src/contrib:
## cannot open URL 'http://cran.us.r-project.org/src/contrib/PACKAGES'
## Warning: package 'ggplot2' is not available (for R version 3.6.2)
## Warning: unable to access index for repository http://cran.us.r-project.org/bin/windows/contrib/3.6:
## cannot open URL 'http://cran.us.r-project.org/bin/windows/contrib/3.6/PACKAGES'
library(ggplot2)
# EDA Insulin
Densityplot_Insul <- ggplot(dataset, aes(x = insu, color = class, fill = class)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "insulin", y = "Density", title = "Density plot of Insulin")
VarInsu <- ggplot(dataset, aes(x = class, y = insu,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle(" Insu Vs Diabetes")
gridExtra::grid.arrange(Densityplot_Insul, VarInsu, ncol = 2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

# EDA Preg
Densityplot_preg <- ggplot(dataset, aes(x = preg, color = class, fill = class)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "preg", y = "Density", title = "Density plot of preg")
Varpreg <- ggplot(dataset, aes(x = class, y = preg,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle(" preg Vs Diabetes")
gridExtra::grid.arrange(Densityplot_preg, Varpreg, ncol = 2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

plot1 <- ggplot(dataset, aes(x = age, y = preg)) +
geom_point(aes(color=class)) +
theme(legend.position = "bottom") +
ggtitle("Relationship of Pregnancies with Age Vs Diabetes")
plot2 <- ggplot(dataset,aes(x=insu,y=pedi))+
geom_point(aes(color=class))+
theme(legend.position = "bottom") +
ggtitle("Relationship of Insulin with pedi Vs Diabetes")
gridExtra::grid.arrange(plot1, plot2, ncol = 2)

plot3 <- ggplot(dataset, aes(x = skin, y = age)) +
geom_point(aes(color=class)) +
ggtitle("Relationship of skin with Age Vs Diabetes")
# EDA
install.packages('corrplot', repos='http://cran.us.r-project.org')
## Installing package into 'C:/Users/vipin.dwivedi.IN/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://cran.us.r-project.org/src/contrib:
## cannot open URL 'http://cran.us.r-project.org/src/contrib/PACKAGES'
## Warning: package 'corrplot' is not available (for R version 3.6.2)
## Warning: unable to access index for repository http://cran.us.r-project.org/bin/windows/contrib/3.6:
## cannot open URL 'http://cran.us.r-project.org/bin/windows/contrib/3.6/PACKAGES'
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
diadata <- read.csv("diab_1.csv",stringsAsFactors = F)
head(diadata);str(diadata)
## preg plas pres skin insu mass pedi age class
## 1 6 148 72 35 0 33.6 0.627 50 tested_positive
## 2 1 85 66 29 0 26.6 0.351 31 tested_negative
## 3 8 183 64 0 0 23.3 0.672 32 tested_positive
## 4 1 89 66 23 94 28.1 0.167 21 tested_negative
## 5 0 137 40 35 168 43.1 2.288 33 tested_positive
## 6 5 116 74 0 0 25.6 0.201 30 tested_negative
## 'data.frame': 768 obs. of 9 variables:
## $ preg : int 6 1 8 1 0 5 3 10 2 8 ...
## $ plas : int 148 85 183 89 137 116 78 115 197 125 ...
## $ pres : int 72 66 64 66 40 74 50 0 70 96 ...
## $ skin : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insu : int 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedi : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ class: chr "tested_positive" "tested_negative" "tested_positive" "tested_negative" ...
table(is.na(diadata))
##
## FALSE
## 6912
corrplot(cor(diadata[, -9]), type = "lower", method = "number")
