# SRI GANESHYA NAMAH
# EDA Insulin
dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))
#install.packages('ggplot2')
library(ggplot2)
dataset$class <- factor(dataset$class)
# 1 Class
ggplot(dataset,aes(class,fill = class)) +geom_bar() + ggtitle("Distribution of class variable")

# Inependent variables
# 2. Pregnancies
Plot_1 <- ggplot(dataset, aes(x = class, y = preg,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Number of pregnancies Vs Diabetes")
Plot_2 <- ggplot(dataset,aes(x = preg,fill = factor(class))) +
geom_bar(position = "Dodge") +
scale_x_continuous(limits = c(0,16)) +
theme(legend.position = "bottom") +
labs(title = "Pregnancies Vs class")
gridExtra::grid.arrange(Plot_1, Plot_2, ncol = 2)
## Warning: Removed 1 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

# Observation (Higher Pregancieg haver higher risk to be diabetic)
# 2. Plasma glucose plas
A_2 <- ggplot(dataset, aes(x = plas, color = class, fill = class)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "plas", y = "Density", title = "Density plot of plas")
A_4 <- ggplot(dataset, aes(x = class, y = plas,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of plas in women Vs Diabetes")
gridExtra::grid.arrange(A_4, A_2, ncol = 2)

# Observation
# From the figures below, there's a clear difference in the amount of Glucose present in the women who have been diagnosed ...
# ...with Diabetes and those who haven't. While the density plot indicates an overlap in the levels of glucose in both categories of women...
# ...the below plots show that Glucose could be a good indicator of the response.
# 3. Blood Pressure pres
a2 <- ggplot(dataset, aes(x = pres, color = class, fill = class)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Blood pressure", y = "Density", title = "Density plot of Blood pressure")
a4 <- ggplot(dataset, aes(x = class, y = pres,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of blood pressure in women Vs Diabetes")
gridExtra::grid.arrange(a4, a2, ncol = 2)

# Observations(From the below plots, no clear difference is seen in the two categories of women who have and don't have Diabetes...
#...This shows that Blood Pressure might not be a good predictor of the response variable)
# 4. Skin Thickness
c2 <- ggplot(dataset, aes(x = skin, color = class, fill = class)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Skin thickness", y = "Density", title = "Density plot of skin thickness")
c3 <- ggplot(dataset, aes(x = class, y = skin,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of skin thickness Vs Diabetes")
gridExtra::grid.arrange(c3, c2, ncol = 2)

# Observation (no clear difference can be seen in the two categories of women who have and don't have Diabetes)
# BMI
point1.1 <- ggplot(dataset, aes(mass, fill = class)) +
geom_histogram() +
theme(legend.position = "bottom") +
ggtitle("Variation of BMI of women Vs Diabetes")
point1.2 <- ggplot(dataset, aes(x = class, y = mass,fill = class)) +
geom_boxplot(binwidth = 5) +
theme(legend.position = "bottom") +
ggtitle("Variation of BMI of women Vs Diabetes")
## Warning: Ignoring unknown parameters: binwidth
gridExtra::grid.arrange(point1.2, point1.1, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Women who had Diabetes had a BMI greater than 25, which is above the normal levels...
# ...On the other hand, women who did not have Diabetes had a BMI ranging from 18 to 60.
# Age
point2.1 <- ggplot(dataset, aes(age, fill = class)) +
geom_histogram(binwidth = 5) +
theme(legend.position = "bottom") +
ggtitle("Variation of Age of women Vs Diabetes")
point2.2 <- ggplot(dataset, aes(x = class, y = age,fill = class)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of Age of women Vs Diabetes")
gridExtra::grid.arrange(point2.2, point2.1, ncol = 2)

# Bivariables
po1 <- ggplot(dataset, aes(x = age, y = preg)) +
geom_point(aes(color=class)) +
theme(legend.position = "bottom") +
ggtitle("Relationship of Pregnancies with Age Vs Diabetes")
p02 <- ggplot(dataset,aes(x=insu,y=plas))+
geom_point(aes(color=class))+
theme(legend.position = "bottom") +
ggtitle("Relationship of Insulin with Glucose Vs Diabetes")
gridExtra::grid.arrange(po1, p02, ncol = 2)

# Women who have Diabetes can be differentiated from those who don't have based on BMI and BP values
# Women with low values of BMI and Skin Thickness did not have Diabetes
poi1 <- ggplot(dataset,aes(x=mass,y= pres))+
geom_point(aes(color=class))+
theme(legend.position = "bottom") +
ggtitle("Relationship of BMI with BP Vs Diabetes")
poi2 <- ggplot(dataset,aes(x=mass,y=skin))+
geom_point(aes(color=class))+
theme(legend.position = "bottom") +
ggtitle("Relationship of BMI with Skin Thickness Vs Diabetes")
gridExtra::grid.arrange(poi1, poi2, ncol = 2)

# EDA
#install.packages('corrplot')
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
diadata <- read.csv("diab_1.csv",stringsAsFactors = F)
head(diadata);str(diadata)
## preg plas pres skin insu mass pedi age class
## 1 6 148 72 35 0 33.6 0.627 50 tested_positive
## 2 1 85 66 29 0 26.6 0.351 31 tested_negative
## 3 8 183 64 0 0 23.3 0.672 32 tested_positive
## 4 1 89 66 23 94 28.1 0.167 21 tested_negative
## 5 0 137 40 35 168 43.1 2.288 33 tested_positive
## 6 5 116 74 0 0 25.6 0.201 30 tested_negative
## 'data.frame': 768 obs. of 9 variables:
## $ preg : int 6 1 8 1 0 5 3 10 2 8 ...
## $ plas : int 148 85 183 89 137 116 78 115 197 125 ...
## $ pres : int 72 66 64 66 40 74 50 0 70 96 ...
## $ skin : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insu : int 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedi : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ class: chr "tested_positive" "tested_negative" "tested_positive" "tested_negative" ...
table(is.na(diadata))
##
## FALSE
## 6912
corrplot(cor(diadata[, -9]), type = "lower", method = "number")
