# SRI GANESHYA NAMAH
# EDA Insulin

dataset = read.csv('diab_1.csv',stringsAsFactors=FALSE)
df <- dataset$class
df[df == "tested_positive"] <-"1"
df[df == "tested_negative"] <-"0"
dataset$class <- df
dataset$class = as.numeric(as.character(dataset$class))

#install.packages('ggplot2')
library(ggplot2)

dataset$class <- factor(dataset$class)

# 1 Class

ggplot(dataset,aes(class,fill = class)) +geom_bar() + ggtitle("Distribution of class variable")

# Inependent variables

# 2. Pregnancies

Plot_1 <- ggplot(dataset, aes(x = class, y = preg,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle("Number of pregnancies Vs Diabetes")

Plot_2 <- ggplot(dataset,aes(x = preg,fill = factor(class))) + 
  geom_bar(position = "Dodge") + 
  scale_x_continuous(limits = c(0,16)) +
  theme(legend.position = "bottom") +
  labs(title = "Pregnancies Vs class")

gridExtra::grid.arrange(Plot_1, Plot_2, ncol = 2)
## Warning: Removed 1 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

#  Observation (Higher Pregancieg haver higher risk to be diabetic)


# 2. Plasma glucose plas

A_2 <- ggplot(dataset, aes(x = plas, color = class, fill = class)) +
  geom_density(alpha = 0.8) +
  theme(legend.position = "bottom") +
  labs(x = "plas", y = "Density", title = "Density plot of plas")

A_4 <- ggplot(dataset, aes(x = class, y = plas,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle("Variation of plas in women Vs Diabetes")

gridExtra::grid.arrange(A_4, A_2, ncol = 2)

# Observation
# From the figures below, there's a clear difference in the amount of Glucose present in the women who have been diagnosed ...
# ...with Diabetes and those who haven't. While the density plot indicates an overlap in the levels of glucose in both categories of women...
# ...the below plots show that Glucose could be a good indicator of the response.


# 3. Blood Pressure pres

a2 <- ggplot(dataset, aes(x = pres, color = class, fill = class)) +
  geom_density(alpha = 0.8) +
  theme(legend.position = "bottom") +
  labs(x = "Blood pressure", y = "Density", title = "Density plot of Blood pressure")

a4 <- ggplot(dataset, aes(x = class, y = pres,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle("Variation of blood pressure in women Vs Diabetes")

gridExtra::grid.arrange(a4, a2, ncol = 2)

# Observations(From the below plots, no clear difference is seen in the two categories of women who have and don't have Diabetes...
#...This shows that Blood Pressure might not be a good predictor of the response variable)

# 4. Skin Thickness

c2 <- ggplot(dataset, aes(x = skin, color = class, fill = class)) +
  geom_density(alpha = 0.8) +
  theme(legend.position = "bottom") +
  labs(x = "Skin thickness", y = "Density", title = "Density plot of skin thickness")

c3 <- ggplot(dataset, aes(x = class, y = skin,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle("Variation of skin thickness Vs Diabetes")

gridExtra::grid.arrange(c3, c2, ncol = 2)

# Observation (no clear difference can be seen in the two categories of women who have and don't have Diabetes)

#  BMI

point1.1 <- ggplot(dataset, aes(mass, fill = class)) +
  geom_histogram() +
  theme(legend.position = "bottom") +
  ggtitle("Variation of BMI of women Vs Diabetes")

point1.2 <- ggplot(dataset, aes(x = class, y = mass,fill = class)) +
  geom_boxplot(binwidth = 5) +
  theme(legend.position = "bottom") +
  ggtitle("Variation of BMI of women Vs Diabetes")
## Warning: Ignoring unknown parameters: binwidth
gridExtra::grid.arrange(point1.2, point1.1, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Women who had Diabetes had a BMI greater than 25, which is above the normal levels...
# ...On the other hand, women who did not have Diabetes had a BMI ranging from 18 to 60.


# Age

point2.1 <- ggplot(dataset, aes(age, fill = class)) +
  geom_histogram(binwidth = 5) +
  theme(legend.position = "bottom") +
  ggtitle("Variation of Age of women Vs Diabetes")

point2.2 <- ggplot(dataset, aes(x = class, y = age,fill = class)) +
  geom_boxplot() +
  theme(legend.position = "bottom") +
  ggtitle("Variation of Age of women Vs Diabetes")

gridExtra::grid.arrange(point2.2, point2.1, ncol = 2)

# Bivariables


po1 <- ggplot(dataset, aes(x = age, y = preg)) +
  geom_point(aes(color=class)) + 
  theme(legend.position = "bottom") +
  ggtitle("Relationship of Pregnancies with Age Vs Diabetes")

p02 <- ggplot(dataset,aes(x=insu,y=plas))+
  geom_point(aes(color=class))+
  theme(legend.position = "bottom") +
  ggtitle("Relationship of Insulin with Glucose Vs Diabetes")

gridExtra::grid.arrange(po1, p02, ncol = 2)

# Women who have Diabetes can be differentiated from those who don't have based on BMI and BP values
# Women with low values of BMI and Skin Thickness did not have Diabetes


poi1 <- ggplot(dataset,aes(x=mass,y= pres))+
  geom_point(aes(color=class))+
  theme(legend.position = "bottom") +
  ggtitle("Relationship of BMI with BP Vs Diabetes")

poi2 <- ggplot(dataset,aes(x=mass,y=skin))+
  geom_point(aes(color=class))+
  theme(legend.position = "bottom") +
  ggtitle("Relationship of BMI with Skin Thickness Vs Diabetes")

gridExtra::grid.arrange(poi1, poi2, ncol = 2)

# EDA
#install.packages('corrplot')
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
diadata <- read.csv("diab_1.csv",stringsAsFactors = F)
head(diadata);str(diadata)
##   preg plas pres skin insu mass  pedi age           class
## 1    6  148   72   35    0 33.6 0.627  50 tested_positive
## 2    1   85   66   29    0 26.6 0.351  31 tested_negative
## 3    8  183   64    0    0 23.3 0.672  32 tested_positive
## 4    1   89   66   23   94 28.1 0.167  21 tested_negative
## 5    0  137   40   35  168 43.1 2.288  33 tested_positive
## 6    5  116   74    0    0 25.6 0.201  30 tested_negative
## 'data.frame':    768 obs. of  9 variables:
##  $ preg : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ plas : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ pres : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ skin : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ insu : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedi : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age  : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ class: chr  "tested_positive" "tested_negative" "tested_positive" "tested_negative" ...
table(is.na(diadata))
## 
## FALSE 
##  6912
corrplot(cor(diadata[, -9]), type = "lower", method = "number")