Due Date: August 15, 2022
setwd("C:/Users/MButr/OneDrive/AI4OPT/Data Processing")
library(mlbench)
library(caret)
library(lattice)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(corrplot)
library(gridExtra)
library(DMwR2)
Let us start by reading the dataset
pid <- read.csv("C:/Users/MButr/OneDrive/AI4OPT/Data Processing/pima-indians-diabetes.csv", header=FALSE)
head(pid)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
Let us find out the dimensions of the dataset and whether there are any missing values (NA)
dim(pid)
## [1] 768 9
str(pid)
## 'data.frame': 768 obs. of 9 variables:
## $ V1: int 6 1 8 1 0 5 3 10 2 8 ...
## $ V2: int 148 85 183 89 137 116 78 115 197 125 ...
## $ V3: int 72 66 64 66 40 74 50 0 70 96 ...
## $ V4: int 35 29 0 23 35 0 32 0 45 0 ...
## $ V5: int 0 0 0 94 168 0 88 0 543 0 ...
## $ V6: num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ V7: num 0.627 0.351 0.672 0.167 2.288 ...
## $ V8: int 50 31 32 21 33 30 26 29 53 54 ...
## $ V9: int 1 0 1 0 1 0 1 0 1 1 ...
colnames(pid)[colSums(is.na(pid))>0]
## character(0)
From the above we notice that
Let us now obtain the summary of the numerical-valued variables in the dataset and change the last variable to a factor.
summary(pid[1:3])
## V1 V2 V3
## Min. : 0.000 Min. : 0.0 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00
## Median : 3.000 Median :117.0 Median : 72.00
## Mean : 3.845 Mean :120.9 Mean : 69.11
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00
## Max. :17.000 Max. :199.0 Max. :122.00
summary(pid[4:6])
## V4 V5 V6
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30
## Median :23.00 Median : 30.5 Median :32.00
## Mean :20.54 Mean : 79.8 Mean :31.99
## 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60
## Max. :99.00 Max. :846.0 Max. :67.10
summary(pid[7:8])
## V7 V8
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
We also note that the 9 variables in the dataset are not really descriptive. So we will adjust the names of the variables as follows:
names(pid)[1] <- "Pregnancy"
names(pid)[2] <- "Glucose"
names(pid)[3] <- "Blood_Pressure"
names(pid)[4] <- "Skin_Thickness"
names(pid)[5] <- "Insulin"
names(pid)[6] <- "Body_Mass_Index"
names(pid)[7] <- "Diabetes_Pedigree_Function"
names(pid)[8] <- "Age"
names(pid)[9] <- "Outcome"
head(pid)
## Pregnancy Glucose Blood_Pressure Skin_Thickness Insulin Body_Mass_Index
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## Diabetes_Pedigree_Function Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Although in this dataset none of the columns contain missing values, some of the measurements (Glucose, Blood Pressure, Skin Thickness, Insulin and BMI) have values of 0, which is not possible for a living human organism.
missing_data <- pid[,setdiff(names(pid), c('Outcome', 'Pregnancy'))]
features_miss_num <- apply(missing_data, 2, function(x) sum(x <= 0))
features_miss <- names(missing_data)[ features_miss_num > 0]
rows_miss <- apply(missing_data, 1, function(x) sum(x <= 0) >= 1)
sum(rows_miss)
## [1] 376
missing_data[missing_data <= 0] <- NA
pid[, names(missing_data)] <- missing_data
orig_data <- pid
colSums(is.na(pid))
## Pregnancy Glucose
## 0 5
## Blood_Pressure Skin_Thickness
## 35 227
## Insulin Body_Mass_Index
## 374 11
## Diabetes_Pedigree_Function Age
## 0 0
## Outcome
## 0
pid[,c(-8,-9)] <- knnImputation(pid[,c(-8,-9)], k = 5)
Let us look at a summary of the variables in the data set after including the NA values
summary(pid[1:3])
## Pregnancy Glucose Blood_Pressure
## Min. : 0.000 Min. : 44.0 Min. : 24.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00
## Median : 3.000 Median :117.0 Median : 72.00
## Mean : 3.845 Mean :121.7 Mean : 72.32
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00
## Max. :17.000 Max. :199.0 Max. :122.00
summary(pid[4:6])
## Skin_Thickness Insulin Body_Mass_Index
## Min. : 7.00 Min. : 14.00 Min. :18.20
## 1st Qu.:22.00 1st Qu.: 89.21 1st Qu.:27.50
## Median :29.00 Median :130.00 Median :32.30
## Mean :29.02 Mean :152.14 Mean :32.45
## 3rd Qu.:35.00 3rd Qu.:187.28 3rd Qu.:36.60
## Max. :99.00 Max. :846.00 Max. :67.10
summary(pid[7:8])
## Diabetes_Pedigree_Function Age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
Let us get the distribution of the outcome variable which is whether someone was diagnosed with diabestes or not from teh dataset
pid$Outcome <- factor(pid$Outcome)
ggplot(pid,aes(Outcome,fill = Outcome)) +
geom_bar() +
ggtitle("Distribution of Outcome variable")
The outcome distribution shows that there are more observations that resulted in not being diagnosed with diabetes. However the number of observations that resulted in being diagnosed with diabetes is not to be ignore as it is almost one third of the entire number of observations.
Let us change the Outcome variable to a factor and change the levels to Positive or Negative for diabetes
pid$Outcome <- as.factor(pid$Outcome)
levels(pid$Outcome) <- c("Negative","Positive")
Let us find the number of Negative and Positive cases in the dataset:
pid %>% group_by(Outcome) %>% summarise(No_of_Observation = n())
## # A tibble: 2 × 2
## Outcome No_of_Observation
## <fct> <int>
## 1 Negative 500
## 2 Positive 268
ggplot(data=pid,aes(x=Outcome,fill=Outcome))+geom_bar()
Let us look at the potential predictor variables (the first 8 variables in the dataset)
p1 <- ggplot(pid, aes(x = Outcome, y = Pregnancy, fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Pregnancies vs Diabetes")
p2 <- ggplot(pid, aes(x = Pregnancy,fill = factor(Outcome))) +
geom_bar(position = "Dodge") +
scale_x_continuous(limits = c(0,16)) +
theme(legend.position = "bottom") +
labs(title = "Pregnancies vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
Let us take a closer look at the Pregnancy variable
ggplot(data=pid,aes(x=Pregnancy,fill=Pregnancy))+geom_bar(fill="red",alpha=0.6)
table(pid$Pregnancy)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17
## 111 135 103 75 68 57 50 45 38 28 24 11 9 10 2 1 1
From the graphs above, we notice that the number of women who were diagnosed with diabetes had more pregnancies than the women with fewer pregnancies. However, we cannot say that there is a clear relation between the number of pregnancies and being diagnosed with diabetes.
p1 <- ggplot(pid, aes(x = Outcome, y = Glucose,fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Glucose vs Diabetes")
p2 <- ggplot(pid, aes(x = Glucose, color = Outcome, fill = Outcome)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Glucose", y = "Density", title = "Density plot of Glucose")
grid.arrange(p1, p2, ncol = 2)
From the plots above, we see that there is a difference in the glucose level between women who were diagnosed with diabetes and those you have not be diagnosed with diabetes.
p1 <- ggplot(pid, aes(x = Outcome, y = Blood_Pressure,fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Blood Pressure vs Diabetes")
p2 <- ggplot(pid, aes(x = Blood_Pressure, color = Outcome, fill = Outcome)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Blood pressure", y = "Density", title = "Density plot of Blood Pressure")
grid.arrange(p1, p2, ncol = 2)
From the plots above, we see that there is no clear difference between blood pressure level and being diagnosed with diabetes.
p1 <- ggplot(pid, aes(x = Outcome, y = Skin_Thickness,fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Skin Thickness vs Diabetes")
p2 <- ggplot(pid, aes(x = Skin_Thickness, color = Outcome, fill = Outcome)) +
geom_density(alpha = 0.8) +
theme(legend.position = "bottom") +
labs(x = "Skin Thickness", y = "Density", title = "Density plot of Skin Thickness")
grid.arrange(p1, p2, ncol = 2)
The graphs show no real relation between skin thickness and being diagnosed with diabetes.
p1 <- ggplot(pid, aes(x = Outcome, y = Insulin, fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Insulin vs Diabetes")
p2 <- ggplot(pid, aes(Insulin, fill = Outcome)) +
geom_histogram(binwidth=10) +
theme(legend.position = "bottom") +
ggtitle("Variation of Insulin vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
The graphs above do not show a clear difference between insulin level and being diagnosed with diabetes.
p1 <- ggplot(pid, aes(x = Outcome, y = Body_Mass_Index,fill = Outcome)) +
geom_boxplot(binwidth = 5) +
theme(legend.position = "bottom") +
ggtitle("BMI vs Diabetes")
p2 <- ggplot(pid, aes(Body_Mass_Index, fill = Outcome)) +
geom_histogram() +
theme(legend.position = "bottom") +
ggtitle("Variation of BMI vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
The graphs show that the BMI of those diagnosed with diabetes had a range between 25 and 70, while those who were not diagnosed with diabetes had a BMI between 18 and 60.
p1 <- ggplot(pid, aes(x = Outcome, y = Diabetes_Pedigree_Function,fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("DPF vs Diabetes")
p2 <- ggplot(pid, aes(Diabetes_Pedigree_Function,fill = Outcome)) +
geom_histogram() +
theme(legend.position = "bottom") +
ggtitle("Variation of DPF vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
The graphs show no real difference between the diabetes pedigree function and being diagnosed with diabetes.
p2 <- ggplot(pid, aes(Age, fill = Outcome)) +
geom_histogram(binwidth = 5) +
theme(legend.position = "bottom") +
ggtitle("Age vs Diabetes")
p1 <- ggplot(pid, aes(x = Outcome, y = Age,fill = Outcome)) +
geom_boxplot() +
theme(legend.position = "bottom") +
ggtitle("Variation of Age vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
Let us take a closer look at the Age variable
ggplot(data=pid,aes(x=Age,fill=Age))+geom_bar(fill="blue",alpha=0.6)
table(pid$Age)
##
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 63 72 38 46 48 33 32 35 29 21 24 16 17 14 10 16 19 16 12 13 22 18 13 8 15 13
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 72 81
## 6 5 5 8 8 8 5 6 4 3 5 7 3 5 2 4 4 1 3 4 3 1 2 1 1 1
Let us create an Age Range Category
pid$Age_Cat <- ifelse(pid$Age<21, "<21",
ifelse((pid$Age>=21) & (pid$Age<=25), "21-25",
ifelse((pid$Age>25) & (pid$Age<=30), "25-30",
ifelse((pid$Age>30) & (pid$Age<=35),
"30-35", ifelse((pid$Age>35) &
(pid$Age<=40),
"35-40",
ifelse((pid$Age>40)
& (pid$Age<=50)
, "40-50",
ifelse(
(pid$Age>50)
&(pid$Age<=60)
,"50-60",
">60")))))))
pid$Age_Cat <- factor(pid$Age_Cat, levels = c("<21","21-25","25-30","30-35","35-40","40-50","50-60",">60"))
table(pid$Age_Cat)
##
## <21 21-25 25-30 30-35 35-40 40-50 50-60 >60
## 0 267 150 81 76 113 54 27
ggplot(aes(x = Age_Cat), data = pid) +
geom_bar(fill='steelblue')
The graphs above do not show a clear distinction between the age of
those diagnosed with diabetes and the age of those not diagnosed with
diabetes.
Next we look at combinations of certain predictive variables.
# Age and Number of Pregnancies versus Diabetes
p1 <- ggplot(pid, aes(x = Age, y = Pregnancy)) +
geom_point(aes(color=Outcome)) +
theme(legend.position = "bottom") +
ggtitle("Pregnancies and Age vs Diabetes")
# Glucose and Insulin level versus Diabetes
p2 <- ggplot(pid,aes(x=Insulin, y=Glucose))+
geom_point(aes(color=Outcome))+
theme(legend.position = "bottom") +
ggtitle("Insulin and Glucose vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
The graphs above show that:
#BMI and Blood Pressure versus Diabetes
p1 <- ggplot(pid,aes(x=Body_Mass_Index,y=Blood_Pressure))+
geom_point(aes(color=Outcome))+
theme(legend.position = "bottom") +
ggtitle("BMI and BP vs Diabetes")
#BMI and Skin Thickness versus Diabetes
p2 <- ggplot(pid,aes(x=Body_Mass_Index,y=Skin_Thickness))+
geom_point(aes(color=Outcome))+
theme(legend.position = "bottom") +
ggtitle("BMI and Skin Thickness vs Diabetes")
grid.arrange(p1, p2, ncol = 2)
The graphs above show that: