# Paul Brown
# Final Assignment:
## Create Libraries and Read Data from Excel
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
## ✔ tibble 3.1.7 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(knitr)
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.92 loaded
Diabetes <- read.csv("C:\\Users\\pdbro\\AppData\\Local\\Temp\\Temp2_archive (3).zip\\Prima Diabetes.csv")
# Data Cleaning
head(Diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
str(Diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
class(Diabetes)
## [1] "data.frame"
missing_data <- Diabetes[,setdiff(names(Diabetes), c('Outcome', 'Pregnancy'))]
features_miss_num <- apply(missing_data, 2, function(x) sum(x <= 0))
features_miss <- names(missing_data)[ features_miss_num > 0]
rows_miss <- apply(missing_data, 1, function(x) sum(x <= 0) >= 1)
sum(rows_miss)
## [1] 432
summary(Diabetes[1:3])
## Pregnancies Glucose BloodPressure
## Min. : 0.000 Min. : 0.0 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00
## Median : 3.000 Median :117.0 Median : 72.00
## Mean : 3.845 Mean :120.9 Mean : 69.11
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00
## Max. :17.000 Max. :199.0 Max. :122.00
summary(Diabetes[4:6])
## SkinThickness Insulin BMI
## Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.:27.30
## Median :23.00 Median : 30.5 Median :32.00
## Mean :20.54 Mean : 79.8 Mean :31.99
## 3rd Qu.:32.00 3rd Qu.:127.2 3rd Qu.:36.60
## Max. :99.00 Max. :846.0 Max. :67.10
summary(Diabetes[7:8])
## DiabetesPedigreeFunction Age
## Min. :0.0780 Min. :21.00
## 1st Qu.:0.2437 1st Qu.:24.00
## Median :0.3725 Median :29.00
## Mean :0.4719 Mean :33.24
## 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :2.4200 Max. :81.00
Diabetes$Outcome <- factor(Diabetes$Outcome)
ggplot(Diabetes,aes(Outcome,fill = Outcome)) +
geom_bar() +
ggtitle("Distribution of Outcome variable")

# Exploratory Analysis
summary(Diabetes)
p1 <- ggplot(Diabetes, aes(x=Pregnancies)) + ggtitle("Number of times pregnant") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour="black", fill="blue") + ylab("Percentage")
p2 <- ggplot(Diabetes, aes(x=BloodPressure)) + ggtitle("Blood Pressure") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour="black", fill="green") + ylab("Percentage")
p3 <- ggplot(Diabetes, aes(x=BMI)) + ggtitle("Body Mass Index") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour="black", fill="yellow") + ylab("Percentage")
p4 <- ggplot(Diabetes, aes(x=Age)) + ggtitle("Age") +
geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="lightblue") + ylab("Percentage")
grid.arrange(p1, p2, p3, p4, ncol=2)
grid.rect(width = 1, height = 1, gp = gpar(lwd = 1, col = "black", fill = NA))

# plotting all predictors
pairs(Diabetes)

#Correlation Matrix
Diabetes_cor <- round(cor(Diabetes[1:8]),1)
Diabetes_cor
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.0 0.1 0.1 -0.1
## Glucose 0.1 1.0 0.2 0.1
## BloodPressure 0.1 0.2 1.0 0.2
## SkinThickness -0.1 0.1 0.2 1.0
## Insulin -0.1 0.3 0.1 0.4
## BMI 0.0 0.2 0.3 0.4
## DiabetesPedigreeFunction 0.0 0.1 0.0 0.2
## Age 0.5 0.3 0.2 -0.1
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies -0.1 0.0 0.0 0.5
## Glucose 0.3 0.2 0.1 0.3
## BloodPressure 0.1 0.3 0.0 0.2
## SkinThickness 0.4 0.4 0.2 -0.1
## Insulin 1.0 0.2 0.2 0.0
## BMI 0.2 1.0 0.1 0.0
## DiabetesPedigreeFunction 0.2 0.1 1.0 0.0
## Age 0.0 0.0 0.0 1.0
numeric.var <- sapply(Diabetes, is.numeric)
corr.matrix <- cor(Diabetes[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", order = "hclust", tl.col = "black", tl.srt=45, tl.cex=0.8, cl.cex=0.8)
box(which = "outer", lty = "solid")

# Boxplots of Diabete Outcomes
attach(Diabetes)
par(mfrow=c(2,4))
boxplot(Pregnancies~Outcome, main="No. of Pregnancies vs. Diabetes",
xlab="Outcome", ylab="Pregnancies",col="red")
boxplot(BloodPressure~Outcome, main="Blood Pressure vs. Diabetes",
xlab="Outcome", ylab="Blood Pressure",col="green")
boxplot(BMI~Outcome, main="BMI vs. Diabetes",
xlab="Outcome", ylab="BMI",col="purple")
boxplot(Age~Outcome, main="Age vs. Diabetes",
xlab="Outcome", ylab="Age",col="lightblue")
box(which = "outer", lty = "solid")
