# Paul Brown
# Final Assignment: 
## Create Libraries and Read Data from Excel
library(readr) 
library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr) 
library(knitr)
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.92 loaded
Diabetes <- read.csv("C:\\Users\\pdbro\\AppData\\Local\\Temp\\Temp2_archive (3).zip\\Prima Diabetes.csv")
# Data Cleaning

head(Diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
str(Diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
class(Diabetes)
## [1] "data.frame"
missing_data <- Diabetes[,setdiff(names(Diabetes), c('Outcome', 'Pregnancy'))]
features_miss_num <- apply(missing_data, 2, function(x) sum(x <= 0))
features_miss <- names(missing_data)[ features_miss_num > 0]

rows_miss <- apply(missing_data, 1, function(x) sum(x <= 0) >= 1) 
sum(rows_miss)
## [1] 432
summary(Diabetes[1:3])
##   Pregnancies        Glucose      BloodPressure   
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00  
##  Median : 3.000   Median :117.0   Median : 72.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00
summary(Diabetes[4:6])
##  SkinThickness      Insulin           BMI       
##  Min.   : 0.00   Min.   :  0.0   Min.   : 0.00  
##  1st Qu.: 0.00   1st Qu.:  0.0   1st Qu.:27.30  
##  Median :23.00   Median : 30.5   Median :32.00  
##  Mean   :20.54   Mean   : 79.8   Mean   :31.99  
##  3rd Qu.:32.00   3rd Qu.:127.2   3rd Qu.:36.60  
##  Max.   :99.00   Max.   :846.0   Max.   :67.10
summary(Diabetes[7:8])
##  DiabetesPedigreeFunction      Age       
##  Min.   :0.0780           Min.   :21.00  
##  1st Qu.:0.2437           1st Qu.:24.00  
##  Median :0.3725           Median :29.00  
##  Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :2.4200           Max.   :81.00
Diabetes$Outcome <- factor(Diabetes$Outcome)

ggplot(Diabetes,aes(Outcome,fill = Outcome)) +
  geom_bar() + 
  ggtitle("Distribution of Outcome variable")

# Exploratory Analysis

summary(Diabetes)
p1 <- ggplot(Diabetes, aes(x=Pregnancies)) + ggtitle("Number of times pregnant") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour="black", fill="blue") + ylab("Percentage")

p2 <- ggplot(Diabetes, aes(x=BloodPressure)) + ggtitle("Blood Pressure") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour="black", fill="green") + ylab("Percentage")

p3 <- ggplot(Diabetes, aes(x=BMI)) + ggtitle("Body Mass Index") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour="black", fill="yellow") + ylab("Percentage")

p4 <- ggplot(Diabetes, aes(x=Age)) + ggtitle("Age") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="lightblue") + ylab("Percentage")
grid.arrange(p1, p2, p3, p4, ncol=2)
grid.rect(width = 1, height = 1, gp = gpar(lwd = 1, col = "black", fill = NA))

# plotting all predictors
pairs(Diabetes)

#Correlation Matrix

Diabetes_cor <- round(cor(Diabetes[1:8]),1)
Diabetes_cor
##                          Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies                      1.0     0.1           0.1          -0.1
## Glucose                          0.1     1.0           0.2           0.1
## BloodPressure                    0.1     0.2           1.0           0.2
## SkinThickness                   -0.1     0.1           0.2           1.0
## Insulin                         -0.1     0.3           0.1           0.4
## BMI                              0.0     0.2           0.3           0.4
## DiabetesPedigreeFunction         0.0     0.1           0.0           0.2
## Age                              0.5     0.3           0.2          -0.1
##                          Insulin BMI DiabetesPedigreeFunction  Age
## Pregnancies                 -0.1 0.0                      0.0  0.5
## Glucose                      0.3 0.2                      0.1  0.3
## BloodPressure                0.1 0.3                      0.0  0.2
## SkinThickness                0.4 0.4                      0.2 -0.1
## Insulin                      1.0 0.2                      0.2  0.0
## BMI                          0.2 1.0                      0.1  0.0
## DiabetesPedigreeFunction     0.2 0.1                      1.0  0.0
## Age                          0.0 0.0                      0.0  1.0
numeric.var <- sapply(Diabetes, is.numeric)
corr.matrix <- cor(Diabetes[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", order = "hclust", tl.col = "black", tl.srt=45, tl.cex=0.8, cl.cex=0.8)
box(which = "outer", lty = "solid")

# Boxplots of Diabete Outcomes 

attach(Diabetes)
par(mfrow=c(2,4))
boxplot(Pregnancies~Outcome, main="No. of Pregnancies vs. Diabetes", 
        xlab="Outcome", ylab="Pregnancies",col="red")

boxplot(BloodPressure~Outcome, main="Blood Pressure vs. Diabetes", 
        xlab="Outcome", ylab="Blood Pressure",col="green")

boxplot(BMI~Outcome, main="BMI vs. Diabetes", 
        xlab="Outcome", ylab="BMI",col="purple")

boxplot(Age~Outcome, main="Age vs. Diabetes", 
        xlab="Outcome", ylab="Age",col="lightblue")
box(which = "outer", lty = "solid")