Paul Brown

Final Assignment:

Create Libraries and Read Data from Excel

library(readr) library(tidyverse) library(dplyr) library(knitr) library(ggplot2) library(caret) library(grid) library(gridExtra) library(corrplot)

Diabetes <- read.csv(“C:\Users\pdbro\AppData\Local\Temp\Temp2_archive (3).zip\Prima Diabetes.csv”)

Data Cleaning

head(Diabetes)

str(Diabetes)

is.na(Diabetes)

Diabetes\(Age[Diabetes\)Age== 0] <- NA

ggplot(Diabetes, aes(x = Age, color = test)) + geom_histogram(position=“dodge”,

binwidth=30, aes(y=..density..))

Diabetes\(BMI[Diabetes\)BMI== 0] <- NA

ggplot(Diabetes, aes(x = BMI, color = test)) + geom_histogram(position=“dodge”,

binwidth=30, aes(y=..density..))

Diabetes\(BMI[Diabetes\)BloodPressure== 0] <- NA

ggplot(Diabetes, aes(x = BMI, color = test)) + geom_histogram(position=“dodge”,

binwidth=30, aes(y=..density..))

Exploratory Analysis

summary(Diabetes)

##Histograms

p1 <- ggplot(Diabetes, aes(x=Pregnancies)) + ggtitle(“Number of times pregnant”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour=“black”, fill=“blue”) + ylab(“Percentage”)

p2 <- ggplot(Diabetes, aes(x=BloodPressure)) + ggtitle(“Blood Pressure”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour=“black”, fill=“green”) + ylab(“Percentage”)

p3 <- ggplot(Diabetes, aes(x=BMI)) + ggtitle(“Body Mass Index”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour=“black”, fill=“yellow”) + ylab(“Percentage”)

p4 <- ggplot(Diabetes, aes(x=Age)) + ggtitle(“Age”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour=“black”, fill=“lightblue”) + ylab(“Percentage”) grid.arrange(p1, p2, p3, p4, ncol=2) grid.rect(width = 1, height = 1, gp = gpar(lwd = 1, col = “black”, fill = NA))

#Correlation Matrix

Diabetes_cor <- round(cor(Diabetes[1:8]),1) Diabetes_cor

Boxplots of Diabete Outcomes

attach(Diabetes) par(mfrow=c(2,4)) boxplot(Pregnancies~Outcome, main=“No. of Pregnancies vs. Diabetes”, xlab=“Outcome”, ylab=“Pregnancies”,col=“red”)

boxplot(BloodPressure~Outcome, main=“Blood Pressure vs. Diabetes”, xlab=“Outcome”, ylab=“Blood Pressure”,col=“green”)

boxplot(BMI~Outcome, main=“BMI vs. Diabetes”, xlab=“Outcome”, ylab=“BMI”,col=“purple”)

boxplot(Age~Outcome, main=“Age vs. Diabetes”, xlab=“Outcome”, ylab=“Age”,col=“lightblue”) box(which = “outer”, lty = “solid”)