library(readr) library(tidyverse) library(dplyr) library(knitr) library(ggplot2) library(caret) library(grid) library(gridExtra) library(corrplot)
Diabetes <- read.csv(“C:\Users\pdbro\AppData\Local\Temp\Temp2_archive (3).zip\Prima Diabetes.csv”)
head(Diabetes)
str(Diabetes)
is.na(Diabetes)
summary(Diabetes)
##Histograms
p1 <- ggplot(Diabetes, aes(x=Pregnancies)) + ggtitle(“Number of times pregnant”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour=“black”, fill=“blue”) + ylab(“Percentage”)
p2 <- ggplot(Diabetes, aes(x=BloodPressure)) + ggtitle(“Blood Pressure”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour=“black”, fill=“green”) + ylab(“Percentage”)
p3 <- ggplot(Diabetes, aes(x=BMI)) + ggtitle(“Body Mass Index”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour=“black”, fill=“yellow”) + ylab(“Percentage”)
p4 <- ggplot(Diabetes, aes(x=Age)) + ggtitle(“Age”) + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour=“black”, fill=“lightblue”) + ylab(“Percentage”) grid.arrange(p1, p2, p3, p4, ncol=2) grid.rect(width = 1, height = 1, gp = gpar(lwd = 1, col = “black”, fill = NA))
##Correlation Matrix
Diabetes_cor <- round(cor(Diabetes[1:8]),1) Diabetes_cor
attach(Diabetes) par(mfrow=c(2,4)) boxplot(Pregnancies~Outcome, main=“No. of Pregnancies vs. Diabetes”, xlab=“Outcome”, ylab=“Pregnancies”,col=“red”)
boxplot(BloodPressure~Outcome, main=“Blood Pressure vs. Diabetes”, xlab=“Outcome”, ylab=“Blood Pressure”,col=“green”)
boxplot(BMI~Outcome, main=“BMI vs. Diabetes”, xlab=“Outcome”, ylab=“BMI”,col=“purple”)
boxplot(Age~Outcome, main=“Age vs. Diabetes”, xlab=“Outcome”, ylab=“Age”,col=“lightblue”) box(which = “outer”, lty = “solid”)