library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.2
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.1.2
library(InformationValue)
## Warning: package 'InformationValue' was built under R version 4.1.2
##
## Attaching package: 'InformationValue'
## The following object is masked from 'package:Metrics':
##
## precision
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.1.2
df = read_excel("labW9.xlsx")
head(df,5)
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
par(mfrow=c(2, 3))
boxplot(df$Pregnancies~df$Outcome, main="Pregnancies vs. Diabetes", xlab="Outcome", ylab="Pregnancies")
boxplot(df$Glucose~df$Outcome, main="Glucose vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$SkinThickness~df$Outcome, main="SkinThickness vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$BMI~df$Outcome, main="BMI vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$DiabetesPedigreeFunction~df$Outcome, main="Diabetes Pedigree Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
boxplot(df$Age~df$Outcome, main="Age Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
#### From the box plot, we can see that there are several outliers detected in this df set #### We also can see that pepople with high No.of pregnancies, Glucose, Skin Thickness, BMI, Age and more prone in getting diabetes
par(mfrow=c(2, 3))
hist(df$Pregnancies, breaks = 10, main = "No. of Pregnancies", xlab = "Pregnancies")
hist(df$Glucose, breaks = 5, main = "Glucose", xlab = "Glucose")
hist(df$BloodPressure, breaks = 5, main = "Blood Pressure", xlab = "Blood Pressure")
hist(df$SkinThickness, breaks = 10, main = "Skin Thickness", xlab = "Skin Thickness")
hist(df$Insulin, breaks = 10, main = "Insulin", xlab = "Insulin")
hist(df$Age, breaks = 10, main = "Age", xlab = "Age")
df$Outcome<-as.factor(df$Outcome)
require(caTools)
sample.split(df$Outcome, SplitRatio = 0.7)->split_index
training_set <- subset(df,split_index == TRUE)
test_set <- subset(df, split_index == FALSE)
nrow(test_set)
## [1] 230
nrow(training_set)
## [1] 538
head(test_set,5)
head(training_set,5)
model <- glm(Outcome~.-SkinThickness-Insulin-Age, data = training_set,family = "binomial")
summary(model)
##
## Call:
## glm(formula = Outcome ~ . - SkinThickness - Insulin - Age, family = "binomial",
## data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5770 -0.7440 -0.4085 0.7340 2.8120
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.625060 0.796868 -9.569 < 2e-16 ***
## Pregnancies 0.172893 0.033311 5.190 2.10e-07 ***
## Glucose 0.034501 0.004060 8.498 < 2e-16 ***
## BloodPressure -0.015273 0.005963 -2.561 0.0104 *
## BMI 0.083164 0.016841 4.938 7.88e-07 ***
## DiabetesPedigreeFunction 0.617220 0.336629 1.834 0.0667 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 508.88 on 532 degrees of freedom
## AIC: 520.88
##
## Number of Fisher Scoring iterations: 5
train<- cbind(training_set, Prob=predict(model, type="response"))
head(train)
train_table <- table(Predicted = train$Prob>0.5, Actual = train$Outcome)
train_table
## Actual
## Predicted 0 1
## FALSE 306 80
## TRUE 44 108
accuracy.train <- round(sum(diag(train_table))/sum(train_table),2)
sprintf("Accuracy-%s",accuracy.train)
## [1] "Accuracy-0.77"
precision.train <- train_table [2,2]/sum(train_table [2,])
recall.train <- train_table [2,2]/sum(train_table [,2])
sprintf("Precision-%s",accuracy.train)
## [1] "Precision-0.77"
sprintf("Recall-%s",accuracy.train)
## [1] "Recall-0.77"