Ahmad Faiz Aslam bin Mohd Abbas

library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.1.2
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.1.2
library(InformationValue)
## Warning: package 'InformationValue' was built under R version 4.1.2
## 
## Attaching package: 'InformationValue'
## The following object is masked from 'package:Metrics':
## 
##     precision
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.1.2

1. Load The dataset

df = read_excel("labW9.xlsx")
head(df,5)

EDA and Ceaning of df

Finding any missing values

colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Creating a box plot to see if there is any outliers in the df

par(mfrow=c(2, 3))
boxplot(df$Pregnancies~df$Outcome, main="Pregnancies vs. Diabetes", xlab="Outcome", ylab="Pregnancies")
boxplot(df$Glucose~df$Outcome, main="Glucose vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$SkinThickness~df$Outcome, main="SkinThickness vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$BMI~df$Outcome, main="BMI vs. Diabetes", xlab="Outcome", ylab="Glucose")
boxplot(df$DiabetesPedigreeFunction~df$Outcome, main="Diabetes Pedigree Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
boxplot(df$Age~df$Outcome, main="Age Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")

#### From the box plot, we can see that there are several outliers detected in this df set #### We also can see that pepople with high No.of pregnancies, Glucose, Skin Thickness, BMI, Age and more prone in getting diabetes

par(mfrow=c(2, 3))
hist(df$Pregnancies, breaks = 10, main = "No. of Pregnancies", xlab = "Pregnancies")
hist(df$Glucose, breaks = 5, main = "Glucose", xlab = "Glucose")
hist(df$BloodPressure, breaks = 5, main = "Blood Pressure", xlab = "Blood Pressure")
hist(df$SkinThickness, breaks = 10, main = "Skin Thickness", xlab = "Skin Thickness")
hist(df$Insulin, breaks = 10, main = "Insulin", xlab = "Insulin")
hist(df$Age, breaks = 10, main = "Age", xlab = "Age")

df$Outcome<-as.factor(df$Outcome)

3. Partion data

require(caTools)
sample.split(df$Outcome, SplitRatio = 0.7)->split_index
training_set <- subset(df,split_index == TRUE)
test_set <- subset(df, split_index == FALSE)

Check of training and test subtest

nrow(test_set)
## [1] 230
nrow(training_set)
## [1] 538
head(test_set,5)
head(training_set,5)

Training of model using Linear Model

model <- glm(Outcome~.-SkinThickness-Insulin-Age, data = training_set,family = "binomial")
summary(model)
## 
## Call:
## glm(formula = Outcome ~ . - SkinThickness - Insulin - Age, family = "binomial", 
##     data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5770  -0.7440  -0.4085   0.7340   2.8120  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -7.625060   0.796868  -9.569  < 2e-16 ***
## Pregnancies               0.172893   0.033311   5.190 2.10e-07 ***
## Glucose                   0.034501   0.004060   8.498  < 2e-16 ***
## BloodPressure            -0.015273   0.005963  -2.561   0.0104 *  
## BMI                       0.083164   0.016841   4.938 7.88e-07 ***
## DiabetesPedigreeFunction  0.617220   0.336629   1.834   0.0667 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 508.88  on 532  degrees of freedom
## AIC: 520.88
## 
## Number of Fisher Scoring iterations: 5
train<- cbind(training_set, Prob=predict(model, type="response")) 
head(train)

Evaluate the outcome using any suitable method

train_table <- table(Predicted = train$Prob>0.5, Actual = train$Outcome)
train_table
##          Actual
## Predicted   0   1
##     FALSE 306  80
##     TRUE   44 108

calculating the accuracy, precision and recall

accuracy.train <- round(sum(diag(train_table))/sum(train_table),2)
sprintf("Accuracy-%s",accuracy.train)
## [1] "Accuracy-0.77"
precision.train <- train_table [2,2]/sum(train_table [2,])
recall.train <- train_table [2,2]/sum(train_table [,2])
sprintf("Precision-%s",accuracy.train)
## [1] "Precision-0.77"
sprintf("Recall-%s",accuracy.train)
## [1] "Recall-0.77"