library(tidyverse)
library("xlsx")
library(ggplot2)
library(reshape2)
library(data.table)
library(caret)
library(e1071)
library(corrplot)
library(kernlab)
Load the Data
df <- read.xlsx2("labW9.xlsx", 1, header=TRUE)
View Data
df %>% head(2)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
df %>% glimpse()
## Rows: 768
## Columns: 9
## $ Pregnancies <chr> "6", "1", "8", "1", "0", "5", "3", "10", "2",~
## $ Glucose <chr> "148", "85", "183", "89", "137", "116", "78",~
## $ BloodPressure <chr> "72", "66", "64", "66", "40", "74", "50", "0"~
## $ SkinThickness <chr> "35", "29", "0", "23", "35", "0", "32", "0", ~
## $ Insulin <chr> "0", "0", "0", "94", "168", "0", "88", "0", "~
## $ BMI <chr> "33.6", "26.6", "23.3", "28.1", "43.1", "25.6~
## $ DiabetesPedigreeFunction <chr> "0.627", "0.351", "0.672", "0.167", "2.288", ~
## $ Age <chr> "50", "31", "32", "21", "33", "30", "26", "29~
## $ Outcome <chr> "1", "0", "1", "0", "1", "0", "1", "0", "1", ~
Check for Missing Values
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
Remarks: No missing values in the dataset.
Check for Duplicates
sum(duplicated(df))
## [1] 0
Remarks: No duplicates in the dataset.
Check Variable Data Types
sapply(df, class)
## Pregnancies Glucose BloodPressure
## "character" "character" "character"
## SkinThickness Insulin BMI
## "character" "character" "character"
## DiabetesPedigreeFunction Age Outcome
## "character" "character" "character"
Remarks: All variables are character type, we need to change it to numeric!
Correct Data Types
df %>% select(BMI, DiabetesPedigreeFunction) %>% sapply(as.numeric) -> df[, c("BMI", "DiabetesPedigreeFunction")]
df %>% select(1:5, 8, 9) %>% sapply(as.integer) -> df[, c(1:5, 8, 9)]
df2 <- copy(df) #make a backup
Check for Outliers
df_long <- melt(df %>% select(-Insulin))
ggplot(df_long, aes(x = variable, y = value)) + geom_boxplot()
df_long <- melt(df %>% select(Insulin))
ggplot(df_long, aes(x = variable, y = value)) + geom_boxplot()
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
Remarks: Outliers detected in SkinThickness, Glucose, BloodPressure, BMI, Insulin.
Fixing Outliers
SkinThickness
max(df$SkinThickness)
## [1] 99
df <- df %>% filter(SkinThickness < 99)
Remarks: We can see that the mean & median for SkinThickness is around 20, but there is a single value of 99. We will get rid of that.
Unusual “zero” Values
df_zero <- df %>% select(SkinThickness, Glucose, BloodPressure, BMI, Insulin)
colSums(df_zero==0) %>% as.data.frame() %>% mutate("Missing %" = (./nrow(df)*100)) %>% rename("Count of Zero Values" = ".")
## Count of Zero Values Missing %
## SkinThickness 227 29.5958279
## Glucose 5 0.6518905
## BloodPressure 35 4.5632334
## BMI 11 1.4341591
## Insulin 373 48.6310300
Remarks: It is illogical for these variables to be zero, must be a mistake. We cannot drop these rows as the observations in our dataset is little.
# replace "zero" values with NA - easier to fix
df[, c("SkinThickness", "Glucose", "BloodPressure", "BMI", "Insulin")][df[, c("SkinThickness", "Glucose", "BloodPressure", "BMI", "Insulin")] == 0] <- NA
Fix “zero” Values
df %>% group_by(Outcome) %>% summarise(SkinThicknessMedian = median(SkinThickness, na.rm=TRUE),
GlucoseMedian = median(Glucose, na.rm=TRUE),
BloodPressureMedian = median(BloodPressure, na.rm=TRUE),
BMIMedian = median(BMI, na.rm=TRUE),
InsulinMedian = median(Insulin, na.rm=TRUE))
## # A tibble: 2 x 6
## Outcome SkinThicknessMedian GlucoseMedian BloodPressureMedian BMIMedian
## <int> <int> <int> <int> <dbl>
## 1 0 27 107 70 30.1
## 2 1 32 140 75 34.3
## # ... with 1 more variable: InsulinMedian <dbl>
Remarks: We can see when the outcome is 1, the values are higher. So, we impute using group median.
#Impute by group median
df <- df %>%
group_by(Outcome) %>%
mutate(SkinThickness = ifelse(is.na(SkinThickness),
median(SkinThickness, na.rm = TRUE),
SkinThickness),
Glucose = ifelse(is.na(Glucose),
median(Glucose, na.rm = TRUE),
Glucose),
BloodPressure = ifelse(is.na(BloodPressure),
median(BloodPressure, na.rm = TRUE),
BloodPressure),
BMI = ifelse(is.na(BMI),
median(BMI, na.rm = TRUE),
BMI),
Insulin = ifelse(is.na(Insulin),
median(Insulin, na.rm = TRUE),
Insulin)
)
df %>% ungroup() -> df
Verifying Cleaning
#Checking for Missing Values
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
#Checking for zero values
df_zero <- df %>% select(-Outcome, -Pregnancies)
colSums(df_zero==0)
## Glucose BloodPressure SkinThickness
## 0 0 0
## Insulin BMI DiabetesPedigreeFunction
## 0 0 0
## Age
## 0
Dataset Variable Distribution: before cleaning
df2 %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins = 15)
Dataset Variable Distribution: after cleaning
df %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins = 15)
Linear Correlation
cor.table = cor(df)
corrplot(cor.table)
Distribution of Target Variable: Outcome
df_outcome <- as.data.frame(table(df$Outcome)) %>% rename("Outcome" = Var1)
ggplot(df_outcome, aes(x="", y=Freq, fill=Outcome)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
scale_fill_brewer(palette="Set1")
Glucose vs Age
dfplt <- copy(df)
dfplt$Outcome <- as.factor(df$Outcome)
ggplot(dfplt, aes(x = Age, y = Glucose)) + geom_point(aes(color = Outcome)) +
labs(title = "Glucose Levels by Age", caption = "Remarks: When the outcome is 0, the distribution is \nconcentrated below 150 glucose level mark")
Glucose vs Insulin
ggplot(dfplt, aes(x = Glucose, y = Insulin)) + geom_point(aes(color = Outcome)) + geom_smooth(method = "lm") +
labs(title = "Glucose vs Insulin", caption = "Remarks: There is a clear positive relationship between glucose and insulin.\nThe higher the glucose levels, the higher is insulin\nAlso, those with outcome of 0 have lower glucose levels")
## `geom_smooth()` using formula 'y ~ x'
Glucose vs BMI
ggplot(dfplt, aes(x = BMI, y = Glucose)) + geom_point(aes(color = Outcome))+ geom_smooth() +
labs(title = "BMI vs Glucose", caption = "Remarks: There isn't really a strong linear relationship between Glucose and BMI")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
BMI Distribution
df_bmi <- copy(df)
df_bmi$BMI <- ifelse(df$BMI<19,"Underweight", ifelse(df$BMI>=19 & df$BMI<=25, "Normal", ifelse(df$BMI>=25 & df$BMI<=30, "Overweight","Obese"))) %>% factor(levels=c("Underweight","Normal", "Overweight","Obese"))
bmi <- as.data.frame(table(df_bmi$BMI)) %>% rename("BMI" = "Var1")
ggplot(bmi, aes(x = "", y = Freq, fill=BMI)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
scale_fill_brewer(palette="Set1") +
labs(title = "BMI Distribution", caption = "Remarks: A large proportion of observations are obese!")
Split Data
split = 0.8
trainIndex = createDataPartition(df$Outcome, p = split, list = F)
train_set <- df[trainIndex,]
test_set <- df[-trainIndex,]
train_set[["Outcome"]] = factor(train_set[["Outcome"]])
test_set[["Outcome"]] = factor(test_set[["Outcome"]])
Remarks: We split the data by 80:20.
View Training Data
summary(train_set)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.00 Min. : 44.0 Min. : 30.00 Min. : 7.00
## 1st Qu.: 1.00 1st Qu.:100.0 1st Qu.: 64.00 1st Qu.:25.00
## Median : 3.00 Median :118.0 Median : 72.00 Median :28.00
## Mean : 3.92 Mean :122.1 Mean : 72.85 Mean :28.87
## 3rd Qu.: 6.00 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.00 Max. :199.0 Max. :122.00 Max. :54.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 15.0 Min. :18.20 Min. :0.0850 Min. :21.00
## 1st Qu.:102.5 1st Qu.:27.50 1st Qu.:0.2442 1st Qu.:24.00
## Median :105.0 Median :32.00 Median :0.3725 Median :29.00
## Mean :142.9 Mean :32.33 Mean :0.4749 Mean :33.25
## 3rd Qu.:169.5 3rd Qu.:36.67 3rd Qu.:0.6458 3rd Qu.:41.00
## Max. :744.0 Max. :67.10 Max. :2.3290 Max. :81.00
## Outcome
## 0:400
## 1:214
##
##
##
##
dim(train_set)
## [1] 614 9
View Training Data
summary(test_set)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 65.0 Min. : 24.0 Min. : 8.00
## 1st Qu.: 1.000 1st Qu.: 98.0 1st Qu.: 62.0 1st Qu.:25.00
## Median : 2.000 Median :111.0 Median : 70.0 Median :27.00
## Mean : 3.556 Mean :119.3 Mean : 70.6 Mean :29.52
## 3rd Qu.: 6.000 3rd Qu.:137.0 3rd Qu.: 78.0 3rd Qu.:32.00
## Max. :15.000 Max. :198.0 Max. :108.0 Max. :63.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.0 Min. :19.90 Min. :0.0780 Min. :21.00
## 1st Qu.:102.5 1st Qu.:27.80 1st Qu.:0.2400 1st Qu.:25.00
## Median :102.5 Median :32.40 Median :0.3680 Median :29.00
## Mean :136.8 Mean :32.85 Mean :0.4591 Mean :33.03
## 3rd Qu.:169.5 3rd Qu.:36.10 3rd Qu.:0.5910 3rd Qu.:38.00
## Max. :846.0 Max. :59.40 Max. :2.4200 Max. :68.00
## Outcome
## 0:100
## 1: 53
##
##
##
##
dim(test_set)
## [1] 153 9
Train Control
set.seed(333) #reproducibility
tc <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
Remarks: We make use of Repeated K-fold for Cross Validation
1. Support Vector Machines (SVM)
1.1 Linear SVM
svm_Lin <- train(Outcome ~., data = train_set, method = "svmLinear",
trControl=tc,
preProcess = c("center", "scale"),
tuneLength = 10)
1.2 Linear SVM w/ Cost Values
svm_LinWCost <- train(Outcome ~.,
data = train_set,
method = "svmLinear",
trControl = tc,
preProcess = c("center","scale"),
tuneGrid = expand.grid(C = seq(0, 2, length = 20)))
plot(svm_LinWCost)
1.3 SVM Radial (Non-Linear Kernel)
svm_Rad <- train(Outcome ~., data = train_set, method = "svmRadial", trControl = tc, preProcess = c("center","scale"), tuneLength = 10)
plot(svm_Rad)
1.4 SVM Polynomial (Non-Linear Kernel)
svm_Poly <- train(Outcome ~., data = train_set, method = "svmPoly", trControl = tc, preProcess = c("center","scale"), tuneLength = 4)
plot(svm_Poly)
Comparing SVM Performances
svm_perf = tibble(
Models = c("Linear", "Linear w/ Cost Values", "Radial", "Polynomial"),
Accuracy = c(
svm_Lin$result[2][[1]],
svm_LinWCost$results[which.min(svm_LinWCost$results[,2]),]$Accuracy,
svm_Rad$results[which.min(svm_Rad$results[,2]),]$Accuracy,
svm_Poly$results[which.min(svm_Poly$results[,2]),]$Accuracy))
svm_perf %>% arrange(Accuracy)
## # A tibble: 4 x 2
## Models Accuracy
## <chr> <dbl>
## 1 Polynomial 0.652
## 2 Linear 0.770
## 3 Linear w/ Cost Values 0.773
## 4 Radial 0.812
2. Logistic Regression
log_reg <- glm(Outcome ~.,family=binomial(link='logit'),data=train_set)
3. Naive Bayes
naive <- train(Outcome ~., data = train_set, trControl = tc, method = "nb")
plot(naive)
4. KNN
knn <- train(Outcome ~., data = train_set, trControl = tc, method = "knn", tuneGrid = data.frame(k = seq(1, 25)))
plot(knn)
5. Random Forest
rf <- train(Outcome ~., data = train_set, trControl = tc, method = "rf")
plot(rf)
predictSVMLin <- predict(svm_Lin, newdata = test_set)
predictSVMLinWCost <- predict(svm_LinWCost, newdata = test_set)
predictSVMRad <- predict(svm_Rad, newdata = test_set)
predictSVMPoly <- predict(svm_Poly, newdata = test_set)
predictNaive <- predict(naive, newdata = test_set)
predictLogReg <- predict(log_reg, newdata = test_set, type="response")
predictKNN <- predict(knn, newdata = test_set)
predictRF <- predict(rf, newdata = test_set)
1. Support Vector Machines (SVM)
1.1 Linear SVM
confusionMatrix(predictSVMLin, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 19
## 1 13 34
##
## Accuracy : 0.7908
## 95% CI : (0.7178, 0.8523)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.0001499
##
## Kappa : 0.5255
##
## Mcnemar's Test P-Value : 0.3767591
##
## Sensitivity : 0.8700
## Specificity : 0.6415
## Pos Pred Value : 0.8208
## Neg Pred Value : 0.7234
## Prevalence : 0.6536
## Detection Rate : 0.5686
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.7558
##
## 'Positive' Class : 0
##
1.2 Linear SVM w/ Cost Values
confusionMatrix(predictSVMLinWCost, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 19
## 1 13 34
##
## Accuracy : 0.7908
## 95% CI : (0.7178, 0.8523)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.0001499
##
## Kappa : 0.5255
##
## Mcnemar's Test P-Value : 0.3767591
##
## Sensitivity : 0.8700
## Specificity : 0.6415
## Pos Pred Value : 0.8208
## Neg Pred Value : 0.7234
## Prevalence : 0.6536
## Detection Rate : 0.5686
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.7558
##
## 'Positive' Class : 0
##
1.3 SVM Radial
confusionMatrix(predictSVMRad, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 83 14
## 1 17 39
##
## Accuracy : 0.7974
## 95% CI : (0.7249, 0.858)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 7.169e-05
##
## Kappa : 0.5584
##
## Mcnemar's Test P-Value : 0.7194
##
## Sensitivity : 0.8300
## Specificity : 0.7358
## Pos Pred Value : 0.8557
## Neg Pred Value : 0.6964
## Prevalence : 0.6536
## Detection Rate : 0.5425
## Detection Prevalence : 0.6340
## Balanced Accuracy : 0.7829
##
## 'Positive' Class : 0
##
1.3 SVM Polynomial
confusionMatrix(predictSVMPoly, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 12
## 1 12 41
##
## Accuracy : 0.8431
## 95% CI : (0.7757, 0.8968)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 1.313e-07
##
## Kappa : 0.6536
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8800
## Specificity : 0.7736
## Pos Pred Value : 0.8800
## Neg Pred Value : 0.7736
## Prevalence : 0.6536
## Detection Rate : 0.5752
## Detection Prevalence : 0.6536
## Balanced Accuracy : 0.8268
##
## 'Positive' Class : 0
##
2. Naive Bayes
confusionMatrix(predictNaive, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 11
## 1 13 42
##
## Accuracy : 0.8431
## 95% CI : (0.7757, 0.8968)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 1.313e-07
##
## Kappa : 0.6566
##
## Mcnemar's Test P-Value : 0.8383
##
## Sensitivity : 0.8700
## Specificity : 0.7925
## Pos Pred Value : 0.8878
## Neg Pred Value : 0.7636
## Prevalence : 0.6536
## Detection Rate : 0.5686
## Detection Prevalence : 0.6405
## Balanced Accuracy : 0.8312
##
## 'Positive' Class : 0
##
3. Logistic Regression
confusionMatrix(data = factor(as.integer(predictLogReg>0.5)), reference = test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 89 19
## 1 11 34
##
## Accuracy : 0.8039
## 95% CI : (0.7321, 0.8636)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 3.3e-05
##
## Kappa : 0.5511
##
## Mcnemar's Test P-Value : 0.2012
##
## Sensitivity : 0.8900
## Specificity : 0.6415
## Pos Pred Value : 0.8241
## Neg Pred Value : 0.7556
## Prevalence : 0.6536
## Detection Rate : 0.5817
## Detection Prevalence : 0.7059
## Balanced Accuracy : 0.7658
##
## 'Positive' Class : 0
##
4. KNN
confusionMatrix(predictKNN, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 91 12
## 1 9 41
##
## Accuracy : 0.8627
## 95% CI : (0.7979, 0.913)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 4.438e-09
##
## Kappa : 0.6928
##
## Mcnemar's Test P-Value : 0.6625
##
## Sensitivity : 0.9100
## Specificity : 0.7736
## Pos Pred Value : 0.8835
## Neg Pred Value : 0.8200
## Prevalence : 0.6536
## Detection Rate : 0.5948
## Detection Prevalence : 0.6732
## Balanced Accuracy : 0.8418
##
## 'Positive' Class : 0
##
5. Random Forest
confusionMatrix(predictRF, test_set$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 9
## 1 6 44
##
## Accuracy : 0.902
## 95% CI : (0.8435, 0.9441)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 1.12e-12
##
## Kappa : 0.7806
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.9400
## Specificity : 0.8302
## Pos Pred Value : 0.9126
## Neg Pred Value : 0.8800
## Prevalence : 0.6536
## Detection Rate : 0.6144
## Detection Prevalence : 0.6732
## Balanced Accuracy : 0.8851
##
## 'Positive' Class : 0
##
Comparison between Models
resamps <- resamples(list(SVM_Linear = svm_LinWCost,
SVM_Poly = svm_Poly,
SVM_Radial = svm_Rad,
KNN = knn,
NaiveBayes = naive,
RandomForest = rf))
bwplot(resamps, metric = "Accuracy")
Remarks: We achieved approx. 90% accuracy with KNN & RandomForest.
Remarks: We will select the RandomForest model for predicting the Outcome, as it provides approx. 90% accuracy.
cm <- confusionMatrix(predictRF, test_set$Outcome)
draw_confusion_matrix(cm)
Evaluate Outcome using Own Data
df_predict <- data.frame(
list(Pregnancies = c(4, 0, 10),
Glucose = c(100, 120, 200),
BloodPressure = c(69, 99, 100),
SkinThickness = c(20, 30, 50),
Insulin = c(120, 100, 60),
BMI = c(40.5, 38.5, 50),
DiabetesPedigreeFunction = c(0.69, 0.5, 0.9),
Age = c(21, 40, 60))
)
Predict Outcome using own Data
predict(rf, newdata = df_predict, method = "prediction")
## [1] 0 0 1
## Levels: 0 1
Remarks: Our model predicted patient 1 and 2 have no diabetes and patient 3 with diabetes.