Myriam Aicha Mbongo 10/16/2023
| Student ID Numbers and Names of Group Members | GitHub Classroom Group Name |
|---|---|
| 2. 134141 - C - Aicha Mbongo |
| Course Code | Course Name | Program | Semester Duration |
|---|---|---|---|
| BBT4206 | Business Intelligence II | Bachelor of Business Information Technology | 21st August 2023 to 28th November 2023 |
# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.
## readr ----
if (require("readr")) {
require("readr")
} else {
install.packages("readr", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: readr
## caret ----
if (require("caret")) {
require("caret")
} else {
install.packages("caret", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
require("e1071")
} else {
install.packages("e1071", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
require("factoextra")
} else {
install.packages("factoextra", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: factoextra
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
require("FactoMineR")
} else {
install.packages("FactoMineR", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
install.packages("NHANES", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("NHANES")## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
install.packages("dplyr", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("dplyr")## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## naniar ----
# Documentation:
# https://cran.r-project.org/package=naniar or
# https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
install.packages("naniar", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("naniar")## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
install.packages("ggplot2", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("ggplot2")
## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
install.packages("mice", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("mice")## Loading required package: mice
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
install.packages("Amelia", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("Amelia")## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(readr)
heart <- read_csv(
"../data/heart.csv",
col_types = cols(
age = col_double(),
sex = col_factor(levels = c("0", "1")),
cp = col_factor(levels = c("0", "1", "2", "3")),
trestbps = col_double(),
chol = col_double(),
fbs = col_factor(levels = c("0", "1")),
restecg = col_factor(levels = c("0", "1", "2")),
thalach = col_double(),
exang = col_factor(levels = c("0", "1")),
oldpeak = col_double(),
slope = col_factor(levels = c("0", "1", "2")),
ca = col_double(),
thal = col_factor(levels = c("0", "1", "2", "3")),
target = col_factor(levels = c("neg", "pos"))
)
)
#View(heart)### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,
#and highlights the need for conversions between categorical and numerical data when necessary.
dim(heart)## [1] 1025 14
## age sex cp trestbps chol fbs restecg thalach
## "numeric" "factor" "factor" "numeric" "numeric" "factor" "factor" "numeric"
## exang oldpeak slope ca thal target
## "factor" "numeric" "factor" "numeric" "factor" "factor"
# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.
heart_freq <- heart$target
cbind(frequency = table(heart_freq),
percentage = prop.table(table(heart_freq)) * 100)## frequency percentage
## neg 499 48.68293
## pos 526 51.31707
# We, therefore, must manually create a function that can calculate the mode.
heart_target_mode <- names(table(heart$target))[
which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)## [1] "pos"
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:312 0:497 Min. : 94.0 Min. :126 0:872 0:497
## 1st Qu.:48.00 1:713 1:167 1st Qu.:120.0 1st Qu.:211 1:153 1:513
## Median :56.00 2:284 Median :130.0 Median :240 2: 15
## Mean :54.43 3: 77 Mean :131.6 Mean :246
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:275
## Max. :77.00 Max. :200.0 Max. :564
## thalach exang oldpeak slope ca thal
## Min. : 71.0 0:680 Min. :0.000 0: 74 Min. :0.0000 0: 7
## 1st Qu.:132.0 1:345 1st Qu.:0.000 1:482 1st Qu.:0.0000 1: 64
## Median :152.0 Median :0.800 2:469 Median :0.0000 2:544
## Mean :149.1 Mean :1.072 Mean :0.7541 3:410
## 3rd Qu.:166.0 3rd Qu.:1.800 3rd Qu.:1.0000
## Max. :202.0 Max. :6.200 Max. :4.0000
## target
## neg:499
## pos:526
##
##
##
##
# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)## age trestbps chol thalach oldpeak ca
## 9.072290 17.516718 51.592510 23.005724 1.175053 1.030798
## age trestbps chol thalach oldpeak ca
## 9.072290 17.516718 51.592510 23.005724 1.175053 1.030798
# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.
# In “type = 2” (used in SPSS and SAS):
# 1. Kurtosis < 3 implies a low number of outliers
# 2. Kurtosis = 3 implies a medium number of outliers
# 3. Kurtosis > 3 implies a high number of outliers
if (!is.element("e1071", installed.packages()[, 1])) {
install.packages("e1071", dependencies = TRUE)
}
require("e1071")
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], kurtosis, type = 2)## age trestbps chol thalach oldpeak ca
## -0.52561781 0.99122074 3.99680305 -0.08882249 1.31447089 0.70112287
# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:
# 1. Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2. Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3. Skewness below -0.4 implies a negative skew; a left-skewed distribution.
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], skewness, type = 2)## age trestbps chol thalach oldpeak ca
## -0.2488659 0.7397682 1.0740728 -0.5137772 1.2108994 1.2611886
# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.
heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 23096 23096 81.16 <2e-16 ***
## Residuals 1023 291104 285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor
#in determining blood pressure.
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.
heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
data = heart)
summary(heart_two_way_anova)## Df Sum Sq Mean Sq F value Pr(>F)
## exang 1 1177 1176.7 3.88 0.04914 *
## ca 1 3050 3050.2 10.06 0.00156 **
## Residuals 1022 309973 303.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in
#resting blood pressure.# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.
par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
heart_variable <- as.numeric(unlist(heart[, i]))
hist(heart_variable, main = names(heart)[i])
}heart_health_variable <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])
heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])
heart_health_variable <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])# Box and whisker plots are useful in understanding the distribution of data.
par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
boxplot(heart[, i], main = names(heart)[i])
}boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.
barplot(table(heart[, 2]), main = names(heart)[2])







# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)
# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.
if (!is.element("corrplot", installed.packages()[, 1])) {
install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")## Loading required package: corrplot
## corrplot 0.92 loaded
#heart <- heart[, -which(names(heart) == "target_numeric")]
# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")## Loading required package: ggcorrplot

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
aes(x = age, y = sex, shape = target, color = target)) +
geom_point() +
geom_smooth(method = lm)## `geom_smooth()` using formula = 'y ~ x'

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]## [1] FALSE
## [1] 0
## [1] 0
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)## # A tibble: 14 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 age 0 0
## 2 sex 0 0
## 3 cp 0 0
## 4 trestbps 0 0
## 5 chol 0 0
## 6 fbs 0 0
## 7 restecg 0 0
## 8 thalach 0 0
## 9 exang 0 0
## 10 oldpeak 0 0
## 11 slope 0 0
## 12 ca 0 0
## 13 thal 0 0
## 14 target 0 0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)## # A tibble: 500 × 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 1 0 0
## 2 2 0 0
## 3 3 0 0
## 4 4 0 0
## 5 5 0 0
## 6 6 0 0
## 7 7 0 0
## 8 8 0 0
## 9 9 0 0
## 10 10 0 0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)
# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))
# Which combinations of variables are missing together?
#gg_miss_upset(heart)
# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)
# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable
is.factor(heart$exang)## [1] TRUE
# Scaling Data Transformation Purpose:
# The scaling data transformation is performed to standardize or normalize
# the numeric features in a dataset. It ensures that different features
# with varying scales are brought to a common scale, preventing one feature
# from dominating the others during model training.
#
# Scaling is essential for algorithms that are sensitive to the magnitude
# of input features, such as distance-based methods (e.g., k-Nearest Neighbors)
# or gradient descent-based optimization algorithms (e.g., Support Vector Machines).
#
# By scaling the data, we make the features comparable and contribute equally
# to the model's learning process, improving the stability and performance of
# machine learning models.
# Summary of each variable
summary(heart)## age sex cp trestbps chol fbs
## Min. :29.00 0:158 0:242 Min. : 94.0 Min. :126.0 0:421
## 1st Qu.:47.00 1:342 1: 80 1st Qu.:120.0 1st Qu.:212.0 1: 79
## Median :54.00 2:139 Median :130.0 Median :240.0
## Mean :53.85 3: 39 Mean :131.4 Mean :247.9
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:282.0
## Max. :77.00 Max. :200.0 Max. :564.0
## restecg thalach exang oldpeak slope ca thal
## 0:247 Min. : 71.0 0:331 Min. :0.000 0: 37 Min. :0.00 0: 5
## 1:244 1st Qu.:132.0 1:169 1st Qu.:0.000 1:232 1st Qu.:0.00 1: 34
## 2: 9 Median :152.0 Median :0.800 2:231 Median :0.00 2:265
## Mean :148.7 Mean :1.064 Mean :0.74 3:196
## 3rd Qu.:166.0 3rd Qu.:1.600 3rd Qu.:1.00
## Max. :202.0 Max. :6.200 Max. :4.00
## target
## neg:230
## pos:270
##
##
##
##
# BEFORE
heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])heart_health_variable <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])heart_health_variable <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])heart_health_variable <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])heart_health_variable <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])## Created from 500 samples and 14 variables
##
## Pre-processing:
## - ignored (8)
## - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
heart)
# AFTER
#1, 4, 5, 8, 10, 12
heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])# Centering Data Transformation Purpose:
# Shifts numeric features by subtracting their mean,
# ensuring a centered distribution around zero.
# Useful for algorithms sensitive to variable means,
# promoting better model interpretability.
# Summary of each variable
summary(heart)## age sex cp trestbps chol fbs
## Min. :29.00 0:158 0:242 Min. : 94.0 Min. :126.0 0:421
## 1st Qu.:47.00 1:342 1: 80 1st Qu.:120.0 1st Qu.:212.0 1: 79
## Median :54.00 2:139 Median :130.0 Median :240.0
## Mean :53.85 3: 39 Mean :131.4 Mean :247.9
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:282.0
## Max. :77.00 Max. :200.0 Max. :564.0
## restecg thalach exang oldpeak slope ca thal
## 0:247 Min. : 71.0 0:331 Min. :0.000 0: 37 Min. :0.00 0: 5
## 1:244 1st Qu.:132.0 1:169 1st Qu.:0.000 1:232 1st Qu.:0.00 1: 34
## 2: 9 Median :152.0 Median :0.800 2:231 Median :0.00 2:265
## Mean :148.7 Mean :1.064 Mean :0.74 3:196
## 3rd Qu.:166.0 3rd Qu.:1.600 3rd Qu.:1.00
## Max. :202.0 Max. :6.200 Max. :4.00
## target
## neg:230
## pos:270
##
##
##
##
# BEFORE
heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_scale_transform )[1])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[4])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[5])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[8])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[10])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[12])model_of_the_transform <- preProcess(heart_scale_transform, method = c("center"))
print(model_of_the_transform)## Created from 500 samples and 14 variables
##
## Pre-processing:
## - centered (6)
## - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
heart_scale_transform)
# AFTER
#1, 4, 5, 8, 10, 12
heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])# BEFORE
#If you've already scaled and centered your data, you have made it comparable and adjusted for differences in means.
#Standardization is typically done to bring variables to a standard normal distribution (mean of 0 and standard deviation
#of 1).
summary(heart_center_transform)## age sex cp trestbps chol fbs
## Min. :-2.75717 0:158 0:242 Min. :-2.1279 Min. :-2.4058 0:421
## 1st Qu.:-0.76019 1:342 1: 80 1st Qu.:-0.6498 1st Qu.:-0.7091 1: 79
## Median : 0.01642 2:139 Median :-0.0813 Median :-0.1566
## Mean : 0.00000 3: 39 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.4872 3rd Qu.: 0.6720
## Max. : 2.56813 Max. : 3.8982 Max. : 6.2356
## restecg thalach exang oldpeak slope ca
## 0:247 Min. :-3.3196 0:331 Min. :-0.8961 0: 37 Min. :-0.7009
## 1:244 1st Qu.:-0.7121 1:169 1st Qu.:-0.8961 1:232 1st Qu.:-0.7009
## 2: 9 Median : 0.1428 Median :-0.2223 2:231 Median :-0.7009
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7412 3rd Qu.: 0.4514 3rd Qu.: 0.2463
## Max. : 2.2801 Max. : 4.3255 Max. : 3.0878
## thal target
## 0: 5 neg:230
## 1: 34 pos:270
## 2:265
## 3:196
##
##
## age trestbps chol thalach oldpeak ca
## 1 1 1 1 1 1
model_of_the_transform <- preProcess(heart_center_transform,
method = c("scale", "center"))
print(model_of_the_transform)## Created from 500 samples and 14 variables
##
## Pre-processing:
## - centered (6)
## - ignored (8)
## - scaled (6)
heart_standardize_transform <- predict(model_of_the_transform, # nolint
heart_center_transform)
# AFTER
summary(heart_standardize_transform)## age sex cp trestbps chol fbs
## Min. :-2.75717 0:158 0:242 Min. :-2.1279 Min. :-2.4058 0:421
## 1st Qu.:-0.76019 1:342 1: 80 1st Qu.:-0.6498 1st Qu.:-0.7091 1: 79
## Median : 0.01642 2:139 Median :-0.0813 Median :-0.1566
## Mean : 0.00000 3: 39 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.4872 3rd Qu.: 0.6720
## Max. : 2.56813 Max. : 3.8982 Max. : 6.2356
## restecg thalach exang oldpeak slope ca
## 0:247 Min. :-3.3196 0:331 Min. :-0.8961 0: 37 Min. :-0.7009
## 1:244 1st Qu.:-0.7121 1:169 1st Qu.:-0.8961 1:232 1st Qu.:-0.7009
## 2: 9 Median : 0.1428 Median :-0.2223 2:231 Median :-0.7009
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7412 3rd Qu.: 0.4514 3rd Qu.: 0.2463
## Max. : 2.2801 Max. : 4.3255 Max. : 3.0878
## thal target
## 0: 5 neg:230
## 1: 34 pos:270
## 2:265
## 3:196
##
##
## age trestbps chol thalach oldpeak ca
## 1 1 1 1 1 1
## age sex cp trestbps chol fbs
## Min. :-2.75717 0:158 0:242 Min. :-2.1279 Min. :-2.4058 0:421
## 1st Qu.:-0.76019 1:342 1: 80 1st Qu.:-0.6498 1st Qu.:-0.7091 1: 79
## Median : 0.01642 2:139 Median :-0.0813 Median :-0.1566
## Mean : 0.00000 3: 39 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.4872 3rd Qu.: 0.6720
## Max. : 2.56813 Max. : 3.8982 Max. : 6.2356
## restecg thalach exang oldpeak slope ca
## 0:247 Min. :-3.3196 0:331 Min. :-0.8961 0: 37 Min. :-0.7009
## 1:244 1st Qu.:-0.7121 1:169 1st Qu.:-0.8961 1:232 1st Qu.:-0.7009
## 2: 9 Median : 0.1428 Median :-0.2223 2:231 Median :-0.7009
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7412 3rd Qu.: 0.4514 3rd Qu.: 0.2463
## Max. : 2.2801 Max. : 4.3255 Max. : 3.0878
## thal target
## 0: 5 neg:230
## 1: 34 pos:270
## 2:265
## 3:196
##
##
#Calculate the skewness before the Box-Cox transform
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], skewness, type = 2)## age trestbps chol thalach oldpeak ca
## -0.1718278 0.7100820 0.8807213 -0.5100814 1.2866519 1.3141907
# **************************************************************************************************************
# RESULTS
# Age: The skewness is relatively small (-0.27). You might not need a Box-Cox transformation for age,
# especially since the skewness is not highly pronounced.
# trestbps (resting blood pressure): The skewness is moderate (0.78). Considering the moderate skewness,
# you could experiment with a Box-Cox transformation to see if it improves the distribution.
# chol (serum cholesterol): The skewness is high (1.49). A Box-Cox transformation is often beneficial for
# skewed variables, so you might consider applying it to cholesterol.
# thalach (maximum heart rate achieved): The skewness is relatively small (-0.56). A Box-Cox transformation
# might not be necessary for thalach, especially if the skewness is not a significant concern.
# oldpeak (ST depression induced by exercise): The skewness is high (1.21). Given the high skewness, a Box-Cox
# transformation could be beneficial for oldpeak.
# ca (number of major vessels colored by fluoroscopy): The skewness is high (1.35). Similar to cholesterol and
# oldpeak, a Box-Cox transformation might be considered for ca.
# **************************************************************************************************************
#Plot a histogram to view the skewness before the Box-Cox transform
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 1]))
hist(heart_health_variable , main = names(heart_standardize_transform)[1])
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 4]))
hist(heart_health_variable , main = names(heart_standardize_transform)[4])
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 5]))
hist(heart_health_variable , main = names(heart_standardize_transform)[5])
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 8]))
hist(heart_health_variable , main = names(heart_standardize_transform)[8])
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 10]))
hist(heart_health_variable , main = names(heart_standardize_transform)[10])
heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 12]))
hist(heart_health_variable , main = names(heart_standardize_transform)[12])
# Select the columns of interest
columns_of_interest <- c(12, 10, 5, 4)
# Apply Box-Cox transformation only to selected columns
model_of_the_transform <- preProcess(heart_standardize_transform[, columns_of_interest], method = c("BoxCox"))
# Print the transformation details
print(model_of_the_transform)## Created from 500 samples and 0 variables
##
## Pre-processing:
## - ignored (0)
# Apply the transformation to the selected columns
heart_box_cox_transform <- predict(model_of_the_transform, heart_standardize_transform[, columns_of_interest])
# AFTER
#1, 4, 5, 8, 10, 12
sapply(heart_box_cox_transform, skewness, type = 2)## ca oldpeak chol trestbps
## 1.3141907 1.2866519 0.8807213 0.7100820
# **************************************************************************************************************
#RESULT
#ca: Skewness is approximately 1.35. A positive skewness indicates that the distribution
#has a longer right tail.
#oldpeak: Skewness is approximately 1.21. Similar to "ca," the positive skewness
#suggests a longer right tail in the distribution.
#chol: Skewness is approximately 1.49. Again, positive skewness indicates a longer
#right tail.
#trestbps: Skewness is approximately 0.78. The positive skewness suggests a longer
#right tail, but it's less pronounced compared to the other variables.
# **************************************************************************************************************
heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 1]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[1])
heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 2]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[2])
heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 3]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[3])
heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 4]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[4])
### PCA for Dimensionality Reduction on the heart Dataset ----
# Combine original and transformed columns
heart_combined <- cbind(heart_standardize_transform[, c("age", "thalach")], heart_box_cox_transform)
summary(heart_combined )## age thalach ca oldpeak
## Min. :-2.75717 Min. :-3.3196 Min. :-0.7009 Min. :-0.8961
## 1st Qu.:-0.76019 1st Qu.:-0.7121 1st Qu.:-0.7009 1st Qu.:-0.8961
## Median : 0.01642 Median : 0.1428 Median :-0.7009 Median :-0.2223
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.7412 3rd Qu.: 0.2463 3rd Qu.: 0.4514
## Max. : 2.56813 Max. : 2.2801 Max. : 3.0878 Max. : 4.3255
## chol trestbps
## Min. :-2.4058 Min. :-2.1279
## 1st Qu.:-0.7091 1st Qu.:-0.6498
## Median :-0.1566 Median :-0.0813
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6720 3rd Qu.: 0.4872
## Max. : 6.2356 Max. : 3.8982
model_of_the_transform <- preProcess(heart_combined, method =
c("scale", "center", "pca"))
print(model_of_the_transform)## Created from 500 samples and 6 variables
##
## Pre-processing:
## - centered (6)
## - ignored (0)
## - principal component signal extraction (6)
## - scaled (6)
##
## PCA needed 6 components to capture 95 percent of the variance
## PC1 PC2 PC3 PC4
## Min. :-3.41728 Min. :-2.77244 Min. :-4.40309 Min. :-2.480478
## 1st Qu.:-1.04054 1st Qu.:-0.70782 1st Qu.:-0.71122 1st Qu.:-0.550784
## Median :-0.08947 Median :-0.05989 Median :-0.03299 Median :-0.004927
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.99553 3rd Qu.: 0.63505 3rd Qu.: 0.59697 3rd Qu.: 0.522978
## Max. : 3.45820 Max. : 4.13484 Max. : 3.00870 Max. : 2.765638
## PC5 PC6
## Min. :-3.25100 Min. :-1.48242
## 1st Qu.:-0.45213 1st Qu.:-0.48877
## Median :-0.05723 Median :-0.07673
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.42679 3rd Qu.: 0.47682
## Max. : 2.84971 Max. : 1.73697
### PCA for Feature Extraction ----
heart_pca_fe <- princomp(cor(heart_combined))
summary(heart_pca_fe)## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 0.7109310 0.3920044 0.3569546 0.3097962 0.22333985
## Proportion of Variance 0.5420891 0.1648153 0.1366601 0.1029361 0.05349931
## Cumulative Proportion 0.5420891 0.7069045 0.8435646 0.9465007 1.00000000
## Comp.6
## Standard deviation 5.882502e-09
## Proportion of Variance 3.711419e-17
## Cumulative Proportion 1.000000e+00
# **************************************************************************************************************
#RESULTS
#the PCA results suggest that the first five principal components retain the majority of the information in your dataset.
#The subsequent components (Comp.6 ) contribute very little or no additional information. I may consider using
#the first five components for further analysis, as they represent the most important features in terms of explaining the
#variance in the data. The negligible contribution of Comp.6 might indicate that these components are not providing
#meaningful information for the analysis.
# **************************************************************************************************************
#### Scree Plot ----
# The cumulative proportion reaches 100% after Comp.5, indicating that the first five components capture
#all the variance in the original data.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## age 0.48945115 0.3072417 0.08517206 0.42252444 0.5639681
## thalach -0.66520939 -0.0904431 0.02928681 0.05066650 0.0582243
## ca 0.35863816 -0.4924230 -0.31097901 0.42411265 -0.5191964
## oldpeak 0.40379185 -0.2934719 0.01815823 -0.77322182 0.1206458
## chol 0.08206685 0.7338168 -0.43306383 -0.20130090 -0.4178283
## trestbps 0.13974934 0.1741534 0.84101490 0.02530656 -0.4688808
## [1] "age" "thalach" "ca" "oldpeak" "chol" "trestbps"
factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
gradient.cols = c("red", "orange", "green"),
repel = TRUE)# Convert relevant columns to factors
categorical_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
heart_categorical <- heart
heart_categorical[categorical_cols] <- lapply(heart_categorical[categorical_cols], as.factor)
# Perform Multiple Correspondence Analysis (MCA)
heart_mca <- MCA(heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])##
## Call:
## MCA(X = heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 0.349 0.165 0.155 0.148 0.133 0.128 0.115
## % of var. 19.940 9.440 8.863 8.454 7.596 7.308 6.551
## Cumulative % of var. 19.940 29.379 38.243 46.697 54.293 61.601 68.153
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 0.111 0.097 0.090 0.084 0.074 0.058 0.044
## % of var. 6.370 5.537 5.123 4.774 4.207 3.302 2.535
## Cumulative % of var. 74.523 80.059 85.182 89.956 94.163 97.465 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr cos2
## 1 | 0.026 0.000 0.001 | -0.137 0.023 0.020 | -0.066 0.006 0.005
## 2 | 0.607 0.211 0.152 | -0.396 0.190 0.065 | 0.460 0.273 0.088
## 3 | -0.266 0.040 0.092 | -0.042 0.002 0.002 | 0.059 0.005 0.005
## 4 | 0.176 0.018 0.029 | -0.331 0.133 0.101 | -0.069 0.006 0.004
## 5 | -0.378 0.082 0.071 | -0.200 0.048 0.020 | 0.824 0.876 0.336
## 6 | 0.841 0.405 0.273 | 0.357 0.154 0.049 | 0.013 0.000 0.000
## 7 | 0.541 0.168 0.272 | -0.527 0.336 0.258 | 0.115 0.017 0.012
## 8 | 0.176 0.018 0.029 | -0.331 0.133 0.101 | -0.069 0.006 0.004
## 9 | 0.308 0.054 0.117 | -0.035 0.001 0.002 | -0.223 0.064 0.062
## 10 | -0.563 0.182 0.330 | -0.006 0.000 0.000 | 0.000 0.000 0.000
##
## 1 |
## 2 |
## 3 |
## 4 |
## 5 |
## 6 |
## 7 |
## 8 |
## 9 |
## 10 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr cos2 v.test
## sex_0 | -0.649 4.773 0.195 -9.859 | 0.393 3.687 0.071 5.962 |
## sex_1 | 0.300 2.205 0.195 9.859 | -0.181 1.704 0.071 -5.962 |
## cp_0 | 0.720 8.980 0.486 15.570 | -0.056 0.115 0.003 -1.211 |
## cp_1 | -0.967 5.365 0.178 -9.432 | -0.012 0.002 0.000 -0.116 |
## cp_2 | -0.688 4.708 0.182 -9.531 | 0.060 0.076 0.001 0.833 |
## cp_3 | -0.030 0.003 0.000 -0.198 | 0.158 0.147 0.002 1.024 |
## fbs_0 | -0.059 0.105 0.019 -3.044 | -0.083 0.435 0.036 -4.260 |
## fbs_1 | 0.315 0.560 0.019 3.044 | 0.440 2.317 0.036 4.260 |
## restecg_0 | 0.278 1.365 0.075 6.130 | 0.120 0.534 0.014 2.638 |
## restecg_1 | -0.317 1.752 0.096 -6.904 | -0.304 3.407 0.088 -6.624 |
## Dim.3 ctr cos2 v.test
## sex_0 -0.844 18.127 0.329 -12.810 |
## sex_1 0.390 8.374 0.329 12.810 |
## cp_0 -0.266 2.762 0.066 -5.757 |
## cp_1 0.804 8.325 0.123 7.834 |
## cp_2 -0.452 4.567 0.079 -6.259 |
## cp_3 1.612 16.337 0.220 10.475 |
## fbs_0 -0.106 0.758 0.060 -5.451 |
## fbs_1 0.563 4.040 0.060 5.451 |
## restecg_0 -0.129 0.662 0.016 -2.846 |
## restecg_1 0.162 1.031 0.025 3.531 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## sex | 0.195 0.071 0.329 |
## cp | 0.532 0.004 0.397 |
## fbs | 0.019 0.036 0.060 |
## restecg | 0.104 0.494 0.034 |
## exang | 0.425 0.021 0.035 |
## slope | 0.331 0.257 0.256 |
## thal | 0.512 0.429 0.128 |
## target | 0.674 0.009 0.003 |
heart_numerical_combined_with_factors <- cbind( heart_combined, heart[, c(2, 3, 6, 7, 9, 11, 13, 14)])
summary(heart_numerical_combined_with_factors)## age thalach ca oldpeak
## Min. :-2.75717 Min. :-3.3196 Min. :-0.7009 Min. :-0.8961
## 1st Qu.:-0.76019 1st Qu.:-0.7121 1st Qu.:-0.7009 1st Qu.:-0.8961
## Median : 0.01642 Median : 0.1428 Median :-0.7009 Median :-0.2223
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.7412 3rd Qu.: 0.2463 3rd Qu.: 0.4514
## Max. : 2.56813 Max. : 2.2801 Max. : 3.0878 Max. : 4.3255
## chol trestbps sex cp fbs restecg exang
## Min. :-2.4058 Min. :-2.1279 0:158 0:242 0:421 0:247 0:331
## 1st Qu.:-0.7091 1st Qu.:-0.6498 1:342 1: 80 1: 79 1:244 1:169
## Median :-0.1566 Median :-0.0813 2:139 2: 9
## Mean : 0.0000 Mean : 0.0000 3: 39
## 3rd Qu.: 0.6720 3rd Qu.: 0.4872
## Max. : 6.2356 Max. : 3.8982
## slope thal target
## 0: 37 0: 5 neg:230
## 1:232 1: 34 pos:270
## 2:231 2:265
## 3:196
##
##
## age thalach ca oldpeak
## Min. :-2.75717 Min. :-3.3196 Min. :-0.7009 Min. :-0.8961
## 1st Qu.:-0.76019 1st Qu.:-0.7121 1st Qu.:-0.7009 1st Qu.:-0.8961
## Median : 0.01642 Median : 0.1428 Median :-0.7009 Median :-0.2223
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.68208 3rd Qu.: 0.7412 3rd Qu.: 0.2463 3rd Qu.: 0.4514
## Max. : 2.56813 Max. : 2.2801 Max. : 3.0878 Max. : 4.3255
## chol trestbps sex cp fbs restecg exang
## Min. :-2.4058 Min. :-2.1279 0:158 0:242 0:421 0:247 0:331
## 1st Qu.:-0.7091 1st Qu.:-0.6498 1:342 1: 80 1: 79 1:244 1:169
## Median :-0.1566 Median :-0.0813 2:139 2: 9
## Mean : 0.0000 Mean : 0.0000 3: 39
## 3rd Qu.: 0.6720 3rd Qu.: 0.4872
## Max. : 6.2356 Max. : 3.8982
## slope thal target
## 0: 37 0: 5 neg:230
## 1:232 1: 34 pos:270
## 2:231 2:265
## 3:196
##
##
model_of_the_transform <- preProcess(heart_numerical_combined_with_factors ,
method = c("scale", "center", "ica"),
n.comp = 6)
print(model_of_the_transform)## Created from 500 samples and 14 variables
##
## Pre-processing:
## - centered (6)
## - independent component signal extraction (6)
## - ignored (8)
## - scaled (6)
##
## ICA used 6 components
heart_numerical_combined_with_factors_ica_dr <- predict(model_of_the_transform, heart_numerical_combined_with_factors )
summary(heart_numerical_combined_with_factors_ica_dr)## sex cp fbs restecg exang slope thal target
## 0:158 0:242 0:421 0:247 0:331 0: 37 0: 5 neg:230
## 1:342 1: 80 1: 79 1:244 1:169 1:232 1: 34 pos:270
## 2:139 2: 9 2:231 2:265
## 3: 39 3:196
##
##
## ICA1 ICA2 ICA3 ICA4
## Min. :-6.37988 Min. :-2.4012 Min. :-4.20026 Min. :-2.2059
## 1st Qu.:-0.59951 1st Qu.:-0.7661 1st Qu.:-0.62554 1st Qu.:-0.5649
## Median : 0.08156 Median : 0.0411 Median : 0.06527 Median :-0.1122
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.63948 3rd Qu.: 0.7562 3rd Qu.: 0.59384 3rd Qu.: 0.4460
## Max. : 2.80818 Max. : 3.0843 Max. : 2.38632 Max. : 3.7516
## ICA5 ICA6
## Min. :-4.1307 Min. :-2.0626
## 1st Qu.:-0.4323 1st Qu.:-0.7113
## Median : 0.1016 Median :-0.2247
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5648 3rd Qu.: 0.7633
## Max. : 2.7263 Max. : 2.5939
# ************************************************************************************************************************************
#RESULTS
# Summary of the Transformed Dataset after Independent Component Analysis (ICA)
# Categorical Variables:
# - sex, cp, fbs, restecg, exang, slope, thal, target are categorical variables with counts and percentages for each level.
# Independent Component Analysis (ICA) Components:
# - ICA1 to ICA6 represent the Independent Component Analysis components.
# - For each component, descriptive statistics are provided:
# - Minimum, 1st Quartile, Median, Mean, 3rd Quartile, Maximum.
# Example for ICA1:
# ICA1: Min. -4.3196, 1st Qu. -0.3524, Median 0.1066, Mean 0.0000, 3rd Qu. 0.5823, Max. 2.4614.
# These statistics offer insights into the distribution and characteristics of the transformed variables after applying ICA.
# ************************************************************************************************************************************## 'data.frame': 500 obs. of 14 variables:
## $ age : num -0.316 -1.759 0.571 -0.538 -0.871 ...
## $ thalach : num 1.596 1.425 1.425 -0.413 0.314 ...
## $ ca : num -0.701 -0.701 -0.701 2.141 -0.701 ...
## $ oldpeak : num -0.896 2.304 -0.896 0.788 -0.896 ...
## $ chol : num 0.258 -0.334 0.455 -1.183 -1.005 ...
## $ trestbps: num 0.487 -0.65 0.374 -0.65 -1.73 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ cp : Factor w/ 4 levels "0","1","2","3": 1 4 1 3 2 1 1 3 1 3 ...
## $ fbs : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 1 2 1 2 2 2 2 2 1 1 ...
## $ exang : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 1 1 ...
## $ slope : Factor w/ 3 levels "0","1","2": 3 2 3 2 3 2 3 2 2 3 ...
## $ thal : Factor w/ 4 levels "0","1","2","3": 3 4 3 4 4 2 4 4 3 3 ...
## $ target : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 1 1 1 2 ...
# Define a 75:25 train:test data split of the dataset.
# That is, 75% of the original data will be used to train the model and
# 25% of the original data will be used to test the model.
train_index <- createDataPartition(heart_preprocessed$target,
p = 0.75,
list = FALSE)
heart_preprocessed_train <- heart_preprocessed[train_index, ]
heart_preprocessed_test <- heart_preprocessed[-train_index, ]# Train a Naive Bayes model using the caret package.
heart_model_nb_caret <- # nolint
caret::train(target ~ ., data =
heart_preprocessed_train[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
"restecg" , "thalach" , "exang",
"oldpeak", "slope", "ca", "thal", "target")],
method = "naive_bayes")### 3.b. Test the trained caret Naive Bayes model using the testing dataset ----
predictions_nb_caret <-
predict(heart_model_nb_caret,
heart_preprocessed_test[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
"restecg" , "thalach" , "exang",
"oldpeak", "slope", "ca", "thal")])heart_freq <- heart_preprocessed_train$target
cbind(frequency =
table(heart_freq),
percentage = prop.table(table(heart_freq)) * 100)## frequency percentage
## neg 173 46.01064
## pos 203 53.98936
# The glm function in this code is used to train a Generalized Linear Model
# for predicting the target variable based on the provided features in the training dataset.
# The subsequent printing of model details and performance metrics allows for an assessment of its effectiveness.
# Set up train control with classProbs = TRUE
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)
set.seed(7)
heart_model_glm <-
train(target ~ ., data = heart_preprocessed_train, method = "glm",
metric = "Accuracy", trControl = train_control)
print(heart_model_glm)## Generalized Linear Model
##
## 376 samples
## 13 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8354263 0.6670537
# Set the seed for reproducibility
set.seed(7)
# Train a Linear Discriminant Analysis (LDA) model using the specified training control
heart_model_lda <- train(target ~ ., data = heart_preprocessed_train,
method = "lda", trControl = train_control)# Set the seed for reproducibility
set.seed(7)
# Train a Classification and Regression Trees (CART) model using the specified training control
heart_model_cart <- train(target ~ ., data = heart_preprocessed_train,
method = "rpart", trControl = train_control)# Set the seed for reproducibility
set.seed(7)
# Train a k-Nearest Neighbors (knn) model using the specified training control
heart_model_knn <- train(target ~ ., data = heart_preprocessed_train,
method = "knn", trControl = train_control)# Set the seed for reproducibility
set.seed(7)
# Train a Random Forest (rf) model using the specified training control
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train,
method = "rf", trControl = train_control)# We then create a list of the model results and pass the list as an argument
# to the `resamples` function.
results <- resamples(list(LDA = heart_model_lda, CART = heart_model_cart,
KNN = heart_model_knn, GLM = heart_model_glm,
RF = heart_model_rf))## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Create an empty plot
plot(0, 0, type = "n", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")predictions_lda <- predict(heart_model_lda, heart_preprocessed_test[, 1:13],type = "prob")
predictions_cart <- predict(heart_model_cart, heart_preprocessed_test[, 1:13],type = "prob")
predictions_knn <- predict(heart_model_knn, heart_preprocessed_test[, 1:13],type = "prob")
predictions_glm <- predict(heart_model_glm, heart_preprocessed_test[, 1:13],type = "prob")
predictions_rf <- predict(heart_model_rf, heart_preprocessed_test[, 1:13],type = "prob")
# Add ROC curves for each model to the plot
roc_curve_lda <- roc(heart_preprocessed_test$target, predictions_lda$neg)## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
# Plot each ROC curve with a different color
plot(roc_curve_lda, main = "ROC curve for LDA", print.auc = TRUE,
print.auc.x = 0.6, print.auc.y = 0.6, col = "red", lwd = 2.5)plot(roc_curve_cart, main = "ROC curve for CART", print.auc = TRUE,
print.auc.x = 0.6, print.auc.y = 0.6, col = "blue", lwd = 2.5)plot(roc_curve_knn, main = "ROC curve for KNN", print.auc = TRUE,
print.auc.x = 0.6, print.auc.y = 0.6, col = "green", lwd = 2.5)plot(roc_curve_glm,main = "ROC curve for GLM", print.auc = TRUE,
print.auc.x = 0.6, print.auc.y = 0.6, col = "purple")plot(roc_curve_rf, main = "ROC curve for RF", print.auc = TRUE,
print.auc.x = 0.6, print.auc.y = 0.6, col = "orange", lwd = 2.5)## 1. Table Summary ----
# This is the simplest comparison. It creates a table with one model per row
# and its corresponding evaluation metrics displayed per column.
summary(results)##
## Call:
## summary.resamples(object = results)
##
## Models: LDA, CART, KNN, GLM, RF
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.7027027 0.7683144 0.8399716 0.8388676 0.9189189 0.9729730 0
## CART 0.6486486 0.7046586 0.7368421 0.7439764 0.7837838 0.8684211 0
## KNN 0.6756757 0.7567568 0.7894737 0.8078291 0.8860242 0.9729730 0
## GLM 0.7027027 0.7631579 0.8421053 0.8354263 0.8940256 0.9729730 0
## RF 0.7631579 0.8918919 0.9220648 0.9212137 0.9729730 1.0000000 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.3879699 0.5319792 0.6705086 0.6721683 0.8345753 0.9458272 0
## CART 0.2701062 0.4168520 0.4765681 0.4883384 0.5731249 0.7382920 0
## KNN 0.3412463 0.5124451 0.5765885 0.6137426 0.7691556 0.9453471 0
## GLM 0.3934426 0.5236769 0.6788630 0.6670537 0.7886056 0.9458272 0
## RF 0.5289256 0.7822645 0.8441422 0.8414560 0.9453471 1.0000000 0
## 2. Box and Whisker Plot ----
# This is useful for visually observing the spread of the estimated accuracies
# for different algorithms and how they relate.
scales <- list(x = list(relation = "free"), y = list(relation = "free"))
bwplot(results, scales = scales)## 3. Dot Plots ----
# They show both the mean estimated accuracy as well as the 95% confidence
# interval (e.g. the range in which 95% of observed scores fell).
scales <- list(x = list(relation = "free"), y = list(relation = "free"))
dotplot(results, scales = scales)## 4. Scatter Plot Matrix ----
# This is useful when considering whether the predictions from two
# different algorithms are correlated. If weakly correlated, then they are good
# candidates for being combined in an ensemble prediction.
splom(results)## 5. Pairwise xyPlots ----
# You can zoom in on one pairwise comparison of the accuracy of trial-folds for
# two models using an xyplot.
# xyplot plots to compare models
xyplot(results, models = c("LDA", "RF"))# Explanation:
# - Upper diagonal: Estimates of the difference in accuracy and kappa values between models.
# - Lower diagonal: P-values for testing the hypothesis that the difference is zero.
# Accuracy:
# - Positive values indicate higher accuracy in the row model compared to the column model.
# - Negative values indicate lower accuracy.
# Kappa:
# - Positive values indicate higher kappa values in the row model compared to the column model.
# - Negative values indicate lower kappa values.
# P-values:
# - Assess the statistical significance of the observed differences.
# - Values below the significance level (e.g., 0.05) suggest a significant difference.
# - Bonferroni adjustment is applied for multiple comparisons.
diffs <- diff(results)
summary(diffs)##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## LDA CART KNN GLM RF
## LDA 0.094891 0.031039 0.003441 -0.082346
## CART 4.763e-07 -0.063853 -0.091450 -0.177237
## KNN 0.03721 5.723e-05 -0.027597 -0.113385
## GLM 1.00000 7.732e-07 0.12566 -0.085787
## RF 1.874e-06 9.436e-16 5.211e-09 3.641e-06
##
## Kappa
## LDA CART KNN GLM RF
## LDA 0.183830 0.058426 0.005115 -0.169288
## CART 1.473e-06 -0.125404 -0.178715 -0.353118
## KNN 0.06184 7.865e-05 -0.053311 -0.227713
## GLM 1.00000 1.744e-06 0.16237 -0.174402
## RF 1.726e-06 1.033e-15 5.161e-09 3.456e-06
# Define train control settings for a random search
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "random")
# Set a seed for reproducibility
set.seed(7)
# Perform a random search for Random Forest hyperparameters
heart_model_random_search_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
metric = "Accuracy",
tuneLength = 12, # Search 12 options for the value of mtry
trControl = train_control)
# Display the details of the random search model
print(heart_model_random_search_rf)## Random Forest
##
## 376 samples
## 13 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9186521 0.8362982
## 3 0.9185347 0.8361989
## 7 0.9203365 0.8396493
## 8 0.9194818 0.8381586
## 12 0.9229918 0.8450439
## 15 0.9212374 0.8414960
## 18 0.9159032 0.8308722
## 19 0.9168041 0.8327416
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
## 2.a. Bagged CART ----
set.seed(7)
heart_model_bagged_cart <- train(target ~ ., data = heart_preprocessed_train, method = "treebag",
metric = "Accuracy",
trControl = train_control)
## 2.b. Random Forest ----
set.seed(7)
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
metric = "Accuracy", trControl = train_control)
# Summarize results
bagging_results <-
resamples(list("Bagged Decision Tree" = heart_model_bagged_cart,
"Random Forest" = heart_model_rf))
summary(bagging_results)##
## Call:
## summary.resamples(object = bagging_results)
##
## Models: Bagged Decision Tree, Random Forest
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Bagged Decision Tree 0.7894737 0.8918919 0.9199858 0.9124394 0.9402287 1
## Random Forest 0.7631579 0.8918919 0.9220648 0.9177274 0.9473684 1
## NA's
## Bagged Decision Tree 0
## Random Forest 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Bagged Decision Tree 0.5789474 0.7828288 0.8388970 0.8242421 0.8801471 1
## Random Forest 0.5263158 0.7822645 0.8441422 0.8345238 0.8947368 1
## NA's
## Bagged Decision Tree 0
## Random Forest 0

##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 6.91%
## Confusion matrix:
## neg pos class.error
## neg 159 14 0.08092486
## pos 12 191 0.05911330
# We can test the model
set.seed(9)
predictions <- predict(heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions, heart_preprocessed_test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 53 2
## pos 4 65
##
## Accuracy : 0.9516
## 95% CI : (0.8977, 0.982)
## No Information Rate : 0.5403
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9023
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9298
## Specificity : 0.9701
## Pos Pred Value : 0.9636
## Neg Pred Value : 0.9420
## Prevalence : 0.4597
## Detection Rate : 0.4274
## Detection Prevalence : 0.4435
## Balanced Accuracy : 0.9500
##
## 'Positive' Class : neg
##
# Saving a model into a file allows you to load it later and use it to make
# predictions. Saved models can be loaded by calling the `readRDS()` function
saveRDS(heart_model_rf, "../models/saved_heart_model_rf.rds")
# The saved model can then be loaded later as follows:
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
print(loaded_heart_model_rf)## Random Forest
##
## 376 samples
## 13 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9168278 0.8324670
## 7 0.9177274 0.8345238
## 15 0.9158794 0.8309357
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
predictions_with_loaded_model <-
predict(loaded_heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions_with_loaded_model, heart_preprocessed_test$target)## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 53 2
## pos 4 65
##
## Accuracy : 0.9516
## 95% CI : (0.8977, 0.982)
## No Information Rate : 0.5403
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9023
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9298
## Specificity : 0.9701
## Pos Pred Value : 0.9636
## Neg Pred Value : 0.9420
## Prevalence : 0.4597
## Detection Rate : 0.4274
## Detection Prevalence : 0.4435
## Balanced Accuracy : 0.9500
##
## 'Positive' Class : neg
##
# Plumber requires functions, an example of the syntax for creating a function
# in R is:
name_of_function <- function(arg) {
# Do something with the argument called `arg`
}# We can also create and use our own data frame as follows:
# Convert categorical variables to factors
to_be_predicted <-
data.frame(age=70, sex=1, cp=0, trestbps=145, chol=174,
fbs=0, restecg=1, thalach=125, exang=1, oldpeak=2.6,
slope=0,
ca=0, thal=3)
to_be_predicted$sex <- as.factor(to_be_predicted$sex)
to_be_predicted$cp <- as.factor(to_be_predicted$cp)
to_be_predicted$fbs <- as.factor(to_be_predicted$fbs)
to_be_predicted$restecg <- as.factor(to_be_predicted$restecg)
to_be_predicted$exang <- as.factor(to_be_predicted$exang)
to_be_predicted$slope <- as.factor(to_be_predicted$slope)
to_be_predicted$thal <- as.factor(to_be_predicted$thal)
# We then use the data frame to make predictions
predict(loaded_heart_model_rf, newdata = to_be_predicted)## [1] neg
## Levels: neg pos
# An alternative is to create a function and then use the function to make
# predictions
predict_target <- function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
arg_ca, arg_thal) {
# Convert categorical variables to factors
arg_sex <- as.factor(arg_sex)
arg_cp <- as.factor(arg_cp)
arg_fbs <- as.factor(arg_fbs)
arg_restecg <- as.factor(arg_restecg)
arg_exang <- as.factor(arg_exang)
arg_slope <- as.factor(arg_slope)
arg_thal <- as.factor(arg_thal)
# Create a data frame using the arguments
to_be_predicted <- data.frame(age = arg_age, sex = arg_sex, cp = arg_cp, trestbps = arg_trestbps,
chol = arg_chol, fbs = arg_fbs, restecg = arg_restecg,
thalach = arg_thalach, exang = arg_exang, oldpeak = arg_oldpeak,
slope = arg_slope, ca = arg_ca, thal = arg_thal)
# Make a prediction based on the data frame
predict(loaded_heart_model_rf, newdata = to_be_predicted)
}# STEP 1. Install and Load the Required Packages ----
## plumber ----
if (require("plumber")) {
require("plumber")
} else {
install.packages("plumber", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: plumber
## caret ----
if (require("caret")) {
require("caret")
} else {
install.packages("caret", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
###LAB 12
# This requires the "plumber" package that was installed and loaded earlier in
# STEP 1. The commenting below makes R recognize the code as the definition of
# an API, i.e., #* comments.
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
#* @apiTitle CVDetect Prediction Model API
#* @apiDescription Used to predict whether a patient has heart disease or not.
#* @param arg_age Age of the patient
#* @param arg_sex Gender of the patient (0: Female, 1: Male)
#* @param arg_cp Chest pain type (0-3)
#* @param arg_trestbps Resting blood pressure (mm Hg)
#* @param arg_chol Serum cholesterol (mg/dl)
#* @param arg_fbs Fasting blood sugar (> 120 mg/dl, 0: No, 1: Yes)
#* @param arg_restecg Resting electrocardiographic results (0-2)
#* @param arg_thalach Maximum heart rate achieved
#* @param arg_exang Exercise-induced angina (0: No, 1: Yes)
#* @param arg_oldpeak ST depression induced by exercise relative to rest
#* @param arg_slope Slope of the peak exercise ST segment (0-2)
#* @param arg_ca Number of major vessels colored by fluoroscopy (0-3)
#* @param arg_thal Thallium stress test result (3: Normal, 6: Fixed defect, 7: Reversible defect)
#* @get /target
predict_diabetes <-
function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
arg_ca, arg_thal) {
# Create a data frame using the arguments
to_be_predicted <- data.frame(
age = as.numeric(arg_age),
sex = as.factor(arg_sex),
cp = as.factor(arg_cp),
trestbps = as.numeric(arg_trestbps),
chol = as.numeric(arg_chol),
fbs = as.factor(arg_fbs),
restecg = as.factor(arg_restecg),
thalach = as.numeric(arg_thalach),
exang = as.factor(arg_exang),
oldpeak = as.numeric(arg_oldpeak),
slope = as.factor(arg_slope),
ca = as.numeric(arg_ca),
thal = as.factor(arg_thal)
)
# Make a prediction based on the data frame
predict(loaded_heart_model_rf, newdata = to_be_predicted)
}#Run Plumber
# STEP 1. Install and load the required packages ----
## plumber ----
if (require("plumber")) {
require("plumber")
} else {
install.packages("plumber", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
# STEP 2. Process a Plumber API ----
# This allows us to process a plumber API
#api <- plumber::plumb("lab12.R")
# STEP 3. Run the API on a specific port ----
# Specify a constant localhost port to use
#api$run(host = "127.0.0.1", port = 5022)