Heart Disease Prediction Report

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

Student ID Numbers and Names of Group Members GitHub Classroom Group Name
2. 134141 - C - Aicha Mbongo
Course Code Course Name Program Semester Duration
BBT4206 Business Intelligence II Bachelor of Business Information Technology 21st August 2023 to 28th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: readr
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")
## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")
## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")
## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")
## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")
## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)
## [1] 1025   14
sapply(heart, class)
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)
## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)
##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
## 

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798
#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)
##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")
## Loading required package: corrplot

## corrplot 0.92 loaded
corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")
## Loading required package: ggcorrplot
ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)
## [1] FALSE
# How many?
n_miss(heart)
## [1] 0
# What is the percentage of missing data in the entire dataset?
prop_miss(heart)
## [1] 0
# How many missing values does each variable have?
heart %>% is.na() %>% colSums()
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)
## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)
## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Scaling Data Transformation Purpose:
# The scaling data transformation is performed to standardize or normalize
# the numeric features in a dataset. It ensures that different features
# with varying scales are brought to a common scale, preventing one feature
# from dominating the others during model training.
#
# Scaling is essential for algorithms that are sensitive to the magnitude
# of input features, such as distance-based methods (e.g., k-Nearest Neighbors)
# or gradient descent-based optimization algorithms (e.g., Support Vector Machines).
#
# By scaling the data, we make the features comparable and contribute equally
# to the model's learning process, improving the stability and performance of
# machine learning models.


# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:158   0:242   Min.   : 94.0   Min.   :126.0   0:421  
##  1st Qu.:47.00   1:342   1: 80   1st Qu.:120.0   1st Qu.:212.0   1: 79  
##  Median :54.00           2:139   Median :130.0   Median :240.0          
##  Mean   :53.85           3: 39   Mean   :131.4   Mean   :247.9          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:282.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       thal   
##  0:247   Min.   : 71.0   0:331   Min.   :0.000   0: 37   Min.   :0.00   0:  5  
##  1:244   1st Qu.:132.0   1:169   1st Qu.:0.000   1:232   1st Qu.:0.00   1: 34  
##  2:  9   Median :152.0           Median :0.800   2:231   Median :0.00   2:265  
##          Mean   :148.7           Mean   :1.064           Mean   :0.74   3:196  
##          3rd Qu.:166.0           3rd Qu.:1.600           3rd Qu.:1.00          
##          Max.   :202.0           Max.   :6.200           Max.   :4.00          
##  target   
##  neg:230  
##  pos:270  
##           
##           
##           
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Centering Data Transformation Purpose:
# Shifts numeric features by subtracting their mean,
# ensuring a centered distribution around zero.
# Useful for algorithms sensitive to variable means,
# promoting better model interpretability.



# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:158   0:242   Min.   : 94.0   Min.   :126.0   0:421  
##  1st Qu.:47.00   1:342   1: 80   1st Qu.:120.0   1st Qu.:212.0   1: 79  
##  Median :54.00           2:139   Median :130.0   Median :240.0          
##  Mean   :53.85           3: 39   Mean   :131.4   Mean   :247.9          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:282.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       thal   
##  0:247   Min.   : 71.0   0:331   Min.   :0.000   0: 37   Min.   :0.00   0:  5  
##  1:244   1st Qu.:132.0   1:169   1st Qu.:0.000   1:232   1st Qu.:0.00   1: 34  
##  2:  9   Median :152.0           Median :0.800   2:231   Median :0.00   2:265  
##          Mean   :148.7           Mean   :1.064           Mean   :0.74   3:196  
##          3rd Qu.:166.0           3rd Qu.:1.600           3rd Qu.:1.00          
##          Max.   :202.0           Max.   :6.200           Max.   :4.00          
##  target   
##  neg:230  
##  pos:270  
##           
##           
##           
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_scale_transform )[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[12])

model_of_the_transform <- preProcess(heart_scale_transform, method = c("center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
                                  heart_scale_transform)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

View(heart_center_transform)

The Standardize Basic Transform on the heart Dataset —-

# BEFORE
#If you've already scaled and centered your data, you have made it comparable and adjusted for differences in means.
#Standardization is typically done to bring variables to a standard normal distribution (mean of 0 and standard deviation 
#of 1).
summary(heart_center_transform)
##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
## 
sapply(heart_center_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1
model_of_the_transform <- preProcess(heart_center_transform,
                                     method = c("scale", "center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
##   - scaled (6)
heart_standardize_transform <- predict(model_of_the_transform, # nolint
                                       heart_center_transform)

# AFTER
summary(heart_standardize_transform)
##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
## 
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1

Box-Cox Transform—-

# BEFORE
summary(heart_standardize_transform)
##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
## 
#Calculate the skewness before the Box-Cox transform
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1718278  0.7100820  0.8807213 -0.5100814  1.2866519  1.3141907
# **************************************************************************************************************
#  RESULTS
#  Age: The skewness is relatively small (-0.27). You might not need a Box-Cox transformation for age,
#  especially since the skewness is not highly pronounced.
  
#  trestbps (resting blood pressure): The skewness is moderate (0.78). Considering the moderate skewness,
#  you could experiment with a Box-Cox transformation to see if it improves the distribution.
  
#  chol (serum cholesterol): The skewness is high (1.49). A Box-Cox transformation is often beneficial for 
#  skewed variables, so you might consider applying it to cholesterol.
  
#  thalach (maximum heart rate achieved): The skewness is relatively small (-0.56). A Box-Cox transformation
#  might not be necessary for thalach, especially if the skewness is not a significant concern.
  
#  oldpeak (ST depression induced by exercise): The skewness is high (1.21). Given the high skewness, a Box-Cox 
#  transformation could be beneficial for oldpeak.
  
#  ca (number of major vessels colored by fluoroscopy): The skewness is high (1.35). Similar to cholesterol and 
#  oldpeak, a Box-Cox transformation might be considered for ca.
# **************************************************************************************************************

  
#Plot a histogram to view the skewness before the Box-Cox transform

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 1]))
hist(heart_health_variable , main = names(heart_standardize_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 4]))
hist(heart_health_variable , main = names(heart_standardize_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 5]))
hist(heart_health_variable , main = names(heart_standardize_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 8]))
hist(heart_health_variable , main = names(heart_standardize_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 10]))
hist(heart_health_variable , main = names(heart_standardize_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 12]))
hist(heart_health_variable , main = names(heart_standardize_transform)[12])



# Select the columns of interest
columns_of_interest <- c(12, 10, 5, 4)

# Apply Box-Cox transformation only to selected columns
model_of_the_transform <- preProcess(heart_standardize_transform[, columns_of_interest], method = c("BoxCox"))

# Print the transformation details
print(model_of_the_transform)
## Created from 500 samples and 0 variables
## 
## Pre-processing:
##   - ignored (0)
# Apply the transformation to the selected columns
heart_box_cox_transform <- predict(model_of_the_transform, heart_standardize_transform[, columns_of_interest])




# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_box_cox_transform, skewness, type = 2)
##        ca   oldpeak      chol  trestbps 
## 1.3141907 1.2866519 0.8807213 0.7100820
# **************************************************************************************************************

#RESULT
#ca: Skewness is approximately 1.35. A positive skewness indicates that the distribution
#has a longer right tail.

#oldpeak: Skewness is approximately 1.21. Similar to "ca," the positive skewness
#suggests a longer right tail in the distribution.

#chol: Skewness is approximately 1.49. Again, positive skewness indicates a longer 
#right tail.

#trestbps: Skewness is approximately 0.78. The positive skewness suggests a longer 
#right tail, but it's less pronounced compared to the other variables.
# **************************************************************************************************************

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform[, 1]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform [, 2]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[2])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 3]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[3])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 4]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[4])

PCA for Feature Extraction on the Boston Housing Dataset —-

### PCA for Dimensionality Reduction on the heart Dataset ----
# Combine original and transformed columns
heart_combined <- cbind(heart_standardize_transform[, c("age", "thalach")], heart_box_cox_transform)
summary(heart_combined )
##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps      
##  Min.   :-2.4058   Min.   :-2.1279  
##  1st Qu.:-0.7091   1st Qu.:-0.6498  
##  Median :-0.1566   Median :-0.0813  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872  
##  Max.   : 6.2356   Max.   : 3.8982
model_of_the_transform <- preProcess(heart_combined, method =
                                       c("scale", "center", "pca"))

print(model_of_the_transform)
## Created from 500 samples and 6 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (0)
##   - principal component signal extraction (6)
##   - scaled (6)
## 
## PCA needed 6 components to capture 95 percent of the variance
heart_pca_dr <- predict(model_of_the_transform, heart_combined)

summary(heart_pca_dr)
##       PC1                PC2                PC3                PC4           
##  Min.   :-3.41728   Min.   :-2.77244   Min.   :-4.40309   Min.   :-2.480478  
##  1st Qu.:-1.04054   1st Qu.:-0.70782   1st Qu.:-0.71122   1st Qu.:-0.550784  
##  Median :-0.08947   Median :-0.05989   Median :-0.03299   Median :-0.004927  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.99553   3rd Qu.: 0.63505   3rd Qu.: 0.59697   3rd Qu.: 0.522978  
##  Max.   : 3.45820   Max.   : 4.13484   Max.   : 3.00870   Max.   : 2.765638  
##       PC5                PC6          
##  Min.   :-3.25100   Min.   :-1.48242  
##  1st Qu.:-0.45213   1st Qu.:-0.48877  
##  Median :-0.05723   Median :-0.07673  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.42679   3rd Qu.: 0.47682  
##  Max.   : 2.84971   Max.   : 1.73697
### PCA for Feature Extraction  ----


heart_pca_fe <- princomp(cor(heart_combined))
summary(heart_pca_fe)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.7109310 0.3920044 0.3569546 0.3097962 0.22333985
## Proportion of Variance 0.5420891 0.1648153 0.1366601 0.1029361 0.05349931
## Cumulative Proportion  0.5420891 0.7069045 0.8435646 0.9465007 1.00000000
##                              Comp.6
## Standard deviation     5.882502e-09
## Proportion of Variance 3.711419e-17
## Cumulative Proportion  1.000000e+00
# **************************************************************************************************************
#RESULTS
#the PCA results suggest that the first five principal components retain the majority of the information in your dataset. 
#The subsequent components (Comp.6 ) contribute very little or no additional information. I may consider using 
#the first five components for further analysis, as they represent the most important features in terms of explaining the 
#variance in the data. The negligible contribution of Comp.6  might indicate that these components are not providing 
#meaningful information for the analysis.
# **************************************************************************************************************



#### Scree Plot ----
# The cumulative proportion reaches 100% after Comp.5, indicating that the first five components capture 
#all the variance in the original data.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:5]
##               Comp.1     Comp.2      Comp.3      Comp.4     Comp.5
## age       0.48945115  0.3072417  0.08517206  0.42252444  0.5639681
## thalach  -0.66520939 -0.0904431  0.02928681  0.05066650  0.0582243
## ca        0.35863816 -0.4924230 -0.31097901  0.42411265 -0.5191964
## oldpeak   0.40379185 -0.2934719  0.01815823 -0.77322182  0.1206458
## chol      0.08206685  0.7338168 -0.43306383 -0.20130090 -0.4178283
## trestbps  0.13974934  0.1741534  0.84101490  0.02530656 -0.4688808
unique(rownames(heart_pca_fe$loadings[, 1:5]))
## [1] "age"      "thalach"  "ca"       "oldpeak"  "chol"     "trestbps"
factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:5)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

MCA for Feature Extraction —-

# Convert relevant columns to factors
categorical_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
heart_categorical <- heart
heart_categorical[categorical_cols] <- lapply(heart_categorical[categorical_cols], as.factor)

# Perform Multiple Correspondence Analysis (MCA)
heart_mca <- MCA(heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])

# Visualize MCA results for individuals
# Summary of the MCA results
summary(heart_mca)
## 
## Call:
## MCA(X = heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)]) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.349   0.165   0.155   0.148   0.133   0.128   0.115
## % of var.             19.940   9.440   8.863   8.454   7.596   7.308   6.551
## Cumulative % of var.  19.940  29.379  38.243  46.697  54.293  61.601  68.153
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.111   0.097   0.090   0.084   0.074   0.058   0.044
## % of var.              6.370   5.537   5.123   4.774   4.207   3.302   2.535
## Cumulative % of var.  74.523  80.059  85.182  89.956  94.163  97.465 100.000
## 
## Individuals (the 10 first)
##              Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## 1         |  0.026  0.000  0.001 | -0.137  0.023  0.020 | -0.066  0.006  0.005
## 2         |  0.607  0.211  0.152 | -0.396  0.190  0.065 |  0.460  0.273  0.088
## 3         | -0.266  0.040  0.092 | -0.042  0.002  0.002 |  0.059  0.005  0.005
## 4         |  0.176  0.018  0.029 | -0.331  0.133  0.101 | -0.069  0.006  0.004
## 5         | -0.378  0.082  0.071 | -0.200  0.048  0.020 |  0.824  0.876  0.336
## 6         |  0.841  0.405  0.273 |  0.357  0.154  0.049 |  0.013  0.000  0.000
## 7         |  0.541  0.168  0.272 | -0.527  0.336  0.258 |  0.115  0.017  0.012
## 8         |  0.176  0.018  0.029 | -0.331  0.133  0.101 | -0.069  0.006  0.004
## 9         |  0.308  0.054  0.117 | -0.035  0.001  0.002 | -0.223  0.064  0.062
## 10        | -0.563  0.182  0.330 | -0.006  0.000  0.000 |  0.000  0.000  0.000
##            
## 1         |
## 2         |
## 3         |
## 4         |
## 5         |
## 6         |
## 7         |
## 8         |
## 9         |
## 10        |
## 
## Categories (the 10 first)
##               Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2  v.test  
## sex_0     |  -0.649   4.773   0.195  -9.859 |   0.393   3.687   0.071   5.962 |
## sex_1     |   0.300   2.205   0.195   9.859 |  -0.181   1.704   0.071  -5.962 |
## cp_0      |   0.720   8.980   0.486  15.570 |  -0.056   0.115   0.003  -1.211 |
## cp_1      |  -0.967   5.365   0.178  -9.432 |  -0.012   0.002   0.000  -0.116 |
## cp_2      |  -0.688   4.708   0.182  -9.531 |   0.060   0.076   0.001   0.833 |
## cp_3      |  -0.030   0.003   0.000  -0.198 |   0.158   0.147   0.002   1.024 |
## fbs_0     |  -0.059   0.105   0.019  -3.044 |  -0.083   0.435   0.036  -4.260 |
## fbs_1     |   0.315   0.560   0.019   3.044 |   0.440   2.317   0.036   4.260 |
## restecg_0 |   0.278   1.365   0.075   6.130 |   0.120   0.534   0.014   2.638 |
## restecg_1 |  -0.317   1.752   0.096  -6.904 |  -0.304   3.407   0.088  -6.624 |
##             Dim.3     ctr    cos2  v.test  
## sex_0      -0.844  18.127   0.329 -12.810 |
## sex_1       0.390   8.374   0.329  12.810 |
## cp_0       -0.266   2.762   0.066  -5.757 |
## cp_1        0.804   8.325   0.123   7.834 |
## cp_2       -0.452   4.567   0.079  -6.259 |
## cp_3        1.612  16.337   0.220  10.475 |
## fbs_0      -0.106   0.758   0.060  -5.451 |
## fbs_1       0.563   4.040   0.060   5.451 |
## restecg_0  -0.129   0.662   0.016  -2.846 |
## restecg_1   0.162   1.031   0.025   3.531 |
## 
## Categorical variables (eta2)
##             Dim.1 Dim.2 Dim.3  
## sex       | 0.195 0.071 0.329 |
## cp        | 0.532 0.004 0.397 |
## fbs       | 0.019 0.036 0.060 |
## restecg   | 0.104 0.494 0.034 |
## exang     | 0.425 0.021 0.035 |
## slope     | 0.331 0.257 0.256 |
## thal      | 0.512 0.429 0.128 |
## target    | 0.674 0.009 0.003 |
heart_numerical_combined_with_factors <- cbind( heart_combined, heart[, c(2, 3, 6, 7, 9, 11, 13, 14)])
summary(heart_numerical_combined_with_factors)
##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4058   Min.   :-2.1279   0:158   0:242   0:421   0:247   0:331  
##  1st Qu.:-0.7091   1st Qu.:-0.6498   1:342   1: 80   1: 79   1:244   1:169  
##  Median :-0.1566   Median :-0.0813           2:139           2:  9          
##  Mean   : 0.0000   Mean   : 0.0000           3: 39                          
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872                                          
##  Max.   : 6.2356   Max.   : 3.8982                                          
##  slope   thal    target   
##  0: 37   0:  5   neg:230  
##  1:232   1: 34   pos:270  
##  2:231   2:265            
##          3:196            
##                           
## 

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart_numerical_combined_with_factors )
##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4058   Min.   :-2.1279   0:158   0:242   0:421   0:247   0:331  
##  1st Qu.:-0.7091   1st Qu.:-0.6498   1:342   1: 80   1: 79   1:244   1:169  
##  Median :-0.1566   Median :-0.0813           2:139           2:  9          
##  Mean   : 0.0000   Mean   : 0.0000           3: 39                          
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872                                          
##  Max.   : 6.2356   Max.   : 3.8982                                          
##  slope   thal    target   
##  0: 37   0:  5   neg:230  
##  1:232   1: 34   pos:270  
##  2:231   2:265            
##          3:196            
##                           
## 
model_of_the_transform <- preProcess(heart_numerical_combined_with_factors ,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 6)
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 6 components
heart_numerical_combined_with_factors_ica_dr <- predict(model_of_the_transform, heart_numerical_combined_with_factors )

summary(heart_numerical_combined_with_factors_ica_dr)
##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:158   0:242   0:421   0:247   0:331   0: 37   0:  5   neg:230  
##  1:342   1: 80   1: 79   1:244   1:169   1:232   1: 34   pos:270  
##          2:139           2:  9           2:231   2:265            
##          3: 39                                   3:196            
##                                                                   
##                                                                   
##       ICA1               ICA2              ICA3               ICA4        
##  Min.   :-6.37988   Min.   :-2.4012   Min.   :-4.20026   Min.   :-2.2059  
##  1st Qu.:-0.59951   1st Qu.:-0.7661   1st Qu.:-0.62554   1st Qu.:-0.5649  
##  Median : 0.08156   Median : 0.0411   Median : 0.06527   Median :-0.1122  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.63948   3rd Qu.: 0.7562   3rd Qu.: 0.59384   3rd Qu.: 0.4460  
##  Max.   : 2.80818   Max.   : 3.0843   Max.   : 2.38632   Max.   : 3.7516  
##       ICA5              ICA6        
##  Min.   :-4.1307   Min.   :-2.0626  
##  1st Qu.:-0.4323   1st Qu.:-0.7113  
##  Median : 0.1016   Median :-0.2247  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5648   3rd Qu.: 0.7633  
##  Max.   : 2.7263   Max.   : 2.5939
# ************************************************************************************************************************************

#RESULTS
# Summary of the Transformed Dataset after Independent Component Analysis (ICA)

# Categorical Variables:
# - sex, cp, fbs, restecg, exang, slope, thal, target are categorical variables with counts and percentages for each level.

# Independent Component Analysis (ICA) Components:
# - ICA1 to ICA6 represent the Independent Component Analysis components.
# - For each component, descriptive statistics are provided:
#   - Minimum, 1st Quartile, Median, Mean, 3rd Quartile, Maximum.

# Example for ICA1:
#   ICA1: Min. -4.3196, 1st Qu. -0.3524, Median 0.1066, Mean 0.0000, 3rd Qu. 0.5823, Max. 2.4614.

# These statistics offer insights into the distribution and characteristics of the transformed variables after applying ICA.
# ************************************************************************************************************************************

Milestone 3: Training the Model (Lab 5-8) —-

1. Split the dataset ====

heart_preprocessed <- heart_numerical_combined_with_factors

str(heart_preprocessed)
## 'data.frame':    500 obs. of  14 variables:
##  $ age     : num  -0.316 -1.759 0.571 -0.538 -0.871 ...
##  $ thalach : num  1.596 1.425 1.425 -0.413 0.314 ...
##  $ ca      : num  -0.701 -0.701 -0.701 2.141 -0.701 ...
##  $ oldpeak : num  -0.896 2.304 -0.896 0.788 -0.896 ...
##  $ chol    : num  0.258 -0.334 0.455 -1.183 -1.005 ...
##  $ trestbps: num  0.487 -0.65 0.374 -0.65 -1.73 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 4 1 3 2 1 1 3 1 3 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 1 2 1 2 2 2 2 2 1 1 ...
##  $ exang   : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 1 1 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 2 3 2 3 2 3 2 2 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 3 4 3 4 4 2 4 4 3 3 ...
##  $ target  : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 1 1 1 2 ...
# Define a 75:25 train:test data split of the dataset.
# That is, 75% of the original data will be used to train the model and
# 25% of the original data will be used to test the model.
train_index <- createDataPartition(heart_preprocessed$target,
                                   p = 0.75,
                                   list = FALSE)
heart_preprocessed_train <- heart_preprocessed[train_index, ]
heart_preprocessed_test <- heart_preprocessed[-train_index, ]

2. Train a Naive Bayes classifier using the training dataset —-

# Train a Naive Bayes model using the caret package.
heart_model_nb_caret <- # nolint
  caret::train(target ~ ., data =
                 heart_preprocessed_train[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                           "restecg" , "thalach" , "exang",
                           "oldpeak", "slope", "ca",  "thal", "target")],
               method = "naive_bayes")

3. Test the trained model using the testing dataset —-

### 3.b. Test the trained caret Naive Bayes model using the testing dataset ----
predictions_nb_caret <-
  predict(heart_model_nb_caret,
          heart_preprocessed_test[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                         "restecg" , "thalach" , "exang",
                         "oldpeak", "slope", "ca",  "thal")])

Determine the Baseline Accuracy —-

heart_freq <- heart_preprocessed_train$target
cbind(frequency =
        table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       173   46.01064
## pos       203   53.98936

Model Training and Evaluation using Generalized Linear Model (GLM)

# The glm function in this code is used to train a Generalized Linear Model
# for predicting the target variable based on the provided features in the training dataset.
# The subsequent printing of model details and performance metrics allows for an assessment of its effectiveness.
# Set up train control with classProbs = TRUE
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)


set.seed(7)
heart_model_glm <-
  train(target ~ ., data = heart_preprocessed_train, method = "glm",
        metric = "Accuracy", trControl = train_control)


print(heart_model_glm)
## Generalized Linear Model 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8354263  0.6670537

LDA —-

# Set the seed for reproducibility
set.seed(7)

# Train a Linear Discriminant Analysis (LDA) model using the specified training control
heart_model_lda <- train(target ~ ., data = heart_preprocessed_train,
                         method = "lda", trControl = train_control)

CART —-

# Set the seed for reproducibility
set.seed(7)

# Train a Classification and Regression Trees (CART) model using the specified training control
heart_model_cart <- train(target ~ ., data = heart_preprocessed_train,
                          method = "rpart", trControl = train_control)

KNN —-

# Set the seed for reproducibility
set.seed(7)

# Train a k-Nearest Neighbors (knn) model using the specified training control
heart_model_knn <- train(target ~ ., data = heart_preprocessed_train,
                         method = "knn", trControl = train_control)

Random Forest —-

# Set the seed for reproducibility
set.seed(7)

# Train a Random Forest (rf) model using the specified training control
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train,
                        method = "rf", trControl = train_control)

Call the resamples Function —-

# We then create a list of the model results and pass the list as an argument
# to the `resamples` function.

results <- resamples(list(LDA = heart_model_lda, CART = heart_model_cart,
                          KNN = heart_model_knn, GLM = heart_model_glm,
                          RF = heart_model_rf))

Add ROC curves for each model to the plot

# Install and load the pROC package 
# install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Create an empty plot
plot(0, 0, type = "n", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")

predictions_lda <- predict(heart_model_lda, heart_preprocessed_test[, 1:13],type = "prob")
predictions_cart <- predict(heart_model_cart, heart_preprocessed_test[, 1:13],type = "prob")
predictions_knn <- predict(heart_model_knn, heart_preprocessed_test[, 1:13],type = "prob")
predictions_glm <- predict(heart_model_glm, heart_preprocessed_test[, 1:13],type = "prob")
predictions_rf <- predict(heart_model_rf, heart_preprocessed_test[, 1:13],type = "prob")



# Add ROC curves for each model to the plot
roc_curve_lda <- roc(heart_preprocessed_test$target, predictions_lda$neg)
## Setting levels: control = neg, case = pos

## Setting direction: controls > cases
roc_curve_cart <- roc(heart_preprocessed_test$target, predictions_cart$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_knn <- roc(heart_preprocessed_test$target, predictions_knn$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_glm <- roc(heart_preprocessed_test$target, predictions_glm$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_rf <- roc(heart_preprocessed_test$target, predictions_rf$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
# Plot each ROC curve with a different color
plot(roc_curve_lda, main = "ROC curve for LDA", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "red", lwd = 2.5)

plot(roc_curve_cart, main = "ROC curve for CART", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "blue", lwd = 2.5)

plot(roc_curve_knn, main = "ROC curve for KNN", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "green", lwd = 2.5)

plot(roc_curve_glm,main = "ROC curve for GLM", print.auc = TRUE,
     print.auc.x = 0.6, print.auc.y = 0.6, col = "purple")

plot(roc_curve_rf, main = "ROC curve for RF", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "orange", lwd = 2.5)

Display the Results —-

## 1. Table Summary ----
# This is the simplest comparison. It creates a table with one model per row
# and its corresponding evaluation metrics displayed per column.

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LDA, CART, KNN, GLM, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.7027027 0.7683144 0.8399716 0.8388676 0.9189189 0.9729730    0
## CART 0.6486486 0.7046586 0.7368421 0.7439764 0.7837838 0.8684211    0
## KNN  0.6756757 0.7567568 0.7894737 0.8078291 0.8860242 0.9729730    0
## GLM  0.7027027 0.7631579 0.8421053 0.8354263 0.8940256 0.9729730    0
## RF   0.7631579 0.8918919 0.9220648 0.9212137 0.9729730 1.0000000    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.3879699 0.5319792 0.6705086 0.6721683 0.8345753 0.9458272    0
## CART 0.2701062 0.4168520 0.4765681 0.4883384 0.5731249 0.7382920    0
## KNN  0.3412463 0.5124451 0.5765885 0.6137426 0.7691556 0.9453471    0
## GLM  0.3934426 0.5236769 0.6788630 0.6670537 0.7886056 0.9458272    0
## RF   0.5289256 0.7822645 0.8441422 0.8414560 0.9453471 1.0000000    0
## 2. Box and Whisker Plot ----
# This is useful for visually observing the spread of the estimated accuracies
# for different algorithms and how they relate.

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
bwplot(results, scales = scales)

## 3. Dot Plots ----
# They show both the mean estimated accuracy as well as the 95% confidence
# interval (e.g. the range in which 95% of observed scores fell).

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
dotplot(results, scales = scales)

## 4. Scatter Plot Matrix ----
# This is useful when considering whether the predictions from two
# different algorithms are correlated. If weakly correlated, then they are good
# candidates for being combined in an ensemble prediction.

splom(results)

## 5. Pairwise xyPlots ----
# You can zoom in on one pairwise comparison of the accuracy of trial-folds for
# two models using an xyplot.

# xyplot plots to compare models
xyplot(results, models = c("LDA", "RF"))

# or
# xyplot plots to compare models
xyplot(results, models = c("GLM", "CART"))

Statistical Significance Tests —-

# Explanation:
# - Upper diagonal: Estimates of the difference in accuracy and kappa values between models.
# - Lower diagonal: P-values for testing the hypothesis that the difference is zero.

# Accuracy:
# - Positive values indicate higher accuracy in the row model compared to the column model.
# - Negative values indicate lower accuracy.

# Kappa:
# - Positive values indicate higher kappa values in the row model compared to the column model.
# - Negative values indicate lower kappa values.

# P-values:
# - Assess the statistical significance of the observed differences.
# - Values below the significance level (e.g., 0.05) suggest a significant difference.
# - Bonferroni adjustment is applied for multiple comparisons.
diffs <- diff(results)

summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.094891  0.031039  0.003441 -0.082346
## CART 4.763e-07           -0.063853 -0.091450 -0.177237
## KNN  0.03721   5.723e-05           -0.027597 -0.113385
## GLM  1.00000   7.732e-07 0.12566             -0.085787
## RF   1.874e-06 9.436e-16 5.211e-09 3.641e-06          
## 
## Kappa 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.183830  0.058426  0.005115 -0.169288
## CART 1.473e-06           -0.125404 -0.178715 -0.353118
## KNN  0.06184   7.865e-05           -0.053311 -0.227713
## GLM  1.00000   1.744e-06 0.16237             -0.174402
## RF   1.726e-06 1.033e-15 5.161e-09 3.456e-06

Apply a “Random Search” to identify the best parameter value —-

# Define train control settings for a random search
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "random")

# Set a seed for reproducibility
set.seed(7)

# Perform a random search for Random Forest hyperparameters
heart_model_random_search_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                                      metric = "Accuracy",
                                      tuneLength = 12,  # Search 12 options for the value of mtry
                                      trControl = train_control)

# Display the details of the random search model
print(heart_model_random_search_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9186521  0.8362982
##    3    0.9185347  0.8361989
##    7    0.9203365  0.8396493
##    8    0.9194818  0.8381586
##   12    0.9229918  0.8450439
##   15    0.9212374  0.8414960
##   18    0.9159032  0.8308722
##   19    0.9168041  0.8327416
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
# Plot the results of the random search
plot(heart_model_random_search_rf)

Bagged CART —-

## 2.a. Bagged CART ----
set.seed(7)
heart_model_bagged_cart <- train(target ~ ., data = heart_preprocessed_train, method = "treebag",
                               metric = "Accuracy",
                               trControl = train_control)

## 2.b. Random Forest ----
set.seed(7)
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                        metric = "Accuracy", trControl = train_control)


# Summarize results
bagging_results <-
  resamples(list("Bagged Decision Tree" = heart_model_bagged_cart,
                 "Random Forest" = heart_model_rf))

summary(bagging_results)
## 
## Call:
## summary.resamples(object = bagging_results)
## 
## Models: Bagged Decision Tree, Random Forest 
## Number of resamples: 30 
## 
## Accuracy 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.7894737 0.8918919 0.9199858 0.9124394 0.9402287    1
## Random Forest        0.7631579 0.8918919 0.9220648 0.9177274 0.9473684    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
## 
## Kappa 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.5789474 0.7828288 0.8388970 0.8242421 0.8801471    1
## Random Forest        0.5263158 0.7822645 0.8441422 0.8345238 0.8947368    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
dotplot(bagging_results)

# We then print the details of the model that has been created
print(heart_model_rf$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 6.91%
## Confusion matrix:
##     neg pos class.error
## neg 159  14  0.08092486
## pos  12 191  0.05911330

Test the Model —-

# We can test the model
set.seed(9)
predictions <- predict(heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  53   2
##        pos   4  65
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.5403         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9023         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9298         
##             Specificity : 0.9701         
##          Pos Pred Value : 0.9636         
##          Neg Pred Value : 0.9420         
##              Prevalence : 0.4597         
##          Detection Rate : 0.4274         
##    Detection Prevalence : 0.4435         
##       Balanced Accuracy : 0.9500         
##                                          
##        'Positive' Class : neg            
## 

Save and Load your Model —-

# Saving a model into a file allows you to load it later and use it to make
# predictions. Saved models can be loaded by calling the `readRDS()` function

saveRDS(heart_model_rf, "../models/saved_heart_model_rf.rds")
# The saved model can then be loaded later as follows:
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
print(loaded_heart_model_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9168278  0.8324670
##    7    0.9177274  0.8345238
##   15    0.9158794  0.8309357
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
predictions_with_loaded_model <-
  predict(loaded_heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions_with_loaded_model, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  53   2
##        pos   4  65
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.5403         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9023         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9298         
##             Specificity : 0.9701         
##          Pos Pred Value : 0.9636         
##          Neg Pred Value : 0.9420         
##              Prevalence : 0.4597         
##          Detection Rate : 0.4274         
##    Detection Prevalence : 0.4435         
##       Balanced Accuracy : 0.9500         
##                                          
##        'Positive' Class : neg            
## 

Creating Functions in R —-

# Plumber requires functions, an example of the syntax for creating a function
# in R is:

name_of_function <- function(arg) {
  # Do something with the argument called `arg`
}

Make Predictions on New Data using the Saved Model —-

# We can also create and use our own data frame as follows:
# Convert categorical variables to factors


to_be_predicted <-
  data.frame(age=70, sex=1, cp=0, trestbps=145, chol=174,
             fbs=0, restecg=1, thalach=125, exang=1, oldpeak=2.6,
             slope=0,
             ca=0, thal=3)


to_be_predicted$sex <- as.factor(to_be_predicted$sex)
to_be_predicted$cp <- as.factor(to_be_predicted$cp)
to_be_predicted$fbs <- as.factor(to_be_predicted$fbs)
to_be_predicted$restecg <- as.factor(to_be_predicted$restecg)
to_be_predicted$exang <- as.factor(to_be_predicted$exang)
to_be_predicted$slope <- as.factor(to_be_predicted$slope)
to_be_predicted$thal <- as.factor(to_be_predicted$thal)
# We then use the data frame to make predictions
predict(loaded_heart_model_rf, newdata = to_be_predicted)
## [1] neg
## Levels: neg pos

STEP 8. Make predictions using the model through a function —-

# An alternative is to create a function and then use the function to make
# predictions

predict_target <- function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
                           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
                           arg_ca, arg_thal) {
  # Convert categorical variables to factors
  arg_sex <- as.factor(arg_sex)
  arg_cp <- as.factor(arg_cp)
  arg_fbs <- as.factor(arg_fbs)
  arg_restecg <- as.factor(arg_restecg)
  arg_exang <- as.factor(arg_exang)
  arg_slope <- as.factor(arg_slope)
  arg_thal <- as.factor(arg_thal)
  
  # Create a data frame using the arguments
  to_be_predicted <- data.frame(age = arg_age, sex = arg_sex, cp = arg_cp, trestbps = arg_trestbps,
                                chol = arg_chol, fbs = arg_fbs, restecg = arg_restecg,
                                thalach = arg_thalach, exang = arg_exang, oldpeak = arg_oldpeak,
                                slope = arg_slope, ca = arg_ca, thal = arg_thal)
  
  # Make a prediction based on the data frame
  predict(loaded_heart_model_rf, newdata = to_be_predicted)
}

Plumber API

# STEP 1. Install and Load the Required Packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: plumber
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}


###LAB 12
# This requires the "plumber" package that was installed and loaded earlier in
# STEP 1. The commenting below makes R recognize the code as the definition of
# an API, i.e., #* comments.

loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")

#* @apiTitle CVDetect Prediction Model API

#* @apiDescription Used to predict whether a patient has heart disease or not.

#* @param arg_age Age of the patient
#* @param arg_sex Gender of the patient (0: Female, 1: Male)
#* @param arg_cp Chest pain type (0-3)
#* @param arg_trestbps Resting blood pressure (mm Hg)
#* @param arg_chol Serum cholesterol (mg/dl)
#* @param arg_fbs Fasting blood sugar (> 120 mg/dl, 0: No, 1: Yes)
#* @param arg_restecg Resting electrocardiographic results (0-2)
#* @param arg_thalach Maximum heart rate achieved
#* @param arg_exang Exercise-induced angina (0: No, 1: Yes)
#* @param arg_oldpeak ST depression induced by exercise relative to rest
#* @param arg_slope Slope of the peak exercise ST segment (0-2)
#* @param arg_ca Number of major vessels colored by fluoroscopy (0-3)
#* @param arg_thal Thallium stress test result (3: Normal, 6: Fixed defect, 7: Reversible defect)


#* @get /target

predict_diabetes <-
  function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
           arg_ca, arg_thal) {
    # Create a data frame using the arguments
    to_be_predicted <- data.frame(
      age = as.numeric(arg_age),
      sex = as.factor(arg_sex),
      cp = as.factor(arg_cp),
      trestbps = as.numeric(arg_trestbps),
      chol = as.numeric(arg_chol),
      fbs = as.factor(arg_fbs),
      restecg = as.factor(arg_restecg),
      thalach = as.numeric(arg_thalach),
      exang = as.factor(arg_exang),
      oldpeak = as.numeric(arg_oldpeak),
      slope = as.factor(arg_slope),
      ca = as.numeric(arg_ca),
      thal = as.factor(arg_thal)
    )
    
    # Make a prediction based on the data frame
    predict(loaded_heart_model_rf, newdata = to_be_predicted)
  }

#Run Plumber

# STEP 1. Install and load the required packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

# STEP 2. Process a Plumber API ----
# This allows us to process a plumber API
#api <- plumber::plumb("lab12.R")

# STEP 3. Run the API on a specific port ----
# Specify a constant localhost port to use
#api$run(host = "127.0.0.1", port = 5022)

PHP output

# Heart Disease Prediction Web Form


cat('<iframe src="http://localhost/lab15/Lab15-submission-ConsumePlumberAPIOutput.php" width="800" height="600"></iframe>')