Heart Disease Prediction Report

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

 134141 - Aicha Mbongo | |

Course Code Course Name Program Semester Duration
BBT4206 Business Intelligence II Bachelor of Business Information Technology 21st August 2023 to 28th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: readr
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")
## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")
## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")
## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")
## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")
## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)
## [1] 1025   14
sapply(heart, class)
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)
## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)
##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
## 

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798
#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)
##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")
## Loading required package: corrplot

## corrplot 0.92 loaded
corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")
## Loading required package: ggcorrplot
ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)
## [1] FALSE
# How many?
n_miss(heart)
## [1] 0
# What is the percentage of missing data in the entire dataset?
prop_miss(heart)
## [1] 0
# How many missing values does each variable have?
heart %>% is.na() %>% colSums()
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)
## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)
## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Scaling Data Transformation Purpose:
# The scaling data transformation is performed to standardize or normalize
# the numeric features in a dataset. It ensures that different features
# with varying scales are brought to a common scale, preventing one feature
# from dominating the others during model training.
#
# Scaling is essential for algorithms that are sensitive to the magnitude
# of input features, such as distance-based methods (e.g., k-Nearest Neighbors)
# or gradient descent-based optimization algorithms (e.g., Support Vector Machines).
#
# By scaling the data, we make the features comparable and contribute equally
# to the model's learning process, improving the stability and performance of
# machine learning models.


# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:147   0:232   Min.   : 94.0   Min.   :126.0   0:418  
##  1st Qu.:48.00   1:353   1: 87   1st Qu.:120.0   1st Qu.:211.0   1: 82  
##  Median :55.00           2:138   Median :130.0   Median :240.0          
##  Mean   :54.05           3: 43   Mean   :131.9   Mean   :245.4          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:275.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       
##  0:251   Min.   : 71.0   0:319   Min.   :0.000   0: 43   Min.   :0.000  
##  1:238   1st Qu.:132.0   1:181   1st Qu.:0.000   1:239   1st Qu.:0.000  
##  2: 11   Median :152.0           Median :0.800   2:218   Median :0.000  
##          Mean   :148.9           Mean   :1.078           Mean   :0.768  
##          3rd Qu.:165.0           3rd Qu.:1.800           3rd Qu.:1.000  
##          Max.   :202.0           Max.   :6.200           Max.   :4.000  
##  thal    target   
##  0:  4   neg:247  
##  1: 35   pos:253  
##  2:260            
##  3:201            
##                   
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Centering Data Transformation Purpose:
# Shifts numeric features by subtracting their mean,
# ensuring a centered distribution around zero.
# Useful for algorithms sensitive to variable means,
# promoting better model interpretability.



# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:147   0:232   Min.   : 94.0   Min.   :126.0   0:418  
##  1st Qu.:48.00   1:353   1: 87   1st Qu.:120.0   1st Qu.:211.0   1: 82  
##  Median :55.00           2:138   Median :130.0   Median :240.0          
##  Mean   :54.05           3: 43   Mean   :131.9   Mean   :245.4          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:275.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       
##  0:251   Min.   : 71.0   0:319   Min.   :0.000   0: 43   Min.   :0.000  
##  1:238   1st Qu.:132.0   1:181   1st Qu.:0.000   1:239   1st Qu.:0.000  
##  2: 11   Median :152.0           Median :0.800   2:218   Median :0.000  
##          Mean   :148.9           Mean   :1.078           Mean   :0.768  
##          3rd Qu.:165.0           3rd Qu.:1.800           3rd Qu.:1.000  
##          Max.   :202.0           Max.   :6.200           Max.   :4.000  
##  thal    target   
##  0:  4   neg:247  
##  1: 35   pos:253  
##  2:260            
##  3:201            
##                   
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_scale_transform )[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[12])

model_of_the_transform <- preProcess(heart_scale_transform, method = c("center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
                                  heart_scale_transform)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

View(heart_center_transform)

The Standardize Basic Transform on the heart Dataset —-

# BEFORE
#If you've already scaled and centered your data, you have made it comparable and adjusted for differences in means.
#Standardization is typically done to bring variables to a standard normal distribution (mean of 0 and standard deviation 
#of 1).
summary(heart_center_transform)
##       age          sex     cp         trestbps            chol         fbs    
##  Min.   :-2.9053   0:147   0:232   Min.   :-2.1539   Min.   :-2.4217   0:418  
##  1st Qu.:-0.7020   1:353   1: 87   1st Qu.:-0.6772   1st Qu.:-0.6983   1: 82  
##  Median : 0.1097           2:138   Median :-0.1093   Median :-0.1104          
##  Mean   : 0.0000           3: 43   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.6895                   3rd Qu.: 0.4587   3rd Qu.: 0.5992          
##  Max.   : 2.6609                   Max.   : 3.8663   Max.   : 6.4586          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:251   Min.   :-3.4495   0:319   Min.   :-0.9168   0: 43   Min.   :-0.7282  
##  1:238   1st Qu.:-0.7497   1:181   1st Qu.:-0.9168   1:239   1st Qu.:-0.7282  
##  2: 11   Median : 0.1354           Median :-0.2367   2:218   Median :-0.7282  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7108           3rd Qu.: 0.6135           3rd Qu.: 0.2200  
##          Max.   : 2.3484           Max.   : 4.3540           Max.   : 3.0644  
##  thal    target   
##  0:  4   neg:247  
##  1: 35   pos:253  
##  2:260            
##  3:201            
##                   
## 
sapply(heart_center_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1
model_of_the_transform <- preProcess(heart_center_transform,
                                     method = c("scale", "center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
##   - scaled (6)
heart_standardize_transform <- predict(model_of_the_transform, # nolint
                                       heart_center_transform)

# AFTER
summary(heart_standardize_transform)
##       age          sex     cp         trestbps            chol         fbs    
##  Min.   :-2.9053   0:147   0:232   Min.   :-2.1539   Min.   :-2.4217   0:418  
##  1st Qu.:-0.7020   1:353   1: 87   1st Qu.:-0.6772   1st Qu.:-0.6983   1: 82  
##  Median : 0.1097           2:138   Median :-0.1093   Median :-0.1104          
##  Mean   : 0.0000           3: 43   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.6895                   3rd Qu.: 0.4587   3rd Qu.: 0.5992          
##  Max.   : 2.6609                   Max.   : 3.8663   Max.   : 6.4586          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:251   Min.   :-3.4495   0:319   Min.   :-0.9168   0: 43   Min.   :-0.7282  
##  1:238   1st Qu.:-0.7497   1:181   1st Qu.:-0.9168   1:239   1st Qu.:-0.7282  
##  2: 11   Median : 0.1354           Median :-0.2367   2:218   Median :-0.7282  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7108           3rd Qu.: 0.6135           3rd Qu.: 0.2200  
##          Max.   : 2.3484           Max.   : 4.3540           Max.   : 3.0644  
##  thal    target   
##  0:  4   neg:247  
##  1: 35   pos:253  
##  2:260            
##  3:201            
##                   
## 
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1

Box-Cox Transform—-

# BEFORE
summary(heart_standardize_transform)
##       age          sex     cp         trestbps            chol         fbs    
##  Min.   :-2.9053   0:147   0:232   Min.   :-2.1539   Min.   :-2.4217   0:418  
##  1st Qu.:-0.7020   1:353   1: 87   1st Qu.:-0.6772   1st Qu.:-0.6983   1: 82  
##  Median : 0.1097           2:138   Median :-0.1093   Median :-0.1104          
##  Mean   : 0.0000           3: 43   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.6895                   3rd Qu.: 0.4587   3rd Qu.: 0.5992          
##  Max.   : 2.6609                   Max.   : 3.8663   Max.   : 6.4586          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:251   Min.   :-3.4495   0:319   Min.   :-0.9168   0: 43   Min.   :-0.7282  
##  1:238   1st Qu.:-0.7497   1:181   1st Qu.:-0.9168   1:239   1st Qu.:-0.7282  
##  2: 11   Median : 0.1354           Median :-0.2367   2:218   Median :-0.7282  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7108           3rd Qu.: 0.6135           3rd Qu.: 0.2200  
##          Max.   : 2.3484           Max.   : 4.3540           Max.   : 3.0644  
##  thal    target   
##  0:  4   neg:247  
##  1: 35   pos:253  
##  2:260            
##  3:201            
##                   
## 
#Calculate the skewness before the Box-Cox transform
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1661626  0.6821373  0.8206755 -0.5659672  1.1861899  1.2872057
# **************************************************************************************************************
#  RESULTS
#  Age: The skewness is relatively small (-0.27). You might not need a Box-Cox transformation for age,
#  especially since the skewness is not highly pronounced.
  
#  trestbps (resting blood pressure): The skewness is moderate (0.78). Considering the moderate skewness,
#  you could experiment with a Box-Cox transformation to see if it improves the distribution.
  
#  chol (serum cholesterol): The skewness is high (1.49). A Box-Cox transformation is often beneficial for 
#  skewed variables, so you might consider applying it to cholesterol.
  
#  thalach (maximum heart rate achieved): The skewness is relatively small (-0.56). A Box-Cox transformation
#  might not be necessary for thalach, especially if the skewness is not a significant concern.
  
#  oldpeak (ST depression induced by exercise): The skewness is high (1.21). Given the high skewness, a Box-Cox 
#  transformation could be beneficial for oldpeak.
  
#  ca (number of major vessels colored by fluoroscopy): The skewness is high (1.35). Similar to cholesterol and 
#  oldpeak, a Box-Cox transformation might be considered for ca.
# **************************************************************************************************************

  
#Plot a histogram to view the skewness before the Box-Cox transform

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 1]))
hist(heart_health_variable , main = names(heart_standardize_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 4]))
hist(heart_health_variable , main = names(heart_standardize_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 5]))
hist(heart_health_variable , main = names(heart_standardize_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 8]))
hist(heart_health_variable , main = names(heart_standardize_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 10]))
hist(heart_health_variable , main = names(heart_standardize_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 12]))
hist(heart_health_variable , main = names(heart_standardize_transform)[12])



# Select the columns of interest
columns_of_interest <- c(12, 10, 5, 4)

# Apply Box-Cox transformation only to selected columns
model_of_the_transform <- preProcess(heart_standardize_transform[, columns_of_interest], method = c("BoxCox"))

# Print the transformation details
print(model_of_the_transform)
## Created from 500 samples and 0 variables
## 
## Pre-processing:
##   - ignored (0)
# Apply the transformation to the selected columns
heart_box_cox_transform <- predict(model_of_the_transform, heart_standardize_transform[, columns_of_interest])




# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_box_cox_transform, skewness, type = 2)
##        ca   oldpeak      chol  trestbps 
## 1.2872057 1.1861899 0.8206755 0.6821373
# **************************************************************************************************************

#RESULT
#ca: Skewness is approximately 1.35. A positive skewness indicates that the distribution
#has a longer right tail.

#oldpeak: Skewness is approximately 1.21. Similar to "ca," the positive skewness
#suggests a longer right tail in the distribution.

#chol: Skewness is approximately 1.49. Again, positive skewness indicates a longer 
#right tail.

#trestbps: Skewness is approximately 0.78. The positive skewness suggests a longer 
#right tail, but it's less pronounced compared to the other variables.
# **************************************************************************************************************

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform[, 1]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform [, 2]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[2])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 3]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[3])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 4]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[4])

PCA for Feature Extraction on the Boston Housing Dataset —-

### PCA for Dimensionality Reduction on the heart Dataset ----
# Combine original and transformed columns
heart_combined <- cbind(heart_standardize_transform[, c("age", "thalach")], heart_box_cox_transform)
summary(heart_combined )
##       age             thalach              ca             oldpeak       
##  Min.   :-2.9053   Min.   :-3.4495   Min.   :-0.7282   Min.   :-0.9168  
##  1st Qu.:-0.7020   1st Qu.:-0.7497   1st Qu.:-0.7282   1st Qu.:-0.9168  
##  Median : 0.1097   Median : 0.1354   Median :-0.7282   Median :-0.2367  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6895   3rd Qu.: 0.7108   3rd Qu.: 0.2200   3rd Qu.: 0.6135  
##  Max.   : 2.6609   Max.   : 2.3484   Max.   : 3.0644   Max.   : 4.3540  
##       chol            trestbps      
##  Min.   :-2.4217   Min.   :-2.1539  
##  1st Qu.:-0.6983   1st Qu.:-0.6772  
##  Median :-0.1104   Median :-0.1093  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5992   3rd Qu.: 0.4587  
##  Max.   : 6.4586   Max.   : 3.8663
model_of_the_transform <- preProcess(heart_combined, method =
                                       c("scale", "center", "pca"))

print(model_of_the_transform)
## Created from 500 samples and 6 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (0)
##   - principal component signal extraction (6)
##   - scaled (6)
## 
## PCA needed 6 components to capture 95 percent of the variance
heart_pca_dr <- predict(model_of_the_transform, heart_combined)

summary(heart_pca_dr)
##       PC1               PC2                PC3                PC4         
##  Min.   :-3.5729   Min.   :-4.61491   Min.   :-2.68626   Min.   :-3.2790  
##  1st Qu.:-0.9763   1st Qu.:-0.70822   1st Qu.:-0.59001   1st Qu.:-0.6209  
##  Median : 0.1276   Median : 0.04232   Median :-0.01535   Median :-0.1118  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.9957   3rd Qu.: 0.69754   3rd Qu.: 0.65186   3rd Qu.: 0.5076  
##  Max.   : 3.5470   Max.   : 2.65747   Max.   : 2.77273   Max.   : 3.3689  
##       PC5                 PC6         
##  Min.   :-2.914974   Min.   :-1.6775  
##  1st Qu.:-0.486171   1st Qu.:-0.4703  
##  Median : 0.002314   Median :-0.0576  
##  Mean   : 0.000000   Mean   : 0.0000  
##  3rd Qu.: 0.509235   3rd Qu.: 0.4884  
##  Max.   : 2.305950   Max.   : 1.8128
### PCA for Feature Extraction  ----


heart_pca_fe <- princomp(cor(heart_combined))
summary(heart_pca_fe)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.6784939 0.4086560 0.3626970 0.3285725 0.22894556
## Proportion of Variance 0.5007774 0.1816638 0.1431004 0.1174398 0.05701868
## Cumulative Proportion  0.5007774 0.6824412 0.8255415 0.9429813 1.00000000
##                        Comp.6
## Standard deviation          0
## Proportion of Variance      0
## Cumulative Proportion       1
# **************************************************************************************************************
#RESULTS
#the PCA results suggest that the first five principal components retain the majority of the information in your dataset. 
#The subsequent components (Comp.6 ) contribute very little or no additional information. I may consider using 
#the first five components for further analysis, as they represent the most important features in terms of explaining the 
#variance in the data. The negligible contribution of Comp.6  might indicate that these components are not providing 
#meaningful information for the analysis.
# **************************************************************************************************************



#### Scree Plot ----
# The cumulative proportion reaches 100% after Comp.5, indicating that the first five components capture 
#all the variance in the original data.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:5]
##              Comp.1     Comp.2     Comp.3     Comp.4      Comp.5
## age       0.4888932  0.3079925  0.1373123  0.3313284  0.63564647
## thalach  -0.6719959 -0.1153564 -0.1146072  0.1299428  0.12801749
## ca        0.2988011 -0.5127746  0.5418997  0.2716502 -0.42301131
## oldpeak   0.4300805 -0.2211574 -0.4284532 -0.6263484 -0.05336626
## chol     -0.0495485  0.6985520  0.4246252 -0.3540574 -0.36237634
## trestbps  0.1808277  0.3033320 -0.5572042  0.5309177 -0.51619952
unique(rownames(heart_pca_fe$loadings[, 1:5]))
## [1] "age"      "thalach"  "ca"       "oldpeak"  "chol"     "trestbps"
factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:5)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

MCA for Feature Extraction —-

# Convert relevant columns to factors
categorical_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
heart_categorical <- heart
heart_categorical[categorical_cols] <- lapply(heart_categorical[categorical_cols], as.factor)

# Perform Multiple Correspondence Analysis (MCA)
heart_mca <- MCA(heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])

# Visualize MCA results for individuals
# Summary of the MCA results
summary(heart_mca)
## 
## Call:
## MCA(X = heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)]) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.344   0.170   0.157   0.140   0.133   0.126   0.117
## % of var.             19.684   9.724   8.966   7.973   7.581   7.186   6.669
## Cumulative % of var.  19.684  29.408  38.374  46.347  53.927  61.113  67.782
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.115   0.100   0.094   0.090   0.065   0.055   0.045
## % of var.              6.564   5.739   5.367   5.129   3.703   3.146   2.569
## Cumulative % of var.  74.347  80.085  85.453  90.582  94.285  97.431 100.000
## 
## Individuals (the 10 first)
##              Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## 1         | -0.139  0.011  0.011 |  0.196  0.045  0.022 |  0.407  0.211  0.095
## 2         | -0.221  0.028  0.047 | -0.483  0.274  0.224 |  0.283  0.102  0.077
## 3         | -0.649  0.245  0.342 | -0.415  0.202  0.139 |  0.224  0.064  0.040
## 4         |  0.583  0.197  0.360 | -0.231  0.063  0.056 | -0.047  0.003  0.002
## 5         |  0.083  0.004  0.006 | -0.627  0.463  0.332 |  0.247  0.078  0.051
## 6         |  0.583  0.197  0.360 | -0.231  0.063  0.056 | -0.047  0.003  0.002
## 7         | -0.745  0.322  0.548 |  0.105  0.013  0.011 | -0.160  0.033  0.025
## 8         | -0.533  0.165  0.232 | -0.372  0.163  0.114 |  0.484  0.299  0.192
## 9         | -0.063  0.002  0.005 | -0.061  0.004  0.005 |  0.029  0.001  0.001
## 10        | -0.794  0.366  0.493 |  0.115  0.016  0.010 | -0.516  0.340  0.209
##            
## 1         |
## 2         |
## 3         |
## 4         |
## 5         |
## 6         |
## 7         |
## 8         |
## 9         |
## 10        |
## 
## Categories (the 10 first)
##               Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2  v.test  
## sex_0     |  -0.522   2.912   0.114  -7.531 |  -0.983  20.866   0.402 -14.169 |
## sex_1     |   0.218   1.213   0.114   7.531 |   0.409   8.689   0.402  14.169 |
## cp_0      |   0.766   9.872   0.508  15.915 |  -0.343   4.017   0.102  -7.135 |
## cp_1      |  -0.930   5.459   0.182  -9.533 |   0.055   0.039   0.001   0.565 |
## cp_2      |  -0.700   4.909   0.187  -9.656 |   0.021   0.009   0.000   0.286 |
## cp_3      |  -0.003   0.000   0.000  -0.022 |   1.674  17.704   0.264  11.471 |
## fbs_0     |  -0.038   0.044   0.007  -1.926 |  -0.180   2.001   0.166  -9.103 |
## fbs_1     |   0.195   0.226   0.007   1.926 |   0.920  10.199   0.166   9.103 |
## restecg_0 |   0.246   1.099   0.061   5.508 |   0.150   0.830   0.023   3.364 |
## restecg_1 |  -0.302   1.575   0.083  -6.429 |   0.011   0.004   0.000   0.226 |
##             Dim.3     ctr    cos2  v.test  
## sex_0       0.570   7.603   0.135   8.213 |
## sex_1      -0.237   3.166   0.135  -8.213 |
## cp_0       -0.144   0.766   0.018  -2.993 |
## cp_1       -0.637   5.623   0.085  -6.530 |
## cp_2        0.493   5.347   0.093   6.802 |
## cp_3        0.483   1.598   0.022   3.309 |
## fbs_0      -0.247   4.052   0.310 -12.441 |
## fbs_1       1.257  20.657   0.310  12.441 |
## restecg_0   0.361   5.216   0.131   8.099 |
## restecg_1  -0.465   8.203   0.196  -9.902 |
## 
## Categorical variables (eta2)
##             Dim.1 Dim.2 Dim.3  
## sex       | 0.114 0.402 0.135 |
## cp        | 0.558 0.296 0.167 |
## fbs       | 0.007 0.166 0.310 |
## restecg   | 0.093 0.305 0.242 |
## exang     | 0.469 0.053 0.003 |
## slope     | 0.372 0.055 0.111 |
## thal      | 0.496 0.083 0.283 |
## target    | 0.647 0.002 0.004 |
heart_numerical_combined_with_factors <- cbind( heart_combined, heart[, c(2, 3, 6, 7, 9, 11, 13, 14)])
summary(heart_numerical_combined_with_factors)
##       age             thalach              ca             oldpeak       
##  Min.   :-2.9053   Min.   :-3.4495   Min.   :-0.7282   Min.   :-0.9168  
##  1st Qu.:-0.7020   1st Qu.:-0.7497   1st Qu.:-0.7282   1st Qu.:-0.9168  
##  Median : 0.1097   Median : 0.1354   Median :-0.7282   Median :-0.2367  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6895   3rd Qu.: 0.7108   3rd Qu.: 0.2200   3rd Qu.: 0.6135  
##  Max.   : 2.6609   Max.   : 2.3484   Max.   : 3.0644   Max.   : 4.3540  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4217   Min.   :-2.1539   0:147   0:232   0:418   0:251   0:319  
##  1st Qu.:-0.6983   1st Qu.:-0.6772   1:353   1: 87   1: 82   1:238   1:181  
##  Median :-0.1104   Median :-0.1093           2:138           2: 11          
##  Mean   : 0.0000   Mean   : 0.0000           3: 43                          
##  3rd Qu.: 0.5992   3rd Qu.: 0.4587                                          
##  Max.   : 6.4586   Max.   : 3.8663                                          
##  slope   thal    target   
##  0: 43   0:  4   neg:247  
##  1:239   1: 35   pos:253  
##  2:218   2:260            
##          3:201            
##                           
## 

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart_numerical_combined_with_factors )
##       age             thalach              ca             oldpeak       
##  Min.   :-2.9053   Min.   :-3.4495   Min.   :-0.7282   Min.   :-0.9168  
##  1st Qu.:-0.7020   1st Qu.:-0.7497   1st Qu.:-0.7282   1st Qu.:-0.9168  
##  Median : 0.1097   Median : 0.1354   Median :-0.7282   Median :-0.2367  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6895   3rd Qu.: 0.7108   3rd Qu.: 0.2200   3rd Qu.: 0.6135  
##  Max.   : 2.6609   Max.   : 2.3484   Max.   : 3.0644   Max.   : 4.3540  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4217   Min.   :-2.1539   0:147   0:232   0:418   0:251   0:319  
##  1st Qu.:-0.6983   1st Qu.:-0.6772   1:353   1: 87   1: 82   1:238   1:181  
##  Median :-0.1104   Median :-0.1093           2:138           2: 11          
##  Mean   : 0.0000   Mean   : 0.0000           3: 43                          
##  3rd Qu.: 0.5992   3rd Qu.: 0.4587                                          
##  Max.   : 6.4586   Max.   : 3.8663                                          
##  slope   thal    target   
##  0: 43   0:  4   neg:247  
##  1:239   1: 35   pos:253  
##  2:218   2:260            
##          3:201            
##                           
## 
model_of_the_transform <- preProcess(heart_numerical_combined_with_factors ,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 6)
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 6 components
heart_numerical_combined_with_factors_ica_dr <- predict(model_of_the_transform, heart_numerical_combined_with_factors )

summary(heart_numerical_combined_with_factors_ica_dr)
##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:147   0:232   0:418   0:251   0:319   0: 43   0:  4   neg:247  
##  1:353   1: 87   1: 82   1:238   1:181   1:239   1: 35   pos:253  
##          2:138           2: 11           2:218   2:260            
##          3: 43                                   3:201            
##                                                                   
##                                                                   
##       ICA1              ICA2              ICA3               ICA4         
##  Min.   :-3.9822   Min.   :-2.6611   Min.   :-3.99645   Min.   :-2.35354  
##  1st Qu.:-0.4033   1st Qu.:-0.8592   1st Qu.:-0.52773   1st Qu.:-0.79879  
##  Median : 0.1058   Median : 0.2131   Median : 0.06454   Median :-0.02952  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.5811   3rd Qu.: 0.7607   3rd Qu.: 0.58032   3rd Qu.: 0.67525  
##  Max.   : 2.5152   Max.   : 2.5457   Max.   : 2.62976   Max.   : 2.60427  
##       ICA5              ICA6         
##  Min.   :-4.1765   Min.   :-2.13754  
##  1st Qu.:-0.5230   1st Qu.:-0.63119  
##  Median : 0.1354   Median :-0.03106  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.5958   3rd Qu.: 0.55989  
##  Max.   : 2.2694   Max.   : 6.63154
# ************************************************************************************************************************************

#RESULTS
# Summary of the Transformed Dataset after Independent Component Analysis (ICA)

# Categorical Variables:
# - sex, cp, fbs, restecg, exang, slope, thal, target are categorical variables with counts and percentages for each level.

# Independent Component Analysis (ICA) Components:
# - ICA1 to ICA6 represent the Independent Component Analysis components.
# - For each component, descriptive statistics are provided:
#   - Minimum, 1st Quartile, Median, Mean, 3rd Quartile, Maximum.

# Example for ICA1:
#   ICA1: Min. -4.3196, 1st Qu. -0.3524, Median 0.1066, Mean 0.0000, 3rd Qu. 0.5823, Max. 2.4614.

# These statistics offer insights into the distribution and characteristics of the transformed variables after applying ICA.
# ************************************************************************************************************************************

Milestone 3: Training the Model (Lab 5-8) —-

1. Split the dataset ====

heart_preprocessed <- heart_numerical_combined_with_factors

str(heart_preprocessed)
## 'data.frame':    500 obs. of  14 variables:
##  $ age     : num  0.805 -0.122 -1.746 0.805 -0.934 ...
##  $ thalach : num  -0.528 -0.263 0.135 -1.06 0.135 ...
##  $ ca      : num  -0.728 -0.728 -0.728 0.22 -0.728 ...
##  $ oldpeak : num  -0.0667 -0.5767 -0.9168 2.1437 -0.9168 ...
##  $ chol    : num  -0.0496 0.3762 -0.5159 -1.6107 -0.0496 ...
##  $ trestbps: num  1.027 -0.109 0.345 0.345 0.345 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 2 1 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 3 1 3 1 1 1 3 3 1 2 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 1 1 1 2 1 1 2 ...
##  $ exang   : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 1 1 1 1 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 3 2 2 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 3 3 3 3 3 3 3 3 3 3 ...
##  $ target  : Factor w/ 2 levels "neg","pos": 2 2 2 1 2 1 2 2 2 2 ...
# Define a 75:25 train:test data split of the dataset.
# That is, 75% of the original data will be used to train the model and
# 25% of the original data will be used to test the model.
train_index <- createDataPartition(heart_preprocessed$target,
                                   p = 0.75,
                                   list = FALSE)
heart_preprocessed_train <- heart_preprocessed[train_index, ]
heart_preprocessed_test <- heart_preprocessed[-train_index, ]

2. Train a Naive Bayes classifier using the training dataset —-

# Train a Naive Bayes model using the caret package.
heart_model_nb_caret <- # nolint
  caret::train(target ~ ., data =
                 heart_preprocessed_train[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                           "restecg" , "thalach" , "exang",
                           "oldpeak", "slope", "ca",  "thal", "target")],
               method = "naive_bayes")

3. Test the trained model using the testing dataset —-

### 3.b. Test the trained caret Naive Bayes model using the testing dataset ----
predictions_nb_caret <-
  predict(heart_model_nb_caret,
          heart_preprocessed_test[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                         "restecg" , "thalach" , "exang",
                         "oldpeak", "slope", "ca",  "thal")])

Determine the Baseline Accuracy —-

heart_freq <- heart_preprocessed_train$target
cbind(frequency =
        table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       186   49.46809
## pos       190   50.53191

Model Training and Evaluation using Generalized Linear Model (GLM)

# The glm function in this code is used to train a Generalized Linear Model
# for predicting the target variable based on the provided features in the training dataset.
# The subsequent printing of model details and performance metrics allows for an assessment of its effectiveness.
# Set up train control with classProbs = TRUE
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)


set.seed(7)
heart_model_glm <-
  train(target ~ ., data = heart_preprocessed_train, method = "glm",
        metric = "Accuracy", trControl = train_control)


print(heart_model_glm)
## Generalized Linear Model 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 338, 338, 339, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8403272  0.6802454

LDA —-

# Set the seed for reproducibility
set.seed(7)

# Train a Linear Discriminant Analysis (LDA) model using the specified training control
heart_model_lda <- train(target ~ ., data = heart_preprocessed_train,
                         method = "lda", trControl = train_control)

CART —-

# Set the seed for reproducibility
set.seed(7)

# Train a Classification and Regression Trees (CART) model using the specified training control
heart_model_cart <- train(target ~ ., data = heart_preprocessed_train,
                          method = "rpart", trControl = train_control)

KNN —-

# Set the seed for reproducibility
set.seed(7)

# Train a k-Nearest Neighbors (knn) model using the specified training control
heart_model_knn <- train(target ~ ., data = heart_preprocessed_train,
                         method = "knn", trControl = train_control)

Random Forest —-

# Set the seed for reproducibility
set.seed(7)

# Train a Random Forest (rf) model using the specified training control
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train,
                        method = "rf", trControl = train_control)

Call the resamples Function —-

# We then create a list of the model results and pass the list as an argument
# to the `resamples` function.

results <- resamples(list(LDA = heart_model_lda, CART = heart_model_cart,
                          KNN = heart_model_knn, GLM = heart_model_glm,
                          RF = heart_model_rf))

Add ROC curves for each model to the plot

# Install and load the pROC package 
# install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Create an empty plot
plot(0, 0, type = "n", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")

predictions_lda <- predict(heart_model_lda, heart_preprocessed_test[, 1:13],type = "prob")
predictions_cart <- predict(heart_model_cart, heart_preprocessed_test[, 1:13],type = "prob")
predictions_knn <- predict(heart_model_knn, heart_preprocessed_test[, 1:13],type = "prob")
predictions_glm <- predict(heart_model_glm, heart_preprocessed_test[, 1:13],type = "prob")
predictions_rf <- predict(heart_model_rf, heart_preprocessed_test[, 1:13],type = "prob")



# Add ROC curves for each model to the plot
roc_curve_lda <- roc(heart_preprocessed_test$target, predictions_lda$neg)
## Setting levels: control = neg, case = pos

## Setting direction: controls > cases
roc_curve_cart <- roc(heart_preprocessed_test$target, predictions_cart$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_knn <- roc(heart_preprocessed_test$target, predictions_knn$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_glm <- roc(heart_preprocessed_test$target, predictions_glm$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_rf <- roc(heart_preprocessed_test$target, predictions_rf$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
# Plot each ROC curve with a different color
plot(roc_curve_lda, main = "ROC curve for LDA", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "red", lwd = 2.5)

plot(roc_curve_cart, main = "ROC curve for CART", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "blue", lwd = 2.5)

plot(roc_curve_knn, main = "ROC curve for KNN", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "green", lwd = 2.5)

plot(roc_curve_glm,main = "ROC curve for GLM", print.auc = TRUE,
     print.auc.x = 0.6, print.auc.y = 0.6, col = "purple")

plot(roc_curve_rf, main = "ROC curve for RF", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "orange", lwd = 2.5)

Display the Results —-

## 1. Table Summary ----
# This is the simplest comparison. It creates a table with one model per row
# and its corresponding evaluation metrics displayed per column.

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LDA, CART, KNN, GLM, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.7297297 0.7894737 0.8421053 0.8394974 0.8940256 0.9736842    0
## CART 0.6315789 0.7153272 0.7631579 0.7606449 0.8054765 0.8947368    0
## KNN  0.7567568 0.8157895 0.8421053 0.8536510 0.8918919 0.9736842    0
## GLM  0.6756757 0.7948080 0.8421053 0.8403272 0.8918919 0.9736842    0
## RF   0.7894737 0.8926031 0.9459459 0.9271693 0.9736842 1.0000000    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.4574780 0.5789474 0.6842105 0.6785991 0.7880117 0.9473684    0
## CART 0.2631579 0.4305556 0.5263158 0.5209092 0.6086544 0.7894737    0
## KNN  0.5138686 0.6315789 0.6842105 0.7072748 0.7833075 0.9473684    0
## GLM  0.3489736 0.5888497 0.6842105 0.6802454 0.7834671 0.9473684    0
## RF   0.5789474 0.7850877 0.8919706 0.8542510 0.9473684 1.0000000    0
## 2. Box and Whisker Plot ----
# This is useful for visually observing the spread of the estimated accuracies
# for different algorithms and how they relate.

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
bwplot(results, scales = scales)

## 3. Dot Plots ----
# They show both the mean estimated accuracy as well as the 95% confidence
# interval (e.g. the range in which 95% of observed scores fell).

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
dotplot(results, scales = scales)

## 4. Scatter Plot Matrix ----
# This is useful when considering whether the predictions from two
# different algorithms are correlated. If weakly correlated, then they are good
# candidates for being combined in an ensemble prediction.

splom(results)

## 5. Pairwise xyPlots ----
# You can zoom in on one pairwise comparison of the accuracy of trial-folds for
# two models using an xyplot.

# xyplot plots to compare models
xyplot(results, models = c("LDA", "RF"))

# or
# xyplot plots to compare models
xyplot(results, models = c("GLM", "CART"))

Statistical Significance Tests —-

# Explanation:
# - Upper diagonal: Estimates of the difference in accuracy and kappa values between models.
# - Lower diagonal: P-values for testing the hypothesis that the difference is zero.

# Accuracy:
# - Positive values indicate higher accuracy in the row model compared to the column model.
# - Negative values indicate lower accuracy.

# Kappa:
# - Positive values indicate higher kappa values in the row model compared to the column model.
# - Negative values indicate lower kappa values.

# P-values:
# - Assess the statistical significance of the observed differences.
# - Values below the significance level (e.g., 0.05) suggest a significant difference.
# - Bonferroni adjustment is applied for multiple comparisons.
diffs <- diff(results)

summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      LDA       CART       KNN        GLM        RF        
## LDA             0.0788525 -0.0141536 -0.0008298 -0.0876719
## CART 3.424e-06            -0.0930062 -0.0796823 -0.1665244
## KNN  1         3.648e-08              0.0133239 -0.0735183
## GLM  1         5.249e-06  1                     -0.0868421
## RF   6.539e-09 1.971e-14  3.946e-08  2.258e-08            
## 
## Kappa 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.157690 -0.028676 -0.001646 -0.175652
## CART 3.483e-06           -0.186366 -0.159336 -0.333342
## KNN  1         3.509e-08            0.027029 -0.146976
## GLM  1         5.309e-06 1                   -0.174006
## RF   6.239e-09 1.998e-14 4.005e-08 2.193e-08

Milestone 4: Hyper-Parameter Tuning and Ensemble (lab 9-11)

Apply a “Random Search” to identify the best parameter value —-

# Define train control settings for a random search
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "random")

# Set a seed for reproducibility
set.seed(7)

# Perform a random search for Random Forest hyperparameters
heart_model_random_search_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                                      metric = "Accuracy",
                                      tuneLength = 12,  # Search 12 options for the value of mtry
                                      trControl = train_control)

# Display the details of the random search model
print(heart_model_random_search_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9228070  0.8455357
##    3    0.9244903  0.8489114
##    7    0.9156472  0.8313085
##    8    0.9174016  0.8347541
##   12    0.9076814  0.8152699
##   15    0.9120910  0.8240880
##   18    0.9085586  0.8170138
##   19    0.9076814  0.8152805
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# Plot the results of the random search
plot(heart_model_random_search_rf)

Bagged CART —-

## 2.a. Bagged CART ----
set.seed(7)
heart_model_bagged_cart <- train(target ~ ., data = heart_preprocessed_train, method = "treebag",
                               metric = "Accuracy",
                               trControl = train_control)

## 2.b. Random Forest ----
set.seed(7)
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                        metric = "Accuracy", trControl = train_control)


# Summarize results
bagging_results <-
  resamples(list("Bagged Decision Tree" = heart_model_bagged_cart,
                 "Random Forest" = heart_model_rf))

summary(bagging_results)
## 
## Call:
## summary.resamples(object = bagging_results)
## 
## Models: Bagged Decision Tree, Random Forest 
## Number of resamples: 30 
## 
## Accuracy 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.7837838 0.8657539 0.8947368 0.8978663 0.9473684    1
## Random Forest        0.7631579 0.8750000 0.9210526 0.9174727 0.9665718    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
## 
## Kappa 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.5685131 0.7322455 0.7894737 0.7956850 0.8947368    1
## Random Forest        0.5263158 0.7500000 0.8421053 0.8348447 0.9330546    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
dotplot(bagging_results)

# We then print the details of the model that has been created
print(heart_model_rf$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 7.18%
## Confusion matrix:
##     neg pos class.error
## neg 174  12  0.06451613
## pos  15 175  0.07894737

Test the Model —-

# We can test the model
set.seed(9)
predictions <- predict(heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  54   4
##        pos   7  59
##                                           
##                Accuracy : 0.9113          
##                  95% CI : (0.8468, 0.9549)
##     No Information Rate : 0.5081          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8224          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.8852          
##             Specificity : 0.9365          
##          Pos Pred Value : 0.9310          
##          Neg Pred Value : 0.8939          
##              Prevalence : 0.4919          
##          Detection Rate : 0.4355          
##    Detection Prevalence : 0.4677          
##       Balanced Accuracy : 0.9109          
##                                           
##        'Positive' Class : neg             
## 

Save and Load your Model —-

# Saving a model into a file allows you to load it later and use it to make
# predictions. Saved models can be loaded by calling the `readRDS()` function

saveRDS(heart_model_rf, "../models/saved_heart_model_rf.rds")
# The saved model can then be loaded later as follows:
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
print(loaded_heart_model_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9174727  0.8348447
##    7    0.9156709  0.8313072
##   15    0.9103604  0.8206412
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
predictions_with_loaded_model <-
  predict(loaded_heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions_with_loaded_model, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  54   4
##        pos   7  59
##                                           
##                Accuracy : 0.9113          
##                  95% CI : (0.8468, 0.9549)
##     No Information Rate : 0.5081          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8224          
##                                           
##  Mcnemar's Test P-Value : 0.5465          
##                                           
##             Sensitivity : 0.8852          
##             Specificity : 0.9365          
##          Pos Pred Value : 0.9310          
##          Neg Pred Value : 0.8939          
##              Prevalence : 0.4919          
##          Detection Rate : 0.4355          
##    Detection Prevalence : 0.4677          
##       Balanced Accuracy : 0.9109          
##                                           
##        'Positive' Class : neg             
## 

Creating Functions in R —-

# Plumber requires functions, an example of the syntax for creating a function
# in R is:

name_of_function <- function(arg) {
  # Do something with the argument called `arg`
}

Make Predictions on New Data using the Saved Model —-

# We can also create and use our own data frame as follows:
# Convert categorical variables to factors


to_be_predicted <-
  data.frame(age=70, sex=1, cp=0, trestbps=145, chol=174,
             fbs=0, restecg=1, thalach=125, exang=1, oldpeak=2.6,
             slope=0,
             ca=0, thal=3)


to_be_predicted$sex <- as.factor(to_be_predicted$sex)
to_be_predicted$cp <- as.factor(to_be_predicted$cp)
to_be_predicted$fbs <- as.factor(to_be_predicted$fbs)
to_be_predicted$restecg <- as.factor(to_be_predicted$restecg)
to_be_predicted$exang <- as.factor(to_be_predicted$exang)
to_be_predicted$slope <- as.factor(to_be_predicted$slope)
to_be_predicted$thal <- as.factor(to_be_predicted$thal)
# We then use the data frame to make predictions
predict(loaded_heart_model_rf, newdata = to_be_predicted)
## [1] neg
## Levels: neg pos

STEP 8. Make predictions using the model through a function —-

# An alternative is to create a function and then use the function to make
# predictions

predict_target <- function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
                           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
                           arg_ca, arg_thal) {
  # Convert categorical variables to factors
  arg_sex <- as.factor(arg_sex)
  arg_cp <- as.factor(arg_cp)
  arg_fbs <- as.factor(arg_fbs)
  arg_restecg <- as.factor(arg_restecg)
  arg_exang <- as.factor(arg_exang)
  arg_slope <- as.factor(arg_slope)
  arg_thal <- as.factor(arg_thal)
  
  # Create a data frame using the arguments
  to_be_predicted <- data.frame(age = arg_age, sex = arg_sex, cp = arg_cp, trestbps = arg_trestbps,
                                chol = arg_chol, fbs = arg_fbs, restecg = arg_restecg,
                                thalach = arg_thalach, exang = arg_exang, oldpeak = arg_oldpeak,
                                slope = arg_slope, ca = arg_ca, thal = arg_thal)
  
  # Make a prediction based on the data frame
  predict(loaded_heart_model_rf, newdata = to_be_predicted)
}

Milestone 5: Consolidation

Plumber API

# STEP 1. Install and Load the Required Packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: plumber
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}


###LAB 12
# This requires the "plumber" package that was installed and loaded earlier in
# STEP 1. The commenting below makes R recognize the code as the definition of
# an API, i.e., #* comments.

loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")

#* @apiTitle CVDetect Prediction Model API

#* @apiDescription Used to predict whether a patient has heart disease or not.

#* @param arg_age Age of the patient
#* @param arg_sex Gender of the patient (0: Female, 1: Male)
#* @param arg_cp Chest pain type (0-3)
#* @param arg_trestbps Resting blood pressure (mm Hg)
#* @param arg_chol Serum cholesterol (mg/dl)
#* @param arg_fbs Fasting blood sugar (> 120 mg/dl, 0: No, 1: Yes)
#* @param arg_restecg Resting electrocardiographic results (0-2)
#* @param arg_thalach Maximum heart rate achieved
#* @param arg_exang Exercise-induced angina (0: No, 1: Yes)
#* @param arg_oldpeak ST depression induced by exercise relative to rest
#* @param arg_slope Slope of the peak exercise ST segment (0-2)
#* @param arg_ca Number of major vessels colored by fluoroscopy (0-3)
#* @param arg_thal Thallium stress test result (3: Normal, 6: Fixed defect, 7: Reversible defect)


#* @get /target

predict_diabetes <-
  function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
           arg_ca, arg_thal) {
    # Create a data frame using the arguments
    to_be_predicted <- data.frame(
      age = as.numeric(arg_age),
      sex = as.factor(arg_sex),
      cp = as.factor(arg_cp),
      trestbps = as.numeric(arg_trestbps),
      chol = as.numeric(arg_chol),
      fbs = as.factor(arg_fbs),
      restecg = as.factor(arg_restecg),
      thalach = as.numeric(arg_thalach),
      exang = as.factor(arg_exang),
      oldpeak = as.numeric(arg_oldpeak),
      slope = as.factor(arg_slope),
      ca = as.numeric(arg_ca),
      thal = as.factor(arg_thal)
    )
    
    # Make a prediction based on the data frame
    predict(loaded_heart_model_rf, newdata = to_be_predicted)
  }

##Run Plumber

# STEP 1. Install and load the required packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

# STEP 2. Process a Plumber API ----
# This allows us to process a plumber API
#api <- plumber::plumb("lab12.R")

# STEP 3. Run the API on a specific port ----
# Specify a constant localhost port to use
#api$run(host = "127.0.0.1", port = 5022)

PHP output

# Heart Disease Prediction Web Form


cat('<iframe src="http://localhost/lab15/Lab15-submission-ConsumePlumberAPIOutput.php" width="800" height="600"></iframe>')