Heart Disease Prediction Report

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

 134141 - Aicha Mbongo | |

Course Code Course Name Program Semester Duration
BBT4206 Business Intelligence II Bachelor of Business Information Technology 21st August 2023 to 28th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: readr
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")
## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")
## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")
## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")
## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")
## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)
## [1] 1025   14
sapply(heart, class)
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)
## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)
##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
## 

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798
#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)
##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")
## Loading required package: corrplot

## corrplot 0.92 loaded
corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")
## Loading required package: ggcorrplot
ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)
## [1] FALSE
# How many?
n_miss(heart)
## [1] 0
# What is the percentage of missing data in the entire dataset?
prop_miss(heart)
## [1] 0
# How many missing values does each variable have?
heart %>% is.na() %>% colSums()
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)
## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)
## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Scaling Data Transformation Purpose:
# The scaling data transformation is performed to standardize or normalize
# the numeric features in a dataset. It ensures that different features
# with varying scales are brought to a common scale, preventing one feature
# from dominating the others during model training.
#
# Scaling is essential for algorithms that are sensitive to the magnitude
# of input features, such as distance-based methods (e.g., k-Nearest Neighbors)
# or gradient descent-based optimization algorithms (e.g., Support Vector Machines).
#
# By scaling the data, we make the features comparable and contribute equally
# to the model's learning process, improving the stability and performance of
# machine learning models.


# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:151   0:241   Min.   : 94.0   Min.   :131.0   0:432  
##  1st Qu.:47.00   1:349   1: 85   1st Qu.:120.0   1st Qu.:212.0   1: 68  
##  Median :56.00           2:147   Median :130.0   Median :243.0          
##  Mean   :54.49           3: 27   Mean   :131.5   Mean   :247.4          
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:278.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       
##  0:244   Min.   : 71.0   0:337   Min.   :0.000   0: 29   Min.   :0.000  
##  1:251   1st Qu.:132.0   1:163   1st Qu.:0.000   1:241   1st Qu.:0.000  
##  2:  5   Median :151.5           Median :0.800   2:230   Median :0.000  
##          Mean   :148.4           Mean   :1.059           Mean   :0.748  
##          3rd Qu.:165.2           3rd Qu.:1.800           3rd Qu.:1.000  
##          Max.   :202.0           Max.   :6.200           Max.   :4.000  
##  thal    target   
##  0:  4   neg:243  
##  1: 25   pos:257  
##  2:265            
##  3:206            
##                   
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Centering Data Transformation Purpose:
# Shifts numeric features by subtracting their mean,
# ensuring a centered distribution around zero.
# Useful for algorithms sensitive to variable means,
# promoting better model interpretability.



# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:151   0:241   Min.   : 94.0   Min.   :131.0   0:432  
##  1st Qu.:47.00   1:349   1: 85   1st Qu.:120.0   1st Qu.:212.0   1: 68  
##  Median :56.00           2:147   Median :130.0   Median :243.0          
##  Mean   :54.49           3: 27   Mean   :131.5   Mean   :247.4          
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:278.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       
##  0:244   Min.   : 71.0   0:337   Min.   :0.000   0: 29   Min.   :0.000  
##  1:251   1st Qu.:132.0   1:163   1st Qu.:0.000   1:241   1st Qu.:0.000  
##  2:  5   Median :151.5           Median :0.800   2:230   Median :0.000  
##          Mean   :148.4           Mean   :1.059           Mean   :0.748  
##          3rd Qu.:165.2           3rd Qu.:1.800           3rd Qu.:1.000  
##          Max.   :202.0           Max.   :6.200           Max.   :4.000  
##  thal    target   
##  0:  4   neg:243  
##  1: 25   pos:257  
##  2:265            
##  3:206            
##                   
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_scale_transform )[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[12])

model_of_the_transform <- preProcess(heart_scale_transform, method = c("center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
                                  heart_scale_transform)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

View(heart_center_transform)

The Standardize Basic Transform on the heart Dataset —-

# BEFORE
#If you've already scaled and centered your data, you have made it comparable and adjusted for differences in means.
#Standardization is typically done to bring variables to a standard normal distribution (mean of 0 and standard deviation 
#of 1).
summary(heart_center_transform)
##       age          sex     cp         trestbps             chol         
##  Min.   :-2.8214   0:151   0:241   Min.   :-2.29240   Min.   :-2.21697  
##  1st Qu.:-0.8292   1:349   1: 85   1st Qu.:-0.70224   1st Qu.:-0.67450  
##  Median : 0.1669           2:147   Median :-0.09064   Median :-0.08417  
##  Mean   : 0.0000           3: 27   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.7203                   3rd Qu.: 0.52096   3rd Qu.: 0.58233  
##  Max.   : 2.4912                   Max.   : 4.19056   Max.   : 6.02859  
##  fbs     restecg    thalach        exang      oldpeak        slope  
##  0:432   0:244   Min.   :-3.3565   0:337   Min.   :-0.9202   0: 29  
##  1: 68   1:251   1st Qu.:-0.7097   1:163   1st Qu.:-0.9202   1:241  
##          2:  5   Median : 0.1364           Median :-0.2252   2:230  
##                  Mean   : 0.0000           Mean   : 0.0000          
##                  3rd Qu.: 0.7330           3rd Qu.: 0.6435          
##                  Max.   : 2.3276           Max.   : 4.4659          
##        ca          thal    target   
##  Min.   :-0.7397   0:  4   neg:243  
##  1st Qu.:-0.7397   1: 25   pos:257  
##  Median :-0.7397   2:265            
##  Mean   : 0.0000   3:206            
##  3rd Qu.: 0.2492                    
##  Max.   : 3.2160
sapply(heart_center_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1
model_of_the_transform <- preProcess(heart_center_transform,
                                     method = c("scale", "center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
##   - scaled (6)
heart_standardize_transform <- predict(model_of_the_transform, # nolint
                                       heart_center_transform)

# AFTER
summary(heart_standardize_transform)
##       age          sex     cp         trestbps             chol         
##  Min.   :-2.8214   0:151   0:241   Min.   :-2.29240   Min.   :-2.21697  
##  1st Qu.:-0.8292   1:349   1: 85   1st Qu.:-0.70224   1st Qu.:-0.67450  
##  Median : 0.1669           2:147   Median :-0.09064   Median :-0.08417  
##  Mean   : 0.0000           3: 27   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.7203                   3rd Qu.: 0.52096   3rd Qu.: 0.58233  
##  Max.   : 2.4912                   Max.   : 4.19056   Max.   : 6.02859  
##  fbs     restecg    thalach        exang      oldpeak        slope  
##  0:432   0:244   Min.   :-3.3565   0:337   Min.   :-0.9202   0: 29  
##  1: 68   1:251   1st Qu.:-0.7097   1:163   1st Qu.:-0.9202   1:241  
##          2:  5   Median : 0.1364           Median :-0.2252   2:230  
##                  Mean   : 0.0000           Mean   : 0.0000          
##                  3rd Qu.: 0.7330           3rd Qu.: 0.6435          
##                  Max.   : 2.3276           Max.   : 4.4659          
##        ca          thal    target   
##  Min.   :-0.7397   0:  4   neg:243  
##  1st Qu.:-0.7397   1: 25   pos:257  
##  Median :-0.7397   2:265            
##  Mean   : 0.0000   3:206            
##  3rd Qu.: 0.2492                    
##  Max.   : 3.2160
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1

Box-Cox Transform—-

# BEFORE
summary(heart_standardize_transform)
##       age          sex     cp         trestbps             chol         
##  Min.   :-2.8214   0:151   0:241   Min.   :-2.29240   Min.   :-2.21697  
##  1st Qu.:-0.8292   1:349   1: 85   1st Qu.:-0.70224   1st Qu.:-0.67450  
##  Median : 0.1669           2:147   Median :-0.09064   Median :-0.08417  
##  Mean   : 0.0000           3: 27   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.7203                   3rd Qu.: 0.52096   3rd Qu.: 0.58233  
##  Max.   : 2.4912                   Max.   : 4.19056   Max.   : 6.02859  
##  fbs     restecg    thalach        exang      oldpeak        slope  
##  0:432   0:244   Min.   :-3.3565   0:337   Min.   :-0.9202   0: 29  
##  1: 68   1:251   1st Qu.:-0.7097   1:163   1st Qu.:-0.9202   1:241  
##          2:  5   Median : 0.1364           Median :-0.2252   2:230  
##                  Mean   : 0.0000           Mean   : 0.0000          
##                  3rd Qu.: 0.7330           3rd Qu.: 0.6435          
##                  Max.   : 2.3276           Max.   : 4.4659          
##        ca          thal    target   
##  Min.   :-0.7397   0:  4   neg:243  
##  1st Qu.:-0.7397   1: 25   pos:257  
##  Median :-0.7397   2:265            
##  Mean   : 0.0000   3:206            
##  3rd Qu.: 0.2492                    
##  Max.   : 3.2160
#Calculate the skewness before the Box-Cox transform
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1484517  0.7437671  1.1608734 -0.4864246  1.1292266  1.3032599
# **************************************************************************************************************
#  RESULTS
#  Age: The skewness is relatively small (-0.27). You might not need a Box-Cox transformation for age,
#  especially since the skewness is not highly pronounced.
  
#  trestbps (resting blood pressure): The skewness is moderate (0.78). Considering the moderate skewness,
#  you could experiment with a Box-Cox transformation to see if it improves the distribution.
  
#  chol (serum cholesterol): The skewness is high (1.49). A Box-Cox transformation is often beneficial for 
#  skewed variables, so you might consider applying it to cholesterol.
  
#  thalach (maximum heart rate achieved): The skewness is relatively small (-0.56). A Box-Cox transformation
#  might not be necessary for thalach, especially if the skewness is not a significant concern.
  
#  oldpeak (ST depression induced by exercise): The skewness is high (1.21). Given the high skewness, a Box-Cox 
#  transformation could be beneficial for oldpeak.
  
#  ca (number of major vessels colored by fluoroscopy): The skewness is high (1.35). Similar to cholesterol and 
#  oldpeak, a Box-Cox transformation might be considered for ca.
# **************************************************************************************************************

  
#Plot a histogram to view the skewness before the Box-Cox transform

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 1]))
hist(heart_health_variable , main = names(heart_standardize_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 4]))
hist(heart_health_variable , main = names(heart_standardize_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 5]))
hist(heart_health_variable , main = names(heart_standardize_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 8]))
hist(heart_health_variable , main = names(heart_standardize_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 10]))
hist(heart_health_variable , main = names(heart_standardize_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 12]))
hist(heart_health_variable , main = names(heart_standardize_transform)[12])



# Select the columns of interest
columns_of_interest <- c(12, 10, 5, 4)

# Apply Box-Cox transformation only to selected columns
model_of_the_transform <- preProcess(heart_standardize_transform[, columns_of_interest], method = c("BoxCox"))

# Print the transformation details
print(model_of_the_transform)
## Created from 500 samples and 0 variables
## 
## Pre-processing:
##   - ignored (0)
# Apply the transformation to the selected columns
heart_box_cox_transform <- predict(model_of_the_transform, heart_standardize_transform[, columns_of_interest])




# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_box_cox_transform, skewness, type = 2)
##        ca   oldpeak      chol  trestbps 
## 1.3032599 1.1292266 1.1608734 0.7437671
# **************************************************************************************************************

#RESULT
#ca: Skewness is approximately 1.35. A positive skewness indicates that the distribution
#has a longer right tail.

#oldpeak: Skewness is approximately 1.21. Similar to "ca," the positive skewness
#suggests a longer right tail in the distribution.

#chol: Skewness is approximately 1.49. Again, positive skewness indicates a longer 
#right tail.

#trestbps: Skewness is approximately 0.78. The positive skewness suggests a longer 
#right tail, but it's less pronounced compared to the other variables.
# **************************************************************************************************************

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform[, 1]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform [, 2]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[2])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 3]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[3])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 4]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[4])

PCA for Feature Extraction on the Boston Housing Dataset —-

### PCA for Dimensionality Reduction on the heart Dataset ----
# Combine original and transformed columns
heart_combined <- cbind(heart_standardize_transform[, c("age", "thalach")], heart_box_cox_transform)
summary(heart_combined )
##       age             thalach              ca             oldpeak       
##  Min.   :-2.8214   Min.   :-3.3565   Min.   :-0.7397   Min.   :-0.9202  
##  1st Qu.:-0.8292   1st Qu.:-0.7097   1st Qu.:-0.7397   1st Qu.:-0.9202  
##  Median : 0.1669   Median : 0.1364   Median :-0.7397   Median :-0.2252  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7203   3rd Qu.: 0.7330   3rd Qu.: 0.2492   3rd Qu.: 0.6435  
##  Max.   : 2.4912   Max.   : 2.3276   Max.   : 3.2160   Max.   : 4.4659  
##       chol             trestbps       
##  Min.   :-2.21697   Min.   :-2.29240  
##  1st Qu.:-0.67450   1st Qu.:-0.70224  
##  Median :-0.08417   Median :-0.09064  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.58233   3rd Qu.: 0.52096  
##  Max.   : 6.02859   Max.   : 4.19056
model_of_the_transform <- preProcess(heart_combined, method =
                                       c("scale", "center", "pca"))

print(model_of_the_transform)
## Created from 500 samples and 6 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (0)
##   - principal component signal extraction (6)
##   - scaled (6)
## 
## PCA needed 6 components to capture 95 percent of the variance
heart_pca_dr <- predict(model_of_the_transform, heart_combined)

summary(heart_pca_dr)
##       PC1               PC2                PC3                PC4           
##  Min.   :-3.7333   Min.   :-3.97952   Min.   :-4.07410   Min.   :-2.779847  
##  1st Qu.:-0.9680   1st Qu.:-0.62115   1st Qu.:-0.63361   1st Qu.:-0.514060  
##  Median : 0.1271   Median :-0.03934   Median :-0.02038   Median : 0.006459  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 1.0441   3rd Qu.: 0.70034   3rd Qu.: 0.71136   3rd Qu.: 0.613969  
##  Max.   : 3.4242   Max.   : 2.77896   Max.   : 2.75013   Max.   : 2.365494  
##       PC5                PC6          
##  Min.   :-2.53758   Min.   :-1.66411  
##  1st Qu.:-0.52814   1st Qu.:-0.49148  
##  Median :-0.03814   Median :-0.03181  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.44151   3rd Qu.: 0.46836  
##  Max.   : 2.82131   Max.   : 1.92655
### PCA for Feature Extraction  ----


heart_pca_fe <- princomp(cor(heart_combined))
summary(heart_pca_fe)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.6908175 0.3771363 0.3501721 0.3149774 0.24813832
## Proportion of Variance 0.5285719 0.1575339 0.1358128 0.1098844 0.06819698
## Cumulative Proportion  0.5285719 0.6861058 0.8219186 0.9318030 1.00000000
##                              Comp.6
## Standard deviation     9.982210e-09
## Proportion of Variance 1.103649e-16
## Cumulative Proportion  1.000000e+00
# **************************************************************************************************************
#RESULTS
#the PCA results suggest that the first five principal components retain the majority of the information in your dataset. 
#The subsequent components (Comp.6 ) contribute very little or no additional information. I may consider using 
#the first five components for further analysis, as they represent the most important features in terms of explaining the 
#variance in the data. The negligible contribution of Comp.6  might indicate that these components are not providing 
#meaningful information for the analysis.
# **************************************************************************************************************



#### Scree Plot ----
# The cumulative proportion reaches 100% after Comp.5, indicating that the first five components capture 
#all the variance in the original data.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:5]
##               Comp.1     Comp.2      Comp.3      Comp.4      Comp.5
## age       0.46825649  0.3051962  0.36642993  0.26770623  0.56375323
## thalach  -0.66852339 -0.1425097  0.01517027  0.11591317  0.05155236
## ca        0.35193325 -0.0389311 -0.44391246  0.62495126 -0.46742011
## oldpeak   0.43171959 -0.3832674 -0.28349057 -0.63191842  0.02317217
## chol     -0.02603638  0.8224225 -0.05536025 -0.35249457 -0.38123700
## trestbps  0.15131301 -0.2485116  0.76485883 -0.02757039 -0.56139512
unique(rownames(heart_pca_fe$loadings[, 1:5]))
## [1] "age"      "thalach"  "ca"       "oldpeak"  "chol"     "trestbps"
factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:5)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

MCA for Feature Extraction —-

# Convert relevant columns to factors
categorical_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
heart_categorical <- heart
heart_categorical[categorical_cols] <- lapply(heart_categorical[categorical_cols], as.factor)

# Perform Multiple Correspondence Analysis (MCA)
heart_mca <- MCA(heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])

# Visualize MCA results for individuals
# Summary of the MCA results
summary(heart_mca)
## 
## Call:
## MCA(X = heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)]) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.360   0.161   0.147   0.137   0.128   0.125   0.121
## % of var.             20.598   9.189   8.392   7.827   7.287   7.158   6.939
## Cumulative % of var.  20.598  29.787  38.179  46.006  53.294  60.451  67.391
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.113   0.105   0.098   0.087   0.064   0.057   0.046
## % of var.              6.445   6.022   5.609   4.967   3.675   3.264   2.627
## Cumulative % of var.  73.835  79.857  85.467  90.433  94.109  97.373 100.000
## 
## Individuals (the 10 first)
##              Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## 1         |  0.943  0.493  0.852 | -0.116  0.017  0.013 | -0.187  0.047  0.033
## 2         |  0.382  0.081  0.148 | -0.109  0.015  0.012 | -0.090  0.011  0.008
## 3         | -0.760  0.320  0.296 |  0.846  0.889  0.367 |  0.180  0.044  0.017
## 4         | -0.729  0.295  0.427 | -0.618  0.475  0.307 |  0.225  0.069  0.041
## 5         | -0.729  0.295  0.427 | -0.618  0.475  0.307 |  0.225  0.069  0.041
## 6         |  0.628  0.219  0.128 | -0.201  0.050  0.013 |  0.493  0.330  0.078
## 7         |  0.196  0.021  0.050 | -0.256  0.081  0.085 | -0.148  0.030  0.028
## 8         | -0.421  0.098  0.085 | -0.242  0.073  0.028 |  0.777  0.821  0.289
## 9         | -0.880  0.430  0.399 |  0.608  0.460  0.190 |  0.205  0.057  0.022
## 10        | -0.470  0.123  0.219 |  0.170  0.036  0.029 | -0.339  0.157  0.114
##            
## 1         |
## 2         |
## 3         |
## 4         |
## 5         |
## 6         |
## 7         |
## 8         |
## 9         |
## 10        |
## 
## Categories (the 10 first)
##               Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2  v.test  
## sex_0     |  -0.626   4.103   0.170  -9.198 |   0.585   8.041   0.148   8.600 |
## sex_1     |   0.271   1.775   0.170   9.198 |  -0.253   3.479   0.148  -8.600 |
## cp_0      |   0.727   8.837   0.492  15.668 |  -0.035   0.046   0.001  -0.756 |
## cp_1      |  -0.835   4.111   0.143  -8.443 |  -0.961  12.192   0.189  -9.711 |
## cp_2      |  -0.672   4.598   0.188  -9.681 |   0.529   6.407   0.117   7.632 |
## cp_3      |  -0.205   0.078   0.002  -1.092 |   0.454   0.866   0.012   2.424 |
## fbs_0     |  -0.001   0.000   0.000  -0.056 |  -0.218   3.193   0.302 -12.276 |
## fbs_1     |   0.006   0.000   0.000   0.056 |   1.385  20.285   0.302  12.276 |
## restecg_0 |   0.289   1.412   0.080   6.299 |   0.356   4.819   0.121   7.773 |
## restecg_1 |  -0.289   1.456   0.084  -6.486 |  -0.406   6.443   0.166  -9.113 |
##             Dim.3     ctr    cos2  v.test  
## sex_0      -0.667  11.444   0.193  -9.804 |
## sex_1       0.289   4.951   0.193   9.804 |
## cp_0       -0.247   2.498   0.057  -5.318 |
## cp_1        0.450   2.926   0.041   4.546 |
## cp_2       -0.179   0.798   0.013  -2.574 |
## cp_3        1.759  14.223   0.177   9.388 |
## fbs_0      -0.207   3.152   0.272 -11.656 |
## fbs_1       1.315  20.023   0.272  11.656 |
## restecg_0  -0.001   0.000   0.000  -0.030 |
## restecg_1   0.075   0.242   0.006   1.686 |
## 
## Categorical variables (eta2)
##             Dim.1 Dim.2 Dim.3  
## sex       | 0.170 0.148 0.193 |
## cp        | 0.508 0.251 0.240 |
## fbs       | 0.000 0.302 0.272 |
## restecg   | 0.084 0.235 0.140 |
## exang     | 0.518 0.001 0.010 |
## slope     | 0.366 0.082 0.190 |
## thal      | 0.546 0.262 0.130 |
## target    | 0.691 0.005 0.000 |
heart_numerical_combined_with_factors <- cbind( heart_combined, heart[, c(2, 3, 6, 7, 9, 11, 13, 14)])
summary(heart_numerical_combined_with_factors)
##       age             thalach              ca             oldpeak       
##  Min.   :-2.8214   Min.   :-3.3565   Min.   :-0.7397   Min.   :-0.9202  
##  1st Qu.:-0.8292   1st Qu.:-0.7097   1st Qu.:-0.7397   1st Qu.:-0.9202  
##  Median : 0.1669   Median : 0.1364   Median :-0.7397   Median :-0.2252  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7203   3rd Qu.: 0.7330   3rd Qu.: 0.2492   3rd Qu.: 0.6435  
##  Max.   : 2.4912   Max.   : 2.3276   Max.   : 3.2160   Max.   : 4.4659  
##       chol             trestbps        sex     cp      fbs     restecg exang  
##  Min.   :-2.21697   Min.   :-2.29240   0:151   0:241   0:432   0:244   0:337  
##  1st Qu.:-0.67450   1st Qu.:-0.70224   1:349   1: 85   1: 68   1:251   1:163  
##  Median :-0.08417   Median :-0.09064           2:147           2:  5          
##  Mean   : 0.00000   Mean   : 0.00000           3: 27                          
##  3rd Qu.: 0.58233   3rd Qu.: 0.52096                                          
##  Max.   : 6.02859   Max.   : 4.19056                                          
##  slope   thal    target   
##  0: 29   0:  4   neg:243  
##  1:241   1: 25   pos:257  
##  2:230   2:265            
##          3:206            
##                           
## 

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart_numerical_combined_with_factors )
##       age             thalach              ca             oldpeak       
##  Min.   :-2.8214   Min.   :-3.3565   Min.   :-0.7397   Min.   :-0.9202  
##  1st Qu.:-0.8292   1st Qu.:-0.7097   1st Qu.:-0.7397   1st Qu.:-0.9202  
##  Median : 0.1669   Median : 0.1364   Median :-0.7397   Median :-0.2252  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7203   3rd Qu.: 0.7330   3rd Qu.: 0.2492   3rd Qu.: 0.6435  
##  Max.   : 2.4912   Max.   : 2.3276   Max.   : 3.2160   Max.   : 4.4659  
##       chol             trestbps        sex     cp      fbs     restecg exang  
##  Min.   :-2.21697   Min.   :-2.29240   0:151   0:241   0:432   0:244   0:337  
##  1st Qu.:-0.67450   1st Qu.:-0.70224   1:349   1: 85   1: 68   1:251   1:163  
##  Median :-0.08417   Median :-0.09064           2:147           2:  5          
##  Mean   : 0.00000   Mean   : 0.00000           3: 27                          
##  3rd Qu.: 0.58233   3rd Qu.: 0.52096                                          
##  Max.   : 6.02859   Max.   : 4.19056                                          
##  slope   thal    target   
##  0: 29   0:  4   neg:243  
##  1:241   1: 25   pos:257  
##  2:230   2:265            
##          3:206            
##                           
## 
model_of_the_transform <- preProcess(heart_numerical_combined_with_factors ,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 6)
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 6 components
heart_numerical_combined_with_factors_ica_dr <- predict(model_of_the_transform, heart_numerical_combined_with_factors )

summary(heart_numerical_combined_with_factors_ica_dr)
##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:151   0:241   0:432   0:244   0:337   0: 29   0:  4   neg:243  
##  1:349   1: 85   1: 68   1:251   1:163   1:241   1: 25   pos:257  
##          2:147           2:  5           2:230   2:265            
##          3: 27                                   3:206            
##                                                                   
##                                                                   
##       ICA1               ICA2              ICA3              ICA4         
##  Min.   :-2.67241   Min.   :-2.4054   Min.   :-6.3914   Min.   :-2.29693  
##  1st Qu.:-0.60643   1st Qu.:-0.8283   1st Qu.:-0.5044   1st Qu.:-0.73013  
##  Median :-0.09574   Median : 0.1220   Median : 0.0128   Median : 0.02446  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.61759   3rd Qu.: 0.8156   3rd Qu.: 0.6539   3rd Qu.: 0.77303  
##  Max.   : 4.27845   Max.   : 2.7676   Max.   : 2.2726   Max.   : 2.55836  
##       ICA5               ICA6        
##  Min.   :-4.46631   Min.   :-2.0360  
##  1st Qu.:-0.58744   1st Qu.:-0.6493  
##  Median : 0.05814   Median :-0.1574  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.59151   3rd Qu.: 0.4214  
##  Max.   : 2.51198   Max.   : 4.2504
# ************************************************************************************************************************************

#RESULTS
# Summary of the Transformed Dataset after Independent Component Analysis (ICA)

# Categorical Variables:
# - sex, cp, fbs, restecg, exang, slope, thal, target are categorical variables with counts and percentages for each level.

# Independent Component Analysis (ICA) Components:
# - ICA1 to ICA6 represent the Independent Component Analysis components.
# - For each component, descriptive statistics are provided:
#   - Minimum, 1st Quartile, Median, Mean, 3rd Quartile, Maximum.

# Example for ICA1:
#   ICA1: Min. -4.3196, 1st Qu. -0.3524, Median 0.1066, Mean 0.0000, 3rd Qu. 0.5823, Max. 2.4614.

# These statistics offer insights into the distribution and characteristics of the transformed variables after applying ICA.
# ************************************************************************************************************************************

Milestone 3: Training the Model (Lab 5-8) —-

1. Split the dataset ====

heart_preprocessed <- heart_numerical_combined_with_factors

str(heart_preprocessed)
## 'data.frame':    500 obs. of  14 variables:
##  $ age     : num  0.61 1.274 1.163 -0.608 -1.493 ...
##  $ thalach : num  -0.276 -0.71 0.375 0.983 1.46 ...
##  $ ca      : num  1.238 0.249 0.249 -0.74 -0.74 ...
##  $ oldpeak : num  1.512 -0.833 -0.225 -0.399 -0.92 ...
##  $ chol    : num  0.659 -0.674 3.229 0.354 -1.722 ...
##  $ trestbps: num  0.8268 -1.1915 0.521 -0.0906 -0.7022 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 1 2 2 2 2 2 1 1 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 1 3 2 2 4 1 2 3 1 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 2 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 1 1 1 2 2 2 2 2 2 1 ...
##  $ exang   : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 1 1 1 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 2 3 3 3 3 2 2 3 3 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 4 3 3 3 3 4 3 4 3 3 ...
##  $ target  : Factor w/ 2 levels "neg","pos": 1 1 2 2 2 1 1 2 2 2 ...
# Define a 75:25 train:test data split of the dataset.
# That is, 75% of the original data will be used to train the model and
# 25% of the original data will be used to test the model.
train_index <- createDataPartition(heart_preprocessed$target,
                                   p = 0.75,
                                   list = FALSE)
heart_preprocessed_train <- heart_preprocessed[train_index, ]
heart_preprocessed_test <- heart_preprocessed[-train_index, ]

2. Train a Naive Bayes classifier using the training dataset —-

# Train a Naive Bayes model using the caret package.
heart_model_nb_caret <- # nolint
  caret::train(target ~ ., data =
                 heart_preprocessed_train[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                           "restecg" , "thalach" , "exang",
                           "oldpeak", "slope", "ca",  "thal", "target")],
               method = "naive_bayes")

3. Test the trained model using the testing dataset —-

### 3.b. Test the trained caret Naive Bayes model using the testing dataset ----
predictions_nb_caret <-
  predict(heart_model_nb_caret,
          heart_preprocessed_test[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                         "restecg" , "thalach" , "exang",
                         "oldpeak", "slope", "ca",  "thal")])

Determine the Baseline Accuracy —-

heart_freq <- heart_preprocessed_train$target
cbind(frequency =
        table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       183   48.67021
## pos       193   51.32979

Model Training and Evaluation using Generalized Linear Model (GLM)

# The glm function in this code is used to train a Generalized Linear Model
# for predicting the target variable based on the provided features in the training dataset.
# The subsequent printing of model details and performance metrics allows for an assessment of its effectiveness.
# Set up train control with classProbs = TRUE
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)


set.seed(7)
heart_model_glm <-
  train(target ~ ., data = heart_preprocessed_train, method = "glm",
        metric = "Accuracy", trControl = train_control)


print(heart_model_glm)
## Generalized Linear Model 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 339, 338, 339, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8334561  0.6661962

LDA —-

# Set the seed for reproducibility
set.seed(7)

# Train a Linear Discriminant Analysis (LDA) model using the specified training control
heart_model_lda <- train(target ~ ., data = heart_preprocessed_train,
                         method = "lda", trControl = train_control)

CART —-

# Set the seed for reproducibility
set.seed(7)

# Train a Classification and Regression Trees (CART) model using the specified training control
heart_model_cart <- train(target ~ ., data = heart_preprocessed_train,
                          method = "rpart", trControl = train_control)

KNN —-

# Set the seed for reproducibility
set.seed(7)

# Train a k-Nearest Neighbors (knn) model using the specified training control
heart_model_knn <- train(target ~ ., data = heart_preprocessed_train,
                         method = "knn", trControl = train_control)

Random Forest —-

# Set the seed for reproducibility
set.seed(7)

# Train a Random Forest (rf) model using the specified training control
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train,
                        method = "rf", trControl = train_control)

Call the resamples Function —-

# We then create a list of the model results and pass the list as an argument
# to the `resamples` function.

results <- resamples(list(LDA = heart_model_lda, CART = heart_model_cart,
                          KNN = heart_model_knn, GLM = heart_model_glm,
                          RF = heart_model_rf))

Add ROC curves for each model to the plot

# Install and load the pROC package 
# install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Create an empty plot
plot(0, 0, type = "n", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")

predictions_lda <- predict(heart_model_lda, heart_preprocessed_test[, 1:13],type = "prob")
predictions_cart <- predict(heart_model_cart, heart_preprocessed_test[, 1:13],type = "prob")
predictions_knn <- predict(heart_model_knn, heart_preprocessed_test[, 1:13],type = "prob")
predictions_glm <- predict(heart_model_glm, heart_preprocessed_test[, 1:13],type = "prob")
predictions_rf <- predict(heart_model_rf, heart_preprocessed_test[, 1:13],type = "prob")



# Add ROC curves for each model to the plot
roc_curve_lda <- roc(heart_preprocessed_test$target, predictions_lda$neg)
## Setting levels: control = neg, case = pos

## Setting direction: controls > cases
roc_curve_cart <- roc(heart_preprocessed_test$target, predictions_cart$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_knn <- roc(heart_preprocessed_test$target, predictions_knn$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_glm <- roc(heart_preprocessed_test$target, predictions_glm$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
roc_curve_rf <- roc(heart_preprocessed_test$target, predictions_rf$neg)
## Setting levels: control = neg, case = pos
## Setting direction: controls > cases
# Plot each ROC curve with a different color
plot(roc_curve_lda, main = "ROC curve for LDA", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "red", lwd = 2.5)

plot(roc_curve_cart, main = "ROC curve for CART", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "blue", lwd = 2.5)

plot(roc_curve_knn, main = "ROC curve for KNN", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "green", lwd = 2.5)

plot(roc_curve_glm,main = "ROC curve for GLM", print.auc = TRUE,
     print.auc.x = 0.6, print.auc.y = 0.6, col = "purple")

plot(roc_curve_rf, main = "ROC curve for RF", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "orange", lwd = 2.5)

Display the Results —-

## 1. Table Summary ----
# This is the simplest comparison. It creates a table with one model per row
# and its corresponding evaluation metrics displayed per column.

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: LDA, CART, KNN, GLM, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.7027027 0.7948080 0.8291753 0.8414457 0.8918919 0.9729730    0
## CART 0.5945946 0.7385290 0.7734708 0.7758884 0.8054765 0.9210526    0
## KNN  0.7297297 0.8108108 0.8513514 0.8398062 0.8860242 0.9473684    0
## GLM  0.7297297 0.7894737 0.8291753 0.8334561 0.8860242 0.9473684    0
## RF   0.8108108 0.9189189 0.9466572 0.9326409 0.9729730 1.0000000    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.4023495 0.5888497 0.6583505 0.6818160 0.7836257 0.9458272    0
## CART 0.1921397 0.4766352 0.5461491 0.5522648 0.6120278 0.8421053    0
## KNN  0.4590643 0.6196769 0.7022874 0.6789178 0.7714539 0.8947368    0
## GLM  0.4574780 0.5760087 0.6583505 0.6661962 0.7710874 0.8947368    0
## RF   0.6196769 0.8371237 0.8928338 0.8649911 0.9459854 1.0000000    0
## 2. Box and Whisker Plot ----
# This is useful for visually observing the spread of the estimated accuracies
# for different algorithms and how they relate.

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
bwplot(results, scales = scales)

## 3. Dot Plots ----
# They show both the mean estimated accuracy as well as the 95% confidence
# interval (e.g. the range in which 95% of observed scores fell).

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
dotplot(results, scales = scales)

## 4. Scatter Plot Matrix ----
# This is useful when considering whether the predictions from two
# different algorithms are correlated. If weakly correlated, then they are good
# candidates for being combined in an ensemble prediction.

splom(results)

## 5. Pairwise xyPlots ----
# You can zoom in on one pairwise comparison of the accuracy of trial-folds for
# two models using an xyplot.

# xyplot plots to compare models
xyplot(results, models = c("LDA", "RF"))

# or
# xyplot plots to compare models
xyplot(results, models = c("GLM", "CART"))

Statistical Significance Tests —-

# Explanation:
# - Upper diagonal: Estimates of the difference in accuracy and kappa values between models.
# - Lower diagonal: P-values for testing the hypothesis that the difference is zero.

# Accuracy:
# - Positive values indicate higher accuracy in the row model compared to the column model.
# - Negative values indicate lower accuracy.

# Kappa:
# - Positive values indicate higher kappa values in the row model compared to the column model.
# - Negative values indicate lower kappa values.

# P-values:
# - Assess the statistical significance of the observed differences.
# - Values below the significance level (e.g., 0.05) suggest a significant difference.
# - Bonferroni adjustment is applied for multiple comparisons.
diffs <- diff(results)

summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.065557  0.001639  0.007990 -0.091195
## CART 1.665e-06           -0.063918 -0.057568 -0.156753
## KNN  1.0000000 0.0001824            0.006350 -0.092835
## GLM  1.0000000 2.288e-05 1.0000000           -0.099185
## RF   8.559e-08 2.674e-13 8.765e-09 9.298e-08          
## 
## Kappa 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.129551  0.002898  0.015620 -0.183175
## CART 2.155e-06           -0.126653 -0.113931 -0.312726
## KNN  1.0000000 0.0002056            0.012722 -0.186073
## GLM  1.0000000 2.658e-05 1.0000000           -0.198795
## RF   8.817e-08 2.849e-13 9.122e-09 9.665e-08

Milestone 4: Hyper-Parameter Tuning and Ensemble (lab 9-11)

Apply a “Random Search” to identify the best parameter value —-

# Define train control settings for a random search
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "random")

# Set a seed for reproducibility
set.seed(7)

# Perform a random search for Random Forest hyperparameters
heart_model_random_search_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                                      metric = "Accuracy",
                                      tuneLength = 12,  # Search 12 options for the value of mtry
                                      trControl = train_control)

# Display the details of the random search model
print(heart_model_random_search_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 339, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9078139  0.8153870
##    3    0.9185074  0.8366975
##    7    0.9273529  0.8544132
##    8    0.9273516  0.8544443
##   12    0.9308616  0.8614461
##   15    0.9334932  0.8667054
##   18    0.9334920  0.8667287
##   19    0.9352950  0.8703242
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 19.
# Plot the results of the random search
plot(heart_model_random_search_rf)

Bagged CART —-

## 2.a. Bagged CART ----
set.seed(7)
heart_model_bagged_cart <- train(target ~ ., data = heart_preprocessed_train, method = "treebag",
                               metric = "Accuracy",
                               trControl = train_control)

## 2.b. Random Forest ----
set.seed(7)
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                        metric = "Accuracy", trControl = train_control)


# Summarize results
bagging_results <-
  resamples(list("Bagged Decision Tree" = heart_model_bagged_cart,
                 "Random Forest" = heart_model_rf))

summary(bagging_results)
## 
## Call:
## summary.resamples(object = bagging_results)
## 
## Models: Bagged Decision Tree, Random Forest 
## Number of resamples: 30 
## 
## Accuracy 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.8108108 0.8954116 0.9210526 0.9246015 0.9669092    1
## Random Forest        0.7837838 0.9189189 0.9473684 0.9326622 0.9729730    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
## 
## Kappa 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.6196769 0.7903509 0.8407809 0.8489086 0.9337125    1
## Random Forest        0.5659824 0.8371237 0.8941496 0.8650381 0.9458272    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
dotplot(bagging_results)

# We then print the details of the model that has been created
print(heart_model_rf$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 5.85%
## Confusion matrix:
##     neg pos class.error
## neg 171  12  0.06557377
## pos  10 183  0.05181347

Test the Model —-

# We can test the model
set.seed(9)
predictions <- predict(heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  54   3
##        pos   6  61
##                                           
##                Accuracy : 0.9274          
##                  95% CI : (0.8667, 0.9663)
##     No Information Rate : 0.5161          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8545          
##                                           
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9000          
##             Specificity : 0.9531          
##          Pos Pred Value : 0.9474          
##          Neg Pred Value : 0.9104          
##              Prevalence : 0.4839          
##          Detection Rate : 0.4355          
##    Detection Prevalence : 0.4597          
##       Balanced Accuracy : 0.9266          
##                                           
##        'Positive' Class : neg             
## 

Save and Load your Model —-

# Saving a model into a file allows you to load it later and use it to make
# predictions. Saved models can be loaded by calling the `readRDS()` function

saveRDS(heart_model_rf, "../models/saved_heart_model_rf.rds")
# The saved model can then be loaded later as follows:
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
print(loaded_heart_model_rf)
## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 338, 338, 338, 339, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9104467  0.8206259
##    7    0.9237505  0.8472566
##   15    0.9326622  0.8650381
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 15.
predictions_with_loaded_model <-
  predict(loaded_heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions_with_loaded_model, heart_preprocessed_test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  54   3
##        pos   6  61
##                                           
##                Accuracy : 0.9274          
##                  95% CI : (0.8667, 0.9663)
##     No Information Rate : 0.5161          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8545          
##                                           
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9000          
##             Specificity : 0.9531          
##          Pos Pred Value : 0.9474          
##          Neg Pred Value : 0.9104          
##              Prevalence : 0.4839          
##          Detection Rate : 0.4355          
##    Detection Prevalence : 0.4597          
##       Balanced Accuracy : 0.9266          
##                                           
##        'Positive' Class : neg             
## 

Creating Functions in R —-

# Plumber requires functions, an example of the syntax for creating a function
# in R is:

name_of_function <- function(arg) {
  # Do something with the argument called `arg`
}

Make Predictions on New Data using the Saved Model —-

# We can also create and use our own data frame as follows:
# Convert categorical variables to factors


to_be_predicted <-
  data.frame(age=70, sex=1, cp=0, trestbps=145, chol=174,
             fbs=0, restecg=1, thalach=125, exang=1, oldpeak=2.6,
             slope=0,
             ca=0, thal=3)


to_be_predicted$sex <- as.factor(to_be_predicted$sex)
to_be_predicted$cp <- as.factor(to_be_predicted$cp)
to_be_predicted$fbs <- as.factor(to_be_predicted$fbs)
to_be_predicted$restecg <- as.factor(to_be_predicted$restecg)
to_be_predicted$exang <- as.factor(to_be_predicted$exang)
to_be_predicted$slope <- as.factor(to_be_predicted$slope)
to_be_predicted$thal <- as.factor(to_be_predicted$thal)
# We then use the data frame to make predictions
predict(loaded_heart_model_rf, newdata = to_be_predicted)
## [1] neg
## Levels: neg pos

STEP 8. Make predictions using the model through a function —-

# An alternative is to create a function and then use the function to make
# predictions

predict_target <- function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
                           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
                           arg_ca, arg_thal) {
  # Convert categorical variables to factors
  arg_sex <- as.factor(arg_sex)
  arg_cp <- as.factor(arg_cp)
  arg_fbs <- as.factor(arg_fbs)
  arg_restecg <- as.factor(arg_restecg)
  arg_exang <- as.factor(arg_exang)
  arg_slope <- as.factor(arg_slope)
  arg_thal <- as.factor(arg_thal)
  
  # Create a data frame using the arguments
  to_be_predicted <- data.frame(age = arg_age, sex = arg_sex, cp = arg_cp, trestbps = arg_trestbps,
                                chol = arg_chol, fbs = arg_fbs, restecg = arg_restecg,
                                thalach = arg_thalach, exang = arg_exang, oldpeak = arg_oldpeak,
                                slope = arg_slope, ca = arg_ca, thal = arg_thal)
  
  # Make a prediction based on the data frame
  predict(loaded_heart_model_rf, newdata = to_be_predicted)
}

Milestone 5: Consolidation

Plumber API

# STEP 1. Install and Load the Required Packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: plumber
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}


###LAB 12
# This requires the "plumber" package that was installed and loaded earlier in
# STEP 1. The commenting below makes R recognize the code as the definition of
# an API, i.e., #* comments.

loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")

#* @apiTitle CVDetect Prediction Model API

#* @apiDescription Used to predict whether a patient has heart disease or not.

#* @param arg_age Age of the patient
#* @param arg_sex Gender of the patient (0: Female, 1: Male)
#* @param arg_cp Chest pain type (0-3)
#* @param arg_trestbps Resting blood pressure (mm Hg)
#* @param arg_chol Serum cholesterol (mg/dl)
#* @param arg_fbs Fasting blood sugar (> 120 mg/dl, 0: No, 1: Yes)
#* @param arg_restecg Resting electrocardiographic results (0-2)
#* @param arg_thalach Maximum heart rate achieved
#* @param arg_exang Exercise-induced angina (0: No, 1: Yes)
#* @param arg_oldpeak ST depression induced by exercise relative to rest
#* @param arg_slope Slope of the peak exercise ST segment (0-2)
#* @param arg_ca Number of major vessels colored by fluoroscopy (0-3)
#* @param arg_thal Thallium stress test result (3: Normal, 6: Fixed defect, 7: Reversible defect)


#* @get /target

predict_diabetes <-
  function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
           arg_ca, arg_thal) {
    # Create a data frame using the arguments
    to_be_predicted <- data.frame(
      age = as.numeric(arg_age),
      sex = as.factor(arg_sex),
      cp = as.factor(arg_cp),
      trestbps = as.numeric(arg_trestbps),
      chol = as.numeric(arg_chol),
      fbs = as.factor(arg_fbs),
      restecg = as.factor(arg_restecg),
      thalach = as.numeric(arg_thalach),
      exang = as.factor(arg_exang),
      oldpeak = as.numeric(arg_oldpeak),
      slope = as.factor(arg_slope),
      ca = as.numeric(arg_ca),
      thal = as.factor(arg_thal)
    )
    
    # Make a prediction based on the data frame
    predict(loaded_heart_model_rf, newdata = to_be_predicted)
  }

##Run Plumber

# STEP 1. Install and load the required packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

# STEP 2. Process a Plumber API ----
# This allows us to process a plumber API
#api <- plumber::plumb("lab12.R")

# STEP 3. Run the API on a specific port ----
# Specify a constant localhost port to use
#api$run(host = "127.0.0.1", port = 5022)

PHP output

# Heart Disease Prediction Web Form


cat('<iframe src="http://localhost/lab15/Lab15-submission-ConsumePlumberAPIOutput.php" width="1000" height="600"></iframe>')
cat('<iframe src="http://localhost/customer-facing-dasboard/BI_dashboard/index.php" width="1000" height="600"></iframe>')