Heart Disease Prediction Report

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT
- Student Details
STEP 1. Install and Load the Required Packages —-
Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)
Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-
- STEP 4. Create a subset of the variables/features —-
Milestone 3: Training the Model (Lab 5-8) —-
Model Training and Evaluation using Generalized Linear Model (GLM)
Add ROC curves for each model to the plot
Display the Results —-
- Statistical Significance Tests —-
Apply a “Random Search” to identify the best parameter value —-
- Bagged CART —-
Test the Model —-
Save and Load your Model —-
Creating Functions in R —-
Make Predictions on New Data using the Saved Model —-
STEP 8. Make predictions using the model through a function —-
Plumber API
PHP output

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

Student ID Numbers and Names of Group Members	GitHub Classroom Group Name
2. 134141 - C - Aicha Mbongo

Course Code	Course Name	Program	Semester Duration
BBT4206	Business Intelligence II	Bachelor of Business Information Technology	21^st August 2023 to 28^th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: readr

## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice

## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: e1071

## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: FactoMineR

if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")

## Loading required package: NHANES

## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")

## Loading required package: naniar

## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")

## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")

## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)

## [1] 1025   14

sapply(heart, class)

##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)

##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)

## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)

##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
##

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)

##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)

##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)

##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)

##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)

##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")

## Loading required package: corrplot

## corrplot 0.92 loaded

corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")

## Loading required package: ggcorrplot

ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)

## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)

## [1] FALSE

# How many?
n_miss(heart)

## [1] 0

# What is the percentage of missing data in the entire dataset?
prop_miss(heart)

## [1] 0

# How many missing values does each variable have?
heart %>% is.na() %>% colSums()

##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)

## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0

# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)

## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows

# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)

## [1] TRUE

# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)

## [1] TRUE

# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Scaling Data Transformation Purpose:
# The scaling data transformation is performed to standardize or normalize
# the numeric features in a dataset. It ensures that different features
# with varying scales are brought to a common scale, preventing one feature
# from dominating the others during model training.
#
# Scaling is essential for algorithms that are sensitive to the magnitude
# of input features, such as distance-based methods (e.g., k-Nearest Neighbors)
# or gradient descent-based optimization algorithms (e.g., Support Vector Machines).
#
# By scaling the data, we make the features comparable and contribute equally
# to the model's learning process, improving the stability and performance of
# machine learning models.


# Summary of each variable
summary(heart)

##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:158   0:242   Min.   : 94.0   Min.   :126.0   0:421  
##  1st Qu.:47.00   1:342   1: 80   1st Qu.:120.0   1st Qu.:212.0   1: 79  
##  Median :54.00           2:139   Median :130.0   Median :240.0          
##  Mean   :53.85           3: 39   Mean   :131.4   Mean   :247.9          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:282.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       thal   
##  0:247   Min.   : 71.0   0:331   Min.   :0.000   0: 37   Min.   :0.00   0:  5  
##  1:244   1st Qu.:132.0   1:169   1st Qu.:0.000   1:232   1st Qu.:0.00   1: 34  
##  2:  9   Median :152.0           Median :0.800   2:231   Median :0.00   2:265  
##          Mean   :148.7           Mean   :1.064           Mean   :0.74   3:196  
##          3rd Qu.:166.0           3rd Qu.:1.600           3rd Qu.:1.00          
##          Max.   :202.0           Max.   :6.200           Max.   :4.00          
##  target   
##  neg:230  
##  pos:270  
##           
##           
##           
##

# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)

heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Centering Data Transformation Purpose:
# Shifts numeric features by subtracting their mean,
# ensuring a centered distribution around zero.
# Useful for algorithms sensitive to variable means,
# promoting better model interpretability.



# Summary of each variable
summary(heart)

##       age        sex     cp         trestbps          chol       fbs    
##  Min.   :29.00   0:158   0:242   Min.   : 94.0   Min.   :126.0   0:421  
##  1st Qu.:47.00   1:342   1: 80   1st Qu.:120.0   1st Qu.:212.0   1: 79  
##  Median :54.00           2:139   Median :130.0   Median :240.0          
##  Mean   :53.85           3: 39   Mean   :131.4   Mean   :247.9          
##  3rd Qu.:60.00                   3rd Qu.:140.0   3rd Qu.:282.0          
##  Max.   :77.00                   Max.   :200.0   Max.   :564.0          
##  restecg    thalach      exang      oldpeak      slope         ca       thal   
##  0:247   Min.   : 71.0   0:331   Min.   :0.000   0: 37   Min.   :0.00   0:  5  
##  1:244   1st Qu.:132.0   1:169   1st Qu.:0.000   1:232   1st Qu.:0.00   1: 34  
##  2:  9   Median :152.0           Median :0.800   2:231   Median :0.00   2:265  
##          Mean   :148.7           Mean   :1.064           Mean   :0.74   3:196  
##          3rd Qu.:166.0           3rd Qu.:1.600           3rd Qu.:1.00          
##          Max.   :202.0           Max.   :6.200           Max.   :4.00          
##  target   
##  neg:230  
##  pos:270  
##           
##           
##           
##

# BEFORE

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_scale_transform )[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_scale_transform)[12])

model_of_the_transform <- preProcess(heart_scale_transform, method = c("center"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)

heart_center_transform <- predict(model_of_the_transform,
                                  heart_scale_transform)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_scale_transform  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_scale_transform <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

View(heart_center_transform)

The Standardize Basic Transform on the heart Dataset —-

# BEFORE
#If you've already scaled and centered your data, you have made it comparable and adjusted for differences in means.
#Standardization is typically done to bring variables to a standard normal distribution (mean of 0 and standard deviation 
#of 1).
summary(heart_center_transform)

##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
##

sapply(heart_center_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)

##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1

model_of_the_transform <- preProcess(heart_center_transform,
                                     method = c("scale", "center"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
##   - scaled (6)

heart_standardize_transform <- predict(model_of_the_transform, # nolint
                                       heart_center_transform)

# AFTER
summary(heart_standardize_transform)

##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
##

sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)

##      age trestbps     chol  thalach  oldpeak       ca 
##        1        1        1        1        1        1

Box-Cox Transform—-

# BEFORE
summary(heart_standardize_transform)

##       age           sex     cp         trestbps            chol         fbs    
##  Min.   :-2.75717   0:158   0:242   Min.   :-2.1279   Min.   :-2.4058   0:421  
##  1st Qu.:-0.76019   1:342   1: 80   1st Qu.:-0.6498   1st Qu.:-0.7091   1: 79  
##  Median : 0.01642           2:139   Median :-0.0813   Median :-0.1566          
##  Mean   : 0.00000           3: 39   Mean   : 0.0000   Mean   : 0.0000          
##  3rd Qu.: 0.68208                   3rd Qu.: 0.4872   3rd Qu.: 0.6720          
##  Max.   : 2.56813                   Max.   : 3.8982   Max.   : 6.2356          
##  restecg    thalach        exang      oldpeak        slope         ca         
##  0:247   Min.   :-3.3196   0:331   Min.   :-0.8961   0: 37   Min.   :-0.7009  
##  1:244   1st Qu.:-0.7121   1:169   1st Qu.:-0.8961   1:232   1st Qu.:-0.7009  
##  2:  9   Median : 0.1428           Median :-0.2223   2:231   Median :-0.7009  
##          Mean   : 0.0000           Mean   : 0.0000           Mean   : 0.0000  
##          3rd Qu.: 0.7412           3rd Qu.: 0.4514           3rd Qu.: 0.2463  
##          Max.   : 2.2801           Max.   : 4.3255           Max.   : 3.0878  
##  thal    target   
##  0:  5   neg:230  
##  1: 34   pos:270  
##  2:265            
##  3:196            
##                   
##

#Calculate the skewness before the Box-Cox transform
sapply(heart_standardize_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)

##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1718278  0.7100820  0.8807213 -0.5100814  1.2866519  1.3141907

# **************************************************************************************************************
#  RESULTS
#  Age: The skewness is relatively small (-0.27). You might not need a Box-Cox transformation for age,
#  especially since the skewness is not highly pronounced.
  
#  trestbps (resting blood pressure): The skewness is moderate (0.78). Considering the moderate skewness,
#  you could experiment with a Box-Cox transformation to see if it improves the distribution.
  
#  chol (serum cholesterol): The skewness is high (1.49). A Box-Cox transformation is often beneficial for 
#  skewed variables, so you might consider applying it to cholesterol.
  
#  thalach (maximum heart rate achieved): The skewness is relatively small (-0.56). A Box-Cox transformation
#  might not be necessary for thalach, especially if the skewness is not a significant concern.
  
#  oldpeak (ST depression induced by exercise): The skewness is high (1.21). Given the high skewness, a Box-Cox 
#  transformation could be beneficial for oldpeak.
  
#  ca (number of major vessels colored by fluoroscopy): The skewness is high (1.35). Similar to cholesterol and 
#  oldpeak, a Box-Cox transformation might be considered for ca.
# **************************************************************************************************************

  
#Plot a histogram to view the skewness before the Box-Cox transform

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 1]))
hist(heart_health_variable , main = names(heart_standardize_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 4]))
hist(heart_health_variable , main = names(heart_standardize_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 5]))
hist(heart_health_variable , main = names(heart_standardize_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 8]))
hist(heart_health_variable , main = names(heart_standardize_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_standardize_transform [, 10]))
hist(heart_health_variable , main = names(heart_standardize_transform)[10])

heart_health_variable  <- as.numeric( unlist(heart_standardize_transform [, 12]))
hist(heart_health_variable , main = names(heart_standardize_transform)[12])



# Select the columns of interest
columns_of_interest <- c(12, 10, 5, 4)

# Apply Box-Cox transformation only to selected columns
model_of_the_transform <- preProcess(heart_standardize_transform[, columns_of_interest], method = c("BoxCox"))

# Print the transformation details
print(model_of_the_transform)

## Created from 500 samples and 0 variables
## 
## Pre-processing:
##   - ignored (0)

# Apply the transformation to the selected columns
heart_box_cox_transform <- predict(model_of_the_transform, heart_standardize_transform[, columns_of_interest])




# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_box_cox_transform, skewness, type = 2)

##        ca   oldpeak      chol  trestbps 
## 1.3141907 1.2866519 0.8807213 0.7100820

# **************************************************************************************************************

#RESULT
#ca: Skewness is approximately 1.35. A positive skewness indicates that the distribution
#has a longer right tail.

#oldpeak: Skewness is approximately 1.21. Similar to "ca," the positive skewness
#suggests a longer right tail in the distribution.

#chol: Skewness is approximately 1.49. Again, positive skewness indicates a longer 
#right tail.

#trestbps: Skewness is approximately 0.78. The positive skewness suggests a longer 
#right tail, but it's less pronounced compared to the other variables.
# **************************************************************************************************************

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform[, 1]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_box_cox_transform [, 2]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[2])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform[, 3]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[3])

heart_health_variable <- as.numeric( unlist(heart_box_cox_transform [, 4]))
hist(heart_health_variable , main = names(heart_box_cox_transform)[4])

PCA for Feature Extraction on the Boston Housing Dataset —-

### PCA for Dimensionality Reduction on the heart Dataset ----
# Combine original and transformed columns
heart_combined <- cbind(heart_standardize_transform[, c("age", "thalach")], heart_box_cox_transform)
summary(heart_combined )

##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps      
##  Min.   :-2.4058   Min.   :-2.1279  
##  1st Qu.:-0.7091   1st Qu.:-0.6498  
##  Median :-0.1566   Median :-0.0813  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872  
##  Max.   : 6.2356   Max.   : 3.8982

model_of_the_transform <- preProcess(heart_combined, method =
                                       c("scale", "center", "pca"))

print(model_of_the_transform)

## Created from 500 samples and 6 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (0)
##   - principal component signal extraction (6)
##   - scaled (6)
## 
## PCA needed 6 components to capture 95 percent of the variance

heart_pca_dr <- predict(model_of_the_transform, heart_combined)

summary(heart_pca_dr)

##       PC1                PC2                PC3                PC4           
##  Min.   :-3.41728   Min.   :-2.77244   Min.   :-4.40309   Min.   :-2.480478  
##  1st Qu.:-1.04054   1st Qu.:-0.70782   1st Qu.:-0.71122   1st Qu.:-0.550784  
##  Median :-0.08947   Median :-0.05989   Median :-0.03299   Median :-0.004927  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.99553   3rd Qu.: 0.63505   3rd Qu.: 0.59697   3rd Qu.: 0.522978  
##  Max.   : 3.45820   Max.   : 4.13484   Max.   : 3.00870   Max.   : 2.765638  
##       PC5                PC6          
##  Min.   :-3.25100   Min.   :-1.48242  
##  1st Qu.:-0.45213   1st Qu.:-0.48877  
##  Median :-0.05723   Median :-0.07673  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.42679   3rd Qu.: 0.47682  
##  Max.   : 2.84971   Max.   : 1.73697

### PCA for Feature Extraction  ----


heart_pca_fe <- princomp(cor(heart_combined))
summary(heart_pca_fe)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.7109310 0.3920044 0.3569546 0.3097962 0.22333985
## Proportion of Variance 0.5420891 0.1648153 0.1366601 0.1029361 0.05349931
## Cumulative Proportion  0.5420891 0.7069045 0.8435646 0.9465007 1.00000000
##                              Comp.6
## Standard deviation     5.882502e-09
## Proportion of Variance 3.711419e-17
## Cumulative Proportion  1.000000e+00

# **************************************************************************************************************
#RESULTS
#the PCA results suggest that the first five principal components retain the majority of the information in your dataset. 
#The subsequent components (Comp.6 ) contribute very little or no additional information. I may consider using 
#the first five components for further analysis, as they represent the most important features in terms of explaining the 
#variance in the data. The negligible contribution of Comp.6  might indicate that these components are not providing 
#meaningful information for the analysis.
# **************************************************************************************************************



#### Scree Plot ----
# The cumulative proportion reaches 100% after Comp.5, indicating that the first five components capture 
#all the variance in the original data.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:5]

##               Comp.1     Comp.2      Comp.3      Comp.4     Comp.5
## age       0.48945115  0.3072417  0.08517206  0.42252444  0.5639681
## thalach  -0.66520939 -0.0904431  0.02928681  0.05066650  0.0582243
## ca        0.35863816 -0.4924230 -0.31097901  0.42411265 -0.5191964
## oldpeak   0.40379185 -0.2934719  0.01815823 -0.77322182  0.1206458
## chol      0.08206685  0.7338168 -0.43306383 -0.20130090 -0.4178283
## trestbps  0.13974934  0.1741534  0.84101490  0.02530656 -0.4688808

unique(rownames(heart_pca_fe$loadings[, 1:5]))

## [1] "age"      "thalach"  "ca"       "oldpeak"  "chol"     "trestbps"

factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:5)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

MCA for Feature Extraction —-

# Convert relevant columns to factors
categorical_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")
heart_categorical <- heart
heart_categorical[categorical_cols] <- lapply(heart_categorical[categorical_cols], as.factor)

# Perform Multiple Correspondence Analysis (MCA)
heart_mca <- MCA(heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)])

# Visualize MCA results for individuals
# Summary of the MCA results
summary(heart_mca)

## 
## Call:
## MCA(X = heart_categorical[, c(2, 3, 6, 7, 9, 11, 13, 14)]) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.349   0.165   0.155   0.148   0.133   0.128   0.115
## % of var.             19.940   9.440   8.863   8.454   7.596   7.308   6.551
## Cumulative % of var.  19.940  29.379  38.243  46.697  54.293  61.601  68.153
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.111   0.097   0.090   0.084   0.074   0.058   0.044
## % of var.              6.370   5.537   5.123   4.774   4.207   3.302   2.535
## Cumulative % of var.  74.523  80.059  85.182  89.956  94.163  97.465 100.000
## 
## Individuals (the 10 first)
##              Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## 1         |  0.026  0.000  0.001 | -0.137  0.023  0.020 | -0.066  0.006  0.005
## 2         |  0.607  0.211  0.152 | -0.396  0.190  0.065 |  0.460  0.273  0.088
## 3         | -0.266  0.040  0.092 | -0.042  0.002  0.002 |  0.059  0.005  0.005
## 4         |  0.176  0.018  0.029 | -0.331  0.133  0.101 | -0.069  0.006  0.004
## 5         | -0.378  0.082  0.071 | -0.200  0.048  0.020 |  0.824  0.876  0.336
## 6         |  0.841  0.405  0.273 |  0.357  0.154  0.049 |  0.013  0.000  0.000
## 7         |  0.541  0.168  0.272 | -0.527  0.336  0.258 |  0.115  0.017  0.012
## 8         |  0.176  0.018  0.029 | -0.331  0.133  0.101 | -0.069  0.006  0.004
## 9         |  0.308  0.054  0.117 | -0.035  0.001  0.002 | -0.223  0.064  0.062
## 10        | -0.563  0.182  0.330 | -0.006  0.000  0.000 |  0.000  0.000  0.000
##            
## 1         |
## 2         |
## 3         |
## 4         |
## 5         |
## 6         |
## 7         |
## 8         |
## 9         |
## 10        |
## 
## Categories (the 10 first)
##               Dim.1     ctr    cos2  v.test     Dim.2     ctr    cos2  v.test  
## sex_0     |  -0.649   4.773   0.195  -9.859 |   0.393   3.687   0.071   5.962 |
## sex_1     |   0.300   2.205   0.195   9.859 |  -0.181   1.704   0.071  -5.962 |
## cp_0      |   0.720   8.980   0.486  15.570 |  -0.056   0.115   0.003  -1.211 |
## cp_1      |  -0.967   5.365   0.178  -9.432 |  -0.012   0.002   0.000  -0.116 |
## cp_2      |  -0.688   4.708   0.182  -9.531 |   0.060   0.076   0.001   0.833 |
## cp_3      |  -0.030   0.003   0.000  -0.198 |   0.158   0.147   0.002   1.024 |
## fbs_0     |  -0.059   0.105   0.019  -3.044 |  -0.083   0.435   0.036  -4.260 |
## fbs_1     |   0.315   0.560   0.019   3.044 |   0.440   2.317   0.036   4.260 |
## restecg_0 |   0.278   1.365   0.075   6.130 |   0.120   0.534   0.014   2.638 |
## restecg_1 |  -0.317   1.752   0.096  -6.904 |  -0.304   3.407   0.088  -6.624 |
##             Dim.3     ctr    cos2  v.test  
## sex_0      -0.844  18.127   0.329 -12.810 |
## sex_1       0.390   8.374   0.329  12.810 |
## cp_0       -0.266   2.762   0.066  -5.757 |
## cp_1        0.804   8.325   0.123   7.834 |
## cp_2       -0.452   4.567   0.079  -6.259 |
## cp_3        1.612  16.337   0.220  10.475 |
## fbs_0      -0.106   0.758   0.060  -5.451 |
## fbs_1       0.563   4.040   0.060   5.451 |
## restecg_0  -0.129   0.662   0.016  -2.846 |
## restecg_1   0.162   1.031   0.025   3.531 |
## 
## Categorical variables (eta2)
##             Dim.1 Dim.2 Dim.3  
## sex       | 0.195 0.071 0.329 |
## cp        | 0.532 0.004 0.397 |
## fbs       | 0.019 0.036 0.060 |
## restecg   | 0.104 0.494 0.034 |
## exang     | 0.425 0.021 0.035 |
## slope     | 0.331 0.257 0.256 |
## thal      | 0.512 0.429 0.128 |
## target    | 0.674 0.009 0.003 |

heart_numerical_combined_with_factors <- cbind( heart_combined, heart[, c(2, 3, 6, 7, 9, 11, 13, 14)])
summary(heart_numerical_combined_with_factors)

##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4058   Min.   :-2.1279   0:158   0:242   0:421   0:247   0:331  
##  1st Qu.:-0.7091   1st Qu.:-0.6498   1:342   1: 80   1: 79   1:244   1:169  
##  Median :-0.1566   Median :-0.0813           2:139           2:  9          
##  Mean   : 0.0000   Mean   : 0.0000           3: 39                          
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872                                          
##  Max.   : 6.2356   Max.   : 3.8982                                          
##  slope   thal    target   
##  0: 37   0:  5   neg:230  
##  1:232   1: 34   pos:270  
##  2:231   2:265            
##          3:196            
##                           
##

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart_numerical_combined_with_factors )

##       age              thalach              ca             oldpeak       
##  Min.   :-2.75717   Min.   :-3.3196   Min.   :-0.7009   Min.   :-0.8961  
##  1st Qu.:-0.76019   1st Qu.:-0.7121   1st Qu.:-0.7009   1st Qu.:-0.8961  
##  Median : 0.01642   Median : 0.1428   Median :-0.7009   Median :-0.2223  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.68208   3rd Qu.: 0.7412   3rd Qu.: 0.2463   3rd Qu.: 0.4514  
##  Max.   : 2.56813   Max.   : 2.2801   Max.   : 3.0878   Max.   : 4.3255  
##       chol            trestbps       sex     cp      fbs     restecg exang  
##  Min.   :-2.4058   Min.   :-2.1279   0:158   0:242   0:421   0:247   0:331  
##  1st Qu.:-0.7091   1st Qu.:-0.6498   1:342   1: 80   1: 79   1:244   1:169  
##  Median :-0.1566   Median :-0.0813           2:139           2:  9          
##  Mean   : 0.0000   Mean   : 0.0000           3: 39                          
##  3rd Qu.: 0.6720   3rd Qu.: 0.4872                                          
##  Max.   : 6.2356   Max.   : 3.8982                                          
##  slope   thal    target   
##  0: 37   0:  5   neg:230  
##  1:232   1: 34   pos:270  
##  2:231   2:265            
##          3:196            
##                           
##

model_of_the_transform <- preProcess(heart_numerical_combined_with_factors ,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 6)
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 6 components

heart_numerical_combined_with_factors_ica_dr <- predict(model_of_the_transform, heart_numerical_combined_with_factors )

summary(heart_numerical_combined_with_factors_ica_dr)

##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:158   0:242   0:421   0:247   0:331   0: 37   0:  5   neg:230  
##  1:342   1: 80   1: 79   1:244   1:169   1:232   1: 34   pos:270  
##          2:139           2:  9           2:231   2:265            
##          3: 39                                   3:196            
##                                                                   
##                                                                   
##       ICA1               ICA2              ICA3               ICA4        
##  Min.   :-6.37988   Min.   :-2.4012   Min.   :-4.20026   Min.   :-2.2059  
##  1st Qu.:-0.59951   1st Qu.:-0.7661   1st Qu.:-0.62554   1st Qu.:-0.5649  
##  Median : 0.08156   Median : 0.0411   Median : 0.06527   Median :-0.1122  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.63948   3rd Qu.: 0.7562   3rd Qu.: 0.59384   3rd Qu.: 0.4460  
##  Max.   : 2.80818   Max.   : 3.0843   Max.   : 2.38632   Max.   : 3.7516  
##       ICA5              ICA6        
##  Min.   :-4.1307   Min.   :-2.0626  
##  1st Qu.:-0.4323   1st Qu.:-0.7113  
##  Median : 0.1016   Median :-0.2247  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5648   3rd Qu.: 0.7633  
##  Max.   : 2.7263   Max.   : 2.5939

# ************************************************************************************************************************************

#RESULTS
# Summary of the Transformed Dataset after Independent Component Analysis (ICA)

# Categorical Variables:
# - sex, cp, fbs, restecg, exang, slope, thal, target are categorical variables with counts and percentages for each level.

# Independent Component Analysis (ICA) Components:
# - ICA1 to ICA6 represent the Independent Component Analysis components.
# - For each component, descriptive statistics are provided:
#   - Minimum, 1st Quartile, Median, Mean, 3rd Quartile, Maximum.

# Example for ICA1:
#   ICA1: Min. -4.3196, 1st Qu. -0.3524, Median 0.1066, Mean 0.0000, 3rd Qu. 0.5823, Max. 2.4614.

# These statistics offer insights into the distribution and characteristics of the transformed variables after applying ICA.
# ************************************************************************************************************************************

Milestone 3: Training the Model (Lab 5-8) —-

1. Split the dataset ====

heart_preprocessed <- heart_numerical_combined_with_factors

str(heart_preprocessed)

## 'data.frame':    500 obs. of  14 variables:
##  $ age     : num  -0.316 -1.759 0.571 -0.538 -0.871 ...
##  $ thalach : num  1.596 1.425 1.425 -0.413 0.314 ...
##  $ ca      : num  -0.701 -0.701 -0.701 2.141 -0.701 ...
##  $ oldpeak : num  -0.896 2.304 -0.896 0.788 -0.896 ...
##  $ chol    : num  0.258 -0.334 0.455 -1.183 -1.005 ...
##  $ trestbps: num  0.487 -0.65 0.374 -0.65 -1.73 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 1 4 1 3 2 1 1 3 1 3 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 1 2 1 2 2 2 2 2 1 1 ...
##  $ exang   : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 1 1 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 3 2 3 2 3 2 3 2 2 3 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 3 4 3 4 4 2 4 4 3 3 ...
##  $ target  : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 1 1 1 2 ...

# Define a 75:25 train:test data split of the dataset.
# That is, 75% of the original data will be used to train the model and
# 25% of the original data will be used to test the model.
train_index <- createDataPartition(heart_preprocessed$target,
                                   p = 0.75,
                                   list = FALSE)
heart_preprocessed_train <- heart_preprocessed[train_index, ]
heart_preprocessed_test <- heart_preprocessed[-train_index, ]

2. Train a Naive Bayes classifier using the training dataset —-

# Train a Naive Bayes model using the caret package.
heart_model_nb_caret <- # nolint
  caret::train(target ~ ., data =
                 heart_preprocessed_train[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                           "restecg" , "thalach" , "exang",
                           "oldpeak", "slope", "ca",  "thal", "target")],
               method = "naive_bayes")

3. Test the trained model using the testing dataset —-

### 3.b. Test the trained caret Naive Bayes model using the testing dataset ----
predictions_nb_caret <-
  predict(heart_model_nb_caret,
          heart_preprocessed_test[, c("age", "sex", "cp", "trestbps" , "chol" , "fbs" ,
                         "restecg" , "thalach" , "exang",
                         "oldpeak", "slope", "ca",  "thal")])

Determine the Baseline Accuracy —-

heart_freq <- heart_preprocessed_train$target
cbind(frequency =
        table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)

##     frequency percentage
## neg       173   46.01064
## pos       203   53.98936

Model Training and Evaluation using Generalized Linear Model (GLM)

# The glm function in this code is used to train a Generalized Linear Model
# for predicting the target variable based on the provided features in the training dataset.
# The subsequent printing of model details and performance metrics allows for an assessment of its effectiveness.
# Set up train control with classProbs = TRUE
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)


set.seed(7)
heart_model_glm <-
  train(target ~ ., data = heart_preprocessed_train, method = "glm",
        metric = "Accuracy", trControl = train_control)


print(heart_model_glm)

## Generalized Linear Model 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8354263  0.6670537

LDA —-

# Set the seed for reproducibility
set.seed(7)

# Train a Linear Discriminant Analysis (LDA) model using the specified training control
heart_model_lda <- train(target ~ ., data = heart_preprocessed_train,
                         method = "lda", trControl = train_control)

CART —-

# Set the seed for reproducibility
set.seed(7)

# Train a Classification and Regression Trees (CART) model using the specified training control
heart_model_cart <- train(target ~ ., data = heart_preprocessed_train,
                          method = "rpart", trControl = train_control)

KNN —-

# Set the seed for reproducibility
set.seed(7)

# Train a k-Nearest Neighbors (knn) model using the specified training control
heart_model_knn <- train(target ~ ., data = heart_preprocessed_train,
                         method = "knn", trControl = train_control)

Random Forest —-

# Set the seed for reproducibility
set.seed(7)

# Train a Random Forest (rf) model using the specified training control
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train,
                        method = "rf", trControl = train_control)

Call the resamples Function —-

# We then create a list of the model results and pass the list as an argument
# to the `resamples` function.

results <- resamples(list(LDA = heart_model_lda, CART = heart_model_cart,
                          KNN = heart_model_knn, GLM = heart_model_glm,
                          RF = heart_model_rf))

Add ROC curves for each model to the plot

# Install and load the pROC package 
# install.packages("pROC")
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Create an empty plot
plot(0, 0, type = "n", main = "ROC Curve", xlab = "False Positive Rate", ylab = "True Positive Rate")

predictions_lda <- predict(heart_model_lda, heart_preprocessed_test[, 1:13],type = "prob")
predictions_cart <- predict(heart_model_cart, heart_preprocessed_test[, 1:13],type = "prob")
predictions_knn <- predict(heart_model_knn, heart_preprocessed_test[, 1:13],type = "prob")
predictions_glm <- predict(heart_model_glm, heart_preprocessed_test[, 1:13],type = "prob")
predictions_rf <- predict(heart_model_rf, heart_preprocessed_test[, 1:13],type = "prob")



# Add ROC curves for each model to the plot
roc_curve_lda <- roc(heart_preprocessed_test$target, predictions_lda$neg)

## Setting levels: control = neg, case = pos

## Setting direction: controls > cases

roc_curve_cart <- roc(heart_preprocessed_test$target, predictions_cart$neg)

## Setting levels: control = neg, case = pos
## Setting direction: controls > cases

roc_curve_knn <- roc(heart_preprocessed_test$target, predictions_knn$neg)

## Setting levels: control = neg, case = pos
## Setting direction: controls > cases

roc_curve_glm <- roc(heart_preprocessed_test$target, predictions_glm$neg)

## Setting levels: control = neg, case = pos
## Setting direction: controls > cases

roc_curve_rf <- roc(heart_preprocessed_test$target, predictions_rf$neg)

## Setting levels: control = neg, case = pos
## Setting direction: controls > cases

# Plot each ROC curve with a different color
plot(roc_curve_lda, main = "ROC curve for LDA", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "red", lwd = 2.5)

plot(roc_curve_cart, main = "ROC curve for CART", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "blue", lwd = 2.5)

plot(roc_curve_knn, main = "ROC curve for KNN", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "green", lwd = 2.5)

plot(roc_curve_glm,main = "ROC curve for GLM", print.auc = TRUE,
     print.auc.x = 0.6, print.auc.y = 0.6, col = "purple")

plot(roc_curve_rf, main = "ROC curve for RF", print.auc = TRUE,
      print.auc.x = 0.6, print.auc.y = 0.6, col = "orange", lwd = 2.5)

Display the Results —-

## 1. Table Summary ----
# This is the simplest comparison. It creates a table with one model per row
# and its corresponding evaluation metrics displayed per column.

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: LDA, CART, KNN, GLM, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.7027027 0.7683144 0.8399716 0.8388676 0.9189189 0.9729730    0
## CART 0.6486486 0.7046586 0.7368421 0.7439764 0.7837838 0.8684211    0
## KNN  0.6756757 0.7567568 0.7894737 0.8078291 0.8860242 0.9729730    0
## GLM  0.7027027 0.7631579 0.8421053 0.8354263 0.8940256 0.9729730    0
## RF   0.7631579 0.8918919 0.9220648 0.9212137 0.9729730 1.0000000    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LDA  0.3879699 0.5319792 0.6705086 0.6721683 0.8345753 0.9458272    0
## CART 0.2701062 0.4168520 0.4765681 0.4883384 0.5731249 0.7382920    0
## KNN  0.3412463 0.5124451 0.5765885 0.6137426 0.7691556 0.9453471    0
## GLM  0.3934426 0.5236769 0.6788630 0.6670537 0.7886056 0.9458272    0
## RF   0.5289256 0.7822645 0.8441422 0.8414560 0.9453471 1.0000000    0

## 2. Box and Whisker Plot ----
# This is useful for visually observing the spread of the estimated accuracies
# for different algorithms and how they relate.

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
bwplot(results, scales = scales)

## 3. Dot Plots ----
# They show both the mean estimated accuracy as well as the 95% confidence
# interval (e.g. the range in which 95% of observed scores fell).

scales <- list(x = list(relation = "free"), y = list(relation = "free"))
dotplot(results, scales = scales)

## 4. Scatter Plot Matrix ----
# This is useful when considering whether the predictions from two
# different algorithms are correlated. If weakly correlated, then they are good
# candidates for being combined in an ensemble prediction.

splom(results)

## 5. Pairwise xyPlots ----
# You can zoom in on one pairwise comparison of the accuracy of trial-folds for
# two models using an xyplot.

# xyplot plots to compare models
xyplot(results, models = c("LDA", "RF"))

# or
# xyplot plots to compare models
xyplot(results, models = c("GLM", "CART"))

Statistical Significance Tests —-

# Explanation:
# - Upper diagonal: Estimates of the difference in accuracy and kappa values between models.
# - Lower diagonal: P-values for testing the hypothesis that the difference is zero.

# Accuracy:
# - Positive values indicate higher accuracy in the row model compared to the column model.
# - Negative values indicate lower accuracy.

# Kappa:
# - Positive values indicate higher kappa values in the row model compared to the column model.
# - Negative values indicate lower kappa values.

# P-values:
# - Assess the statistical significance of the observed differences.
# - Values below the significance level (e.g., 0.05) suggest a significant difference.
# - Bonferroni adjustment is applied for multiple comparisons.
diffs <- diff(results)

summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.094891  0.031039  0.003441 -0.082346
## CART 4.763e-07           -0.063853 -0.091450 -0.177237
## KNN  0.03721   5.723e-05           -0.027597 -0.113385
## GLM  1.00000   7.732e-07 0.12566             -0.085787
## RF   1.874e-06 9.436e-16 5.211e-09 3.641e-06          
## 
## Kappa 
##      LDA       CART      KNN       GLM       RF       
## LDA             0.183830  0.058426  0.005115 -0.169288
## CART 1.473e-06           -0.125404 -0.178715 -0.353118
## KNN  0.06184   7.865e-05           -0.053311 -0.227713
## GLM  1.00000   1.744e-06 0.16237             -0.174402
## RF   1.726e-06 1.033e-15 5.161e-09 3.456e-06

Apply a “Random Search” to identify the best parameter value —-

# Define train control settings for a random search
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, search = "random")

# Set a seed for reproducibility
set.seed(7)

# Perform a random search for Random Forest hyperparameters
heart_model_random_search_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                                      metric = "Accuracy",
                                      tuneLength = 12,  # Search 12 options for the value of mtry
                                      trControl = train_control)

# Display the details of the random search model
print(heart_model_random_search_rf)

## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9186521  0.8362982
##    3    0.9185347  0.8361989
##    7    0.9203365  0.8396493
##    8    0.9194818  0.8381586
##   12    0.9229918  0.8450439
##   15    0.9212374  0.8414960
##   18    0.9159032  0.8308722
##   19    0.9168041  0.8327416
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.

# Plot the results of the random search
plot(heart_model_random_search_rf)

Bagged CART —-

## 2.a. Bagged CART ----
set.seed(7)
heart_model_bagged_cart <- train(target ~ ., data = heart_preprocessed_train, method = "treebag",
                               metric = "Accuracy",
                               trControl = train_control)

## 2.b. Random Forest ----
set.seed(7)
heart_model_rf <- train(target ~ ., data = heart_preprocessed_train, method = "rf",
                        metric = "Accuracy", trControl = train_control)


# Summarize results
bagging_results <-
  resamples(list("Bagged Decision Tree" = heart_model_bagged_cart,
                 "Random Forest" = heart_model_rf))

summary(bagging_results)

## 
## Call:
## summary.resamples(object = bagging_results)
## 
## Models: Bagged Decision Tree, Random Forest 
## Number of resamples: 30 
## 
## Accuracy 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.7894737 0.8918919 0.9199858 0.9124394 0.9402287    1
## Random Forest        0.7631579 0.8918919 0.9220648 0.9177274 0.9473684    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0
## 
## Kappa 
##                           Min.   1st Qu.    Median      Mean   3rd Qu. Max.
## Bagged Decision Tree 0.5789474 0.7828288 0.8388970 0.8242421 0.8801471    1
## Random Forest        0.5263158 0.7822645 0.8441422 0.8345238 0.8947368    1
##                      NA's
## Bagged Decision Tree    0
## Random Forest           0

dotplot(bagging_results)

# We then print the details of the model that has been created
print(heart_model_rf$finalModel)

## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 6.91%
## Confusion matrix:
##     neg pos class.error
## neg 159  14  0.08092486
## pos  12 191  0.05911330

Test the Model —-

# We can test the model
set.seed(9)
predictions <- predict(heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions, heart_preprocessed_test$target)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  53   2
##        pos   4  65
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.5403         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9023         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9298         
##             Specificity : 0.9701         
##          Pos Pred Value : 0.9636         
##          Neg Pred Value : 0.9420         
##              Prevalence : 0.4597         
##          Detection Rate : 0.4274         
##    Detection Prevalence : 0.4435         
##       Balanced Accuracy : 0.9500         
##                                          
##        'Positive' Class : neg            
##

Save and Load your Model —-

# Saving a model into a file allows you to load it later and use it to make
# predictions. Saved models can be loaded by calling the `readRDS()` function

saveRDS(heart_model_rf, "../models/saved_heart_model_rf.rds")
# The saved model can then be loaded later as follows:
loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")
print(loaded_heart_model_rf)

## Random Forest 
## 
## 376 samples
##  13 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 339, 337, 338, 338, 338, 339, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9168278  0.8324670
##    7    0.9177274  0.8345238
##   15    0.9158794  0.8309357
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.

predictions_with_loaded_model <-
  predict(loaded_heart_model_rf, newdata = heart_preprocessed_test)
confusionMatrix(predictions_with_loaded_model, heart_preprocessed_test$target)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg  53   2
##        pos   4  65
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.5403         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9023         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.9298         
##             Specificity : 0.9701         
##          Pos Pred Value : 0.9636         
##          Neg Pred Value : 0.9420         
##              Prevalence : 0.4597         
##          Detection Rate : 0.4274         
##    Detection Prevalence : 0.4435         
##       Balanced Accuracy : 0.9500         
##                                          
##        'Positive' Class : neg            
##

Creating Functions in R —-

# Plumber requires functions, an example of the syntax for creating a function
# in R is:

name_of_function <- function(arg) {
  # Do something with the argument called `arg`
}

Make Predictions on New Data using the Saved Model —-

# We can also create and use our own data frame as follows:
# Convert categorical variables to factors


to_be_predicted <-
  data.frame(age=70, sex=1, cp=0, trestbps=145, chol=174,
             fbs=0, restecg=1, thalach=125, exang=1, oldpeak=2.6,
             slope=0,
             ca=0, thal=3)


to_be_predicted$sex <- as.factor(to_be_predicted$sex)
to_be_predicted$cp <- as.factor(to_be_predicted$cp)
to_be_predicted$fbs <- as.factor(to_be_predicted$fbs)
to_be_predicted$restecg <- as.factor(to_be_predicted$restecg)
to_be_predicted$exang <- as.factor(to_be_predicted$exang)
to_be_predicted$slope <- as.factor(to_be_predicted$slope)
to_be_predicted$thal <- as.factor(to_be_predicted$thal)
# We then use the data frame to make predictions
predict(loaded_heart_model_rf, newdata = to_be_predicted)

## [1] neg
## Levels: neg pos

STEP 8. Make predictions using the model through a function —-

# An alternative is to create a function and then use the function to make
# predictions

predict_target <- function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
                           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
                           arg_ca, arg_thal) {
  # Convert categorical variables to factors
  arg_sex <- as.factor(arg_sex)
  arg_cp <- as.factor(arg_cp)
  arg_fbs <- as.factor(arg_fbs)
  arg_restecg <- as.factor(arg_restecg)
  arg_exang <- as.factor(arg_exang)
  arg_slope <- as.factor(arg_slope)
  arg_thal <- as.factor(arg_thal)
  
  # Create a data frame using the arguments
  to_be_predicted <- data.frame(age = arg_age, sex = arg_sex, cp = arg_cp, trestbps = arg_trestbps,
                                chol = arg_chol, fbs = arg_fbs, restecg = arg_restecg,
                                thalach = arg_thalach, exang = arg_exang, oldpeak = arg_oldpeak,
                                slope = arg_slope, ca = arg_ca, thal = arg_thal)
  
  # Make a prediction based on the data frame
  predict(loaded_heart_model_rf, newdata = to_be_predicted)
}

Plumber API

# STEP 1. Install and Load the Required Packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: plumber

## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}


###LAB 12
# This requires the "plumber" package that was installed and loaded earlier in
# STEP 1. The commenting below makes R recognize the code as the definition of
# an API, i.e., #* comments.

loaded_heart_model_rf <- readRDS("../models/saved_heart_model_rf.rds")

#* @apiTitle CVDetect Prediction Model API

#* @apiDescription Used to predict whether a patient has heart disease or not.

#* @param arg_age Age of the patient
#* @param arg_sex Gender of the patient (0: Female, 1: Male)
#* @param arg_cp Chest pain type (0-3)
#* @param arg_trestbps Resting blood pressure (mm Hg)
#* @param arg_chol Serum cholesterol (mg/dl)
#* @param arg_fbs Fasting blood sugar (> 120 mg/dl, 0: No, 1: Yes)
#* @param arg_restecg Resting electrocardiographic results (0-2)
#* @param arg_thalach Maximum heart rate achieved
#* @param arg_exang Exercise-induced angina (0: No, 1: Yes)
#* @param arg_oldpeak ST depression induced by exercise relative to rest
#* @param arg_slope Slope of the peak exercise ST segment (0-2)
#* @param arg_ca Number of major vessels colored by fluoroscopy (0-3)
#* @param arg_thal Thallium stress test result (3: Normal, 6: Fixed defect, 7: Reversible defect)


#* @get /target

predict_diabetes <-
  function(arg_age, arg_sex, arg_cp, arg_trestbps, arg_chol,
           arg_fbs, arg_restecg, arg_thalach, arg_exang, arg_oldpeak, arg_slope,
           arg_ca, arg_thal) {
    # Create a data frame using the arguments
    to_be_predicted <- data.frame(
      age = as.numeric(arg_age),
      sex = as.factor(arg_sex),
      cp = as.factor(arg_cp),
      trestbps = as.numeric(arg_trestbps),
      chol = as.numeric(arg_chol),
      fbs = as.factor(arg_fbs),
      restecg = as.factor(arg_restecg),
      thalach = as.numeric(arg_thalach),
      exang = as.factor(arg_exang),
      oldpeak = as.numeric(arg_oldpeak),
      slope = as.factor(arg_slope),
      ca = as.numeric(arg_ca),
      thal = as.factor(arg_thal)
    )
    
    # Make a prediction based on the data frame
    predict(loaded_heart_model_rf, newdata = to_be_predicted)
  }

#Run Plumber

# STEP 1. Install and load the required packages ----
## plumber ----
if (require("plumber")) {
  require("plumber")
} else {
  install.packages("plumber", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

# STEP 2. Process a Plumber API ----
# This allows us to process a plumber API
#api <- plumber::plumb("lab12.R")

# STEP 3. Run the API on a specific port ----
# Specify a constant localhost port to use
#api$run(host = "127.0.0.1", port = 5022)

PHP output

# Heart Disease Prediction Web Form


cat('<iframe src="http://localhost/lab15/Lab15-submission-ConsumePlumberAPIOutput.php" width="800" height="600"></iframe>')