Business Intelligence Project

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT
- Student Details
STEP 1. Install and Load the Required Packages —-
Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)
Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-
- STEP 4. Create a subset of the variables/features —-

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

Student ID Numbers and Names of Group Members	GitHub Classroom Group Name
2. 134141 - C - Aicha Mbongo

Course Code	Course Name	Program	Semester Duration
BBT4206	Business Intelligence II	Bachelor of Business Information Technology	21^st August 2023 to 28^th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: readr

## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice

## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: e1071

## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}

## Loading required package: FactoMineR

if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")

## Loading required package: NHANES

## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")

## Loading required package: naniar

## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")

## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")

## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)

## [1] 1025   14

sapply(heart, class)

##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)

##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)

## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)

##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
##

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)

##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)

##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)

##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)

##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)

##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")

## Loading required package: corrplot

## corrplot 0.92 loaded

corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")

## Loading required package: ggcorrplot

ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)

## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)

## [1] FALSE

# How many?
n_miss(heart)

## [1] 0

# What is the percentage of missing data in the entire dataset?
prop_miss(heart)

## [1] 0

# How many missing values does each variable have?
heart %>% is.na() %>% colSums()

##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0

# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)

## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0

# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)

## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows

# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)

## [1] TRUE

# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)

## [1] TRUE

# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Summary of each variable
summary(heart)

##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
##

# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)

heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Summary of each variable
summary(heart)

##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
##

# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
boxplot(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
boxplot(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
boxplot(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
boxplot(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
boxplot(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
boxplot(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("center"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)

heart_center_transform <- predict(model_of_the_transform,
                                 heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

Yeo-Johnson Power Transform on the Boston Housing Dataset —-

# BEFORE
summary(heart)

##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
##

#Calculate the skewness before the Box-Cox transform
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)

##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1969603  0.7392460  1.3148294 -0.6468942  1.1584133  1.2639445

#Plot a histogram to view the skewness before the Box-Cox transform
heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("YeoJohnson"))
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - Yeo-Johnson transformation (6)
## 
## Lambda estimates for Yeo-Johnson transformation:
## 1.41, -0.87, -0.29, 2.36, -0.51, -1.33

heart_yeo_johnson_transform <- predict(model_of_the_transform,
                                 heart)



# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_yeo_johnson_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)

##           age      trestbps          chol       thalach       oldpeak 
## -0.0480406918  0.0004529974 -0.0142411533 -0.0604063559  0.1282174464 
##            ca 
##  0.4161702231

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 1]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 4]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 5]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 8]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 10]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 12]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[12])

PCA for Feature Extraction on the Boston Housing Dataset —-

heart_pca_fe <- princomp(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))
summary(heart_pca_fe)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.7012142 0.3796381 0.3680016 0.3170306 0.23180681
## Proportion of Variance 0.5312850 0.1557277 0.1463274 0.1085997 0.05806021
## Cumulative Proportion  0.5312850 0.6870127 0.8333401 0.9419398 1.00000000
##                              Comp.6
## Standard deviation     9.132458e-09
## Proportion of Variance 9.011594e-17
## Cumulative Proportion  1.000000e+00

#### Scree Plot ----
# The Scree Plot shows that the 1st 2 principal components can cumulatively
# explain 92.8% of the variance, i.e., 87.7% + 5.1% = 92.8%.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:2]

##               Comp.1     Comp.2
## age       0.50238175  0.2863519
## trestbps  0.11895391  0.5074929
## chol      0.06973052  0.4976462
## thalach  -0.67417832 -0.0746078
## oldpeak   0.40879449 -0.2318280
## ca        0.32706399 -0.5945513

factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:2)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

# Independent Component Analysis (ICA) Linear Algebra Transform ----
### STEP 4d. ICA Linear Algebra Transform for Dimensionality Reduction ----

# Independent Component Analysis (ICA) transforms the data to return only the
# independent components. The n.comp argument is required to specify the
# desired number of independent components. This also results in a list of
# attributes that are uncorrelated.

if (!is.element("fastICA", installed.packages()[, 1])) {
  install.packages("fastICA", dependencies = TRUE)
}
require("fastICA")

## Loading required package: fastICA

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart)

##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
##

model_of_the_transform <- preProcess(heart,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 5)
print(model_of_the_transform)

## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 5 components

heart_ica_dr <- predict(model_of_the_transform, heart)

summary(heart_ica_dr)

##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:147   0:241   0:426   0:246   0:339   0: 29   0:  3   neg:244  
##  1:353   1: 86   1: 74   1:244   1:161   1:226   1: 33   pos:256  
##          2:134           2: 10           2:245   2:264            
##          3: 39                                   3:200            
##                                                                   
##                                                                   
##       ICA1               ICA2                ICA3               ICA4         
##  Min.   :-2.41852   Min.   :-3.591591   Min.   :-6.34953   Min.   :-3.71925  
##  1st Qu.:-0.76748   1st Qu.:-0.523542   1st Qu.:-0.48595   1st Qu.:-0.48108  
##  Median : 0.02731   Median : 0.008073   Median : 0.09773   Median : 0.08747  
##  Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.80934   3rd Qu.: 0.504764   3rd Qu.: 0.69382   3rd Qu.: 0.61364  
##  Max.   : 2.38878   Max.   : 3.372602   Max.   : 2.47824   Max.   : 2.47281  
##       ICA5         
##  Min.   :-3.18425  
##  1st Qu.:-0.61222  
##  Median : 0.02158  
##  Mean   : 0.00000  
##  3rd Qu.: 0.59903  
##  Max.   : 2.82145