Business Intelligence Project

Myriam Aicha Mbongo 10/16/2023

CARDIOVASCULAR DISEASE PREDICTION PROJECT

Student Details

Student ID Numbers and Names of Group Members GitHub Classroom Group Name
2. 134141 - C - Aicha Mbongo
Course Code Course Name Program Semester Duration
BBT4206 Business Intelligence II Bachelor of Business Information Technology 21st August 2023 to 28th November 2023

STEP 1. Install and Load the Required Packages —-

# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.

## readr ----
if (require("readr")) {
  require("readr")
} else {
  install.packages("readr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: readr
## caret ----
if (require("caret")) {
  require("caret")
} else {
  install.packages("caret", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
  require("e1071")
} else {
  install.packages("e1071", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
  require("factoextra")
} else {
  install.packages("factoextra", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: factoextra

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
  require("FactoMineR")
} else {
  install.packages("FactoMineR", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
  install.packages("NHANES", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("NHANES")
## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
  install.packages("dplyr", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("dplyr")
## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## naniar ----
# Documentation:
#   https://cran.r-project.org/package=naniar or
#   https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
  install.packages("naniar", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("naniar")
## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
  install.packages("ggplot2", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("ggplot2")

## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
  install.packages("mice", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("mice")
## Loading required package: mice

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE,
                   repos = "https://cloud.r-project.org")
}
require("Amelia")
## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

Milestone 1 : Exploratory Data Analysis (Lab 1 and 2)

STEP 3. Load the downloaded sample datasets

library(readr)
heart <- read_csv(
  "../data/heart.csv",
  col_types = cols(
    age = col_double(),
    sex = col_factor(levels = c("0", "1")),
    cp = col_factor(levels = c("0", "1", "2", "3")),
    trestbps = col_double(),
    chol = col_double(),
    fbs = col_factor(levels = c("0", "1")),
    restecg = col_factor(levels = c("0", "1", "2")),
    thalach = col_double(),
    exang = col_factor(levels = c("0", "1")),
    oldpeak = col_double(),
    slope = col_factor(levels = c("0", "1", "2")),
    ca = col_double(),
    thal = col_factor(levels = c("0", "1", "2", "3")),
    target = col_factor(levels = c("neg", "pos"))
  )
)

#View(heart)

STEP 3a. Preview the Loaded Datasets, Identify the Data Types —-

### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types  ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,  
#and highlights the need for conversions between categorical and numerical data when necessary.


dim(heart)
## [1] 1025   14
sapply(heart, class)
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor" "numeric"  "factor"  "factor"

STEP 3b. Identify the number of instances that belong to each class. —-

# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.

heart_freq <- heart$target
cbind(frequency = table(heart_freq),
      percentage = prop.table(table(heart_freq)) * 100)
##     frequency percentage
## neg       499   48.68293
## pos       526   51.31707

STEP 3c. Measures of Central Tendency(Calculate the mode ) —-

# We, therefore, must manually create a function that can calculate the mode.

heart_target_mode <- names(table(heart$target))[
  which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)
## [1] "pos"

STEP 3d. Measure the distribution of the data for each variable —-

summary(heart)
##       age        sex     cp         trestbps          chol     fbs     restecg
##  Min.   :29.00   0:312   0:497   Min.   : 94.0   Min.   :126   0:872   0:497  
##  1st Qu.:48.00   1:713   1:167   1st Qu.:120.0   1st Qu.:211   1:153   1:513  
##  Median :56.00           2:284   Median :130.0   Median :240           2: 15  
##  Mean   :54.43           3: 77   Mean   :131.6   Mean   :246                  
##  3rd Qu.:61.00                   3rd Qu.:140.0   3rd Qu.:275                  
##  Max.   :77.00                   Max.   :200.0   Max.   :564                  
##     thalach      exang      oldpeak      slope         ca         thal   
##  Min.   : 71.0   0:680   Min.   :0.000   0: 74   Min.   :0.0000   0:  7  
##  1st Qu.:132.0   1:345   1st Qu.:0.000   1:482   1st Qu.:0.0000   1: 64  
##  Median :152.0           Median :0.800   2:469   Median :0.0000   2:544  
##  Mean   :149.1           Mean   :1.072           Mean   :0.7541   3:410  
##  3rd Qu.:166.0           3rd Qu.:1.800           3rd Qu.:1.0000          
##  Max.   :202.0           Max.   :6.200           Max.   :4.0000          
##  target   
##  neg:499  
##  pos:526  
##           
##           
##           
## 

STEP 3e. Measure the standard deviation of each variable —-

# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798
#or
sapply(heart[, c(1, 4, 5, 8, 10, 12)], sd)
##       age  trestbps      chol   thalach   oldpeak        ca 
##  9.072290 17.516718 51.592510 23.005724  1.175053  1.030798

STEP 3f. Measure the kurtosis of each variable —-

# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.

# In “type = 2” (used in SPSS and SAS):
# 1.    Kurtosis < 3 implies a low number of outliers
# 2.    Kurtosis = 3 implies a medium number of outliers
# 3.    Kurtosis > 3 implies a high number of outliers

if (!is.element("e1071", installed.packages()[, 1])) {
  install.packages("e1071", dependencies = TRUE)
}
require("e1071")

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  kurtosis, type = 2)
##         age    trestbps        chol     thalach     oldpeak          ca 
## -0.52561781  0.99122074  3.99680305 -0.08882249  1.31447089  0.70112287

STEP 3g. Measure the skewness of each variable—-

# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:

# 1.    Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2.    Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3.    Skewness below -0.4 implies a negative skew; a left-skewed distribution.

sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.2488659  0.7397682  1.0740728 -0.5137772  1.2108994  1.2611886

STEP 3h. Measure the skewness of each variable—-

# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.

heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)

STEP 3i. Measure the correlation between variables —-

heart_cor <- cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cor)

STEP 3j. Inferential Statistics —-

# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## age            1  23096   23096   81.16 <2e-16 ***
## Residuals   1023 291104     285                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups 
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor 
#in determining blood pressure. 
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.

heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
                                           data = heart)
summary(heart_two_way_anova)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## exang          1   1177  1176.7    3.88 0.04914 * 
## ca             1   3050  3050.2   10.06 0.00156 **
## Residuals   1022 309973   303.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in 
#resting blood pressure.

Univariate Plots —-

STEP 3k. Create Histograms for Each Numeric Attribute —-

# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.



par(mfrow = c(1, 3))

for (i in c(1, 4, 5)) {
  heart_variable <- as.numeric(unlist(heart[, i]))
  hist(heart_variable, main = names(heart)[i])
}

heart_health_variable  <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable  <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

STEP 3l. Create Box and Whisker Plots for Each Numeric Attribute —-

# Box and whisker plots are useful in understanding the distribution of data.

par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
  boxplot(heart[, i], main = names(heart)[i])
}

boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])

STEP 3m. Create Bar Plots for Each Categorical Attribute —-

# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.

barplot(table(heart[, 2]), main = names(heart)[2])

barplot(table(heart[, 3]), main = names(heart)[3])

barplot(table(heart[, 6]), main = names(heart)[6])

barplot(table(heart[, 7]), main = names(heart)[7])

barplot(table(heart[, 9]), main = names(heart)[9])

barplot(table(heart[, 11]), main = names(heart)[11])

barplot(table(heart[, 13]), main = names(heart)[13])

barplot(table(heart[, 14]), main = names(heart)[14])

STEP 3n. Create a Missingness Map to Identify Missing Data —-

# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
  install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)

Multivariate Plots —-

STEP 3o. Create a Correlation Plot —-

# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.

if (!is.element("corrplot", installed.packages()[, 1])) {
  install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")
## Loading required package: corrplot

## corrplot 0.92 loaded
corrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]), method = "circle")

#heart <- heart[, -which(names(heart) == "target_numeric")]

# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
  install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")
## Loading required package: ggcorrplot
ggcorrplot(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))

STEP 3p. Create a Scatter Plot —-

pairs(heart)

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
       aes(x = age, y = sex, shape = target, color = target)) +
  geom_point() +
  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Milestone 2: Preprocessing and Data Transformation (Lab 3 and 4) —-

STEP 4. Create a subset of the variables/features —-

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]

STEP 4a. Confirm the “missingness” in the Dataset before Imputation —-

# Are there missing values in the dataset?
any_na(heart)
## [1] FALSE
# How many?
n_miss(heart)
## [1] 0
# What is the percentage of missing data in the entire dataset?
prop_miss(heart)
## [1] 0
# How many missing values does each variable have?
heart %>% is.na() %>% colSums()
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        0        0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)
## # A tibble: 14 × 3
##    variable n_miss pct_miss
##    <chr>     <int>    <dbl>
##  1 age           0        0
##  2 sex           0        0
##  3 cp            0        0
##  4 trestbps      0        0
##  5 chol          0        0
##  6 fbs           0        0
##  7 restecg       0        0
##  8 thalach       0        0
##  9 exang         0        0
## 10 oldpeak       0        0
## 11 slope         0        0
## 12 ca            0        0
## 13 thal          0        0
## 14 target        0        0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)
## # A tibble: 500 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1     1      0        0
##  2     2      0        0
##  3     3      0        0
##  4     4      0        0
##  5     5      0        0
##  6     6      0        0
##  7     7      0        0
##  8     8      0        0
##  9     9      0        0
## 10    10      0        0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)

# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))

# Which combinations of variables are missing together?
#gg_miss_upset(heart)

# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)

# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable

is.factor(heart$exang)
## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = exang)

STEP 4b. Apply a Scale Data Transform —-

# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("scale"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
                                               heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])

View(heart_scale_transform)

STEP 4c. Apply a Center Data Transform —-

# Summary of each variable
summary(heart)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
## 
# BEFORE

heart_health_variable <- as.numeric( unlist(heart [, 1]))
boxplot(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
boxplot(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
boxplot(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
boxplot(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
boxplot(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
boxplot(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("center"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
                                 heart)


# AFTER 
#1, 4, 5, 8, 10, 12

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])

Yeo-Johnson Power Transform on the Boston Housing Dataset —-

# BEFORE
summary(heart)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
## 
#Calculate the skewness before the Box-Cox transform
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##        age   trestbps       chol    thalach    oldpeak         ca 
## -0.1969603  0.7392460  1.3148294 -0.6468942  1.1584133  1.2639445
#Plot a histogram to view the skewness before the Box-Cox transform
heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])

heart_health_variable  <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])

heart_health_variable  <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])

heart_health_variable  <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])

heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])

heart_health_variable  <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])

model_of_the_transform <- preProcess(heart, method = c("YeoJohnson"))
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - ignored (8)
##   - Yeo-Johnson transformation (6)
## 
## Lambda estimates for Yeo-Johnson transformation:
## 1.41, -0.87, -0.29, 2.36, -0.51, -1.33
heart_yeo_johnson_transform <- predict(model_of_the_transform,
                                 heart)



# AFTER 
#1, 4, 5, 8, 10, 12

sapply(heart_yeo_johnson_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)],  skewness, type = 2)
##           age      trestbps          chol       thalach       oldpeak 
## -0.0480406918  0.0004529974 -0.0142411533 -0.0604063559  0.1282174464 
##            ca 
##  0.4161702231
heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 1]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[1])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 4]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[4])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 5]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[5])

heart_health_variable  <- as.numeric( unlist(heart_yeo_johnson_transform [, 8]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[8])

heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 10]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[10])

heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 12]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[12])

PCA for Feature Extraction on the Boston Housing Dataset —-

heart_pca_fe <- princomp(cor(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)]))
summary(heart_pca_fe)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     0.7012142 0.3796381 0.3680016 0.3170306 0.23180681
## Proportion of Variance 0.5312850 0.1557277 0.1463274 0.1085997 0.05806021
## Cumulative Proportion  0.5312850 0.6870127 0.8333401 0.9419398 1.00000000
##                              Comp.6
## Standard deviation     9.132458e-09
## Proportion of Variance 9.011594e-17
## Cumulative Proportion  1.000000e+00
#### Scree Plot ----
# The Scree Plot shows that the 1st 2 principal components can cumulatively
# explain 92.8% of the variance, i.e., 87.7% + 5.1% = 92.8%.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)

heart_pca_fe$loadings[, 1:2]
##               Comp.1     Comp.2
## age       0.50238175  0.2863519
## trestbps  0.11895391  0.5074929
## chol      0.06973052  0.4976462
## thalach  -0.67417832 -0.0746078
## oldpeak   0.40879449 -0.2318280
## ca        0.32706399 -0.5945513
factoextra::fviz_cos2(heart_pca_fe, choice = "var", axes = 1:2)

factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
                         gradient.cols = c("red", "orange", "green"),
                         repel = TRUE)

# Independent Component Analysis (ICA) Linear Algebra Transform ----
### STEP 4d. ICA Linear Algebra Transform for Dimensionality Reduction ----

# Independent Component Analysis (ICA) transforms the data to return only the
# independent components. The n.comp argument is required to specify the
# desired number of independent components. This also results in a list of
# attributes that are uncorrelated.

if (!is.element("fastICA", installed.packages()[, 1])) {
  install.packages("fastICA", dependencies = TRUE)
}
require("fastICA")
## Loading required package: fastICA

ICA for Dimensionality Reduction on the Boston Housing Dataset —-

summary(heart)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0:147   0:241   Min.   : 94   Min.   :126.0   0:426   0:246  
##  1st Qu.:48.00   1:353   1: 86   1st Qu.:120   1st Qu.:211.8   1: 74   1:244  
##  Median :56.00           2:134   Median :130   Median :239.0           2: 10  
##  Mean   :54.68           3: 39   Mean   :131   Mean   :246.3                  
##  3rd Qu.:61.00                   3rd Qu.:140   3rd Qu.:273.2                  
##  Max.   :77.00                   Max.   :200   Max.   :564.0                  
##     thalach    exang      oldpeak      slope         ca        thal   
##  Min.   : 71   0:339   Min.   :0.000   0: 29   Min.   :0.000   0:  3  
##  1st Qu.:136   1:161   1st Qu.:0.000   1:226   1st Qu.:0.000   1: 33  
##  Median :154           Median :0.800   2:245   Median :0.000   2:264  
##  Mean   :150           Mean   :1.041           Mean   :0.716   3:200  
##  3rd Qu.:166           3rd Qu.:1.600           3rd Qu.:1.000          
##  Max.   :202           Max.   :5.600           Max.   :4.000          
##  target   
##  neg:244  
##  pos:256  
##           
##           
##           
## 
model_of_the_transform <- preProcess(heart,
                                     method = c("scale", "center", "ica"),
                                     n.comp = 5)
print(model_of_the_transform)
## Created from 500 samples and 14 variables
## 
## Pre-processing:
##   - centered (6)
##   - independent component signal extraction (6)
##   - ignored (8)
##   - scaled (6)
## 
## ICA used 5 components
heart_ica_dr <- predict(model_of_the_transform, heart)

summary(heart_ica_dr)
##  sex     cp      fbs     restecg exang   slope   thal    target   
##  0:147   0:241   0:426   0:246   0:339   0: 29   0:  3   neg:244  
##  1:353   1: 86   1: 74   1:244   1:161   1:226   1: 33   pos:256  
##          2:134           2: 10           2:245   2:264            
##          3: 39                                   3:200            
##                                                                   
##                                                                   
##       ICA1               ICA2                ICA3               ICA4         
##  Min.   :-2.41852   Min.   :-3.591591   Min.   :-6.34953   Min.   :-3.71925  
##  1st Qu.:-0.76748   1st Qu.:-0.523542   1st Qu.:-0.48595   1st Qu.:-0.48108  
##  Median : 0.02731   Median : 0.008073   Median : 0.09773   Median : 0.08747  
##  Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.80934   3rd Qu.: 0.504764   3rd Qu.: 0.69382   3rd Qu.: 0.61364  
##  Max.   : 2.38878   Max.   : 3.372602   Max.   : 2.47824   Max.   : 2.47281  
##       ICA5         
##  Min.   :-3.18425  
##  1st Qu.:-0.61222  
##  Median : 0.02158  
##  Mean   : 0.00000  
##  3rd Qu.: 0.59903  
##  Max.   : 2.82145