Myriam Aicha Mbongo 10/16/2023
| Student ID Numbers and Names of Group Members | GitHub Classroom Group Name |
|---|---|
| 2. 134141 - C - Aicha Mbongo |
| Course Code | Course Name | Program | Semester Duration |
|---|---|---|---|
| BBT4206 | Business Intelligence II | Bachelor of Business Information Technology | 21st August 2023 to 28th November 2023 |
# STEP 1. Install and Load the Required Packages ----
# The following packages should be installed and loaded before proceeding to the
# subsequent steps.
## readr ----
if (require("readr")) {
require("readr")
} else {
install.packages("readr", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: readr
## caret ----
if (require("caret")) {
require("caret")
} else {
install.packages("caret", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
## e1071 ----
if (require("e1071")) {
require("e1071")
} else {
install.packages("e1071", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: e1071
## factoextra ----
if (require("factoextra")) {
require("factoextra")
} else {
install.packages("factoextra", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: factoextra
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## FactoMineR ----
if (require("FactoMineR")) {
require("FactoMineR")
} else {
install.packages("FactoMineR", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}## Loading required package: FactoMineR
if (!is.element("NHANES", installed.packages()[, 1])) {
install.packages("NHANES", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("NHANES")## Loading required package: NHANES
## dplyr ----
if (!is.element("dplyr", installed.packages()[, 1])) {
install.packages("dplyr", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("dplyr")## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## naniar ----
# Documentation:
# https://cran.r-project.org/package=naniar or
# https://www.rdocumentation.org/packages/naniar/versions/1.0.0
if (!is.element("naniar", installed.packages()[, 1])) {
install.packages("naniar", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("naniar")## Loading required package: naniar
## ggplot2 ----
# We require the "ggplot2" package to create more appealing visualizations
if (!is.element("ggplot2", installed.packages()[, 1])) {
install.packages("ggplot2", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("ggplot2")
## MICE ----
# We use the MICE package to perform data imputation
if (!is.element("mice", installed.packages()[, 1])) {
install.packages("mice", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("mice")## Loading required package: mice
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Amelia ----
if (!is.element("Amelia", installed.packages()[, 1])) {
install.packages("Amelia", dependencies = TRUE,
repos = "https://cloud.r-project.org")
}
require("Amelia")## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(readr)
heart <- read_csv(
"../data/heart.csv",
col_types = cols(
age = col_double(),
sex = col_factor(levels = c("0", "1")),
cp = col_factor(levels = c("0", "1", "2", "3")),
trestbps = col_double(),
chol = col_double(),
fbs = col_factor(levels = c("0", "1")),
restecg = col_factor(levels = c("0", "1", "2")),
thalach = col_double(),
exang = col_factor(levels = c("0", "1")),
oldpeak = col_double(),
slope = col_factor(levels = c("0", "1", "2")),
ca = col_double(),
thal = col_factor(levels = c("0", "1", "2", "3")),
target = col_factor(levels = c("neg", "pos"))
)
)
#View(heart)### STEP 3a. Pre#View the Loaded Datasets, Identify the Data Types ----
# Dimensions refer to the number of observations (rows) and the number of
# attributes/variables/features (columns).
#Understanding data types is key for effective analysis.It helps choose suitable visualizations and algorithms,
#and highlights the need for conversions between categorical and numerical data when necessary.
dim(heart)## [1] 1025 14
## age sex cp trestbps chol fbs restecg thalach
## "numeric" "factor" "factor" "numeric" "numeric" "factor" "factor" "numeric"
## exang oldpeak slope ca thal target
## "factor" "numeric" "factor" "numeric" "factor" "factor"
# It is more sensible to count categorical variables (factors or dimensions)
# than numeric variables, e.g., counting the number of male and female
# participants instead of counting the frequency of each participant’s height.
heart_freq <- heart$target
cbind(frequency = table(heart_freq),
percentage = prop.table(table(heart_freq)) * 100)## frequency percentage
## neg 499 48.68293
## pos 526 51.31707
# We, therefore, must manually create a function that can calculate the mode.
heart_target_mode <- names(table(heart$target))[
which(table(heart$target) == max(table(heart$target)))
]
print(heart_target_mode)## [1] "pos"
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:312 0:497 Min. : 94.0 Min. :126 0:872 0:497
## 1st Qu.:48.00 1:713 1:167 1st Qu.:120.0 1st Qu.:211 1:153 1:513
## Median :56.00 2:284 Median :130.0 Median :240 2: 15
## Mean :54.43 3: 77 Mean :131.6 Mean :246
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:275
## Max. :77.00 Max. :200.0 Max. :564
## thalach exang oldpeak slope ca thal
## Min. : 71.0 0:680 Min. :0.000 0: 74 Min. :0.0000 0: 7
## 1st Qu.:132.0 1:345 1st Qu.:0.000 1:482 1st Qu.:0.0000 1: 64
## Median :152.0 Median :0.800 2:469 Median :0.0000 2:544
## Mean :149.1 Mean :1.072 Mean :0.7541 3:410
## 3rd Qu.:166.0 3rd Qu.:1.800 3rd Qu.:1.0000
## Max. :202.0 Max. :6.200 Max. :4.0000
## target
## neg:499
## pos:526
##
##
##
##
# calculate the standard deviation of only columns that are numeric, thus
# leaving out the columns termed as “factors” (categorical) or those that have
# a string data type.
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], sd)## age trestbps chol thalach oldpeak ca
## 9.072290 17.516718 51.592510 23.005724 1.175053 1.030798
## age trestbps chol thalach oldpeak ca
## 9.072290 17.516718 51.592510 23.005724 1.175053 1.030798
# The Kurtosis informs you of how often outliers occur in the results.
# There are different formulas for calculating kurtosis.
# Specifying “type = 2” allows us to use the 2nd formula which is the same
# kurtosis formula used in SPSS and SAS.
# In “type = 2” (used in SPSS and SAS):
# 1. Kurtosis < 3 implies a low number of outliers
# 2. Kurtosis = 3 implies a medium number of outliers
# 3. Kurtosis > 3 implies a high number of outliers
if (!is.element("e1071", installed.packages()[, 1])) {
install.packages("e1071", dependencies = TRUE)
}
require("e1071")
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], kurtosis, type = 2)## age trestbps chol thalach oldpeak ca
## -0.52561781 0.99122074 3.99680305 -0.08882249 1.31447089 0.70112287
# The skewness informs you of the asymmetry of the distribution of results.
# Using “type = 2” can be interpreted as:
# 1. Skewness between -0.4 and 0.4 (inclusive) implies that there is no skew
# in the distribution of results; the distribution of results is symmetrical;
# it is a normal distribution.
# 2. Skewness above 0.4 implies a positive skew; a right-skewed distribution.
# 3. Skewness below -0.4 implies a negative skew; a left-skewed distribution.
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], skewness, type = 2)## age trestbps chol thalach oldpeak ca
## -0.2488659 0.7397682 1.0740728 -0.5137772 1.2108994 1.2611886
# Note that the covariance and the correlation are computed for numeric values
# only, not categorical values.
heart_cov <- cov(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)])
#View(heart_cov)# One-Way ANOVA can be used to test the effect of the 3 types of fertilizer on
# crop yield whereas,
# Two-Way ANOVA can be used to test the effect of the 3 types of fertilizer and
# the 2 types of planting density on crop yield.
heart_one_way_anova <- aov(trestbps ~ age, data = heart)
summary(heart_one_way_anova)## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 23096 23096 81.16 <2e-16 ***
## Residuals 1023 291104 285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The ANOVA rejects the null hypothesis,The ANOVA indicates a significant difference in resting blood pressure among age groups
#(F(1, 1023) = 81.16, p < 2e-16), highlighting age as a key factor
#in determining blood pressure.
#This aligns with cardiovascular knowledge, correlating increased age with a higher risk of cardiovascular disease.
heart_two_way_anova <- aov(trestbps ~ exang + ca, # nolint
data = heart)
summary(heart_two_way_anova)## Df Sum Sq Mean Sq F value Pr(>F)
## exang 1 1177 1176.7 3.88 0.04914 *
## ca 1 3050 3050.2 10.06 0.00156 **
## Residuals 1022 309973 303.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#both variables, exercise-induced angina and the number of major vessels, are associated with statistically significant differences in
#resting blood pressure.# Histograms help in determining whether an attribute has a Gaussian
# distribution. They can also be used to identify the presence of outliers.
par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
heart_variable <- as.numeric(unlist(heart[, i]))
hist(heart_variable, main = names(heart)[i])
}heart_health_variable <- as.numeric( unlist(heart[, 8]))
hist(heart_health_variable , main = names(heart)[8])
heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])
heart_health_variable <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])# Box and whisker plots are useful in understanding the distribution of data.
par(mfrow = c(1, 3))
for (i in c(1, 4, 5)) {
boxplot(heart[, i], main = names(heart)[i])
}boxplot(heart[, 8], main = names(heart)[8])
boxplot(heart[, 10], main = names(heart)[10])
boxplot(heart[, 12], main = names(heart)[12])# Categorical attributes (factors) can also be visualized. This is done using a
# bar chart to give an idea of the proportion of instances that belong to each
# category.
barplot(table(heart[, 2]), main = names(heart)[2])







# Execute the following to create a map to identify the missing data in each
# dataset:
if (!is.element("Amelia", installed.packages()[, 1])) {
install.packages("Amelia", dependencies = TRUE)
}
require("Amelia")
#comment
missmap(heart, col = c("red", "grey"), legend = TRUE)
# Correlation plots can be used to get an idea of which attributes change
# together. The function “corrplot()” found in the package “corrplot” is
# required to perform this. The larger the dot in the correlation plot, the
# larger the correlation. Blue represents a positive correlation whereas red
# represents a negative correlation.
if (!is.element("corrplot", installed.packages()[, 1])) {
install.packages("corrplot", dependencies = TRUE)
}
require("corrplot")## Loading required package: corrplot
## corrplot 0.92 loaded
#heart <- heart[, -which(names(heart) == "target_numeric")]
# Alternatively, the 'ggcorrplot::ggcorrplot()' function can be used to plot a
# more visually appealing plot.
# The code below shows how to install a package in R:
if (!is.element("ggcorrplot", installed.packages()[, 1])) {
install.packages("ggcorrplot", dependencies = TRUE)
}
require("ggcorrplot")## Loading required package: ggcorrplot

# Alternatively, the ggcorrplot package can be used to make the plots more
# appealing:
ggplot(heart,
aes(x = age, y = sex, shape = target, color = target)) +
geom_point() +
geom_smooth(method = lm)## `geom_smooth()` using formula = 'y ~ x'

### Subset of rows ----
# We then select 500 random observations to be included in the dataset
rand_ind <- sample(seq_len(nrow(heart)), 500)
heart <- heart[rand_ind, ]## [1] FALSE
## [1] 0
## [1] 0
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal target
## 0 0 0 0 0 0
# What is the number and percentage of missing values grouped by
# each variable?
miss_var_summary(heart)## # A tibble: 14 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 age 0 0
## 2 sex 0 0
## 3 cp 0 0
## 4 trestbps 0 0
## 5 chol 0 0
## 6 fbs 0 0
## 7 restecg 0 0
## 8 thalach 0 0
## 9 exang 0 0
## 10 oldpeak 0 0
## 11 slope 0 0
## 12 ca 0 0
## 13 thal 0 0
## 14 target 0 0
# What is the number and percentage of missing values grouped by
# each observation?
miss_case_summary(heart)## # A tibble: 500 × 3
## case n_miss pct_miss
## <int> <int> <dbl>
## 1 1 0 0
## 2 2 0 0
## 3 3 0 0
## 4 4 0 0
## 5 5 0 0
## 6 6 0 0
## 7 7 0 0
## 8 8 0 0
## 9 9 0 0
## 10 10 0 0
## # ℹ 490 more rows
# Which variables contain the most missing values?
#gg_miss_var(heart)
# Where are missing values located (the shaded regions in the plot)?
#vis_miss(heart) + theme(axis.text.x = element_text(angle = 80))
# Which combinations of variables are missing together?
#gg_miss_upset(heart)
# Create a heatmap of "missingness" broken down by "target"
# First, confirm that the "target" variable is a categorical variable
is.factor(heart$target)## [1] TRUE
# Second, create the visualization
#gg_miss_fct(heart, fct = target)
# We can also create a heatmap of "missingness" broken down by "exang"
# First, confirm that the "exang" variable is a categorical variable
is.factor(heart$exang)## [1] TRUE
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:147 0:241 Min. : 94 Min. :126.0 0:426 0:246
## 1st Qu.:48.00 1:353 1: 86 1st Qu.:120 1st Qu.:211.8 1: 74 1:244
## Median :56.00 2:134 Median :130 Median :239.0 2: 10
## Mean :54.68 3: 39 Mean :131 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140 3rd Qu.:273.2
## Max. :77.00 Max. :200 Max. :564.0
## thalach exang oldpeak slope ca thal
## Min. : 71 0:339 Min. :0.000 0: 29 Min. :0.000 0: 3
## 1st Qu.:136 1:161 1st Qu.:0.000 1:226 1st Qu.:0.000 1: 33
## Median :154 Median :0.800 2:245 Median :0.000 2:264
## Mean :150 Mean :1.041 Mean :0.716 3:200
## 3rd Qu.:166 3rd Qu.:1.600 3rd Qu.:1.000
## Max. :202 Max. :5.600 Max. :4.000
## target
## neg:244
## pos:256
##
##
##
##
# BEFORE
heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])heart_health_variable <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])heart_health_variable <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])heart_health_variable <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])heart_health_variable <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])## Created from 500 samples and 14 variables
##
## Pre-processing:
## - ignored (8)
## - scaled (6)
heart_scale_transform <- predict(model_of_the_transform,
heart)
# AFTER
#1, 4, 5, 8, 10, 12
heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 1]))
hist(heart_health_variable , main = names(heart_scale_transform)[1])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 4]))
hist(heart_health_variable , main = names(heart_scale_transform)[4])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 5]))
hist(heart_health_variable , main = names(heart_scale_transform)[5])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 8]))
hist(heart_health_variable , main = names(heart_scale_transform)[8])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 10]))
hist(heart_health_variable , main = names(heart_scale_transform)[10])heart_health_variable <- as.numeric( unlist(heart_scale_transform [, 12]))
hist(heart_health_variable , main = names(heart_scale_transform)[12])## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:147 0:241 Min. : 94 Min. :126.0 0:426 0:246
## 1st Qu.:48.00 1:353 1: 86 1st Qu.:120 1st Qu.:211.8 1: 74 1:244
## Median :56.00 2:134 Median :130 Median :239.0 2: 10
## Mean :54.68 3: 39 Mean :131 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140 3rd Qu.:273.2
## Max. :77.00 Max. :200 Max. :564.0
## thalach exang oldpeak slope ca thal
## Min. : 71 0:339 Min. :0.000 0: 29 Min. :0.000 0: 3
## 1st Qu.:136 1:161 1st Qu.:0.000 1:226 1st Qu.:0.000 1: 33
## Median :154 Median :0.800 2:245 Median :0.000 2:264
## Mean :150 Mean :1.041 Mean :0.716 3:200
## 3rd Qu.:166 3rd Qu.:1.600 3rd Qu.:1.000
## Max. :202 Max. :5.600 Max. :4.000
## target
## neg:244
## pos:256
##
##
##
##
# BEFORE
heart_health_variable <- as.numeric( unlist(heart [, 1]))
boxplot(heart_health_variable , main = names(heart_health_variable )[1])heart_health_variable <- as.numeric( unlist(heart [, 4]))
boxplot(heart_health_variable , main = names(heart)[4])heart_health_variable <- as.numeric( unlist(heart [, 5]))
boxplot(heart_health_variable , main = names(heart)[5])heart_health_variable <- as.numeric( unlist(heart [, 8]))
boxplot(heart_health_variable , main = names(heart)[8])heart_health_variable <- as.numeric( unlist(heart [, 10]))
boxplot(heart_health_variable , main = names(heart)[10])heart_health_variable <- as.numeric( unlist(heart [, 12]))
boxplot(heart_health_variable , main = names(heart)[12])## Created from 500 samples and 14 variables
##
## Pre-processing:
## - centered (6)
## - ignored (8)
heart_center_transform <- predict(model_of_the_transform,
heart)
# AFTER
#1, 4, 5, 8, 10, 12
heart_health_variable <- as.numeric( unlist(heart_center_transform [, 1]))
boxplot(heart_health_variable , main = names(heart_center_transform)[1])heart_health_variable <- as.numeric( unlist(heart_center_transform [, 4]))
boxplot(heart_health_variable , main = names(heart_center_transform)[4])heart_health_variable <- as.numeric( unlist(heart_center_transform [, 5]))
boxplot(heart_health_variable , main = names(heart_center_transform)[5])heart_health_variable <- as.numeric( unlist(heart_center_transform [, 8]))
boxplot(heart_health_variable , main = names(heart_center_transform)[8])heart_health_variable <- as.numeric( unlist(heart_center_transform [, 10]))
boxplot(heart_health_variable , main = names(heart_center_transform)[10])heart_health_variable <- as.numeric( unlist(heart_center_transform [, 12]))
boxplot(heart_health_variable , main = names(heart_center_transform)[12])## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:147 0:241 Min. : 94 Min. :126.0 0:426 0:246
## 1st Qu.:48.00 1:353 1: 86 1st Qu.:120 1st Qu.:211.8 1: 74 1:244
## Median :56.00 2:134 Median :130 Median :239.0 2: 10
## Mean :54.68 3: 39 Mean :131 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140 3rd Qu.:273.2
## Max. :77.00 Max. :200 Max. :564.0
## thalach exang oldpeak slope ca thal
## Min. : 71 0:339 Min. :0.000 0: 29 Min. :0.000 0: 3
## 1st Qu.:136 1:161 1st Qu.:0.000 1:226 1st Qu.:0.000 1: 33
## Median :154 Median :0.800 2:245 Median :0.000 2:264
## Mean :150 Mean :1.041 Mean :0.716 3:200
## 3rd Qu.:166 3rd Qu.:1.600 3rd Qu.:1.000
## Max. :202 Max. :5.600 Max. :4.000
## target
## neg:244
## pos:256
##
##
##
##
#Calculate the skewness before the Box-Cox transform
sapply(heart[, -c(2, 3, 6, 7, 9, 11, 13, 14)], skewness, type = 2)## age trestbps chol thalach oldpeak ca
## -0.1969603 0.7392460 1.3148294 -0.6468942 1.1584133 1.2639445
#Plot a histogram to view the skewness before the Box-Cox transform
heart_health_variable <- as.numeric( unlist(heart [, 1]))
hist(heart_health_variable , main = names(heart_health_variable )[1])
heart_health_variable <- as.numeric( unlist(heart [, 4]))
hist(heart_health_variable , main = names(heart)[4])
heart_health_variable <- as.numeric( unlist(heart [, 5]))
hist(heart_health_variable , main = names(heart)[5])
heart_health_variable <- as.numeric( unlist(heart [, 8]))
hist(heart_health_variable , main = names(heart)[8])
heart_health_variable <- as.numeric( unlist(heart [, 10]))
hist(heart_health_variable , main = names(heart)[10])
heart_health_variable <- as.numeric( unlist(heart [, 12]))
hist(heart_health_variable , main = names(heart)[12])
## Created from 500 samples and 14 variables
##
## Pre-processing:
## - ignored (8)
## - Yeo-Johnson transformation (6)
##
## Lambda estimates for Yeo-Johnson transformation:
## 1.41, -0.87, -0.29, 2.36, -0.51, -1.33
heart_yeo_johnson_transform <- predict(model_of_the_transform,
heart)
# AFTER
#1, 4, 5, 8, 10, 12
sapply(heart_yeo_johnson_transform[, -c(2, 3, 6, 7, 9, 11, 13, 14)], skewness, type = 2)## age trestbps chol thalach oldpeak
## -0.0480406918 0.0004529974 -0.0142411533 -0.0604063559 0.1282174464
## ca
## 0.4161702231
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 1]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[1])
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 4]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[4])
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 5]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[5])
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 8]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[8])
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 10]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[10])
heart_health_variable <- as.numeric( unlist(heart_yeo_johnson_transform [, 12]))
hist(heart_health_variable , main = names(heart_yeo_johnson_transform)[12])
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 0.7012142 0.3796381 0.3680016 0.3170306 0.23180681
## Proportion of Variance 0.5312850 0.1557277 0.1463274 0.1085997 0.05806021
## Cumulative Proportion 0.5312850 0.6870127 0.8333401 0.9419398 1.00000000
## Comp.6
## Standard deviation 9.132458e-09
## Proportion of Variance 9.011594e-17
## Cumulative Proportion 1.000000e+00
#### Scree Plot ----
# The Scree Plot shows that the 1st 2 principal components can cumulatively
# explain 92.8% of the variance, i.e., 87.7% + 5.1% = 92.8%.
factoextra::fviz_eig(heart_pca_fe, addlabels = TRUE)## Comp.1 Comp.2
## age 0.50238175 0.2863519
## trestbps 0.11895391 0.5074929
## chol 0.06973052 0.4976462
## thalach -0.67417832 -0.0746078
## oldpeak 0.40879449 -0.2318280
## ca 0.32706399 -0.5945513
factoextra::fviz_pca_var(heart_pca_fe, col.var = "cos2",
gradient.cols = c("red", "orange", "green"),
repel = TRUE)# Independent Component Analysis (ICA) Linear Algebra Transform ----
### STEP 4d. ICA Linear Algebra Transform for Dimensionality Reduction ----
# Independent Component Analysis (ICA) transforms the data to return only the
# independent components. The n.comp argument is required to specify the
# desired number of independent components. This also results in a list of
# attributes that are uncorrelated.
if (!is.element("fastICA", installed.packages()[, 1])) {
install.packages("fastICA", dependencies = TRUE)
}
require("fastICA")## Loading required package: fastICA
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0:147 0:241 Min. : 94 Min. :126.0 0:426 0:246
## 1st Qu.:48.00 1:353 1: 86 1st Qu.:120 1st Qu.:211.8 1: 74 1:244
## Median :56.00 2:134 Median :130 Median :239.0 2: 10
## Mean :54.68 3: 39 Mean :131 Mean :246.3
## 3rd Qu.:61.00 3rd Qu.:140 3rd Qu.:273.2
## Max. :77.00 Max. :200 Max. :564.0
## thalach exang oldpeak slope ca thal
## Min. : 71 0:339 Min. :0.000 0: 29 Min. :0.000 0: 3
## 1st Qu.:136 1:161 1st Qu.:0.000 1:226 1st Qu.:0.000 1: 33
## Median :154 Median :0.800 2:245 Median :0.000 2:264
## Mean :150 Mean :1.041 Mean :0.716 3:200
## 3rd Qu.:166 3rd Qu.:1.600 3rd Qu.:1.000
## Max. :202 Max. :5.600 Max. :4.000
## target
## neg:244
## pos:256
##
##
##
##
model_of_the_transform <- preProcess(heart,
method = c("scale", "center", "ica"),
n.comp = 5)
print(model_of_the_transform)## Created from 500 samples and 14 variables
##
## Pre-processing:
## - centered (6)
## - independent component signal extraction (6)
## - ignored (8)
## - scaled (6)
##
## ICA used 5 components
## sex cp fbs restecg exang slope thal target
## 0:147 0:241 0:426 0:246 0:339 0: 29 0: 3 neg:244
## 1:353 1: 86 1: 74 1:244 1:161 1:226 1: 33 pos:256
## 2:134 2: 10 2:245 2:264
## 3: 39 3:200
##
##
## ICA1 ICA2 ICA3 ICA4
## Min. :-2.41852 Min. :-3.591591 Min. :-6.34953 Min. :-3.71925
## 1st Qu.:-0.76748 1st Qu.:-0.523542 1st Qu.:-0.48595 1st Qu.:-0.48108
## Median : 0.02731 Median : 0.008073 Median : 0.09773 Median : 0.08747
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.80934 3rd Qu.: 0.504764 3rd Qu.: 0.69382 3rd Qu.: 0.61364
## Max. : 2.38878 Max. : 3.372602 Max. : 2.47824 Max. : 2.47281
## ICA5
## Min. :-3.18425
## 1st Qu.:-0.61222
## Median : 0.02158
## Mean : 0.00000
## 3rd Qu.: 0.59903
## Max. : 2.82145