library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read.csv("C:/Users/ACER ASPIRE 5/Documents/heart_disease.csv")
#Summary Data
str(data)
## 'data.frame': 10000 obs. of 21 variables:
## $ Age : num 56 69 46 32 60 25 78 38 56 75 ...
## $ Gender : chr "Male" "Female" "Male" "Female" ...
## $ Blood.Pressure : num 153 146 126 122 166 152 121 161 135 144 ...
## $ Cholesterol.Level : num 155 286 216 293 242 257 175 187 291 252 ...
## $ Exercise.Habits : chr "High" "High" "Low" "High" ...
## $ Smoking : chr "Yes" "No" "No" "Yes" ...
## $ Family.Heart.Disease: chr "Yes" "Yes" "No" "Yes" ...
## $ Diabetes : chr "No" "Yes" "No" "No" ...
## $ BMI : num 25 25.2 29.9 24.1 20.5 ...
## $ High.Blood.Pressure : chr "Yes" "No" "No" "Yes" ...
## $ Low.HDL.Cholesterol : chr "Yes" "Yes" "Yes" "No" ...
## $ High.LDL.Cholesterol: chr "No" "No" "Yes" "Yes" ...
## $ Alcohol.Consumption : chr "High" "Medium" "Low" "Low" ...
## $ Stress.Level : chr "Medium" "High" "Low" "High" ...
## $ Sleep.Hours : num 7.63 8.74 4.44 5.25 7.03 ...
## $ Sugar.Consumption : chr "Medium" "Medium" "Low" "High" ...
## $ Triglyceride.Level : num 342 133 393 293 263 126 107 228 317 199 ...
## $ Fasting.Blood.Sugar : num NA 157 92 94 154 91 85 111 103 96 ...
## $ CRP.Level : num 12.97 9.36 12.71 12.51 10.38 ...
## $ Homocysteine.Level : num 12.39 19.3 11.23 5.96 8.15 ...
## $ Heart.Disease.Status: chr "No" "No" "No" "No" ...
summary(data)
## Age Gender Blood.Pressure Cholesterol.Level
## Min. :18.0 Length:10000 Min. :120.0 Min. :150.0
## 1st Qu.:34.0 Class :character 1st Qu.:134.0 1st Qu.:187.0
## Median :49.0 Mode :character Median :150.0 Median :226.0
## Mean :49.3 Mean :149.8 Mean :225.4
## 3rd Qu.:65.0 3rd Qu.:165.0 3rd Qu.:263.0
## Max. :80.0 Max. :180.0 Max. :300.0
## NA's :29 NA's :19 NA's :30
## Exercise.Habits Smoking Family.Heart.Disease Diabetes
## Length:10000 Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BMI High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
## Min. :18.00 Length:10000 Length:10000 Length:10000
## 1st Qu.:23.66 Class :character Class :character Class :character
## Median :29.08 Mode :character Mode :character Mode :character
## Mean :29.08
## 3rd Qu.:34.52
## Max. :40.00
## NA's :22
## Alcohol.Consumption Stress.Level Sleep.Hours Sugar.Consumption
## Length:10000 Length:10000 Min. : 4.001 Length:10000
## Class :character Class :character 1st Qu.: 5.450 Class :character
## Mode :character Mode :character Median : 7.003 Mode :character
## Mean : 6.991
## 3rd Qu.: 8.532
## Max. :10.000
## NA's :25
## Triglyceride.Level Fasting.Blood.Sugar CRP.Level Homocysteine.Level
## Min. :100.0 Min. : 80.0 Min. : 0.003647 Min. : 5.000
## 1st Qu.:176.0 1st Qu.: 99.0 1st Qu.: 3.674126 1st Qu.: 8.723
## Median :250.0 Median :120.0 Median : 7.472164 Median :12.409
## Mean :250.7 Mean :120.1 Mean : 7.472201 Mean :12.456
## 3rd Qu.:326.0 3rd Qu.:141.0 3rd Qu.:11.255592 3rd Qu.:16.141
## Max. :400.0 Max. :160.0 Max. :14.997087 Max. :19.999
## NA's :26 NA's :22 NA's :26 NA's :20
## Heart.Disease.Status
## Length:10000
## Class :character
## Mode :character
##
##
##
##
library(ggplot2)
library(tidyr)
numeric_columns <- c("Age", "Blood.Pressure", "Cholesterol.Level", "BMI", "CRP.Level", "Homocysteine.Level", "Triglyceride.Level", "Fasting.Blood.Sugar", "Sleep.Hours")
cleaned_numeric_data <- data[, numeric_columns]
long_data <- pivot_longer(cleaned_numeric_data, cols = everything(), names_to = "Variabel", values_to = "Nilai")
ggplot(long_data, aes(x = Nilai)) +
geom_histogram(fill = "steelblue", color = "white", bins = 30) +
facet_wrap(~ Variabel, scales = "free", ncol = 3) +
theme_minimal() +
labs(title = "Distribusi Histogram Variabel Numerik", x = "Nilai", y = "Frekuensi")
## Warning: Removed 219 rows containing non-finite outside the scale range
## (`stat_bin()`).
glimpse(data)
## Rows: 10,000
## Columns: 21
## $ Age <dbl> 56, 69, 46, 32, 60, 25, 78, 38, 56, 75, 36, 40, 2…
## $ Gender <chr> "Male", "Female", "Male", "Female", "Male", "Male…
## $ Blood.Pressure <dbl> 153, 146, 126, 122, 166, 152, 121, 161, 135, 144,…
## $ Cholesterol.Level <dbl> 155, 286, 216, 293, 242, 257, 175, 187, 291, 252,…
## $ Exercise.Habits <chr> "High", "High", "Low", "High", "Low", "Low", "Hig…
## $ Smoking <chr> "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", "Y…
## $ Family.Heart.Disease <chr> "Yes", "Yes", "No", "Yes", "Yes", "No", "Yes", "Y…
## $ Diabetes <chr> "No", "Yes", "No", "No", "Yes", "No", "Yes", "Yes…
## $ BMI <dbl> 24.99159, 25.22180, 29.85545, 24.13048, 20.48629,…
## $ High.Blood.Pressure <chr> "Yes", "No", "No", "Yes", "Yes", "No", "No", "No"…
## $ Low.HDL.Cholesterol <chr> "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "No…
## $ High.LDL.Cholesterol <chr> "No", "No", "Yes", "Yes", "No", "No", "No", "No",…
## $ Alcohol.Consumption <chr> "High", "Medium", "Low", "Low", "Low", "Low", "Me…
## $ Stress.Level <chr> "Medium", "High", "Low", "High", "High", "Medium"…
## $ Sleep.Hours <dbl> 7.633228, 8.744034, 4.440440, 5.249405, 7.030971,…
## $ Sugar.Consumption <chr> "Medium", "Medium", "Low", "High", "High", "Low",…
## $ Triglyceride.Level <dbl> 342, 133, 393, 293, 263, 126, 107, 228, 317, 199,…
## $ Fasting.Blood.Sugar <dbl> NA, 157, 92, 94, 154, 91, 85, 111, 103, 96, NA, 1…
## $ CRP.Level <dbl> 12.96924569, 9.35538940, 12.70987253, 12.50904619…
## $ Homocysteine.Level <dbl> 12.387250, 19.298875, 11.230926, 5.961958, 8.1538…
## $ Heart.Disease.Status <chr> "No", "No", "No", "No", "No", "No", "No", "No", "…
#Cek Missing Values
cat("Jumlah missing values per kolom:\n")
## Jumlah missing values per kolom:
sapply(data, function(x) sum(is.na(x)))
## Age Gender Blood.Pressure
## 29 0 19
## Cholesterol.Level Exercise.Habits Smoking
## 30 0 0
## Family.Heart.Disease Diabetes BMI
## 0 0 22
## High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
## 0 0 0
## Alcohol.Consumption Stress.Level Sleep.Hours
## 0 0 25
## Sugar.Consumption Triglyceride.Level Fasting.Blood.Sugar
## 0 26 22
## CRP.Level Homocysteine.Level Heart.Disease.Status
## 26 20 0
#Analisis variabel target
target_col <- "Heart.Disease.Status"
if (target_col %in% names(data)) {
target_data <- data[[target_col]]
target_data <- target_data[!is.na(target_data)] # buang NA
cat("Distribusi variabel target:\n")
print(table(target_data))
print(prop.table(table(target_data)))
# Visualisasi distribusi variabel target
ggplot(data, aes(x = factor(.data[[target_col]]))) +
geom_bar(fill = "steelblue") +
labs(title = "Distribusi Variabel Target",
x = target_col,
y = "Jumlah") +
theme_minimal()
if (length(unique(target_data)) > 1) {
target_counts <- table(target_data)
imbalance_ratio <- min(target_counts) / max(target_counts)
cat("Rasio Imbalance Target:", round(imbalance_ratio, 2), "\n")
if (imbalance_ratio < 0.4) {
cat("PERINGATAN: Variabel target terindikasi imbalance.\n")
} else {
cat("Variabel target dalam kondisi seimbang.\n")
}
} else {
cat("Variabel target hanya memiliki satu kelas.\n")
}
} else {
cat("Kolom target tidak ditemukan dalam data.\n")
}
## Distribusi variabel target:
## target_data
## No Yes
## 8000 2000
## target_data
## No Yes
## 0.8 0.2
## Rasio Imbalance Target: 0.25
## PERINGATAN: Variabel target terindikasi imbalance.
# Load library
library(ggplot2)
library(dplyr)
library(scales)
target_df <- data %>%
filter(!is.na(.data[[target_col]])) %>%
group_by(!!sym(target_col)) %>%
summarise(Jumlah = n()) %>%
mutate(Proporsi = Jumlah / sum(Jumlah),
Persentase = percent(Proporsi))
ggplot(target_df, aes(x = !!sym(target_col), y = Jumlah, fill = !!sym(target_col))) +
geom_bar(stat = "identity", width = 0.6) +
geom_text(aes(label = Jumlah), vjust = -0.5) +
labs(title = "Distribusi Frekuensi Variabel Target",
x = "Status Penyakit Jantung",
y = "Jumlah") +
theme_minimal() +
scale_fill_manual(values = c("#4CAF50", "#F44336")) +
theme(legend.position = "none")
# Pie chart proporsi
ggplot(target_df, aes(x = "", y = Proporsi, fill = !!sym(target_col))) +
geom_col(width = 1) +
coord_polar(theta = "y") +
geom_text(aes(label = Persentase), position = position_stack(vjust = 0.5)) +
labs(title = "Distribusi Proporsi Variabel Target") +
theme_void() +
scale_fill_manual(values = c("#4CAF50", "#F44336"))
data$Heart.Disease.Status <- as.factor(data$Heart.Disease.Status)
model_full <- glm(Heart.Disease.Status ~ Age + Blood.Pressure + Cholesterol.Level + BMI +
CRP.Level + Homocysteine.Level + Gender + Diabetes + Smoking,
data = data, family = binomial)
# Ringkasan model untuk uji parsial
summary(model_full)
##
## Call:
## glm(formula = Heart.Disease.Status ~ Age + Blood.Pressure + Cholesterol.Level +
## BMI + CRP.Level + Homocysteine.Level + Gender + Diabetes +
## Smoking, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.648e+00 1.252e+00 -2.115 0.0344 *
## Age -1.339e-03 1.384e-03 -0.968 0.3332
## Blood.Pressure -1.973e-03 1.435e-03 -1.375 0.1691
## Cholesterol.Level 6.136e-05 5.782e-04 0.106 0.9155
## BMI 8.026e-03 4.002e-03 2.005 0.0449 *
## CRP.Level -3.452e-03 5.803e-03 -0.595 0.5519
## Homocysteine.Level 4.566e-03 5.824e-03 0.784 0.4330
## GenderFemale 1.572e+00 1.028e+00 1.529 0.1263
## GenderMale 1.492e+00 1.028e+00 1.451 0.1467
## DiabetesNo 1.446e-02 4.583e-01 0.032 0.9748
## DiabetesYes 1.557e-03 4.583e-01 0.003 0.9973
## SmokingNo -2.030e-01 4.702e-01 -0.432 0.6659
## SmokingYes -1.831e-01 4.701e-01 -0.390 0.6969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9873.6 on 9853 degrees of freedom
## Residual deviance: 9859.3 on 9841 degrees of freedom
## (146 observations deleted due to missingness)
## AIC: 9885.3
##
## Number of Fisher Scoring iterations: 5
#Data Cleaning
#Handling Mssing Values
cat("\n>> Handling Missing Values...\n")
##
## >> Handling Missing Values...
# Identifikasi kolom numerik dan kategorik
num_cols <- sapply(data, is.numeric)
cat_cols <- sapply(data, is.character)
# Isi NA pada kolom numerik dengan median
data[num_cols] <- lapply(data[num_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
})
#
get_mode <- function(v) {
uniqv <- unique(v[!is.na(v)])
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# handling missing values
data[cat_cols] <- lapply(data[cat_cols], function(x) {
x[is.na(x)] <- get_mode(x)
return(x)
})
cat("Jumlah missing values setelah penanganan:\n")
## Jumlah missing values setelah penanganan:
print(sapply(data, function(x) sum(is.na(x))))
## Age Gender Blood.Pressure
## 0 0 0
## Cholesterol.Level Exercise.Habits Smoking
## 0 0 0
## Family.Heart.Disease Diabetes BMI
## 0 0 0
## High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
## 0 0 0
## Alcohol.Consumption Stress.Level Sleep.Hours
## 0 0 0
## Sugar.Consumption Triglyceride.Level Fasting.Blood.Sugar
## 0 0 0
## CRP.Level Homocysteine.Level Heart.Disease.Status
## 0 0 0
# hapus duplikasi data
cat("\n>> Menghapus duplikasi...\n")
##
## >> Menghapus duplikasi...
before_rows <- nrow(data)
data <- distinct(data)
after_rows <- nrow(data)
cat("Jumlah baris duplikat yang dihapus:", before_rows - after_rows, "\n")
## Jumlah baris duplikat yang dihapus: 0
Handling Outlier
cat("\n>> Penanganan Outlier dengan IQR...\n")
##
## >> Penanganan Outlier dengan IQR...
remove_outliers <- function(df, cols) {
for (col in cols) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
before <- nrow(df)
df <- df %>% filter(df[[col]] >= lower & df[[col]] <= upper)
after <- nrow(df)
cat("Kolom:", col, "- Data dihapus:", before - after, "\n")
}
return(df)
}
data <- remove_outliers(data, names(data)[num_cols])
## Kolom: Age - Data dihapus: 0
## Kolom: Blood.Pressure - Data dihapus: 0
## Kolom: Cholesterol.Level - Data dihapus: 0
## Kolom: BMI - Data dihapus: 0
## Kolom: Sleep.Hours - Data dihapus: 0
## Kolom: Triglyceride.Level - Data dihapus: 0
## Kolom: Fasting.Blood.Sugar - Data dihapus: 0
## Kolom: CRP.Level - Data dihapus: 0
## Kolom: Homocysteine.Level - Data dihapus: 0
cat("\n>> Ringkasan Data Setelah Cleaning:\n")
##
## >> Ringkasan Data Setelah Cleaning:
cat("Jumlah baris akhir:", nrow(data), "\n")
## Jumlah baris akhir: 10000
cat("Jumlah kolom:", ncol(data), "\n")
## Jumlah kolom: 21
summary(data)
## Age Gender Blood.Pressure Cholesterol.Level
## Min. :18.0 Length:10000 Min. :120.0 Min. :150.0
## 1st Qu.:34.0 Class :character 1st Qu.:134.0 1st Qu.:187.0
## Median :49.0 Mode :character Median :150.0 Median :226.0
## Mean :49.3 Mean :149.8 Mean :225.4
## 3rd Qu.:65.0 3rd Qu.:165.0 3rd Qu.:263.0
## Max. :80.0 Max. :180.0 Max. :300.0
## Exercise.Habits Smoking Family.Heart.Disease Diabetes
## Length:10000 Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## BMI High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
## Min. :18.00 Length:10000 Length:10000 Length:10000
## 1st Qu.:23.67 Class :character Class :character Class :character
## Median :29.08 Mode :character Mode :character Mode :character
## Mean :29.08
## 3rd Qu.:34.51
## Max. :40.00
## Alcohol.Consumption Stress.Level Sleep.Hours Sugar.Consumption
## Length:10000 Length:10000 Min. : 4.001 Length:10000
## Class :character Class :character 1st Qu.: 5.455 Class :character
## Mode :character Mode :character Median : 7.003 Mode :character
## Mean : 6.991
## 3rd Qu.: 8.528
## Max. :10.000
## Triglyceride.Level Fasting.Blood.Sugar CRP.Level Homocysteine.Level
## Min. :100.0 Min. : 80.0 Min. : 0.003647 Min. : 5.00
## 1st Qu.:176.0 1st Qu.: 99.0 1st Qu.: 3.681800 1st Qu.: 8.73
## Median :250.0 Median :120.0 Median : 7.472164 Median :12.41
## Mean :250.7 Mean :120.1 Mean : 7.472200 Mean :12.46
## 3rd Qu.:326.0 3rd Qu.:141.0 3rd Qu.:11.244879 3rd Qu.:16.13
## Max. :400.0 Max. :160.0 Max. :14.997087 Max. :20.00
## Heart.Disease.Status
## No :8000
## Yes:2000
##
##
##
##
#Data Transform
cat("\n>> Data Transformation: Normalisasi dan Standarisasi\n")
##
## >> Data Transformation: Normalisasi dan Standarisasi
# pilih kolom numeeik
numeric_data <- data[, num_cols]
# normalisasi
normalized_data <- as.data.frame(lapply(numeric_data, function(x) {
(x - min(x)) / (max(x) - min(x))
}))
# standarisasi
standardized_data <- as.data.frame(scale(numeric_data))
cat("\nRingkasan data setelah normalisasi:\n")
##
## Ringkasan data setelah normalisasi:
print(summary(normalized_data))
## Age Blood.Pressure Cholesterol.Level BMI
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2581 1st Qu.:0.2333 1st Qu.:0.2467 1st Qu.:0.2576
## Median :0.5000 Median :0.5000 Median :0.5067 Median :0.5036
## Mean :0.5048 Mean :0.4960 Mean :0.5028 Mean :0.5035
## 3rd Qu.:0.7581 3rd Qu.:0.7500 3rd Qu.:0.7533 3rd Qu.:0.7505
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Sleep.Hours Triglyceride.Level Fasting.Blood.Sugar CRP.Level
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2425 1st Qu.:0.2533 1st Qu.:0.2375 1st Qu.:0.2453
## Median :0.5005 Median :0.5000 Median :0.5000 Median :0.4981
## Mean :0.4985 Mean :0.5024 Mean :0.5018 Mean :0.4981
## 3rd Qu.:0.7546 3rd Qu.:0.7533 3rd Qu.:0.7625 3rd Qu.:0.7497
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Homocysteine.Level
## Min. :0.0000
## 1st Qu.:0.2487
## Median :0.4940
## Mean :0.4971
## 3rd Qu.:0.7421
## Max. :1.0000
cat("\nRingkasan data setelah standarisasi:\n")
##
## Ringkasan data setelah standarisasi:
print(summary(standardized_data))
## Age Blood.Pressure Cholesterol.Level BMI
## Min. :-1.72260 Min. :-1.69502 Min. :-1.73355 Min. :-1.757804
## 1st Qu.:-0.84191 1st Qu.:-0.89758 1st Qu.:-0.88318 1st Qu.:-0.858453
## Median :-0.01626 Median : 0.01377 Median : 0.01316 Median : 0.000352
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.86443 3rd Qu.: 0.86817 3rd Qu.: 0.86353 3rd Qu.: 0.862159
## Max. : 1.69008 Max. : 1.72256 Max. : 1.71391 Max. : 1.733240
## Sleep.Hours Triglyceride.Level Fasting.Blood.Sugar
## Min. :-1.708024 Min. :-1.733475 Min. :-1.703957
## 1st Qu.:-0.877252 1st Qu.:-0.859449 1st Qu.:-0.897438
## Median : 0.006792 Median :-0.008424 Median :-0.006023
## Mean : 0.000000 Mean : 0.000000 Mean : 0.000000
## 3rd Qu.: 0.877543 3rd Qu.: 0.865602 3rd Qu.: 0.885391
## Max. : 1.718212 Max. : 1.716627 Max. : 1.691910
## CRP.Level Homocysteine.Level
## Min. :-1.7230084 Min. :-1.72627
## 1st Qu.:-0.8744519 1st Qu.:-0.86277
## Median :-0.0000083 Median :-0.01083
## Mean : 0.0000000 Mean : 0.00000
## 3rd Qu.: 0.8703635 3rd Qu.: 0.85082
## Max. : 1.7360044 Max. : 1.74640
# data reduction - PCA
cat("\n>> PCA - Principal Component Analysis\n")
##
## >> PCA - Principal Component Analysis
pca_result <- prcomp(standardized_data, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 1.0222 1.0198 1.0148 1.0009 0.9953 0.9940 0.9885 0.9852
## Proportion of Variance 0.1161 0.1156 0.1144 0.1113 0.1101 0.1098 0.1086 0.1078
## Cumulative Proportion 0.1161 0.2317 0.3461 0.4574 0.5675 0.6772 0.7858 0.8936
## PC9
## Standard deviation 0.9784
## Proportion of Variance 0.1064
## Cumulative Proportion 1.0000
# Visualisasi Scree Plot (untuk melihat komponen penting)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50))
print(pca_result$rotation)
## PC1 PC2 PC3 PC4 PC5
## Age -0.36848805 -0.04212263 0.49486012 0.11104379 0.5479900
## Blood.Pressure 0.32564120 -0.13273467 -0.49862244 0.34123989 0.4106359
## Cholesterol.Level -0.42564014 -0.45725026 0.07551057 0.10243494 -0.2703604
## BMI -0.24256154 -0.52460491 -0.03388758 -0.23603483 0.3427929
## Sleep.Hours -0.39499448 0.08754526 -0.27206899 0.54699901 -0.2224816
## Triglyceride.Level -0.04960765 -0.10847759 -0.46554120 -0.39431939 0.3041300
## Fasting.Blood.Sugar -0.38301274 0.24437332 -0.27643371 -0.54243643 -0.2303277
## CRP.Level -0.09142637 0.59155766 0.16332648 -0.07351989 0.3172713
## Homocysteine.Level 0.45236008 -0.25928506 0.32526549 -0.22512463 -0.2116088
## PC6 PC7 PC8 PC9
## Age 0.14768000 0.02501341 -0.35387527 0.39657177
## Blood.Pressure -0.30740257 0.01635034 0.13295960 0.47979840
## Cholesterol.Level 0.12149845 0.18792062 0.61679938 0.29990939
## BMI -0.45224200 -0.31928486 0.03228907 -0.42904267
## Sleep.Hours 0.12406416 -0.60198509 -0.18544699 -0.03767127
## Triglyceride.Level 0.71259703 -0.10794862 0.03876290 -0.01049958
## Fasting.Blood.Sugar -0.36048071 -0.03814620 -0.19520375 0.44763068
## CRP.Level -0.06899174 -0.31917313 0.63345082 -0.03267511
## Homocysteine.Level 0.08072754 -0.62006810 0.01877203 0.36814510
explained_var <- cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2))
num_components <- which(explained_var >= 0.9)[1]
pca_data <- as.data.frame(pca_result$x[, 1:num_components])
cat("Jumlah komponen PCA yang dipilih (>=90% variansi):", num_components, "\n")
## Jumlah komponen PCA yang dipilih (>=90% variansi): 9
data %>%
dplyr::select(Age, Blood.Pressure, BMI, Cholesterol.Level, CRP.Level,
Fasting.Blood.Sugar, Homocysteine.Level, Sleep.Hours, Triglyceride.Level) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = variable, y = value)) +
geom_boxplot(fill = "skyblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(title = "Boxplot untuk Kolom Numerik Tertentu", x = "Variabel", y = "Nilai")
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
#Data Reduction
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(dplyr)
numerik_data <- data%>%
dplyr::select(Age, Blood.Pressure, BMI, Cholesterol.Level, CRP.Level,
Fasting.Blood.Sugar, Homocysteine.Level, Sleep.Hours, Triglyceride.Level)
cor_matrix <- cor(numerik_data, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.cex = 0.8, number.cex = 0.7,
addCoef.col = "black", # tampilkan angka korelasi
col = colorRampPalette(c("red", "white", "blue"))(200),
title = "Korelasi Antar Variabel Numerik", mar = c(0,0,1,0))
#Modelling
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
set.seed(123)
n_components <- which(cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2)) >= 0.9)[1]
pca_data <- as.data.frame(pca_result$x[, 1:n_components])
pca_data$target <- as.factor(data$Heart.Disease.Status)
train_index <- createDataPartition(pca_data$target, p = 0.8, list = FALSE)
train_pca <- pca_data[train_index, ]
test_pca <- pca_data[-train_index, ]
train_pca_bal <- ROSE(target ~ ., data = train_pca, seed = 123)$data
test_pca$target <- factor(test_pca$target, levels = levels(train_pca_bal$target))
cat("\n>> LDA dengan PCA + ROSE\n")
##
## >> LDA dengan PCA + ROSE
lda_model_pca <- lda(target ~ ., data = train_pca_bal)
lda_pred_pca <- predict(lda_model_pca, newdata = test_pca)$class
conf_lda_pca <- confusionMatrix(lda_pred_pca, test_pca$target)
print(conf_lda_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 936 248
## Yes 664 152
##
## Accuracy : 0.544
## 95% CI : (0.5219, 0.566)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0252
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5850
## Specificity : 0.3800
## Pos Pred Value : 0.7905
## Neg Pred Value : 0.1863
## Prevalence : 0.8000
## Detection Rate : 0.4680
## Detection Prevalence : 0.5920
## Balanced Accuracy : 0.4825
##
## 'Positive' Class : No
##
cat("\n>> Logistic Regression dengan PCA + ROSE\n")
##
## >> Logistic Regression dengan PCA + ROSE
log_model_pca <- glm(target ~ ., data = train_pca_bal, family = "binomial")
log_pred_pca <- predict(log_model_pca, newdata = test_pca, type = "response")
log_pred_class <- ifelse(log_pred_pca > 0.5, "Yes", "No")
log_pred_class <- factor(log_pred_class, levels = levels(test_pca$target))
conf_log_pca <- confusionMatrix(log_pred_class, test_pca$target)
print(conf_log_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 936 248
## Yes 664 152
##
## Accuracy : 0.544
## 95% CI : (0.5219, 0.566)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0252
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5850
## Specificity : 0.3800
## Pos Pred Value : 0.7905
## Neg Pred Value : 0.1863
## Prevalence : 0.8000
## Detection Rate : 0.4680
## Detection Prevalence : 0.5920
## Balanced Accuracy : 0.4825
##
## 'Positive' Class : No
##
library(caret)
library(ROSE)
set.seed(123)
train_index <- createDataPartition(data$Heart.Disease.Status, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
num_cols <- sapply(train_data, is.numeric)
train_num <- train_data[, num_cols]
test_num <- test_data[, num_cols]
train_num_scaled <- scale(train_num)
test_num_scaled <- scale(test_num, center = attr(train_num_scaled, "scaled:center"),
scale = attr(train_num_scaled, "scaled:scale"))
pca_res <- prcomp(train_num_scaled, center = TRUE, scale. = TRUE)
var_exp <- cumsum(pca_res$sdev^2) / sum(pca_res$sdev^2)
n_comp <- which(var_exp >= 0.9)[1]
train_pca <- as.data.frame(pca_res$x[, 1:n_comp])
train_pca$Heart.Disease.Status <- train_data$Heart.Disease.Status
test_pca_mat <- predict(pca_res, newdata = test_num_scaled)
test_pca <- as.data.frame(test_pca_mat[, 1:n_comp])
test_pca$Heart.Disease.Status <- test_data$Heart.Disease.Status
train_pca_bal <- ROSE(Heart.Disease.Status ~ ., data = train_pca, seed = 123)$data
test_pca$Heart.Disease.Status <- factor(test_pca$Heart.Disease.Status, levels = levels(train_pca_bal$Heart.Disease.Status))
log_model <- glm(Heart.Disease.Status ~ ., data = train_pca_bal, family = binomial)
pred_prob <- predict(log_model, newdata = test_pca, type = "response")
pred_class <- ifelse(pred_prob > 0.5, levels(train_pca_bal$Heart.Disease.Status)[2], levels(train_pca_bal$Heart.Disease.Status)[1])
pred_class <- factor(pred_class, levels = levels(test_pca$Heart.Disease.Status))
conf_mat <- confusionMatrix(pred_class, test_pca$Heart.Disease.Status)
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 936 247
## Yes 664 153
##
## Accuracy : 0.5445
## 95% CI : (0.5224, 0.5665)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0234
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5850
## Specificity : 0.3825
## Pos Pred Value : 0.7912
## Neg Pred Value : 0.1873
## Prevalence : 0.8000
## Detection Rate : 0.4680
## Detection Prevalence : 0.5915
## Balanced Accuracy : 0.4838
##
## 'Positive' Class : No
##