Load data

data <- read.csv("C:/Users/ACER ASPIRE 5/Documents/heart_disease.csv")

#Summary Data

str(data)

## 'data.frame':    10000 obs. of  21 variables:
##  $ Age                 : num  56 69 46 32 60 25 78 38 56 75 ...
##  $ Gender              : chr  "Male" "Female" "Male" "Female" ...
##  $ Blood.Pressure      : num  153 146 126 122 166 152 121 161 135 144 ...
##  $ Cholesterol.Level   : num  155 286 216 293 242 257 175 187 291 252 ...
##  $ Exercise.Habits     : chr  "High" "High" "Low" "High" ...
##  $ Smoking             : chr  "Yes" "No" "No" "Yes" ...
##  $ Family.Heart.Disease: chr  "Yes" "Yes" "No" "Yes" ...
##  $ Diabetes            : chr  "No" "Yes" "No" "No" ...
##  $ BMI                 : num  25 25.2 29.9 24.1 20.5 ...
##  $ High.Blood.Pressure : chr  "Yes" "No" "No" "Yes" ...
##  $ Low.HDL.Cholesterol : chr  "Yes" "Yes" "Yes" "No" ...
##  $ High.LDL.Cholesterol: chr  "No" "No" "Yes" "Yes" ...
##  $ Alcohol.Consumption : chr  "High" "Medium" "Low" "Low" ...
##  $ Stress.Level        : chr  "Medium" "High" "Low" "High" ...
##  $ Sleep.Hours         : num  7.63 8.74 4.44 5.25 7.03 ...
##  $ Sugar.Consumption   : chr  "Medium" "Medium" "Low" "High" ...
##  $ Triglyceride.Level  : num  342 133 393 293 263 126 107 228 317 199 ...
##  $ Fasting.Blood.Sugar : num  NA 157 92 94 154 91 85 111 103 96 ...
##  $ CRP.Level           : num  12.97 9.36 12.71 12.51 10.38 ...
##  $ Homocysteine.Level  : num  12.39 19.3 11.23 5.96 8.15 ...
##  $ Heart.Disease.Status: chr  "No" "No" "No" "No" ...

summary(data)

##       Age          Gender          Blood.Pressure  Cholesterol.Level
##  Min.   :18.0   Length:10000       Min.   :120.0   Min.   :150.0    
##  1st Qu.:34.0   Class :character   1st Qu.:134.0   1st Qu.:187.0    
##  Median :49.0   Mode  :character   Median :150.0   Median :226.0    
##  Mean   :49.3                      Mean   :149.8   Mean   :225.4    
##  3rd Qu.:65.0                      3rd Qu.:165.0   3rd Qu.:263.0    
##  Max.   :80.0                      Max.   :180.0   Max.   :300.0    
##  NA's   :29                        NA's   :19      NA's   :30       
##  Exercise.Habits      Smoking          Family.Heart.Disease   Diabetes        
##  Length:10000       Length:10000       Length:10000         Length:10000      
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##       BMI        High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
##  Min.   :18.00   Length:10000        Length:10000        Length:10000        
##  1st Qu.:23.66   Class :character    Class :character    Class :character    
##  Median :29.08   Mode  :character    Mode  :character    Mode  :character    
##  Mean   :29.08                                                               
##  3rd Qu.:34.52                                                               
##  Max.   :40.00                                                               
##  NA's   :22                                                                  
##  Alcohol.Consumption Stress.Level        Sleep.Hours     Sugar.Consumption 
##  Length:10000        Length:10000       Min.   : 4.001   Length:10000      
##  Class :character    Class :character   1st Qu.: 5.450   Class :character  
##  Mode  :character    Mode  :character   Median : 7.003   Mode  :character  
##                                         Mean   : 6.991                     
##                                         3rd Qu.: 8.532                     
##                                         Max.   :10.000                     
##                                         NA's   :25                         
##  Triglyceride.Level Fasting.Blood.Sugar   CRP.Level         Homocysteine.Level
##  Min.   :100.0      Min.   : 80.0       Min.   : 0.003647   Min.   : 5.000    
##  1st Qu.:176.0      1st Qu.: 99.0       1st Qu.: 3.674126   1st Qu.: 8.723    
##  Median :250.0      Median :120.0       Median : 7.472164   Median :12.409    
##  Mean   :250.7      Mean   :120.1       Mean   : 7.472201   Mean   :12.456    
##  3rd Qu.:326.0      3rd Qu.:141.0       3rd Qu.:11.255592   3rd Qu.:16.141    
##  Max.   :400.0      Max.   :160.0       Max.   :14.997087   Max.   :19.999    
##  NA's   :26         NA's   :22          NA's   :26          NA's   :20        
##  Heart.Disease.Status
##  Length:10000        
##  Class :character    
##  Mode  :character    
##                      
##                      
##                      
##

library(ggplot2)
library(tidyr)

numeric_columns <- c("Age", "Blood.Pressure", "Cholesterol.Level", "BMI", "CRP.Level", "Homocysteine.Level", "Triglyceride.Level", "Fasting.Blood.Sugar", "Sleep.Hours")
cleaned_numeric_data <- data[, numeric_columns]

long_data <- pivot_longer(cleaned_numeric_data, cols = everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(long_data, aes(x = Nilai)) +
  geom_histogram(fill = "steelblue", color = "white", bins = 30) +
  facet_wrap(~ Variabel, scales = "free", ncol = 3) +
  theme_minimal() +
  labs(title = "Distribusi Histogram Variabel Numerik", x = "Nilai", y = "Frekuensi")

## Warning: Removed 219 rows containing non-finite outside the scale range
## (`stat_bin()`).

glimpse(data)

## Rows: 10,000
## Columns: 21
## $ Age                  <dbl> 56, 69, 46, 32, 60, 25, 78, 38, 56, 75, 36, 40, 2…
## $ Gender               <chr> "Male", "Female", "Male", "Female", "Male", "Male…
## $ Blood.Pressure       <dbl> 153, 146, 126, 122, 166, 152, 121, 161, 135, 144,…
## $ Cholesterol.Level    <dbl> 155, 286, 216, 293, 242, 257, 175, 187, 291, 252,…
## $ Exercise.Habits      <chr> "High", "High", "Low", "High", "Low", "Low", "Hig…
## $ Smoking              <chr> "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", "Y…
## $ Family.Heart.Disease <chr> "Yes", "Yes", "No", "Yes", "Yes", "No", "Yes", "Y…
## $ Diabetes             <chr> "No", "Yes", "No", "No", "Yes", "No", "Yes", "Yes…
## $ BMI                  <dbl> 24.99159, 25.22180, 29.85545, 24.13048, 20.48629,…
## $ High.Blood.Pressure  <chr> "Yes", "No", "No", "Yes", "Yes", "No", "No", "No"…
## $ Low.HDL.Cholesterol  <chr> "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "No…
## $ High.LDL.Cholesterol <chr> "No", "No", "Yes", "Yes", "No", "No", "No", "No",…
## $ Alcohol.Consumption  <chr> "High", "Medium", "Low", "Low", "Low", "Low", "Me…
## $ Stress.Level         <chr> "Medium", "High", "Low", "High", "High", "Medium"…
## $ Sleep.Hours          <dbl> 7.633228, 8.744034, 4.440440, 5.249405, 7.030971,…
## $ Sugar.Consumption    <chr> "Medium", "Medium", "Low", "High", "High", "Low",…
## $ Triglyceride.Level   <dbl> 342, 133, 393, 293, 263, 126, 107, 228, 317, 199,…
## $ Fasting.Blood.Sugar  <dbl> NA, 157, 92, 94, 154, 91, 85, 111, 103, 96, NA, 1…
## $ CRP.Level            <dbl> 12.96924569, 9.35538940, 12.70987253, 12.50904619…
## $ Homocysteine.Level   <dbl> 12.387250, 19.298875, 11.230926, 5.961958, 8.1538…
## $ Heart.Disease.Status <chr> "No", "No", "No", "No", "No", "No", "No", "No", "…

#Cek Missing Values

cat("Jumlah missing values per kolom:\n")

## Jumlah missing values per kolom:

sapply(data, function(x) sum(is.na(x)))

##                  Age               Gender       Blood.Pressure 
##                   29                    0                   19 
##    Cholesterol.Level      Exercise.Habits              Smoking 
##                   30                    0                    0 
## Family.Heart.Disease             Diabetes                  BMI 
##                    0                    0                   22 
##  High.Blood.Pressure  Low.HDL.Cholesterol High.LDL.Cholesterol 
##                    0                    0                    0 
##  Alcohol.Consumption         Stress.Level          Sleep.Hours 
##                    0                    0                   25 
##    Sugar.Consumption   Triglyceride.Level  Fasting.Blood.Sugar 
##                    0                   26                   22 
##            CRP.Level   Homocysteine.Level Heart.Disease.Status 
##                   26                   20                    0

#Analisis variabel target

target_col <- "Heart.Disease.Status"  

if (target_col %in% names(data)) {
  target_data <- data[[target_col]]
  target_data <- target_data[!is.na(target_data)]  # buang NA
  
  cat("Distribusi variabel target:\n")
  print(table(target_data))
  print(prop.table(table(target_data)))

  # Visualisasi distribusi variabel target
  ggplot(data, aes(x = factor(.data[[target_col]]))) +
    geom_bar(fill = "steelblue") +
    labs(title = "Distribusi Variabel Target",
         x = target_col,
         y = "Jumlah") +
    theme_minimal()
  if (length(unique(target_data)) > 1) {
    target_counts <- table(target_data)
    imbalance_ratio <- min(target_counts) / max(target_counts)
    
    cat("Rasio Imbalance Target:", round(imbalance_ratio, 2), "\n")
    if (imbalance_ratio < 0.4) {
      cat("PERINGATAN: Variabel target terindikasi imbalance.\n")
    } else {
      cat("Variabel target dalam kondisi seimbang.\n")
    }
  } else {
    cat("Variabel target hanya memiliki satu kelas.\n")
  }
} else {
  cat("Kolom target tidak ditemukan dalam data.\n")
}

## Distribusi variabel target:
## target_data
##   No  Yes 
## 8000 2000 
## target_data
##  No Yes 
## 0.8 0.2 
## Rasio Imbalance Target: 0.25 
## PERINGATAN: Variabel target terindikasi imbalance.

# Load library
library(ggplot2)
library(dplyr)
library(scales)

target_df <- data %>%
  filter(!is.na(.data[[target_col]])) %>%
  group_by(!!sym(target_col)) %>%
  summarise(Jumlah = n()) %>%
  mutate(Proporsi = Jumlah / sum(Jumlah),
         Persentase = percent(Proporsi))

ggplot(target_df, aes(x = !!sym(target_col), y = Jumlah, fill = !!sym(target_col))) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(label = Jumlah), vjust = -0.5) +
  labs(title = "Distribusi Frekuensi Variabel Target",
       x = "Status Penyakit Jantung",
       y = "Jumlah") +
  theme_minimal() +
  scale_fill_manual(values = c("#4CAF50", "#F44336")) +
  theme(legend.position = "none")

# Pie chart proporsi
ggplot(target_df, aes(x = "", y = Proporsi, fill = !!sym(target_col))) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  geom_text(aes(label = Persentase), position = position_stack(vjust = 0.5)) +
  labs(title = "Distribusi Proporsi Variabel Target") +
  theme_void() +
  scale_fill_manual(values = c("#4CAF50", "#F44336"))

data$Heart.Disease.Status <- as.factor(data$Heart.Disease.Status)

model_full <- glm(Heart.Disease.Status ~ Age + Blood.Pressure + Cholesterol.Level + BMI + 
                    CRP.Level + Homocysteine.Level + Gender + Diabetes + Smoking,
                  data = data, family = binomial)

# Ringkasan model untuk uji parsial
summary(model_full)

## 
## Call:
## glm(formula = Heart.Disease.Status ~ Age + Blood.Pressure + Cholesterol.Level + 
##     BMI + CRP.Level + Homocysteine.Level + Gender + Diabetes + 
##     Smoking, family = binomial, data = data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)        -2.648e+00  1.252e+00  -2.115   0.0344 *
## Age                -1.339e-03  1.384e-03  -0.968   0.3332  
## Blood.Pressure     -1.973e-03  1.435e-03  -1.375   0.1691  
## Cholesterol.Level   6.136e-05  5.782e-04   0.106   0.9155  
## BMI                 8.026e-03  4.002e-03   2.005   0.0449 *
## CRP.Level          -3.452e-03  5.803e-03  -0.595   0.5519  
## Homocysteine.Level  4.566e-03  5.824e-03   0.784   0.4330  
## GenderFemale        1.572e+00  1.028e+00   1.529   0.1263  
## GenderMale          1.492e+00  1.028e+00   1.451   0.1467  
## DiabetesNo          1.446e-02  4.583e-01   0.032   0.9748  
## DiabetesYes         1.557e-03  4.583e-01   0.003   0.9973  
## SmokingNo          -2.030e-01  4.702e-01  -0.432   0.6659  
## SmokingYes         -1.831e-01  4.701e-01  -0.390   0.6969  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9873.6  on 9853  degrees of freedom
## Residual deviance: 9859.3  on 9841  degrees of freedom
##   (146 observations deleted due to missingness)
## AIC: 9885.3
## 
## Number of Fisher Scoring iterations: 5

#Data Cleaning

#Handling Mssing Values

cat("\n>> Handling Missing Values...\n")

## 
## >> Handling Missing Values...

# Identifikasi kolom numerik dan kategorik
num_cols <- sapply(data, is.numeric)
cat_cols <- sapply(data, is.character)

# Isi NA pada kolom numerik dengan median
data[num_cols] <- lapply(data[num_cols], function(x) {
  x[is.na(x)] <- median(x, na.rm = TRUE)
  return(x)
})

#
get_mode <- function(v) {
  uniqv <- unique(v[!is.na(v)])
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

# handling missing values
data[cat_cols] <- lapply(data[cat_cols], function(x) {
  x[is.na(x)] <- get_mode(x)
  return(x)
})

cat("Jumlah missing values setelah penanganan:\n")

## Jumlah missing values setelah penanganan:

print(sapply(data, function(x) sum(is.na(x))))

##                  Age               Gender       Blood.Pressure 
##                    0                    0                    0 
##    Cholesterol.Level      Exercise.Habits              Smoking 
##                    0                    0                    0 
## Family.Heart.Disease             Diabetes                  BMI 
##                    0                    0                    0 
##  High.Blood.Pressure  Low.HDL.Cholesterol High.LDL.Cholesterol 
##                    0                    0                    0 
##  Alcohol.Consumption         Stress.Level          Sleep.Hours 
##                    0                    0                    0 
##    Sugar.Consumption   Triglyceride.Level  Fasting.Blood.Sugar 
##                    0                    0                    0 
##            CRP.Level   Homocysteine.Level Heart.Disease.Status 
##                    0                    0                    0

# hapus duplikasi data
cat("\n>> Menghapus duplikasi...\n")

## 
## >> Menghapus duplikasi...

before_rows <- nrow(data)
data <- distinct(data)
after_rows <- nrow(data)
cat("Jumlah baris duplikat yang dihapus:", before_rows - after_rows, "\n")

## Jumlah baris duplikat yang dihapus: 0

Handling Outlier

cat("\n>> Penanganan Outlier dengan IQR...\n")

## 
## >> Penanganan Outlier dengan IQR...

remove_outliers <- function(df, cols) {
  for (col in cols) {
    Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
    Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
    IQR <- Q3 - Q1
    lower <- Q1 - 1.5 * IQR
    upper <- Q3 + 1.5 * IQR
    before <- nrow(df)
    df <- df %>% filter(df[[col]] >= lower & df[[col]] <= upper)
    after <- nrow(df)
    cat("Kolom:", col, "- Data dihapus:", before - after, "\n")
  }
  return(df)
}

data <- remove_outliers(data, names(data)[num_cols])

## Kolom: Age - Data dihapus: 0 
## Kolom: Blood.Pressure - Data dihapus: 0 
## Kolom: Cholesterol.Level - Data dihapus: 0 
## Kolom: BMI - Data dihapus: 0 
## Kolom: Sleep.Hours - Data dihapus: 0 
## Kolom: Triglyceride.Level - Data dihapus: 0 
## Kolom: Fasting.Blood.Sugar - Data dihapus: 0 
## Kolom: CRP.Level - Data dihapus: 0 
## Kolom: Homocysteine.Level - Data dihapus: 0

cat("\n>> Ringkasan Data Setelah Cleaning:\n")

## 
## >> Ringkasan Data Setelah Cleaning:

cat("Jumlah baris akhir:", nrow(data), "\n")

## Jumlah baris akhir: 10000

cat("Jumlah kolom:", ncol(data), "\n")

## Jumlah kolom: 21

summary(data)

##       Age          Gender          Blood.Pressure  Cholesterol.Level
##  Min.   :18.0   Length:10000       Min.   :120.0   Min.   :150.0    
##  1st Qu.:34.0   Class :character   1st Qu.:134.0   1st Qu.:187.0    
##  Median :49.0   Mode  :character   Median :150.0   Median :226.0    
##  Mean   :49.3                      Mean   :149.8   Mean   :225.4    
##  3rd Qu.:65.0                      3rd Qu.:165.0   3rd Qu.:263.0    
##  Max.   :80.0                      Max.   :180.0   Max.   :300.0    
##  Exercise.Habits      Smoking          Family.Heart.Disease   Diabetes        
##  Length:10000       Length:10000       Length:10000         Length:10000      
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##       BMI        High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
##  Min.   :18.00   Length:10000        Length:10000        Length:10000        
##  1st Qu.:23.67   Class :character    Class :character    Class :character    
##  Median :29.08   Mode  :character    Mode  :character    Mode  :character    
##  Mean   :29.08                                                               
##  3rd Qu.:34.51                                                               
##  Max.   :40.00                                                               
##  Alcohol.Consumption Stress.Level        Sleep.Hours     Sugar.Consumption 
##  Length:10000        Length:10000       Min.   : 4.001   Length:10000      
##  Class :character    Class :character   1st Qu.: 5.455   Class :character  
##  Mode  :character    Mode  :character   Median : 7.003   Mode  :character  
##                                         Mean   : 6.991                     
##                                         3rd Qu.: 8.528                     
##                                         Max.   :10.000                     
##  Triglyceride.Level Fasting.Blood.Sugar   CRP.Level         Homocysteine.Level
##  Min.   :100.0      Min.   : 80.0       Min.   : 0.003647   Min.   : 5.00     
##  1st Qu.:176.0      1st Qu.: 99.0       1st Qu.: 3.681800   1st Qu.: 8.73     
##  Median :250.0      Median :120.0       Median : 7.472164   Median :12.41     
##  Mean   :250.7      Mean   :120.1       Mean   : 7.472200   Mean   :12.46     
##  3rd Qu.:326.0      3rd Qu.:141.0       3rd Qu.:11.244879   3rd Qu.:16.13     
##  Max.   :400.0      Max.   :160.0       Max.   :14.997087   Max.   :20.00     
##  Heart.Disease.Status
##  No :8000            
##  Yes:2000            
##                      
##                      
##                      
##

#Data Transform

cat("\n>> Data Transformation: Normalisasi dan Standarisasi\n")

## 
## >> Data Transformation: Normalisasi dan Standarisasi

# pilih kolom numeeik
numeric_data <- data[, num_cols]

# normalisasi
normalized_data <- as.data.frame(lapply(numeric_data, function(x) {
  (x - min(x)) / (max(x) - min(x))
}))

# standarisasi
standardized_data <- as.data.frame(scale(numeric_data))

cat("\nRingkasan data setelah normalisasi:\n")

## 
## Ringkasan data setelah normalisasi:

print(summary(normalized_data))

##       Age         Blood.Pressure   Cholesterol.Level      BMI        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000    Min.   :0.0000  
##  1st Qu.:0.2581   1st Qu.:0.2333   1st Qu.:0.2467    1st Qu.:0.2576  
##  Median :0.5000   Median :0.5000   Median :0.5067    Median :0.5036  
##  Mean   :0.5048   Mean   :0.4960   Mean   :0.5028    Mean   :0.5035  
##  3rd Qu.:0.7581   3rd Qu.:0.7500   3rd Qu.:0.7533    3rd Qu.:0.7505  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000    Max.   :1.0000  
##   Sleep.Hours     Triglyceride.Level Fasting.Blood.Sugar   CRP.Level     
##  Min.   :0.0000   Min.   :0.0000     Min.   :0.0000      Min.   :0.0000  
##  1st Qu.:0.2425   1st Qu.:0.2533     1st Qu.:0.2375      1st Qu.:0.2453  
##  Median :0.5005   Median :0.5000     Median :0.5000      Median :0.4981  
##  Mean   :0.4985   Mean   :0.5024     Mean   :0.5018      Mean   :0.4981  
##  3rd Qu.:0.7546   3rd Qu.:0.7533     3rd Qu.:0.7625      3rd Qu.:0.7497  
##  Max.   :1.0000   Max.   :1.0000     Max.   :1.0000      Max.   :1.0000  
##  Homocysteine.Level
##  Min.   :0.0000    
##  1st Qu.:0.2487    
##  Median :0.4940    
##  Mean   :0.4971    
##  3rd Qu.:0.7421    
##  Max.   :1.0000

cat("\nRingkasan data setelah standarisasi:\n")

## 
## Ringkasan data setelah standarisasi:

print(summary(standardized_data))

##       Age           Blood.Pressure     Cholesterol.Level       BMI           
##  Min.   :-1.72260   Min.   :-1.69502   Min.   :-1.73355   Min.   :-1.757804  
##  1st Qu.:-0.84191   1st Qu.:-0.89758   1st Qu.:-0.88318   1st Qu.:-0.858453  
##  Median :-0.01626   Median : 0.01377   Median : 0.01316   Median : 0.000352  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.86443   3rd Qu.: 0.86817   3rd Qu.: 0.86353   3rd Qu.: 0.862159  
##  Max.   : 1.69008   Max.   : 1.72256   Max.   : 1.71391   Max.   : 1.733240  
##   Sleep.Hours        Triglyceride.Level  Fasting.Blood.Sugar
##  Min.   :-1.708024   Min.   :-1.733475   Min.   :-1.703957  
##  1st Qu.:-0.877252   1st Qu.:-0.859449   1st Qu.:-0.897438  
##  Median : 0.006792   Median :-0.008424   Median :-0.006023  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000  
##  3rd Qu.: 0.877543   3rd Qu.: 0.865602   3rd Qu.: 0.885391  
##  Max.   : 1.718212   Max.   : 1.716627   Max.   : 1.691910  
##    CRP.Level          Homocysteine.Level
##  Min.   :-1.7230084   Min.   :-1.72627  
##  1st Qu.:-0.8744519   1st Qu.:-0.86277  
##  Median :-0.0000083   Median :-0.01083  
##  Mean   : 0.0000000   Mean   : 0.00000  
##  3rd Qu.: 0.8703635   3rd Qu.: 0.85082  
##  Max.   : 1.7360044   Max.   : 1.74640

# data reduction - PCA


cat("\n>> PCA - Principal Component Analysis\n")

## 
## >> PCA - Principal Component Analysis

pca_result <- prcomp(standardized_data, center = TRUE, scale. = TRUE)

summary(pca_result)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
## Standard deviation     1.0222 1.0198 1.0148 1.0009 0.9953 0.9940 0.9885 0.9852
## Proportion of Variance 0.1161 0.1156 0.1144 0.1113 0.1101 0.1098 0.1086 0.1078
## Cumulative Proportion  0.1161 0.2317 0.3461 0.4574 0.5675 0.6772 0.7858 0.8936
##                           PC9
## Standard deviation     0.9784
## Proportion of Variance 0.1064
## Cumulative Proportion  1.0000

# Visualisasi Scree Plot (untuk melihat komponen penting)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50))

print(pca_result$rotation)

##                             PC1         PC2         PC3         PC4        PC5
## Age                 -0.36848805 -0.04212263  0.49486012  0.11104379  0.5479900
## Blood.Pressure       0.32564120 -0.13273467 -0.49862244  0.34123989  0.4106359
## Cholesterol.Level   -0.42564014 -0.45725026  0.07551057  0.10243494 -0.2703604
## BMI                 -0.24256154 -0.52460491 -0.03388758 -0.23603483  0.3427929
## Sleep.Hours         -0.39499448  0.08754526 -0.27206899  0.54699901 -0.2224816
## Triglyceride.Level  -0.04960765 -0.10847759 -0.46554120 -0.39431939  0.3041300
## Fasting.Blood.Sugar -0.38301274  0.24437332 -0.27643371 -0.54243643 -0.2303277
## CRP.Level           -0.09142637  0.59155766  0.16332648 -0.07351989  0.3172713
## Homocysteine.Level   0.45236008 -0.25928506  0.32526549 -0.22512463 -0.2116088
##                             PC6         PC7         PC8         PC9
## Age                  0.14768000  0.02501341 -0.35387527  0.39657177
## Blood.Pressure      -0.30740257  0.01635034  0.13295960  0.47979840
## Cholesterol.Level    0.12149845  0.18792062  0.61679938  0.29990939
## BMI                 -0.45224200 -0.31928486  0.03228907 -0.42904267
## Sleep.Hours          0.12406416 -0.60198509 -0.18544699 -0.03767127
## Triglyceride.Level   0.71259703 -0.10794862  0.03876290 -0.01049958
## Fasting.Blood.Sugar -0.36048071 -0.03814620 -0.19520375  0.44763068
## CRP.Level           -0.06899174 -0.31917313  0.63345082 -0.03267511
## Homocysteine.Level   0.08072754 -0.62006810  0.01877203  0.36814510

explained_var <- cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2))
num_components <- which(explained_var >= 0.9)[1]

pca_data <- as.data.frame(pca_result$x[, 1:num_components])
cat("Jumlah komponen PCA yang dipilih (>=90% variansi):", num_components, "\n")

## Jumlah komponen PCA yang dipilih (>=90% variansi): 9

data %>%
  dplyr::select(Age, Blood.Pressure, BMI, Cholesterol.Level, CRP.Level, 
                Fasting.Blood.Sugar, Homocysteine.Level, Sleep.Hours, Triglyceride.Level) %>%  
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(fill = "skyblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(title = "Boxplot untuk Kolom Numerik Tertentu", x = "Variabel", y = "Nilai")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#Data Reduction

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.3

## corrplot 0.95 loaded

library(dplyr)
numerik_data <- data%>%
  dplyr::select(Age, Blood.Pressure, BMI, Cholesterol.Level, CRP.Level, 
                Fasting.Blood.Sugar, Homocysteine.Level, Sleep.Hours, Triglyceride.Level)

cor_matrix <- cor(numerik_data, use = "complete.obs")

corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.cex = 0.8, number.cex = 0.7, 
         addCoef.col = "black", # tampilkan angka korelasi
         col = colorRampPalette(c("red", "white", "blue"))(200),
         title = "Korelasi Antar Variabel Numerik", mar = c(0,0,1,0))

#Modelling

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(ROSE)

## Warning: package 'ROSE' was built under R version 4.4.3

## Loaded ROSE 0.0-4

set.seed(123)

n_components <- which(cumsum(pca_result$sdev^2 / sum(pca_result$sdev^2)) >= 0.9)[1]
pca_data <- as.data.frame(pca_result$x[, 1:n_components])
pca_data$target <- as.factor(data$Heart.Disease.Status)

train_index <- createDataPartition(pca_data$target, p = 0.8, list = FALSE)
train_pca <- pca_data[train_index, ]
test_pca <- pca_data[-train_index, ]

train_pca_bal <- ROSE(target ~ ., data = train_pca, seed = 123)$data

test_pca$target <- factor(test_pca$target, levels = levels(train_pca_bal$target))


cat("\n>> LDA dengan PCA + ROSE\n")

## 
## >> LDA dengan PCA + ROSE

lda_model_pca <- lda(target ~ ., data = train_pca_bal)
lda_pred_pca <- predict(lda_model_pca, newdata = test_pca)$class
conf_lda_pca <- confusionMatrix(lda_pred_pca, test_pca$target)
print(conf_lda_pca)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  936 248
##        Yes 664 152
##                                          
##                Accuracy : 0.544          
##                  95% CI : (0.5219, 0.566)
##     No Information Rate : 0.8            
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0252        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5850         
##             Specificity : 0.3800         
##          Pos Pred Value : 0.7905         
##          Neg Pred Value : 0.1863         
##              Prevalence : 0.8000         
##          Detection Rate : 0.4680         
##    Detection Prevalence : 0.5920         
##       Balanced Accuracy : 0.4825         
##                                          
##        'Positive' Class : No             
##

cat("\n>> Logistic Regression dengan PCA + ROSE\n")

## 
## >> Logistic Regression dengan PCA + ROSE

log_model_pca <- glm(target ~ ., data = train_pca_bal, family = "binomial")
log_pred_pca <- predict(log_model_pca, newdata = test_pca, type = "response")
log_pred_class <- ifelse(log_pred_pca > 0.5, "Yes", "No")
log_pred_class <- factor(log_pred_class, levels = levels(test_pca$target))
conf_log_pca <- confusionMatrix(log_pred_class, test_pca$target)
print(conf_log_pca)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  936 248
##        Yes 664 152
##                                          
##                Accuracy : 0.544          
##                  95% CI : (0.5219, 0.566)
##     No Information Rate : 0.8            
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0252        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5850         
##             Specificity : 0.3800         
##          Pos Pred Value : 0.7905         
##          Neg Pred Value : 0.1863         
##              Prevalence : 0.8000         
##          Detection Rate : 0.4680         
##    Detection Prevalence : 0.5920         
##       Balanced Accuracy : 0.4825         
##                                          
##        'Positive' Class : No             
##

library(caret)
library(ROSE)

set.seed(123)

train_index <- createDataPartition(data$Heart.Disease.Status, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

num_cols <- sapply(train_data, is.numeric)
train_num <- train_data[, num_cols]
test_num <- test_data[, num_cols]

train_num_scaled <- scale(train_num)
test_num_scaled <- scale(test_num, center = attr(train_num_scaled, "scaled:center"), 
                         scale = attr(train_num_scaled, "scaled:scale"))

pca_res <- prcomp(train_num_scaled, center = TRUE, scale. = TRUE)
var_exp <- cumsum(pca_res$sdev^2) / sum(pca_res$sdev^2)
n_comp <- which(var_exp >= 0.9)[1]

train_pca <- as.data.frame(pca_res$x[, 1:n_comp])
train_pca$Heart.Disease.Status <- train_data$Heart.Disease.Status

test_pca_mat <- predict(pca_res, newdata = test_num_scaled)
test_pca <- as.data.frame(test_pca_mat[, 1:n_comp])
test_pca$Heart.Disease.Status <- test_data$Heart.Disease.Status

train_pca_bal <- ROSE(Heart.Disease.Status ~ ., data = train_pca, seed = 123)$data

test_pca$Heart.Disease.Status <- factor(test_pca$Heart.Disease.Status, levels = levels(train_pca_bal$Heart.Disease.Status))

log_model <- glm(Heart.Disease.Status ~ ., data = train_pca_bal, family = binomial)

pred_prob <- predict(log_model, newdata = test_pca, type = "response")
pred_class <- ifelse(pred_prob > 0.5, levels(train_pca_bal$Heart.Disease.Status)[2], levels(train_pca_bal$Heart.Disease.Status)[1])
pred_class <- factor(pred_class, levels = levels(test_pca$Heart.Disease.Status))

conf_mat <- confusionMatrix(pred_class, test_pca$Heart.Disease.Status)
print(conf_mat)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  936 247
##        Yes 664 153
##                                           
##                Accuracy : 0.5445          
##                  95% CI : (0.5224, 0.5665)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0234         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5850          
##             Specificity : 0.3825          
##          Pos Pred Value : 0.7912          
##          Neg Pred Value : 0.1873          
##              Prevalence : 0.8000          
##          Detection Rate : 0.4680          
##    Detection Prevalence : 0.5915          
##       Balanced Accuracy : 0.4838          
##                                           
##        'Positive' Class : No              
##

Klasifikasi Penyakit Jantung dengan Anaalisis Diskriminan dan Regresi Logistik

Kelompok 9 2023 F Farah Raina Febiana (23031554132), Reva Deshinta Isyana (23031554153), Salsa Rahma Aulia (23031554219)

2025-06-05

Load library

Load data