col_names <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data <- read.csv("cleveland.data.csv", , header = FALSE, col.names = col_names, na.strings = "?")

Variabel X (fitur) & Y (target)

X <- data[, setdiff(col_names, "num")]
Y <- data$num

Memeriksa struktur awal dan missing values

str(data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num     : int  0 2 1 0 0 0 3 0 2 1 ...
summary(data)
##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :1.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :3.000   Median :130.0  
##  Mean   :54.44   Mean   :0.6799   Mean   :3.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
##                                                                  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.5  
##  Median :241.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :246.7   Mean   :0.1485   Mean   :0.9901   Mean   :149.6  
##  3rd Qu.:275.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##                                                                   
##      exang           oldpeak         slope             ca        
##  Min.   :0.0000   Min.   :0.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.80   Median :2.000   Median :0.0000  
##  Mean   :0.3267   Mean   :1.04   Mean   :1.601   Mean   :0.6722  
##  3rd Qu.:1.0000   3rd Qu.:1.60   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.20   Max.   :3.000   Max.   :3.0000  
##                                                  NA's   :4       
##       thal            num        
##  Min.   :3.000   Min.   :0.0000  
##  1st Qu.:3.000   1st Qu.:0.0000  
##  Median :3.000   Median :0.0000  
##  Mean   :4.734   Mean   :0.9373  
##  3rd Qu.:7.000   3rd Qu.:2.0000  
##  Max.   :7.000   Max.   :4.0000  
##  NA's   :2
colSums(is.na(data))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal      num 
##        0        0        0        4        2        0

Menangani missing values

Mengisi NA di kolom numerik dengan median

data$ca <- ifelse(is.na(data$ca), median(data$ca, na.rm = TRUE), data$ca)
data$thal <- ifelse(is.na(data$thal), median(data$thal, na.rm = TRUE), data$thal)

Mengubah tipe data: Kategorikal -> Faktor

data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)
data$ca <- as.factor(data$ca) 
data$thal <- as.factor(data$thal)
data$num <- as.factor(data$num)

Hapus outlier

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
outlier_cols <- c("age", "trestbps", "chol", "thalach", "oldpeak")
for (col in outlier_cols) {
  Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  data <- data %>% filter(data[[col]] >= lower_bound & data[[col]] <= upper_bound)
}

Memeriksa data dan hapus duplikasi

library(dplyr)
data <- data %>% distinct()
# Validasi rentang logis
data <- data %>% filter(age >= 20 & age <= 100, chol > 0, trestbps > 0)
str(data)
## 'data.frame':    284 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 1 1 2 2 ...
##  $ cp      : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 4 levels "0","1","2","3": 1 4 3 1 1 1 3 1 2 1 ...
##  $ thal    : Factor w/ 3 levels "3","6","7": 2 1 3 1 1 1 1 1 3 3 ...
##  $ num     : Factor w/ 5 levels "0","1","2","3",..: 1 3 2 1 1 1 4 1 3 2 ...
summary(data)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0: 86   1: 22   Min.   : 94   Min.   :126.0   0:244   0:144  
##  1st Qu.:47.00   1:198   2: 49   1st Qu.:120   1st Qu.:210.8   1: 40   1:  2  
##  Median :55.00           3: 82   Median :130   Median :239.5           2:138  
##  Mean   :54.07           4:131   Mean   :130   Mean   :242.5                  
##  3rd Qu.:60.00                   3rd Qu.:140   3rd Qu.:271.0                  
##  Max.   :77.00                   Max.   :170   Max.   :360.0                  
##     thalach      exang      oldpeak       slope   ca      thal    num    
##  Min.   : 88.0   0:194   Min.   :0.0000   1:138   0:170   3:162   0:158  
##  1st Qu.:133.8   1: 90   1st Qu.:0.0000   2:130   1: 63   6: 17   1: 53  
##  Median :153.0           Median :0.6000   3: 16   2: 35   7:105   2: 33  
##  Mean   :150.0           Mean   :0.9486           3: 16           3: 29  
##  3rd Qu.:168.0           3rd Qu.:1.6000                           4: 11  
##  Max.   :202.0           Max.   :4.0000
colSums(is.na(data))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal      num 
##        0        0        0        0        0        0

Statistik Deskriptif

library(psych)
## Warning: package 'psych' was built under R version 4.5.2
describe(data)
##          vars   n   mean    sd median trimmed   mad min max range  skew
## age         1 284  54.07  9.12   55.0   54.18 10.38  29  77    48 -0.15
## sex*        2 284   1.70  0.46    2.0    1.75  0.00   1   2     1 -0.85
## cp*         3 284   3.13  0.96    3.0    3.26  1.48   1   4     3 -0.79
## trestbps    4 284 129.98 15.45  130.0  129.50 14.83  94 170    76  0.25
## chol        5 284 242.54 44.73  239.5  241.61 45.22 126 360   234  0.19
## fbs*        6 284   1.14  0.35    1.0    1.05  0.00   1   2     1  2.05
## restecg*    7 284   1.98  1.00    1.0    1.97  0.00   1   3     2  0.04
## thalach     8 284 150.00 22.66  153.0  151.31 22.98  88 202   114 -0.48
## exang*      9 284   1.32  0.47    1.0    1.27  0.00   1   2     1  0.78
## oldpeak    10 284   0.95  1.03    0.6    0.80  0.89   0   4     4  0.92
## slope*     11 284   1.57  0.60    2.0    1.52  1.48   1   3     2  0.51
## ca*        12 284   1.64  0.90    1.0    1.48  0.00   1   4     3  1.23
## thal*      13 284   1.80  0.95    1.0    1.75  0.00   1   3     2  0.41
## num*       14 284   1.88  1.19    1.0    1.68  0.00   1   5     4  1.14
##          kurtosis   se
## age         -0.58 0.54
## sex*        -1.28 0.03
## cp*         -0.51 0.06
## trestbps    -0.26 0.92
## chol        -0.31 2.65
## fbs*         2.23 0.02
## restecg*    -2.00 0.06
## thalach     -0.37 1.34
## exang*      -1.39 0.03
## oldpeak     -0.12 0.06
## slope*      -0.65 0.04
## ca*          0.41 0.05
## thal*       -1.77 0.06
## num*         0.10 0.07
summary(data)
##       age        sex     cp         trestbps        chol       fbs     restecg
##  Min.   :29.00   0: 86   1: 22   Min.   : 94   Min.   :126.0   0:244   0:144  
##  1st Qu.:47.00   1:198   2: 49   1st Qu.:120   1st Qu.:210.8   1: 40   1:  2  
##  Median :55.00           3: 82   Median :130   Median :239.5           2:138  
##  Mean   :54.07           4:131   Mean   :130   Mean   :242.5                  
##  3rd Qu.:60.00                   3rd Qu.:140   3rd Qu.:271.0                  
##  Max.   :77.00                   Max.   :170   Max.   :360.0                  
##     thalach      exang      oldpeak       slope   ca      thal    num    
##  Min.   : 88.0   0:194   Min.   :0.0000   1:138   0:170   3:162   0:158  
##  1st Qu.:133.8   1: 90   1st Qu.:0.0000   2:130   1: 63   6: 17   1: 53  
##  Median :153.0           Median :0.6000   3: 16   2: 35   7:105   2: 33  
##  Mean   :150.0           Mean   :0.9486           3: 16           3: 29  
##  3rd Qu.:168.0           3rd Qu.:1.6000                           4: 11  
##  Max.   :202.0           Max.   :4.0000

Korelasi

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
numeric_data <- data %>% select(age,trestbps, chol, thalach, oldpeak)
cor_matrix <- cor(numeric_data, use = "complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "blue")

# Histogram

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(data, aes(x = age)) + geom_histogram(binwidth = 5, fill = "yellow", alpha = 0.7) + labs(title = "Distribusi Usia")

ggplot(data, aes(x = chol)) + geom_histogram(binwidth = 20, fill = "darkgrey", alpha = 0.7) + labs(title = "Distribusi Kolesterol")

ggplot(data, aes(x = thalach)) + geom_histogram(binwidth = 10, fill = "pink", alpha = 0.7) + labs(title = "Distribusi Detak Jantung Maksimal")

ggplot(data, aes(x = num)) + geom_bar(fill = "purple", alpha = 0.7) + labs(title = "Distribusi Diagnosis Penyakit Jantung")