col_names <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data <- read.csv("cleveland.data.csv", , header = FALSE, col.names = col_names, na.strings = "?")
X <- data[, setdiff(col_names, "num")]
Y <- data$num
str(data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : num 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : num 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : num 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : num 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : num 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ num : int 0 2 1 0 0 0 3 0 2 1 ...
summary(data)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :1.000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :3.000 Median :130.0
## Mean :54.44 Mean :0.6799 Mean :3.158 Mean :131.7
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :4.000 Max. :200.0
##
## chol fbs restecg thalach
## Min. :126.0 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:133.5
## Median :241.0 Median :0.0000 Median :1.0000 Median :153.0
## Mean :246.7 Mean :0.1485 Mean :0.9901 Mean :149.6
## 3rd Qu.:275.0 3rd Qu.:0.0000 3rd Qu.:2.0000 3rd Qu.:166.0
## Max. :564.0 Max. :1.0000 Max. :2.0000 Max. :202.0
##
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.00 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.80 Median :2.000 Median :0.0000
## Mean :0.3267 Mean :1.04 Mean :1.601 Mean :0.6722
## 3rd Qu.:1.0000 3rd Qu.:1.60 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.20 Max. :3.000 Max. :3.0000
## NA's :4
## thal num
## Min. :3.000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:0.0000
## Median :3.000 Median :0.0000
## Mean :4.734 Mean :0.9373
## 3rd Qu.:7.000 3rd Qu.:2.0000
## Max. :7.000 Max. :4.0000
## NA's :2
colSums(is.na(data))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal num
## 0 0 0 4 2 0
data$ca <- ifelse(is.na(data$ca), median(data$ca, na.rm = TRUE), data$ca)
data$thal <- ifelse(is.na(data$thal), median(data$thal, na.rm = TRUE), data$thal)
data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)
data$ca <- as.factor(data$ca)
data$thal <- as.factor(data$thal)
data$num <- as.factor(data$num)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
outlier_cols <- c("age", "trestbps", "chol", "thalach", "oldpeak")
for (col in outlier_cols) {
Q1 <- quantile(data[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
data <- data %>% filter(data[[col]] >= lower_bound & data[[col]] <= upper_bound)
}
library(dplyr)
data <- data %>% distinct()
# Validasi rentang logis
data <- data %>% filter(age >= 20 & age <= 100, chol > 0, trestbps > 0)
str(data)
## 'data.frame': 284 obs. of 14 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 1 1 2 2 ...
## $ cp : Factor w/ 4 levels "1","2","3","4": 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 3 3 3 1 3 1 3 1 3 3 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : Factor w/ 3 levels "1","2","3": 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : Factor w/ 4 levels "0","1","2","3": 1 4 3 1 1 1 3 1 2 1 ...
## $ thal : Factor w/ 3 levels "3","6","7": 2 1 3 1 1 1 1 1 3 3 ...
## $ num : Factor w/ 5 levels "0","1","2","3",..: 1 3 2 1 1 1 4 1 3 2 ...
summary(data)
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0: 86 1: 22 Min. : 94 Min. :126.0 0:244 0:144
## 1st Qu.:47.00 1:198 2: 49 1st Qu.:120 1st Qu.:210.8 1: 40 1: 2
## Median :55.00 3: 82 Median :130 Median :239.5 2:138
## Mean :54.07 4:131 Mean :130 Mean :242.5
## 3rd Qu.:60.00 3rd Qu.:140 3rd Qu.:271.0
## Max. :77.00 Max. :170 Max. :360.0
## thalach exang oldpeak slope ca thal num
## Min. : 88.0 0:194 Min. :0.0000 1:138 0:170 3:162 0:158
## 1st Qu.:133.8 1: 90 1st Qu.:0.0000 2:130 1: 63 6: 17 1: 53
## Median :153.0 Median :0.6000 3: 16 2: 35 7:105 2: 33
## Mean :150.0 Mean :0.9486 3: 16 3: 29
## 3rd Qu.:168.0 3rd Qu.:1.6000 4: 11
## Max. :202.0 Max. :4.0000
colSums(is.na(data))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exang oldpeak slope ca thal num
## 0 0 0 0 0 0
library(psych)
## Warning: package 'psych' was built under R version 4.5.2
describe(data)
## vars n mean sd median trimmed mad min max range skew
## age 1 284 54.07 9.12 55.0 54.18 10.38 29 77 48 -0.15
## sex* 2 284 1.70 0.46 2.0 1.75 0.00 1 2 1 -0.85
## cp* 3 284 3.13 0.96 3.0 3.26 1.48 1 4 3 -0.79
## trestbps 4 284 129.98 15.45 130.0 129.50 14.83 94 170 76 0.25
## chol 5 284 242.54 44.73 239.5 241.61 45.22 126 360 234 0.19
## fbs* 6 284 1.14 0.35 1.0 1.05 0.00 1 2 1 2.05
## restecg* 7 284 1.98 1.00 1.0 1.97 0.00 1 3 2 0.04
## thalach 8 284 150.00 22.66 153.0 151.31 22.98 88 202 114 -0.48
## exang* 9 284 1.32 0.47 1.0 1.27 0.00 1 2 1 0.78
## oldpeak 10 284 0.95 1.03 0.6 0.80 0.89 0 4 4 0.92
## slope* 11 284 1.57 0.60 2.0 1.52 1.48 1 3 2 0.51
## ca* 12 284 1.64 0.90 1.0 1.48 0.00 1 4 3 1.23
## thal* 13 284 1.80 0.95 1.0 1.75 0.00 1 3 2 0.41
## num* 14 284 1.88 1.19 1.0 1.68 0.00 1 5 4 1.14
## kurtosis se
## age -0.58 0.54
## sex* -1.28 0.03
## cp* -0.51 0.06
## trestbps -0.26 0.92
## chol -0.31 2.65
## fbs* 2.23 0.02
## restecg* -2.00 0.06
## thalach -0.37 1.34
## exang* -1.39 0.03
## oldpeak -0.12 0.06
## slope* -0.65 0.04
## ca* 0.41 0.05
## thal* -1.77 0.06
## num* 0.10 0.07
summary(data)
## age sex cp trestbps chol fbs restecg
## Min. :29.00 0: 86 1: 22 Min. : 94 Min. :126.0 0:244 0:144
## 1st Qu.:47.00 1:198 2: 49 1st Qu.:120 1st Qu.:210.8 1: 40 1: 2
## Median :55.00 3: 82 Median :130 Median :239.5 2:138
## Mean :54.07 4:131 Mean :130 Mean :242.5
## 3rd Qu.:60.00 3rd Qu.:140 3rd Qu.:271.0
## Max. :77.00 Max. :170 Max. :360.0
## thalach exang oldpeak slope ca thal num
## Min. : 88.0 0:194 Min. :0.0000 1:138 0:170 3:162 0:158
## 1st Qu.:133.8 1: 90 1st Qu.:0.0000 2:130 1: 63 6: 17 1: 53
## Median :153.0 Median :0.6000 3: 16 2: 35 7:105 2: 33
## Mean :150.0 Mean :0.9486 3: 16 3: 29
## 3rd Qu.:168.0 3rd Qu.:1.6000 4: 11
## Max. :202.0 Max. :4.0000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
numeric_data <- data %>% select(age,trestbps, chol, thalach, oldpeak)
cor_matrix <- cor(numeric_data, use = "complete.obs")
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "blue")
# Histogram
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(data, aes(x = age)) + geom_histogram(binwidth = 5, fill = "yellow", alpha = 0.7) + labs(title = "Distribusi Usia")
ggplot(data, aes(x = chol)) + geom_histogram(binwidth = 20, fill = "darkgrey", alpha = 0.7) + labs(title = "Distribusi Kolesterol")
ggplot(data, aes(x = thalach)) + geom_histogram(binwidth = 10, fill = "pink", alpha = 0.7) + labs(title = "Distribusi Detak Jantung Maksimal")
ggplot(data, aes(x = num)) + geom_bar(fill = "purple", alpha = 0.7) + labs(title = "Distribusi Diagnosis Penyakit Jantung")