# install package
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
| Variable | Type | Description |
|---|---|---|
| id | Identifier | Patient ID |
| age | Numeric | Age of the patient |
| sex | Categorical | Sex (0 = female, 1 = male) |
| chol | Numeric | Serum cholesterol level |
| trestbps | Numeric | Resting blood pressure |
| thalach | Numeric | Maximum heart rate achieved |
| target | Binary | Heart disease status (0 = no disease, 1 = disease) |
demographics = read.csv("demographics.csv")
head(demographics)
## id age sex
## 1 1 63 1
## 2 2 37 1
## 3 3 41 0
## 4 4 56 1
## 5 5 57 0
## 6 6 57 1
# check whether there is NA in all datasets
sum(is.na(demographics$id))
## [1] 0
any(duplicated(demographics$id))
## [1] FALSE
clinical = read.csv("clinical.csv")
head(clinical)
## id chol trestbps thalach
## 1 1 233 145 150
## 2 2 250 130 187
## 3 3 204 130 172
## 4 4 236 120 178
## 5 5 354 120 163
## 6 6 192 140 148
# check whether there is NA in all datasets
sum(is.na(clinical$id))
## [1] 0
any(duplicated(clinical$id))
## [1] FALSE
outcome = read.csv("outcome.csv")
head(outcome)
## id target
## 1 1 1
## 2 2 1
## 3 3 1
## 4 4 1
## 5 5 1
## 6 6 1
# check whether there is NA in all datasets
sum(is.na(outcome$id))
## [1] 0
any(duplicated(outcome$id))
## [1] FALSE
merged_df <- merge(demographics, clinical, by = "id")
merged_df <- merge(merged_df, outcome, by = "id")
summary(merged_df)
## id age sex chol
## Min. : 1.0 Min. :29.00 Min. :0.0000 Min. :126.0
## 1st Qu.: 76.5 1st Qu.:47.50 1st Qu.:0.0000 1st Qu.:211.0
## Median :152.0 Median :55.00 Median :1.0000 Median :240.0
## Mean :152.0 Mean :54.37 Mean :0.6832 Mean :246.3
## 3rd Qu.:227.5 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:274.5
## Max. :303.0 Max. :77.00 Max. :1.0000 Max. :564.0
## trestbps thalach target
## Min. : 94.0 Min. : 71.0 Min. :0.0000
## 1st Qu.:120.0 1st Qu.:133.5 1st Qu.:0.0000
## Median :130.0 Median :153.0 Median :1.0000
## Mean :131.6 Mean :149.6 Mean :0.5446
## 3rd Qu.:140.0 3rd Qu.:166.0 3rd Qu.:1.0000
## Max. :200.0 Max. :202.0 Max. :1.0000
sum(is.na(merged_df))
## [1] 0
## change "sex" and "target" to factor
merged_df$sex <- factor(merged_df$sex,
levels = c(0, 1),
labels = c("Female", "Male"))
merged_df$target <- factor(merged_df$target,
levels = c(0, 1),
labels = c("No Disease", "Disease"))
quant_vars <- c("age", "chol", "trestbps", "thalach")
par(mfrow = c(2, 2))
for (col in quant_vars) {
hist(merged_df[[col]],
main = col,
xlab = col)
}
par(mfrow = c(2, 2))
for (col in quant_vars) {
boxplot(merged_df[[col]],
main = col,
xlab = col)
}
table(merged_df$sex)
##
## Female Male
## 96 207
table(merged_df$target)
##
## No Disease Disease
## 138 165
par(mfrow = c(2, 1))
ggplot(merged_df, aes(x = factor(target))) +
geom_bar(fill = "orange") +
labs(
title = "Distribution of target",
x = "target",
y = "Count"
)
ggplot(merged_df, aes(x = factor(sex))) +
geom_bar(fill = "steelblue") +
labs(
title = "Distribution of Gender",
x = "Gender",
y = "Count"
)
# First, do a visualization
ggplot(merged_df, aes(x = factor(sex), fill = factor(target))) +
geom_bar(position = "dodge") +
labs(
title = "Heart Disease by Gender",
x = "Gender",
y = "Count",
fill = "Heart Disease"
)
# We can also do a mosaic plot
mosaicplot(table(merged_df$sex, merged_df$target),
main = "Mosaic Plot of Sex vs Heart Disease",
xlab = "Sex",
ylab = "Heart Disease",
col = c("lightblue", "pink"))
model1 <- glm(
target ~ age + factor(sex) + chol + trestbps + thalach ,
data = merged_df,
family = binomial
)
summary(model1)
##
## Call:
## glm(formula = target ~ age + factor(sex) + chol + trestbps +
## thalach, family = binomial, data = merged_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.344548 1.832916 -0.188 0.8509
## age -0.014453 0.017584 -0.822 0.4111
## factor(sex)Male -1.860719 0.343297 -5.420 5.96e-08 ***
## chol -0.007105 0.002762 -2.572 0.0101 *
## trestbps -0.020107 0.008366 -2.404 0.0162 *
## thalach 0.046981 0.007700 6.101 1.05e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 316.69 on 297 degrees of freedom
## AIC: 328.69
##
## Number of Fisher Scoring iterations: 4
model2 <- glm(
target ~ factor(sex) + chol + trestbps + thalach ,
data = merged_df,
family = binomial
)
summary(model2)
##
## Call:
## glm(formula = target ~ factor(sex) + chol + trestbps + thalach,
## family = binomial, data = merged_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.168485 1.538982 -0.759 0.44770
## factor(sex)Male -1.836105 0.341305 -5.380 7.46e-08 ***
## chol -0.007468 0.002731 -2.735 0.00624 **
## trestbps -0.021845 0.008134 -2.686 0.00724 **
## thalach 0.049189 0.007278 6.759 1.39e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 317.37 on 298 degrees of freedom
## AIC: 327.37
##
## Number of Fisher Scoring iterations: 4
\[ \text{logit}(p) = \log\left(\frac{p}{1 - p}\right) = \beta_0 + \beta_1 \cdot \text{sex(1) male} + \beta_2 \cdot \text{chol} + \beta_3 \cdot \text{trestbps} + \beta_4 \cdot \text{thalach} \]
result_table <- data.frame(
Estimate = coef(model2),
OR = exp(coef(model2)),
Lower_CI = exp(confint(model2)[,1]),
Upper_CI = exp(confint(model2)[,2])
)
## Waiting for profiling to be done...
## Waiting for profiling to be done...
round(result_table, 3)
## Estimate OR Lower_CI Upper_CI
## (Intercept) -1.168 0.311 0.015 6.390
## factor(sex)Male -1.836 0.159 0.079 0.304
## chol -0.007 0.993 0.987 0.998
## trestbps -0.022 0.978 0.962 0.994
## thalach 0.049 1.050 1.036 1.066
pred_prob <- predict(model2, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
mean(pred_class == merged_df$target)
## [1] 0
table(Predicted = pred_class, Actual = merged_df$target)
## Actual
## Predicted No Disease Disease
## 0 89 32
## 1 49 133