# install package
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3

R Markdown

Introduction

Background: In recent years, heart disease is one of the leading causes of death. In this project, we want to analyze patient data to identify factors associated with heart disease and build a predictive model.

Research question: Can we predict heart disease based on patient characteristics?

Data Description

Variable Type Description
id Identifier Patient ID
age Numeric Age of the patient
sex Categorical Sex (0 = female, 1 = male)
chol Numeric Serum cholesterol level
trestbps Numeric Resting blood pressure
thalach Numeric Maximum heart rate achieved
target Binary Heart disease status (0 = no disease, 1 = disease)

Step 0: read in 3 datasets

dataset 1: demographics (age, sex)

demographics = read.csv("demographics.csv")
head(demographics)
##   id age sex
## 1  1  63   1
## 2  2  37   1
## 3  3  41   0
## 4  4  56   1
## 5  5  57   0
## 6  6  57   1
# check whether there is NA in all datasets
sum(is.na(demographics$id))
## [1] 0
any(duplicated(demographics$id))
## [1] FALSE

dataset 2: clinical (chol, trestbps, thalach)

clinical = read.csv("clinical.csv")
head(clinical)
##   id chol trestbps thalach
## 1  1  233      145     150
## 2  2  250      130     187
## 3  3  204      130     172
## 4  4  236      120     178
## 5  5  354      120     163
## 6  6  192      140     148
# check whether there is NA in all datasets
sum(is.na(clinical$id))
## [1] 0
any(duplicated(clinical$id))
## [1] FALSE

dataset 3: outcome (target)

outcome = read.csv("outcome.csv")
head(outcome)
##   id target
## 1  1      1
## 2  2      1
## 3  3      1
## 4  4      1
## 5  5      1
## 6  6      1
# check whether there is NA in all datasets
sum(is.na(outcome$id))
## [1] 0
any(duplicated(outcome$id))
## [1] FALSE

Step 1: merge these 3 datasets using common key “id”

merged_df <- merge(demographics, clinical, by = "id")
merged_df <- merge(merged_df, outcome, by = "id")


summary(merged_df)
##        id             age             sex              chol      
##  Min.   :  1.0   Min.   :29.00   Min.   :0.0000   Min.   :126.0  
##  1st Qu.: 76.5   1st Qu.:47.50   1st Qu.:0.0000   1st Qu.:211.0  
##  Median :152.0   Median :55.00   Median :1.0000   Median :240.0  
##  Mean   :152.0   Mean   :54.37   Mean   :0.6832   Mean   :246.3  
##  3rd Qu.:227.5   3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:274.5  
##  Max.   :303.0   Max.   :77.00   Max.   :1.0000   Max.   :564.0  
##     trestbps        thalach          target      
##  Min.   : 94.0   Min.   : 71.0   Min.   :0.0000  
##  1st Qu.:120.0   1st Qu.:133.5   1st Qu.:0.0000  
##  Median :130.0   Median :153.0   Median :1.0000  
##  Mean   :131.6   Mean   :149.6   Mean   :0.5446  
##  3rd Qu.:140.0   3rd Qu.:166.0   3rd Qu.:1.0000  
##  Max.   :200.0   Max.   :202.0   Max.   :1.0000
sum(is.na(merged_df))
## [1] 0

Step 2: Exploratory data analysis

For quantative variable: we use histogram / boxplot

quant_vars <- c("age", "chol", "trestbps", "thalach")

par(mfrow = c(2, 2))  # 2x2 grid

Histogram

par(mfrow = c(2, 2)) 
for (col in quant_vars) {
  hist(merged_df[[col]],
       main = col,
       xlab = col)
}

Boxplot

par(mfrow = c(2, 2)) 
for (col in quant_vars) {
  boxplot(merged_df[[col]],
       main = col,
       xlab = col)
}

For categorical variables: we can use bar plots/tables to visualize

table

table(merged_df$sex)
## 
##   0   1 
##  96 207
table(merged_df$target)
## 
##   0   1 
## 138 165

bar plot

par(mfrow = c(2, 1))
ggplot(merged_df, aes(x = factor(target))) +
  geom_bar(fill = "orange") +
  labs(
    title = "Distribution of target",
    x = "target",
    y = "Count"
  )

ggplot(merged_df, aes(x = factor(sex))) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribution of Gender",
    x = "Gender",
    y = "Count"
  )

Question we are interested in: Is Heart Disease More Common in Males or Females?

# First, do a visualization 
ggplot(merged_df, aes(x = factor(sex), fill = factor(target))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Heart Disease by Gender",
    x = "Gender",
    y = "Count",
    fill = "Heart Disease"
  )

Now use the chi2 test: we are interested in

null hypothesis: sex and heart disease are independent
alternative: sex and heart disease are associate

Summary: The small p-value suggests that sex and heart disease are not independent, indicating a significant association.

Step 3: Model using logistic regression

model <- glm(
  target ~ age + factor(sex) + chol + trestbps + thalach ,
  data = merged_df,
  family = binomial
)

summary(model)
## 
## Call:
## glm(formula = target ~ age + factor(sex) + chol + trestbps + 
##     thalach, family = binomial, data = merged_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.344548   1.832916  -0.188   0.8509    
## age          -0.014453   0.017584  -0.822   0.4111    
## factor(sex)1 -1.860719   0.343297  -5.420 5.96e-08 ***
## chol         -0.007105   0.002762  -2.572   0.0101 *  
## trestbps     -0.020107   0.008366  -2.404   0.0162 *  
## thalach       0.046981   0.007700   6.101 1.05e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 316.69  on 297  degrees of freedom
## AIC: 328.69
## 
## Number of Fisher Scoring iterations: 4

Conclusion: Sex, Serum cholesterol level, Resting blood pressure, Maximum heart rate achieved are associated with heart disease.

Step 4: further analysis

exp(coef(model))
##  (Intercept)          age factor(sex)1         chol     trestbps      thalach 
##    0.7085402    0.9856509    0.1555607    0.9929202    0.9800935    1.0481020

Interpretation:

Sex: Holding other variables constant, males have significantly lower odds of heart disease compared to females. Specifically, the odds of heart disease for males are about 84% lower than for females.

Cholesterol: For each one-unit increase in cholesterol, the odds of heart disease decrease by about 0.7%. This indicates a small but statistically significant negative association.