Does Smoking Status and Levels of CPK Increase Probability of Dying to Heart Failure

A Statistical Analysis

Michael Bojczuk s3284442

Last updated: 01 October, 2020

Introduction

Problem Statement

Data

Data Cont.

#set the factor levels and labels
heart$smoking <- heart$smoking %>% factor(levels = c(1,0), labels = c("Smoker", "Non-Smoker"))
heart$DEATH_EVENT <- heart$DEATH_EVENT %>% factor(levels = c(1,0), labels = c("Dead", "Alive"))

Heart Failure based on Smoking

table(heart$DEATH_EVENT,heart$smoking)  %>% addmargins()
##        
##         Smoker Non-Smoker Sum
##   Dead      30         66  96
##   Alive     66        137 203
##   Sum       96        203 299
#Probability died to heart failure given smoker status 
round((29/292)/(92/292),2)
## [1] 0.32

Heart Failure based on Smoking Cont.

table_1 <- table(heart$DEATH_EVENT,heart$smoking) %>% prop.table(margin = 2)
#barplot with the proportions table that is cross tabulated
table_1 %>% barplot(main = "Patient After Heart Failure Status by Smoking Status", 
                    ylab="Proportion within Smoking Status",
                    xlab="Smoking Status",
                    ylim=c(0,.8), legend=rownames(table_1), beside=TRUE,
                    args.legend=c(x = "top", horiz=TRUE, title="Patient Status"))
grid()

Dealing with Outliers for CPK Levels

#box plot creation
heart %>% boxplot(creatinine_phosphokinase ~ DEATH_EVENT, data = ., 
                  xlab = 'Patient Status', ylab = 'CPK Level per mcg/L')

#after seeing the boxplot we choose to remove CPK values above 3800 mcg/L
heart <- heart %>% filter(creatinine_phosphokinase < 3800)

Descriptive Statistics on CPK Levels

table1 <- heart %>% group_by(DEATH_EVENT) %>% summarise(Min = min(creatinine_phosphokinase, na.rm = T),
                                                        Q1 = quantile(creatinine_phosphokinase, probs = 0.25, na.rm = T),
                                                        Median = median(creatinine_phosphokinase, na.rm = T),
                                                        Mean = mean(creatinine_phosphokinase, na.rm = T),
                                                        Q3 = quantile(creatinine_phosphokinase, probs = 0.75, na.rm = T),
                                                        Max = max(creatinine_phosphokinase, na.rm = T),
                                                        SD = sd(creatinine_phosphokinase, na.rm = T),
                                                        Length = n(),
                                                        Missing = sum(is.na(creatinine_phosphokinase)))
kable(table1)
DEATH_EVENT Min Q1 Median Mean Q3 Max SD Length Missing
Dead 23 127.25 249.5 423.1522 582 2442 467.5249 92 0
Alive 30 107.50 238.0 479.5800 582 2794 569.3690 200 0

Decsriptive Statistics on CPK Levels Cont.

par(mfrow = c(1, 2))
#set filters
deceased <- heart %>% filter(DEATH_EVENT=="Dead")
alive <- heart %>% filter(DEATH_EVENT=="Alive")
#Deceased Plots
hist(deceased$creatinine_phosphokinase, xlab = "Creatinine Phosphokinase Levels", breaks= 10, main = "Deceased Patients from Heart Failure")
abline(v = mean(deceased$creatinine_phosphokinase), col = "red", lw = 2)
text(1500,25, "Mean = 423.15\nSD = 467.52", col = 'black', cex = 0.75)
#Alive Plots
hist(alive$creatinine_phosphokinase, xlab = "Creatinine Phosphokinase Levels", breaks= 10, main = "Alive Patients from Heart Failure")
abline(v = mean(alive$creatinine_phosphokinase), col = "red", lw = 2)
text(1600,56, "Mean = 479.58\nSD = 569.37", col = 'black', cex = 0.75)
#Legend
legend("topright", c('Mean'), fill = c('red'), cex =0.75)

Decsriptive Statistics on CPK Levels Cont

par(mfrow = c(1, 2))
qqPlot(deceased$creatinine_phosphokinase, dist="norm", ylab = "CPK levels Deceased")
## [1] 74 73
qqPlot(alive$creatinine_phosphokinase, dist="norm", ylab = "CPK levels Alive")

## [1] 133 182

Hypothesis Testing

Hypthesis Testing Cont.

heart %>% leveneTest(creatinine_phosphokinase ~ DEATH_EVENT, data = .)
heart %>% t.test(creatinine_phosphokinase ~ DEATH_EVENT, data = ., var.equal = T, alternative = "two.sided")
## 
##  Two Sample t-test
## 
## data:  creatinine_phosphokinase by DEATH_EVENT
## t = -0.83029, df = 290, p-value = 0.4071
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -190.18782   77.33216
## sample estimates:
##  mean in group Dead mean in group Alive 
##            423.1522            479.5800

Hypothesis Testing Conclusion

Discussion

References