library(readr)
mydata <- read_csv("heart.csv")
## Rows: 1025 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(mydata)
## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212 0 1 168 0 1 2
## 2 53 1 0 140 203 1 0 155 1 3.1 0
## 3 70 1 0 145 174 0 1 125 1 2.6 0
## 4 61 1 0 148 203 0 1 161 0 0 2
## 5 62 0 0 138 294 1 1 106 0 1.9 1
## 6 58 0 0 100 248 0 0 122 0 1 1
## # ℹ 3 more variables: ca <dbl>, thal <dbl>, target <dbl>
colnames(mydata) [1] <- "Age"
colnames(mydata) [2] <- "Sex"
colnames(mydata) [3] <- "Chest pain"
colnames(mydata) [4] <- "Resting blood pressure"
colnames(mydata) [5] <- "Cholestoral"
colnames(mydata) [6] <- "Fasting blood sugar"
colnames(mydata) [7] <- "Resting electrocardiographic results"
colnames(mydata) [8] <- "Max heart rate"
colnames(mydata) [9] <- "Exercise induced angina"
colnames(mydata) [10] <- "Oldpeak"
colnames(mydata) [11] <- "Slope of the peak"
colnames(mydata) [12] <- "Number of major vessels"
colnames(mydata) [13] <- "Thal"
colnames(mydata) [14] <- "Target"
head(mydata)
## # A tibble: 6 × 14
## Age Sex `Chest pain` `Resting blood pressure` Cholestoral
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212
## 2 53 1 0 140 203
## 3 70 1 0 145 174
## 4 61 1 0 148 203
## 5 62 0 0 138 294
## 6 58 0 0 100 248
## # ℹ 9 more variables: `Fasting blood sugar` <dbl>,
## # `Resting electrocardiographic results` <dbl>, `Max heart rate` <dbl>,
## # `Exercise induced angina` <dbl>, Oldpeak <dbl>, `Slope of the peak` <dbl>,
## # `Number of major vessels` <dbl>, Thal <dbl>, Target <dbl>
Unit of observation is one patient.
Sample size is 1025.
Explanation of variables:
The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.
mydata$`Sex` <- factor(mydata$`Sex`, levels = c(0, 1), labels = c("Female", "Male"))
mydata$`Chest pain` <- factor(mydata$`Chest pain`, levels = c(0, 1, 2, 3), labels = c("Type 0", "Type 1", "Type 2", "Type 3"))
mydata$`Fasting blood sugar` <- factor(mydata$`Fasting blood sugar`, levels = c(0, 1), labels = c("False", "True"))
mydata$`Resting electrocardiographic results` <- factor(mydata$`Resting electrocardiographic results`,
levels = c(0, 1, 2), labels = c("Normal", "ST-T wave abnormality", "Probable/Definite LVH"))
mydata$`Exercise induced angina` <- factor(mydata$`Exercise induced angina`, levels = c(0, 1), labels = c("No", "Yes"))
mydata$`Slope of the peak` <- factor(mydata$`Slope of the peak`, levels = c(0, 1, 2), labels = c("Upsloping", "Flat", "Downsloping"))
mydata$`Number of major vessels` <- factor(mydata$`Number of major vessels`, levels = c(0, 1, 2, 3))
mydata$Thal <- factor(mydata$Thal, levels = c(1, 2, 3), labels = c("Normal", "Fixed Defect", "Reversible Defect"))
mydata$Target <- factor(mydata$Target, levels = c(0, 1), labels = c("No Disease", "Disease"))
head(mydata)
## # A tibble: 6 × 14
## Age Sex `Chest pain` `Resting blood pressure` Cholestoral
## <dbl> <fct> <fct> <dbl> <dbl>
## 1 52 Male Type 0 125 212
## 2 53 Male Type 0 140 203
## 3 70 Male Type 0 145 174
## 4 61 Male Type 0 148 203
## 5 62 Female Type 0 138 294
## 6 58 Female Type 0 100 248
## # ℹ 9 more variables: `Fasting blood sugar` <fct>,
## # `Resting electrocardiographic results` <fct>, `Max heart rate` <dbl>,
## # `Exercise induced angina` <fct>, Oldpeak <dbl>, `Slope of the peak` <fct>,
## # `Number of major vessels` <fct>, Thal <fct>, Target <fct>
psych::describe.by(mydata)
## Warning in psych::describe.by(mydata): describe.by is deprecated. Please use
## the describeBy function
## Warning in describeBy(x = x, group = group, mat = mat, type = type, ...): no
## grouping variable requested
## vars n mean sd median trimmed
## Age 1 1025 54.43 9.07 56.0 54.66
## Sex* 2 1025 1.70 0.46 2.0 1.74
## Chest pain* 3 1025 1.94 1.03 2.0 1.83
## Resting blood pressure 4 1025 131.61 17.52 130.0 130.39
## Cholestoral 5 1025 246.00 51.59 240.0 243.26
## Fasting blood sugar* 6 1025 1.15 0.36 1.0 1.06
## Resting electrocardiographic results* 7 1025 1.53 0.53 2.0 1.52
## Max heart rate 8 1025 149.11 23.01 152.0 150.40
## Exercise induced angina* 9 1025 1.34 0.47 1.0 1.30
## Oldpeak 10 1025 1.07 1.18 0.8 0.89
## Slope of the peak* 11 1025 2.39 0.62 2.0 2.45
## Number of major vessels* 12 1007 1.70 0.94 1.0 1.54
## Thal* 13 1018 2.34 0.59 2.0 2.38
## Target* 14 1025 1.51 0.50 2.0 1.52
## mad min max range skew kurtosis se
## Age 8.90 29 77.0 48.0 -0.25 -0.53 0.28
## Sex* 0.00 1 2.0 1.0 -0.85 -1.28 0.01
## Chest pain* 1.48 1 4.0 3.0 0.53 -1.15 0.03
## Resting blood pressure 14.83 94 200.0 106.0 0.74 0.97 0.55
## Cholestoral 48.93 126 564.0 438.0 1.07 3.96 1.61
## Fasting blood sugar* 0.00 1 2.0 1.0 1.97 1.87 0.01
## Resting electrocardiographic results* 0.00 1 3.0 2.0 0.18 -1.31 0.02
## Max heart rate 23.72 71 202.0 131.0 -0.51 -0.10 0.72
## Exercise induced angina* 0.00 1 2.0 1.0 0.69 -1.52 0.01
## Oldpeak 1.19 0 6.2 6.2 1.21 1.29 0.04
## Slope of the peak* 1.48 1 3.0 2.0 -0.48 -0.65 0.02
## Number of major vessels* 0.00 1 4.0 3.0 1.13 0.10 0.03
## Thal* 0.00 1 3.0 2.0 -0.27 -0.67 0.02
## Target* 0.00 1 2.0 1.0 -0.05 -2.00 0.02
summary(mydata [ , c(1,4,5)])
## Age Resting blood pressure Cholestoral
## Min. :29.00 Min. : 94.0 Min. :126
## 1st Qu.:48.00 1st Qu.:120.0 1st Qu.:211
## Median :56.00 Median :130.0 Median :240
## Mean :54.43 Mean :131.6 Mean :246
## 3rd Qu.:61.00 3rd Qu.:140.0 3rd Qu.:275
## Max. :77.00 Max. :200.0 Max. :564
The minimum age is 29.
The average resting blood pressure is 131,6.
Fifty percent of patients have less than 240 and fifty percent of patients have more than 240 cholestoral.
The average Cholestoral in general population is 200 and the median 195 (source: chatgpt). I am interested whether the average cholestoral is different in this data frame.
library(ggplot2)
ggplot(mydata, aes(x = Cholestoral)) +
geom_histogram(binwidth = 50, colour = "black", fill = "salmon") +
ylab("Frequency") +
xlab("Cholestoral in mg/dl")
Looking at the graph, the distribution does not look normal but asymetrical to the right, we will make Shapiro-Wilk test to be sure.
shapiro.test(mydata$Cholestoral)
##
## Shapiro-Wilk normality test
##
## data: mydata$Cholestoral
## W = 0.95022, p-value < 2.2e-16
We can reject H0 at p<0.001.
#Despite the normality requirement violation, we could still try the parametric test.
t.test(mydata$Cholestoral,
mu = 200,
alternative = "two.sided")
##
## One Sample t-test
##
## data: mydata$Cholestoral
## t = 28.545, df = 1024, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 200
## 95 percent confidence interval:
## 242.8378 249.1622
## sample estimates:
## mean of x
## 246
We can reject H0 at p<0.001.
#install.packages("effectsize")
library(effectsize)
effectsize::cohens_d(mydata$Cholestoral, mu = 200)
## Cohen's d | 95% CI
## ------------------------
## 0.89 | [0.82, 0.96]
##
## - Deviation from a difference of 200.
interpret_cohens_d(0.89, rules = "sawilowsky2009")
## [1] "large"
## (Rules: sawilowsky2009)
Based on the t-test we can conclude that the average cholestoral is different in general population than on our sample data (p<0.001). The effect size is large (r=0.89).
median(mydata$Cholestoral)
## [1] 240
The median looks different than average, I will check with a formal test.
wilcox.test(mydata$Cholestoral,
mu = 195,
correct = FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Cholestoral
## V = 491519, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 195
We can reject H0 at p<0.001.
library(effectsize)
effectsize(wilcox.test(mydata$Cholestoral,
mu = 195,
correct = FALSE))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.88 | [0.86, 0.90]
##
## - Deviation from a difference of 195.
interpret_rank_biserial(0.88, rules = "funder2019")
## [1] "very large"
## (Rules: funder2019)
We can conclude that the average median is different in the general population than on our sample (p<0.001). The effect is very large (r=0.88).
The assumptions of hypothesis about the arithmetic mean is that the variable is numeric, which it is. The other assumption is that the variable is normally distributed, which we proved with the Shapiro-Wilk test that it is not. For this reason the non-parametric test is better, than the parametric one.
Is there a correlation between age and maximum heart rate?
library(car)
scatterplotMatrix(mydata[, c("Age", "Max heart rate")], smooth=FALSE)
On the scatterplot we see negative correlation between variables Age and Max heart rate.
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(mydata, columns = c("Age","Max heart rate"))
library(Hmisc)
rcorr(as.matrix(mydata[, c("Age", "Max heart rate")]),
type ="pearson")
## Age Max heart rate
## Age 1.00 -0.39
## Max heart rate -0.39 1.00
##
## n= 1025
##
##
## P
## Age Max heart rate
## Age 0
## Max heart rate 0
cor(mydata$Age, mydata$`Max heart rate`,
method = "pearson",
use = "complete.obs")
## [1] -0.3902271
I can conclude that there is negative semi-strong correlation between variables Age and Max heart rate.
What is the association between Gender and Target?
Assumptions of chi square: 1. Independence of observations 2. Expected frequencies are greater than 5
chi_square <- chisq.test(mydata$Sex, mydata$Target,
correct = TRUE)
chi_square
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mydata$Sex and mydata$Target
## X-squared = 78.863, df = 1, p-value < 2.2e-16
addmargins(chi_square$observed)
## mydata$Target
## mydata$Sex No Disease Disease Sum
## Female 86 226 312
## Male 413 300 713
## Sum 499 526 1025
addmargins(round(chi_square$expected, 2))
## mydata$Target
## mydata$Sex No Disease Disease Sum
## Female 151.89 160.11 312
## Male 347.11 365.89 713
## Sum 499.00 526.00 1025
We reject H0 at p<0.001.
If there is no association, we would expect 151.89 female to not have a disease. But in reality 86 females do not have disease.
Here I used correct=TRUE, because of Yates´s continuity correction, because the table size is 2x2.
round(chi_square$res, 2)
## mydata$Target
## mydata$Sex No Disease Disease
## Female -5.35 5.21
## Male 3.54 -3.44
In the combination No Disease and Female, there is less than expected number of patient (α=0.001)
library(effectsize)
effectsize::cramers_v(mydata$Sex, mydata$Target)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.28 | [0.23, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.55)
## [1] "very large"
## (Rules: funder2019)
oddsratio(mydata$Target, mydata$Sex)
## Odds ratio | 95% CI
## -------------------------
## 0.28 | [0.21, 0.37]
Odds ratio = 1 / 0.28 = 3.57
interpret_oddsratio(3.57)
## [1] "medium"
## (Rules: chen2010)
CONCLUSION: We reject H0 (p<0.001), and assume the presence of association, with the very large effect size of 0.28 by Cramers statistic, and medium effect size of 3.57 by Odds ratio.