#Research Question1: Is there a association between physical activity level of the individual and cardiovascular disease?

#Research Quetsion2: Is there a correlation between the weight and the Systolic blood pressure (ap_hi)?

# Read the CSV file
mydata <- read.table("~/NADJA ROGANOVIC/heart_data.csv", header=TRUE, sep=",", dec=".")

#Excluding unimportant variables
mydata<- mydata[, -c(4,3,5,8,9,10,11,12)]

head(mydata)
##   index id weight ap_hi active cardio
## 1     0  0     62   110      1      0
## 2     1  1     85   140      1      1
## 3     2  2     64   130      0      1
## 4     3  3     82   150      1      1
## 5     4  4     56   100      0      0
## 6     5  8     67   120      0      0

##Description

set.seed(1) 
mydata <- mydata[sample(nrow(mydata), 400), ]  
mydata$activeF <- factor(mydata$active, 
                                levels = c(0, 1), 
                                labels = c("No", "Yes"))

mydata$cardioF <- factor(mydata$cardio, 
                                levels = c(0, 1), 
                                labels = c("No", "Yes"))
   
head(mydata)
##       index    id weight ap_hi active cardio activeF cardioF
## 24388 24387 34842     70   140      1      1     Yes     Yes
## 59521 59520 84974     53    90      1      0     Yes      No
## 43307 43306 61873     86   160      1      1     Yes     Yes
## 69586 69585 99392     51   110      1      0     Yes      No
## 11571 11570 16539     93   120      1      0     Yes      No
## 25173 25172 35941     61   130      1      0     Yes      No
mydata$weight <- as.numeric(mydata$weight)
mydata$ap_hi <- as.numeric(mydata$ap_hi)
mydata$active <- as.factor(mydata$active)
mydata$activeF <- as.factor(mydata$activeF)
mydata$cardio <- as.factor(mydata$cardio)
mydata$cardioF <- as.factor(mydata$cardioF)
summary(mydata)
##      index             id            weight           ap_hi       active 
##  Min.   :  132   Min.   :  177   Min.   : 44.00   Min.   : 11.0   0: 76  
##  1st Qu.:17323   1st Qu.:24750   1st Qu.: 65.00   1st Qu.:120.0   1:324  
##  Median :33152   Median :47374   Median : 74.00   Median :120.0          
##  Mean   :35427   Mean   :50584   Mean   : 75.65   Mean   :126.4          
##  3rd Qu.:55085   3rd Qu.:78579   3rd Qu.: 84.00   3rd Qu.:140.0          
##  Max.   :69983   Max.   :99974   Max.   :140.00   Max.   :200.0          
##  cardio  activeF   cardioF  
##  0:200   No : 76   No :200  
##  1:200   Yes:324   Yes:200  
##                             
##                             
##                             
## 
hist(mydata$weight,
     xlab = "Weight",
     ylab = "Frequency",
     main = "Histogram of Weight",
     col = "lightblue")

hist(mydata$ap_hi,
     xlab = "Systolic blood pressure",
     ylab = "Frequency",
     main = "Histogram of Systolic blood pressure",
     col = "lightgreen")

##Research Question1: Is there a association between physical activity level of the individual and cardiovascular disease?

#Chi - Square Test

##Assumptions

results <- chisq.test(mydata$activeF, mydata$cardioF, 
                        correct = TRUE)
results
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mydata$activeF and mydata$cardioF
## X-squared = 1.9656, df = 1, p-value = 0.1609

##Hypothesis:

Based on sample data, we cannot reject null hypothesis, and we can say that there is no association between physical activity level of the individual and cardiovascular disease.

addmargins(results$observed)
##               mydata$cardioF
## mydata$activeF  No Yes Sum
##            No   32  44  76
##            Yes 168 156 324
##            Sum 200 200 400
round(results$expected, 2)
##               mydata$cardioF
## mydata$activeF  No Yes
##            No   38  38
##            Yes 162 162
round(results$res, 2)
##               mydata$cardioF
## mydata$activeF    No   Yes
##            No  -0.97  0.97
##            Yes  0.47 -0.47
addmargins(round(results$expected, 2))
##               mydata$cardioF
## mydata$activeF  No Yes Sum
##            No   38  38  76
##            Yes 162 162 324
##            Sum 200 200 400
addmargins(round(prop.table(results$observed), 3))
##               mydata$cardioF
## mydata$activeF   No  Yes  Sum
##            No  0.08 0.11 0.19
##            Yes 0.42 0.39 0.81
##            Sum 0.50 0.50 1.00
addmargins(round(prop.table(results$observed, 1), 3), 2)
##               mydata$cardioF
## mydata$activeF    No   Yes   Sum
##            No  0.421 0.579 1.000
##            Yes 0.519 0.481 1.000
addmargins(round(prop.table(results$observed, 2), 3), 1)
##               mydata$cardioF
## mydata$activeF   No  Yes
##            No  0.16 0.22
##            Yes 0.84 0.78
##            Sum 1.00 1.00
library(effectsize)
## Warning: package 'effectsize' was built under R version 4.3.2
effectsize::cramers_v(mydata$activeF, mydata$cardioF)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.06              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.06)
## [1] "very small"
## (Rules: funder2019)

##Fisher‘s Exactxact Probability Test of Independence

*We use this non-parametric test in case any of the assumptions cannot be met.

fisher.test(mydata$activeF, mydata$cardioF)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata$activeF and mydata$cardioF
## p-value = 0.1606
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.3933268 1.1519814
## sample estimates:
## odds ratio 
##  0.6759953
interpret_oddsratio(0.6759)
## [1] "very small"
## (Rules: chen2010)

The effect size is very small meaning we found very small effect on physical activity to cardiovascular disease.

#Research Quetsion2: Is there a correlation between the weight and the Systolic blood pressure (ap_hi)?

Assumptions

library(car)
## Loading required package: carData
scatterplotMatrix(mydata[, c(4, 5)], smooth = FALSE)

* For this homework, we will assume linear relationship between variables.

cor.test(mydata$ap_hi, mydata$weight,
         method = "pearson",
         exact = FALSE,
         use ="complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  mydata$ap_hi and mydata$weight
## t = 4.2938, df = 398, p-value = 2.209e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1147251 0.3022264
## sample estimates:
##       cor 
## 0.2104099

Based on sample data, we reject null hypothesis at p<0.001, which means that there is correlation between weight and Systolic blood pressure.