# load dataset
heart <- read.csv("/cloud/project/dataset/heart_disease.csv")
head(heart)
## Age Gender Blood.Pressure Cholesterol.Level Exercise.Habits Smoking
## 1 56 Male 153 155 High Yes
## 2 69 Female 146 286 High No
## 3 46 Male 126 216 Low No
## 4 32 Female 122 293 High Yes
## 5 60 Male 166 242 Low Yes
## 6 25 Male 152 257 Low Yes
## Family.Heart.Disease Diabetes BMI High.Blood.Pressure
## 1 Yes No 24.99159 Yes
## 2 Yes Yes 25.22180 No
## 3 No No 29.85545 No
## 4 Yes No 24.13048 Yes
## 5 Yes Yes 20.48629 Yes
## 6 No No 28.14468 No
## Low.HDL.Cholesterol High.LDL.Cholesterol Alcohol.Consumption Stress.Level
## 1 Yes No High Medium
## 2 Yes No Medium High
## 3 Yes Yes Low Low
## 4 No Yes Low High
## 5 No No Low High
## 6 No No Low Medium
## Sleep.Hours Sugar.Consumption Triglyceride.Level Fasting.Blood.Sugar
## 1 7.633228 Medium 342 NA
## 2 8.744034 Medium 133 157
## 3 4.440440 Low 393 92
## 4 5.249405 High 293 94
## 5 7.030971 High 263 154
## 6 5.504876 Low 126 91
## CRP.Level Homocysteine.Level Heart.Disease.Status
## 1 12.969246 12.387250 No
## 2 9.355389 19.298875 No
## 3 12.709873 11.230926 No
## 4 12.509046 5.961958 No
## 5 10.381259 8.153887 No
## 6 4.297575 10.815983 No
# load R packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
# Analysis question 1: Summarize the central tendencies, dispersions, and distributions of individual variables.
# Step1: conduct exploratory analysis
summary(heart)
## Age Gender Blood.Pressure Cholesterol.Level
## Min. :18.0 Length:10000 Min. :120.0 Min. :150.0
## 1st Qu.:34.0 Class :character 1st Qu.:134.0 1st Qu.:187.0
## Median :49.0 Mode :character Median :150.0 Median :226.0
## Mean :49.3 Mean :149.8 Mean :225.4
## 3rd Qu.:65.0 3rd Qu.:165.0 3rd Qu.:263.0
## Max. :80.0 Max. :180.0 Max. :300.0
## NA's :29 NA's :19 NA's :30
## Exercise.Habits Smoking Family.Heart.Disease Diabetes
## Length:10000 Length:10000 Length:10000 Length:10000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BMI High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
## Min. :18.00 Length:10000 Length:10000 Length:10000
## 1st Qu.:23.66 Class :character Class :character Class :character
## Median :29.08 Mode :character Mode :character Mode :character
## Mean :29.08
## 3rd Qu.:34.52
## Max. :40.00
## NA's :22
## Alcohol.Consumption Stress.Level Sleep.Hours Sugar.Consumption
## Length:10000 Length:10000 Min. : 4.001 Length:10000
## Class :character Class :character 1st Qu.: 5.450 Class :character
## Mode :character Mode :character Median : 7.003 Mode :character
## Mean : 6.991
## 3rd Qu.: 8.532
## Max. :10.000
## NA's :25
## Triglyceride.Level Fasting.Blood.Sugar CRP.Level Homocysteine.Level
## Min. :100.0 Min. : 80.0 Min. : 0.003647 Min. : 5.000
## 1st Qu.:176.0 1st Qu.: 99.0 1st Qu.: 3.674126 1st Qu.: 8.723
## Median :250.0 Median :120.0 Median : 7.472164 Median :12.409
## Mean :250.7 Mean :120.1 Mean : 7.472201 Mean :12.456
## 3rd Qu.:326.0 3rd Qu.:141.0 3rd Qu.:11.255592 3rd Qu.:16.141
## Max. :400.0 Max. :160.0 Max. :14.997087 Max. :19.999
## NA's :26 NA's :22 NA's :26 NA's :20
## Heart.Disease.Status
## Length:10000
## Class :character
## Mode :character
##
##
##
##
# Step2: calculate the frequency distribution of the age variable
ggplot(data=heart,mapping=aes(x=Age))+geom_histogram(binwidth=5,fill='lightblue',color='black')
## Warning: Removed 29 rows containing non-finite outside the scale range
## (`stat_bin()`).

#What is the IQRs for Blood Pressure?
#establish BP IQR
BP_iqr <- IQR(heart$Blood.Pressure,na.rm=TRUE)
q1_BP <- quantile(heart$Blood.Pressure,0.25,na.rm=TRUE)
q3_BP <- quantile(heart$Blood.Pressure,0.75,na.rm=TRUE)
#establish lower and upper bounds
lower_bound <- q1_BP - 1.5 * BP_iqr
upper_bound <- q3_BP + 1.5 * BP_iqr
#identify outliers
bp_outliers <- heart$Blood.Pressure < lower_bound | heart$Blood.Pressure > upper_bound
#print actual outlier values
bp_outlier_values <- heart$Blood.Pressure[bp_outliers]
print(bp_outlier_values)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Are there outliers in Triglyceride Levels that could indicate measurement errors or unique cases?
#step1: identify triglyceride iqr
triglyceride_iqr <- IQR(heart$Triglyceride.Level,na.rm=TRUE)
q1_triglyceride <- quantile(heart$Triglyceride.Level,0.25,na.rm=TRUE)
q3_triglyceride <- quantile(heart$Triglyceride.Level,0.75,na.rm=TRUE)
#step2: identify upper and lower bounds
lower_bounds <- q1_triglyceride - 1.5 * triglyceride_iqr
upper_bounds <- q3_triglyceride + 1.5 * triglyceride_iqr
#step3: identify outliers in triglyceride values
triglyceride_outliers <- heart$Triglyceride.Level < lower_bounds | heart$Triglyceride.Level > upper_bounds
#step4: print outlier values
outlier_values <- heart$Triglyceride.Level[triglyceride_outliers]
print(outlier_values)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA
#Is there a significant correlation between Age and Blood Pressure
age_bp_corr <- cor(heart$Age,heart$Blood.Pressure,use='complete.obs')
print(age_bp_corr)
## [1] -0.02078085
#How are BMI and Cholesterol Levels related?
correlation_result <- cor(heart$BMI,heart$Cholesterol.Level,use='complete.obs')
print(correlation_result)
## [1] 0.02200238
#Is there an association between Smoking Status and Heart Disease Status?
chi_results <- chisq.test(heart$Smoking,heart$Heart.Disease.Status)
print(chi_results)
##
## Pearson's Chi-squared test
##
## data: heart$Smoking and heart$Heart.Disease.Status
## X-squared = 0.31118, df = 2, p-value = 0.8559
# visualize output
ggplot(data=heart,mapping=aes(x=Smoking,fill=Heart.Disease.Status))+
geom_bar(position='dodge')

#Does the prevalence of Diabetes differ by Gender?
diabetes_gender <- chisq.test(heart$Diabetes,heart$Gender)
## Warning in chisq.test(heart$Diabetes, heart$Gender): Chi-squared approximation
## may be incorrect
print(diabetes_gender)
##
## Pearson's Chi-squared test
##
## data: heart$Diabetes and heart$Gender
## X-squared = 1.5363, df = 4, p-value = 0.8202
# visualize results
ggplot(data=heart,mapping=aes(x=Diabetes,fill=Gender))+
geom_bar(position='dodge')+theme_minimal()+
labs(title='Relationship between diabetes and gender')

#Is there a significant difference in average BMI between smokers and non-smokers?
anova_result <- aov(BMI ~ Smoking,data=heart)
print(anova_result)
## Call:
## aov(formula = BMI ~ Smoking, data = heart)
##
## Terms:
## Smoking Residuals
## Sum of Squares 142.7 396737.3
## Deg. of Freedom 2 9975
##
## Residual standard error: 6.306596
## Estimated effects may be unbalanced
## 22 observations deleted due to missingness
#visualize results
ggplot(data=heart,mapping=aes(x=Smoking,y=BMI))+ geom_boxplot()+
labs(title='Difference in BMI between Smokers and non smokers')
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#How do Age, BMI, and Exercise Habits predict Blood Pressure levels?
lm_model <- lm(Blood.Pressure~heart$Age+heart$BMI+heart$Exercise.Habits,data=heart)
print(lm_model)
##
## Call:
## lm(formula = Blood.Pressure ~ heart$Age + heart$BMI + heart$Exercise.Habits,
## data = heart)
##
## Coefficients:
## (Intercept) heart$Age
## 149.19481 -0.01957
## heart$BMI heart$Exercise.HabitsHigh
## 0.01685 1.10786
## heart$Exercise.HabitsLow heart$Exercise.HabitsMedium
## 0.85990 1.09049
#visualize results
ggplot(data=heart,mapping=aes(x=Age,y=Blood.Pressure))+geom_col()
## Warning: Removed 48 rows containing missing values or values outside the scale range
## (`geom_col()`).
