# load dataset
heart <- read.csv("/cloud/project/dataset/heart_disease.csv")
head(heart)
##   Age Gender Blood.Pressure Cholesterol.Level Exercise.Habits Smoking
## 1  56   Male            153               155            High     Yes
## 2  69 Female            146               286            High      No
## 3  46   Male            126               216             Low      No
## 4  32 Female            122               293            High     Yes
## 5  60   Male            166               242             Low     Yes
## 6  25   Male            152               257             Low     Yes
##   Family.Heart.Disease Diabetes      BMI High.Blood.Pressure
## 1                  Yes       No 24.99159                 Yes
## 2                  Yes      Yes 25.22180                  No
## 3                   No       No 29.85545                  No
## 4                  Yes       No 24.13048                 Yes
## 5                  Yes      Yes 20.48629                 Yes
## 6                   No       No 28.14468                  No
##   Low.HDL.Cholesterol High.LDL.Cholesterol Alcohol.Consumption Stress.Level
## 1                 Yes                   No                High       Medium
## 2                 Yes                   No              Medium         High
## 3                 Yes                  Yes                 Low          Low
## 4                  No                  Yes                 Low         High
## 5                  No                   No                 Low         High
## 6                  No                   No                 Low       Medium
##   Sleep.Hours Sugar.Consumption Triglyceride.Level Fasting.Blood.Sugar
## 1    7.633228            Medium                342                  NA
## 2    8.744034            Medium                133                 157
## 3    4.440440               Low                393                  92
## 4    5.249405              High                293                  94
## 5    7.030971              High                263                 154
## 6    5.504876               Low                126                  91
##   CRP.Level Homocysteine.Level Heart.Disease.Status
## 1 12.969246          12.387250                   No
## 2  9.355389          19.298875                   No
## 3 12.709873          11.230926                   No
## 4 12.509046           5.961958                   No
## 5 10.381259           8.153887                   No
## 6  4.297575          10.815983                   No
# load R packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)

install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)

# Analysis question 1: Summarize the central tendencies, dispersions, and distributions of individual variables.
# Step1: conduct exploratory analysis
summary(heart)
##       Age          Gender          Blood.Pressure  Cholesterol.Level
##  Min.   :18.0   Length:10000       Min.   :120.0   Min.   :150.0    
##  1st Qu.:34.0   Class :character   1st Qu.:134.0   1st Qu.:187.0    
##  Median :49.0   Mode  :character   Median :150.0   Median :226.0    
##  Mean   :49.3                      Mean   :149.8   Mean   :225.4    
##  3rd Qu.:65.0                      3rd Qu.:165.0   3rd Qu.:263.0    
##  Max.   :80.0                      Max.   :180.0   Max.   :300.0    
##  NA's   :29                        NA's   :19      NA's   :30       
##  Exercise.Habits      Smoking          Family.Heart.Disease   Diabetes        
##  Length:10000       Length:10000       Length:10000         Length:10000      
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##       BMI        High.Blood.Pressure Low.HDL.Cholesterol High.LDL.Cholesterol
##  Min.   :18.00   Length:10000        Length:10000        Length:10000        
##  1st Qu.:23.66   Class :character    Class :character    Class :character    
##  Median :29.08   Mode  :character    Mode  :character    Mode  :character    
##  Mean   :29.08                                                               
##  3rd Qu.:34.52                                                               
##  Max.   :40.00                                                               
##  NA's   :22                                                                  
##  Alcohol.Consumption Stress.Level        Sleep.Hours     Sugar.Consumption 
##  Length:10000        Length:10000       Min.   : 4.001   Length:10000      
##  Class :character    Class :character   1st Qu.: 5.450   Class :character  
##  Mode  :character    Mode  :character   Median : 7.003   Mode  :character  
##                                         Mean   : 6.991                     
##                                         3rd Qu.: 8.532                     
##                                         Max.   :10.000                     
##                                         NA's   :25                         
##  Triglyceride.Level Fasting.Blood.Sugar   CRP.Level         Homocysteine.Level
##  Min.   :100.0      Min.   : 80.0       Min.   : 0.003647   Min.   : 5.000    
##  1st Qu.:176.0      1st Qu.: 99.0       1st Qu.: 3.674126   1st Qu.: 8.723    
##  Median :250.0      Median :120.0       Median : 7.472164   Median :12.409    
##  Mean   :250.7      Mean   :120.1       Mean   : 7.472201   Mean   :12.456    
##  3rd Qu.:326.0      3rd Qu.:141.0       3rd Qu.:11.255592   3rd Qu.:16.141    
##  Max.   :400.0      Max.   :160.0       Max.   :14.997087   Max.   :19.999    
##  NA's   :26         NA's   :22          NA's   :26          NA's   :20        
##  Heart.Disease.Status
##  Length:10000        
##  Class :character    
##  Mode  :character    
##                      
##                      
##                      
## 
# Step2: calculate the frequency distribution of the age variable
ggplot(data=heart,mapping=aes(x=Age))+geom_histogram(binwidth=5,fill='lightblue',color='black')
## Warning: Removed 29 rows containing non-finite outside the scale range
## (`stat_bin()`).

#What is the IQRs for Blood Pressure?

#establish BP IQR
BP_iqr <- IQR(heart$Blood.Pressure,na.rm=TRUE)
q1_BP <- quantile(heart$Blood.Pressure,0.25,na.rm=TRUE)
q3_BP <- quantile(heart$Blood.Pressure,0.75,na.rm=TRUE)

#establish lower and upper bounds 
lower_bound <- q1_BP - 1.5 * BP_iqr
upper_bound <- q3_BP + 1.5 * BP_iqr

#identify outliers
bp_outliers <- heart$Blood.Pressure < lower_bound | heart$Blood.Pressure > upper_bound

#print actual outlier values
bp_outlier_values <- heart$Blood.Pressure[bp_outliers]
print(bp_outlier_values)
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Are there outliers in Triglyceride Levels that could indicate measurement errors or unique cases?

#step1: identify triglyceride iqr
triglyceride_iqr <- IQR(heart$Triglyceride.Level,na.rm=TRUE)
q1_triglyceride <- quantile(heart$Triglyceride.Level,0.25,na.rm=TRUE)
q3_triglyceride <- quantile(heart$Triglyceride.Level,0.75,na.rm=TRUE)

#step2: identify upper and lower bounds
lower_bounds <- q1_triglyceride - 1.5 * triglyceride_iqr
upper_bounds <- q3_triglyceride + 1.5 * triglyceride_iqr

#step3: identify outliers in triglyceride values
triglyceride_outliers <- heart$Triglyceride.Level < lower_bounds | heart$Triglyceride.Level > upper_bounds

#step4: print outlier values
outlier_values <- heart$Triglyceride.Level[triglyceride_outliers]
print(outlier_values)
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA
#Is there a significant correlation between Age and Blood Pressure
age_bp_corr <- cor(heart$Age,heart$Blood.Pressure,use='complete.obs')
print(age_bp_corr)
## [1] -0.02078085
#How are BMI and Cholesterol Levels related?
correlation_result <- cor(heart$BMI,heart$Cholesterol.Level,use='complete.obs')
print(correlation_result)
## [1] 0.02200238
#Is there an association between Smoking Status and Heart Disease Status?
chi_results <- chisq.test(heart$Smoking,heart$Heart.Disease.Status)
print(chi_results)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$Smoking and heart$Heart.Disease.Status
## X-squared = 0.31118, df = 2, p-value = 0.8559
# visualize output
ggplot(data=heart,mapping=aes(x=Smoking,fill=Heart.Disease.Status))+
  geom_bar(position='dodge')

#Does the prevalence of Diabetes differ by Gender?
diabetes_gender <- chisq.test(heart$Diabetes,heart$Gender)
## Warning in chisq.test(heart$Diabetes, heart$Gender): Chi-squared approximation
## may be incorrect
print(diabetes_gender)
## 
##  Pearson's Chi-squared test
## 
## data:  heart$Diabetes and heart$Gender
## X-squared = 1.5363, df = 4, p-value = 0.8202
# visualize results
ggplot(data=heart,mapping=aes(x=Diabetes,fill=Gender))+
  geom_bar(position='dodge')+theme_minimal()+
  labs(title='Relationship between diabetes and gender')

#Is there a significant difference in average BMI between smokers and non-smokers?

anova_result <- aov(BMI ~ Smoking,data=heart)
print(anova_result)
## Call:
##    aov(formula = BMI ~ Smoking, data = heart)
## 
## Terms:
##                  Smoking Residuals
## Sum of Squares     142.7  396737.3
## Deg. of Freedom        2      9975
## 
## Residual standard error: 6.306596
## Estimated effects may be unbalanced
## 22 observations deleted due to missingness
#visualize results
ggplot(data=heart,mapping=aes(x=Smoking,y=BMI))+ geom_boxplot()+
  labs(title='Difference in BMI between Smokers and non smokers')
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#How do Age, BMI, and Exercise Habits predict Blood Pressure levels?
lm_model <- lm(Blood.Pressure~heart$Age+heart$BMI+heart$Exercise.Habits,data=heart)
print(lm_model)
## 
## Call:
## lm(formula = Blood.Pressure ~ heart$Age + heart$BMI + heart$Exercise.Habits, 
##     data = heart)
## 
## Coefficients:
##                 (Intercept)                    heart$Age  
##                   149.19481                     -0.01957  
##                   heart$BMI    heart$Exercise.HabitsHigh  
##                     0.01685                      1.10786  
##    heart$Exercise.HabitsLow  heart$Exercise.HabitsMedium  
##                     0.85990                      1.09049
#visualize results
ggplot(data=heart,mapping=aes(x=Age,y=Blood.Pressure))+geom_col()
## Warning: Removed 48 rows containing missing values or values outside the scale range
## (`geom_col()`).