Applied Analytics Assignment-2

Cholestrol and exercise induce angina association to heart diseases.

Sai Vamsi Chunduru - S3884753, Pragati Patidar – S3858702, Kyron Reshi – S3920193, Arjun Padmanabha Pillai – S3887231

last updated on 17 October, 2021

Introduction

Problem Statement

Data

Descriptive Statistics and Visualisation

hd<-read_csv("heart.csv")
#renaming variables
names(hd)[5] <- 'Cholesterol'
names(hd)[9] <-'Exercise_Induced_Angina'
#factorizing variables
#target variable as factor variable 1 for  having disease and 0 for not having heart disease
hd$target <- hd$target %>% factor(levels=c(0,1),
                                        labels=c("no heart disease","heart disease"))
#Exercise_Induced_Angina variable as factor variable 1 for yes and 0 for no.
hd$Exercise_Induced_Angina<- hd$Exercise_Induced_Angina%>% factor(levels=c(1,0) , labels=c("Yes","No"))
#summarizing required variables
summary(hd$Cholesterol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   126.0   211.0   240.0   246.3   274.5   564.0
summary(hd$Exercise_Induced_Angina)
## Yes  No 
##  99 204
summary(hd$target)
## no heart disease    heart disease 
##              138              165

Decsriptive Statistics Cont.

hd %>% histogram(~target | Exercise_Induced_Angina, data= ., main = "Risk of heart attack")

Decsriptive Statistics Cont1

-In terms of the Cholesterol of individuals, the no heart disease sample appears to have the highest percent of total in the range of 220 to 240 cholesterol levels followed by 280 to 300 cholesterol levels. While the heart disease sample appears to have the highest percent of total in the range of 200-250 cholesterol levels.

hd %>% histogram(~Cholesterol | target, data= ., main = "Cholesterol observations", breaks=10)

Decsriptive Statistics Cont2

-The plot below shows the distribution in the cholesterol levels of the individual in this investigation. The curve appears to be more positively skewed.

#code for checking  distribution of data -bins=30
hd %>% ggplot(aes(x=Cholesterol)) + geom_histogram(aes(y=..density..), colour="black")+
        geom_density(alpha=.2, fill="dodgerblue3")

Hypothesis Testing- Pearson’s Chi-squared test of association

table_ang_target <- table(hd$Exercise_Induced_Angina , hd$target)
table_ang_target %>% addmargins()
##      
##       no heart disease heart disease Sum
##   Yes               76            23  99
##   No                62           142 204
##   Sum              138           165 303
table_ang_target1 <- table_ang_target %>% prop.table(margin=2) #proportions
table_ang_target1
##      
##       no heart disease heart disease
##   Yes        0.5507246     0.1393939
##   No         0.4492754     0.8606061

Hypothesis Testing cont.

barplot(table_ang_target1, main="Bar plot For Exercise Induced Angina",
        ylab="Proportion within Exercise_Induced_Angina", xlab="Likelihood of heart attack",
        ylim=c(0,1),legend=row.names(table_ang_target1), beside=TRUE,
        args.legend=c(x="topleft",horiz=FALSE,title="Likelihood of heart attack"))

Hypothesis Testing cont1

chi_ang_target <- chisq.test(table_ang_target) 
chi_ang_target
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table_ang_target
## X-squared = 55.945, df = 1, p-value = 7.454e-14
chi_ang_target$expected ##  Pearson's Chi-squared test
##      
##       no heart disease heart disease
##   Yes         45.08911      53.91089
##   No          92.91089     111.08911

Hypothesis Testing- Independent Two sample t-Test :

Two sample t-test hypothesis cont.

hd %>% group_by(target) %>% summarise(Min = min(Cholesterol,na.rm = TRUE),
                                           Q1 = quantile(Cholesterol,probs = .25,na.rm = TRUE),
                                           Median = median(Cholesterol, na.rm = TRUE),
                                           Q3 = quantile(Cholesterol,probs = .75,na.rm = TRUE),
                                           Max = max(Cholesterol,na.rm = TRUE),
                                           Mean = mean(Cholesterol, na.rm = TRUE),
                                           SD = sd(Cholesterol, na.rm = TRUE),
                                           n = n(),
                                           Missing = sum(is.na(Cholesterol))) -> table1

knitr::kable(table1)
target Min Q1 Median Q3 Max Mean SD n Missing
no heart disease 131 217.25 249 283 409 251.0870 49.45461 138 0
heart disease 126 208.00 234 267 564 242.2303 53.55287 165 0

Two sample t-test hypothesis cont1

# QQ plot for target == "no heart disease" for showing distribution 
target_no <- hd %>% filter(target == "no heart disease")
target_no$Cholesterol%>% qqPlot(dist="norm")

## [1] 82 56

Two sample t-test hypothesis cont2

# QQ plot for target == "no heart disease" for showing distribution
target_yes<- hd %>% filter(target == "heart disease")
target_yes$Cholesterol%>% qqPlot(dist="norm")

## [1] 86 29

Two sample t-test hypothesis cont4

leveneTest( Cholesterol ~ target, data = hd) # Homogeneity of variances using levene test
test_result<- t.test(Cholesterol ~ target,
                 data = hd,
                 var.equal = TRUE, alternative = "two.sided" ) #Independent two sample t-Test
test_result
## 
##  Two Sample t-test
## 
## data:  Cholesterol by target
## t = 1.4842, df = 301, p-value = 0.1388
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.885882 20.599189
## sample estimates:
## mean in group no heart disease    mean in group heart disease 
##                       251.0870                       242.2303

Two sample t-test hypothesis cont5

#Use the p-value and CI of the mean to make a decision about the null hypothesis.
test_result$p.value
## [1] 0.1387903
test_result$conf.int
## [1] -2.885882 20.599189
## attr(,"conf.level")
## [1] 0.95

Discussion

References

  1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
  2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
  3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
  4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D.