DA410 Final Project

HeartDisease <- read.csv("~/Bellevue College/DA410/HeartDisease.csv")
heart <- HeartDisease

library("tidyverse")

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'readr' was built under R version 4.0.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library("tidyselect")
library("ggplot2")
library("car")

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library("ICSNP")

## Warning: package 'ICSNP' was built under R version 4.0.4

## Loading required package: mvtnorm

## Loading required package: ICS

## Warning: package 'ICS' was built under R version 4.0.4

library("tibble")
library("readr")
library("plotly")

## Warning: package 'plotly' was built under R version 4.0.4

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library("DT")

## Warning: package 'DT' was built under R version 4.0.4

library("mvnormtest")
library("dplyr")
library("caTools")

## Warning: package 'caTools' was built under R version 4.0.4

library("MASS")

## Warning: package 'MASS' was built under R version 4.0.4

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:plotly':
## 
##     select

## The following object is masked from 'package:dplyr':
## 
##     select

library("ROCR")

## Warning: package 'ROCR' was built under R version 4.0.4

#interactive datatable to view data
datatable(heart)

#box plot to view the 5 continuous variables when target is 0 and 1
f1 <- plot_ly(HeartDisease,
              type = "box") %>% 
  add_boxplot(y = ~trestbps,
              x = ~target,
              name = "trestbps") %>% 
  layout(title = "trestbps",
         xaxis = list(title = "target"),
         yaxis = list(title = "trestbps"))
f1

f2 <- plot_ly(HeartDisease,
              type = "box") %>% 
  add_boxplot(y = ~chol,
              x = ~target,
              name = "chol") %>% 
  layout(title = "chol",
         xaxis = list(title = "target"),
         yaxis = list(title = "chol"))
f2

f3 <- plot_ly(HeartDisease,
              type = "box") %>% 
  add_boxplot(y = ~ï..age,
              x = ~target,
              name = "ï..age") %>% 
  layout(title = "ï..age",
         xaxis = list(title = "target"),
         yaxis = list(title = "ï..age"))
f3

f4 <- plot_ly(HeartDisease,
              type = "box") %>% 
  add_boxplot(y = ~thalach,
              x = ~target,
              name = "thalach") %>% 
  layout(title = "ï..age",
         xaxis = list(title = "target"),
         yaxis = list(title = "thalach"))
f4

f5 <- plot_ly(HeartDisease,
              type = "box") %>% 
  add_boxplot(y = ~oldpeak,
              x = ~target,
              name = "oldpeak") %>% 
  layout(title = "oldpeak",
         xaxis = list(title = "target"),
         yaxis = list(title = "oldpeak"))
f5

#Removing the non continuous varibles from the data set
heart.vars <- HeartDisease %>% dplyr::select(-one_of("target","cp","sex","fbs","ca"))
head(heart.vars)

##   ï..age trestbps chol thalach oldpeak
## 1     63      145  233     150     2.3
## 2     37      130  250     187     3.5
## 3     41      130  204     172     1.4
## 4     56      120  236     178     0.8
## 5     57      120  354     163     0.6
## 6     57      140  192     148     0.4

#testing for normality using shapiro-wilks test
mvnormtest::mshapiro.test(t(heart.vars))

## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.93989, p-value = 9.335e-10

#the hypothesis is that if the p-value is below .05 then we can assume normality in the distribution

#determinant of variance-covariance matrix
det(cov(heart.vars))

## [1] 29788150473

#Because it passed both the shapiro-wilks test and the determinant is positive, we can conclude that a hotelling t test can be used on the data. 
fit <- HotellingsT2(filter(heart,
                             target == "1") [c(1, 4, 5, 7, 8)],
                      filter(heart,
                             target == "0") [c(1, 4, 5, 7, 8)])
fit

## 
##  Hotelling's two sample T2-test
## 
## data:  filter(heart, target == "1")[c(1, 4, 5, 7, 8)] and filter(heart, target == "0")[c(1, 4, 5, 7, 8)]
## T.2 = 22.924, df1 = 5, df2 = 297, p-value < 2.2e-16
## alternative hypothesis: true location difference is not equal to c(0,0,0,0,0)

#The hotellings T squared test shows that we have a p-value less than .05. This mean we can reject the null hypothesis that there is no multivariate difference between the two states of target. Accepting the alternative assumes there is a difference in sample means.

#Viewing and comparing discrete variables
table(heart$target, heart$sex)

##    
##       0   1
##   0  24 114
##   1  72  93

table(heart$target, heart$fbs)

##    
##       0   1
##   0 116  22
##   1 142  23

#Viewing and comparing discrete variables
ggplot(heart) +
  aes(x=target, fill = fbs) +
  geom_bar() + 
  scale_fill_hue() + 
  theme_minimal()

#using a chi-squared test to look for a difference in means between discrete variables. 
chi.test <- chisq.test(table(heart$target == 1, heart$sex))
chi.test

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(heart$target == 1, heart$sex)
## X-squared = 22.717, df = 1, p-value = 1.877e-06

chi.test0 <- chisq.test(table(heart$target != 1, heart$sex))
chi.test0

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(heart$target != 1, heart$sex)
## X-squared = 22.717, df = 1, p-value = 1.877e-06

chi.test1 <- chisq.test(table(heart$target == 1, heart$fbs))
chi.test1

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(heart$target == 1, heart$fbs)
## X-squared = 0.10627, df = 1, p-value = 0.7444

chi.test01 <- chisq.test(table(heart$target != 1, heart$fbs))
chi.test01

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(heart$target != 1, heart$fbs)
## X-squared = 0.10627, df = 1, p-value = 0.7444

#For both FBS and SEX, I get a p-value of less than .05. This means I can reject the null hypothesis that there is no difference in means between the variables.

#Using the split function to separate data into test and train
split = sample.split(heart$target, SplitRatio = 0.8)
train = subset(heart, split == TRUE)
test = subset(heart, split == FALSE)

head(train)

##   ï..age sex cp trestbps chol fbs thalach oldpeak ca target
## 1     63   1  3      145  233   1     150     2.3  0      1
## 2     37   1  2      130  250   0     187     3.5  0      1
## 3     41   0  1      130  204   0     172     1.4  0      1
## 4     56   1  1      120  236   0     178     0.8  0      1
## 5     57   0  0      120  354   0     163     0.6  0      1
## 6     57   1  0      140  192   0     148     0.4  0      1

#Doing a PCA to find out if I need to do any dimensional reduction
heart.pr <- prcomp(heart.vars, center = TRUE, scale = TRUE)
summary(heart.pr)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.3441 1.0380 0.9399 0.8713 0.68799
## Proportion of Variance 0.3613 0.2155 0.1767 0.1518 0.09467
## Cumulative Proportion  0.3613 0.5768 0.7535 0.9053 1.00000

#Visualizing the variance in the PC Values
screeplot(heart.pr, type = "l", npcs = 5, main = "Screeplot of the 5 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

#The first 4 variables are enough to achieve 90 percent of the explained variance, but I decided to use all 5 variables since that is not too many dimensions to complicate the model.

#Using LDA to predict the outcome for target variable
#creation of lda model
heart.lda <- lda(target ~ ï..age + trestbps + chol + thalach + oldpeak, data = train)

#creating prediction
heart.lda.predict <- predict(heart.lda, newdata = test)
table(heart.lda.predict$class)

## 
##  0  1 
## 18 43

heart.lda.predict.posteriors <- as.data.frame(heart.lda.predict$posterior)
heart.lda.predict.posteriors

##              0          1
## 19  0.38719062 0.61280938
## 23  0.12340338 0.87659662
## 31  0.11705413 0.88294587
## 32  0.32020725 0.67979275
## 33  0.08234908 0.91765092
## 46  0.16443415 0.83556585
## 47  0.11771415 0.88228585
## 50  0.19868898 0.80131102
## 52  0.29682702 0.70317298
## 55  0.14007556 0.85992444
## 57  0.08116591 0.91883409
## 59  0.10828163 0.89171837
## 62  0.19816728 0.80183272
## 67  0.42323963 0.57676037
## 69  0.12951817 0.87048183
## 75  0.17077343 0.82922657
## 82  0.16140806 0.83859194
## 87  0.38055146 0.61944854
## 91  0.12218370 0.87781630
## 92  0.14347149 0.85652851
## 105 0.16084750 0.83915250
## 106 0.72551815 0.27448185
## 109 0.31506320 0.68493680
## 111 0.34730606 0.65269394
## 115 0.22225928 0.77774072
## 118 0.42635988 0.57364012
## 122 0.11222928 0.88777072
## 125 0.07685671 0.92314329
## 126 0.10691203 0.89308797
## 136 0.18428170 0.81571830
## 138 0.28822386 0.71177614
## 141 0.29244906 0.70755094
## 153 0.37463426 0.62536574
## 169 0.50749903 0.49250097
## 173 0.46957217 0.53042783
## 179 0.82630463 0.17369537
## 183 0.16868002 0.83131998
## 187 0.53407156 0.46592844
## 192 0.74694401 0.25305599
## 197 0.86785497 0.13214503
## 198 0.18729528 0.81270472
## 200 0.24028283 0.75971717
## 203 0.73472409 0.26527591
## 205 0.97854727 0.02145273
## 210 0.17114549 0.82885451
## 219 0.85750031 0.14249969
## 221 0.90617426 0.09382574
## 239 0.18099366 0.81900634
## 240 0.22690228 0.77309772
## 241 0.93267857 0.06732143
## 246 0.22175901 0.77824099
## 248 0.53481882 0.46518118
## 255 0.51222647 0.48777353
## 263 0.90335961 0.09664039
## 271 0.40594806 0.59405194
## 278 0.34855771 0.65144229
## 281 0.78746618 0.21253382
## 285 0.67073193 0.32926807
## 290 0.72124510 0.27875490
## 297 0.30258292 0.69741708
## 298 0.85894009 0.14105991

#visualization of the false positive and false negative rates
pred <- prediction(heart.lda.predict.posteriors[,2], test$target)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values

plot(roc.perf)
abline(a = 0, b = 1)
text(x = .2, y = .8 ,paste("AUC = ", round(auc.train[[1]],3), sep = ""))

#Another way to understand the misclassification rate
tab <- table(heart.lda.predict$class, test$target)
tab

##    
##      0  1
##   0 17  1
##   1 11 32

1-sum(diag(tab))/sum(tab)

## [1] 0.1967213

#Using all PC values to create an LDA model, I was able to achieve a success rate of 74.1% I would say the reason the miscalssification rate is up above 30 is because an LDA model assumes linearity. If the true relationship of the variables is not linear then this will add to the uncertainty of the model.

DA410 Final Project

Dane Turnbull

3/21/2021