heart_disease <- read.csv("D:/下载/heart_disease33.txt", sep="")
library(summarytools)
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tibble::view() masks summarytools::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(alr4)
## 载入需要的程辑包:car
## 载入需要的程辑包:carData
##
## 载入程辑包:'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## 载入需要的程辑包:effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(class)
library(caret)
## 载入需要的程辑包:lattice
##
## 载入程辑包:'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(tree)
library(MASS)
##
## 载入程辑包:'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## 载入程辑包:'randomForest'
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ROCR)
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(nnet)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 载入程辑包:'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#Check for Missing data
str(heart_disease)
## 'data.frame': 7000 obs. of 11 variables:
## $ HeartDiseaseorAttack: int NA 0 0 1 0 1 0 0 0 0 ...
## $ HighBP : int 1 0 0 1 1 1 1 1 0 1 ...
## $ HighChol : int 0 0 1 1 0 1 1 0 0 1 ...
## $ CholCheck : int 1 1 1 1 1 1 1 1 1 1 ...
## $ BMI : int 26 26 27 28 25 40 43 42 27 29 ...
## $ Smoker : int 1 0 0 1 0 0 1 0 0 1 ...
## $ Stroke : int 0 0 0 0 0 1 0 0 0 0 ...
## $ Diabetes : int 0 2 0 0 0 0 2 0 0 0 ...
## $ Sex : int 0 0 1 0 1 1 0 0 1 1 ...
## $ Age : int 8 8 10 10 6 7 7 6 9 13 ...
## $ Education : int 5 6 6 4 4 6 5 5 5 4 ...
summary(heart_disease)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.00000 Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.08868 Mean :0.437 Mean :0.4201 Mean :0.9616
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.0000
## NA's :279
## BMI Smoker Stroke Diabetes
## Min. :14.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :27.00 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :28.25 Mean :0.4273 Mean :0.0432 Mean :0.3189
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :92.00 Max. :1.0000 Max. :1.0000 Max. :2.0000
## NA's :499 NA's :625
## Sex Age Education
## Min. :0.0000 Min. : 1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.000 1st Qu.:4.000
## Median :0.0000 Median : 8.000 Median :5.000
## Mean :0.4369 Mean : 7.824 Mean :4.985
## 3rd Qu.:1.0000 3rd Qu.:10.000 3rd Qu.:6.000
## Max. :1.0000 Max. :13.000 Max. :6.000
##
freq(heart_disease)
## Variable(s) ignored: BMI
## Frequencies
## heart_disease$HeartDiseaseorAttack
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 6125 91.13 91.13 87.50 87.50
## 1 596 8.87 100.00 8.51 96.01
## <NA> 279 3.99 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$HighBP
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 3941 56.30 56.30 56.30 56.30
## 1 3059 43.70 100.00 43.70 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$HighChol
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 4059 57.99 57.99 57.99 57.99
## 1 2941 42.01 100.00 42.01 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$CholCheck
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 269 3.84 3.84 3.84 3.84
## 1 6731 96.16 100.00 96.16 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Smoker
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 4009 57.27 57.27 57.27 57.27
## 1 2991 42.73 100.00 42.73 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Stroke
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 6220 95.68 95.68 88.86 88.86
## 1 281 4.32 100.00 4.01 92.87
## <NA> 499 7.13 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Diabetes
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 5282 82.85 82.85 75.46 75.46
## 1 153 2.40 85.25 2.19 77.64
## 2 940 14.75 100.00 13.43 91.07
## <NA> 625 8.93 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Sex
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 0 3942 56.31 56.31 56.31 56.31
## 1 3058 43.69 100.00 43.69 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Age
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 1 199 2.84 2.84 2.84 2.84
## 2 230 3.29 6.13 3.29 6.13
## 3 347 4.96 11.09 4.96 11.09
## 4 427 6.10 17.19 6.10 17.19
## 5 481 6.87 24.06 6.87 24.06
## 6 586 8.37 32.43 8.37 32.43
## 7 735 10.50 42.93 10.50 42.93
## 8 803 11.47 54.40 11.47 54.40
## 9 859 12.27 66.67 12.27 66.67
## 10 838 11.97 78.64 11.97 78.64
## 11 629 8.99 87.63 8.99 87.63
## 12 420 6.00 93.63 6.00 93.63
## 13 446 6.37 100.00 6.37 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
##
## heart_disease$Education
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 1 10 0.14 0.14 0.14 0.14
## 2 213 3.04 3.19 3.04 3.19
## 3 346 4.94 8.13 4.94 8.13
## 4 1591 22.73 30.86 22.73 30.86
## 5 1982 28.31 59.17 28.31 59.17
## 6 2858 40.83 100.00 40.83 100.00
## <NA> 0 0.00 100.00
## Total 7000 100.00 100.00 100.00 100.00
freq(heart_disease$BMI)
## Frequencies
## heart_disease$BMI
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
## 14 5 0.071 0.071 0.071 0.071
## 15 3 0.043 0.114 0.043 0.114
## 16 16 0.229 0.343 0.229 0.343
## 17 27 0.386 0.729 0.386 0.729
## 18 61 0.871 1.600 0.871 1.600
## 19 113 1.614 3.214 1.614 3.214
## 20 158 2.257 5.471 2.257 5.471
## 21 273 3.900 9.371 3.900 9.371
## 22 369 5.271 14.643 5.271 14.643
## 23 442 6.314 20.957 6.314 20.957
## 24 531 7.586 28.543 7.586 28.543
## 25 473 6.757 35.300 6.757 35.300
## 26 563 8.043 43.343 8.043 43.343
## 27 657 9.386 52.729 9.386 52.729
## 28 467 6.671 59.400 6.671 59.400
## 29 411 5.871 65.271 5.871 65.271
## 30 412 5.886 71.157 5.886 71.157
## 31 373 5.329 76.486 5.329 76.486
## 32 278 3.971 80.457 3.971 80.457
## 33 250 3.571 84.029 3.571 84.029
## 34 189 2.700 86.729 2.700 86.729
## 35 158 2.257 88.986 2.257 88.986
## 36 130 1.857 90.843 1.857 90.843
## 37 107 1.529 92.371 1.529 92.371
## 38 76 1.086 93.457 1.086 93.457
## 39 82 1.171 94.629 1.171 94.629
## 40 65 0.929 95.557 0.929 95.557
## 41 46 0.657 96.214 0.657 96.214
## 42 52 0.743 96.957 0.743 96.957
## 43 45 0.643 97.600 0.643 97.600
## 44 34 0.486 98.086 0.486 98.086
## 45 24 0.343 98.429 0.343 98.429
## 46 24 0.343 98.771 0.343 98.771
## 47 16 0.229 99.000 0.229 99.000
## 48 9 0.129 99.129 0.129 99.129
## 49 10 0.143 99.271 0.143 99.271
## 50 10 0.143 99.414 0.143 99.414
## 51 4 0.057 99.471 0.057 99.471
## 52 7 0.100 99.571 0.100 99.571
## 53 4 0.057 99.629 0.057 99.629
## 54 4 0.057 99.686 0.057 99.686
## 55 6 0.086 99.771 0.086 99.771
## 56 1 0.014 99.786 0.014 99.786
## 57 2 0.029 99.814 0.029 99.814
## 58 3 0.043 99.857 0.043 99.857
## 59 4 0.057 99.914 0.057 99.914
## 62 2 0.029 99.943 0.029 99.943
## 65 1 0.014 99.957 0.014 99.957
## 73 1 0.014 99.971 0.014 99.971
## 74 1 0.014 99.986 0.014 99.986
## 92 1 0.014 100.000 0.014 100.000
## <NA> 0 0.000 100.000
## Total 7000 100.000 100.000 100.000 100.000
n=nrow(heart_disease)
##Drop NAs for all categorical variables, no NAs for continuous variable
heart_disease1 <- heart_disease %>%
drop_na(HeartDiseaseorAttack, Stroke, Diabetes)#create a dataset without NAs
heart_disease2 <- heart_disease %>%
drop_na(HeartDiseaseorAttack, Stroke, Diabetes)#create a dataset without NAs without change variable type
#Change categorical variables from integer to factor
heart_disease1$HeartDiseaseorAttack <- as.factor(heart_disease1$HeartDiseaseorAttack)
heart_disease1$Stroke <- as.factor(heart_disease1$Stroke)
heart_disease1$Diabetes <- as.factor(heart_disease1$Diabetes)
summary(heart_disease1)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## 0:5229 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1: 473 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.4281 Mean :0.4135 Mean :0.9626
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke Diabetes Sex
## Min. :14.00 Min. :0.0000 0:5472 0:4754 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000 1: 230 1: 130 1st Qu.:0.0000
## Median :27.00 Median :0.0000 2: 818 Median :0.0000
## Mean :28.33 Mean :0.3653 Mean :0.4397
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :74.00 Max. :1.0000 Max. :1.0000
## Age Education
## Min. : 1.000 Min. :1.000
## 1st Qu.: 6.000 1st Qu.:4.000
## Median : 8.000 Median :5.000
## Mean : 7.782 Mean :5.074
## 3rd Qu.:10.000 3rd Qu.:6.000
## Max. :13.000 Max. :6.000
##Iterative Imputation for missing values
heart_disease3 <- heart_disease
heart_disease3$HeartDiseaseorAttack <- as.factor(heart_disease3$HeartDiseaseorAttack)
heart_disease3$Stroke <- as.factor(heart_disease3$Stroke)
heart_disease3$Diabetes <- as.character(heart_disease3$Diabetes)
summary(heart_disease3)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## 0 :6125 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1 : 596 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000
## NA's: 279 Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.437 Mean :0.4201 Mean :0.9616
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke Diabetes
## Min. :14.00 Min. :0.0000 0 :6220 Length:7000
## 1st Qu.:24.00 1st Qu.:0.0000 1 : 281 Class :character
## Median :27.00 Median :0.0000 NA's: 499 Mode :character
## Mean :28.25 Mean :0.4273
## 3rd Qu.:31.00 3rd Qu.:1.0000
## Max. :92.00 Max. :1.0000
## Sex Age Education
## Min. :0.0000 Min. : 1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.000 1st Qu.:4.000
## Median :0.0000 Median : 8.000 Median :5.000
## Mean :0.4369 Mean : 7.824 Mean :4.985
## 3rd Qu.:1.0000 3rd Qu.:10.000 3rd Qu.:6.000
## Max. :1.0000 Max. :13.000 Max. :6.000
heart_disease4 = heart_disease3
##Impute the missings in HeartDiseaseorAttack, Stroke, Diabetes using most frequent category
heart_disease3$HeartDiseaseorAttack[is.na(heart_disease3$HeartDiseaseorAttack)]="0"
heart_disease3$Stroke[is.na(heart_disease3$Stroke)]="0"
heart_disease3$Diabetes[is.na(heart_disease3$Diabetes)]="0"
summary(heart_disease3)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## 0:6404 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1: 596 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.437 Mean :0.4201 Mean :0.9616
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke Diabetes Sex
## Min. :14.00 Min. :0.0000 0:6719 Length:7000 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000 1: 281 Class :character 1st Qu.:0.0000
## Median :27.00 Median :0.0000 Mode :character Median :0.0000
## Mean :28.25 Mean :0.4273 Mean :0.4369
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :92.00 Max. :1.0000 Max. :1.0000
## Age Education
## Min. : 1.000 Min. :1.000
## 1st Qu.: 6.000 1st Qu.:4.000
## Median : 8.000 Median :5.000
## Mean : 7.824 Mean :4.985
## 3rd Qu.:10.000 3rd Qu.:6.000
## Max. :13.000 Max. :6.000
n_iter=10 #Set number of iterations
for(i in 1:n_iter)
{
#impute Stroke given rest
m_Stroke <- glm(Stroke~., data=heart_disease3, subset=!is.na(heart_disease$Stroke), family=binomial)
pred_Stroke <- predict(m_Stroke, heart_disease3[is.na(heart_disease$Stroke),], type="response")
heart_disease3$Stroke[is.na(heart_disease$Stroke)] <- ifelse(pred_Stroke>0.5,1,0)
#impute HeartDiseaseorAttack given rest
m_HDA <- glm(HeartDiseaseorAttack~.,heart_disease3,subset=!is.na(heart_disease$HeartDiseaseorAttack), family=binomial)
pred_HDA <- predict(m_HDA, heart_disease3[is.na(heart_disease$HeartDiseaseorAttack),], type="response")
heart_disease3$HeartDiseaseorAttack[is.na(heart_disease$HeartDiseaseorAttack)] <- ifelse(pred_HDA>0.5,1,0)
#impute Diabetes given rest
m_Diabetes <- multinom(Diabetes~., heart_disease3, subset=!is.na(heart_disease$Diabetes), trace=FALSE)
pred_Diabetes <- predict(m_Diabetes, heart_disease3[is.na(heart_disease$Diabetes),], type="class")
heart_disease3$Diabetes[is.na(heart_disease$Diabetes)] = pred_Diabetes
}
summary(heart_disease3)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## 0:6400 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1: 600 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.437 Mean :0.4201 Mean :0.9616
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke Diabetes Sex
## Min. :14.00 Min. :0.0000 0:6718 Length:7000 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000 1: 282 Class :character 1st Qu.:0.0000
## Median :27.00 Median :0.0000 Mode :character Median :0.0000
## Mean :28.25 Mean :0.4273 Mean :0.4369
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :92.00 Max. :1.0000 Max. :1.0000
## Age Education
## Min. : 1.000 Min. :1.000
## 1st Qu.: 6.000 1st Qu.:4.000
## Median : 8.000 Median :5.000
## Mean : 7.824 Mean :4.985
## 3rd Qu.:10.000 3rd Qu.:6.000
## Max. :13.000 Max. :6.000
#Logistic Regression model for HeartDiseaseorAttack
logm1 <- glm(HeartDiseaseorAttack~., data=heart_disease1, family="binomial")
summary(logm1)
##
## Call:
## glm(formula = HeartDiseaseorAttack ~ ., family = "binomial",
## data = heart_disease1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.520337 0.842005 -8.931 < 2e-16 ***
## HighBP 0.755290 0.124127 6.085 1.17e-09 ***
## HighChol 0.701951 0.113372 6.192 5.96e-10 ***
## CholCheck 1.301442 0.720691 1.806 0.0709 .
## BMI 0.009998 0.009237 1.082 0.2791
## Smoker 0.427345 0.105821 4.038 5.38e-05 ***
## Stroke1 1.421228 0.162229 8.761 < 2e-16 ***
## Diabetes1 0.376741 0.277667 1.357 0.1748
## Diabetes2 0.587354 0.119765 4.904 9.38e-07 ***
## Sex 0.603005 0.107132 5.629 1.82e-08 ***
## Age 0.248172 0.023369 10.620 < 2e-16 ***
## Education -0.055623 0.049741 -1.118 0.2635
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3260.7 on 5701 degrees of freedom
## Residual deviance: 2606.2 on 5690 degrees of freedom
## AIC: 2630.2
##
## Number of Fisher Scoring iterations: 7
pchisq(logm1$deviance, 5691, lower.tail=FALSE)#deviance test for good fit
## [1] 1
Pearson = sum(residuals(logm1, type="pearson")^2)#chi-square test for good fit
Pearson
## [1] 5795.519
pchisq(Pearson, 5691, lower.tail=FALSE)
## [1] 0.1635461
plot(logm1, which=5)#Diagnosis Plot

pi_logit_d1 <- predict(logm1, type="response")
y_logit_d1 <- ifelse(pi_logit_d1 >0.5, 1, 0)
ER_logit_d1 <- mean(y_logit_d1!=heart_disease1$HeartDiseaseorAttack)#Error Rate
##ROC curve and AUC
pred_logit_d1 <- prediction(pi_logit_d1, heart_disease1$HeartDiseaseorAttack)
perf_logit_d1 <- performance(pred_logit_d1, "tpr","fpr")
plot(perf_logit_d1, colorize=TRUE, main="Missing Data impute by dropping NAs")

AUC_logit_d1 <- performance(pred_logit_d1,"auc")@y.values[[1]]
##logistic regression model on iterative regression impute
logm2 <- glm(HeartDiseaseorAttack~., data=heart_disease3, family="binomial")
summary(logm2)
##
## Call:
## glm(formula = HeartDiseaseorAttack ~ ., family = "binomial",
## data = heart_disease3)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.99154 0.60013 -11.650 < 2e-16 ***
## HighBP 0.77772 0.11117 6.996 2.64e-12 ***
## HighChol 0.65317 0.10057 6.495 8.30e-11 ***
## CholCheck 0.74257 0.46671 1.591 0.11160
## BMI 0.01547 0.00797 1.941 0.05226 .
## Smoker 0.41550 0.09818 4.232 2.32e-05 ***
## Stroke1 1.33036 0.14696 9.053 < 2e-16 ***
## Diabetes1 -0.03548 0.15021 -0.236 0.81327
## Diabetes2 0.59644 0.11135 5.357 8.48e-08 ***
## Diabetes3 1.41475 0.46628 3.034 0.00241 **
## Sex 0.56535 0.09487 5.959 2.53e-09 ***
## Age 0.24734 0.02069 11.956 < 2e-16 ***
## Education -0.06762 0.04317 -1.566 0.11730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4095.1 on 6999 degrees of freedom
## Residual deviance: 3299.0 on 6987 degrees of freedom
## AIC: 3325
##
## Number of Fisher Scoring iterations: 6
pchisq(logm2$deviance, 5691, lower.tail=FALSE)#deviance test for good fit
## [1] 1
Pearson = sum(residuals(logm2, type="pearson")^2)#chi-square test for good fit
Pearson
## [1] 7042.115
pchisq(Pearson, 5691, lower.tail=FALSE)
## [1] 2.26052e-32
plot(logm2, which=5)#Diagnosis Plot

pi_logit_d2 <- predict(logm2, type="response")
y_logit_d2 <- ifelse(pi_logit_d2 >0.5, 1, 0)
ER_logit_d2 <- mean(y_logit_d2!=heart_disease3$HeartDiseaseorAttack)#Error Rate
##ROC Curve
pred_logit_d2 <- prediction(pi_logit_d2, heart_disease3$HeartDiseaseorAttack)
perf_logit_d2 <- performance(pred_logit_d2, "tpr","fpr")
plot(perf_logit_d2, colorize=TRUE, main="Missing Data impute by Iterative Regression")

AUC_logit_d2 <- performance(pred_logit_d2,"auc")@y.values[[1]]
###Comparsion between two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_logit_d1,ER_logit_d2))
## model ER
## 1 Dropping NAs RF 0.08260260
## 2 Iterative Regression RF 0.08471429
par(mfrow=c(1,2))
plot(perf_logit_d1, colorize=TRUE, main="Dropping NAs logit")
plot(perf_logit_d2, colorize=TRUE, main="Iterative Regression logit")

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_logit_d1,AUC_logit_d2))
## model AUC
## 1 Dropping NAs RF 0.8281027
## 2 Iterative Regression RF 0.8219561
##Split the dataset into 50% training, and 50% validation
set.seed(4052)
train_ind <- sample(1:nrow(heart_disease1), round(0.5*nrow(heart_disease1)))
train_ind <- sample(1:nrow(heart_disease1), round(0.5*nrow(heart_disease1)))
train <- heart_disease1[train_ind,]
valid <- heart_disease1[-train_ind,]
#Use KNN to predict HeartDiseaseorAttack
train2 <- heart_disease2[train_ind,]
valid2 <- heart_disease2[-train_ind,]
train.knn <- train2[,-1]
val.knn <- valid2[,-1]
y_train <- as.numeric(train2[,1])
y_test <- as.numeric(valid2[,1])
#Use KNN to predict HeartDiseaseorAttack on iterative regression impute
train_ind2 <- sample(1:nrow(heart_disease3), round(0.5*nrow(heart_disease3)))
train3 <- heart_disease3[train_ind2,]
valid3 <- heart_disease3[-train_ind2,]
train.knn2 <- train3[,-1]
val.knn2 <- valid3[,-1]
y_train2 <- as.numeric(train3[,1])-1
y_test2 <- as.numeric(valid3[,1])-1
##Use 10-folds Cross-validation to choose K, with a list of k=1,10,70,100
klist <- c(1,10,70,100)
nfolds <- 10
fold=createFolds(1:nrow(train2),k=nfolds,list=FALSE)
kCV_err <- rep(0,4)
for(i in 1:nfolds){
pre1.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=1)
pre10.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=10)
pre70.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=70)
pre100.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=100)
kCV_err[1]=kCV_err[1]+mean(pre1.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
kCV_err[2]=kCV_err[2]+mean(pre10.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
kCV_err[3]=kCV_err[3]+mean(pre70.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
kCV_err[4]=kCV_err[4]+mean(pre100.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
}
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
data.frame(klist, kCV_err)#Use k=70 since greater k does not lead to change in ER
## klist kCV_err
## 1 1 0.13429841
## 2 10 0.08788196
## 3 70 0.08667391
## 4 100 0.08667391
k1 <- knn(train.knn, val.knn, cl=train2[,1], k=70, prob=TRUE)
table(valid$HeartDiseaseorAttack,k1)
## k1
## 0 1
## 0 2604 0
## 1 247 0
val_err <- mean(valid$HeartDiseaseorAttack!=k1)
val_err
## [1] 0.08663627
##ROC Curves and Error Rate, and AUC
winning_pi_knn_d1 <- as.numeric(attr(k1, "prob"))
winning_class <- k1
pi_knn_d1 <- ifelse(winning_class==1, winning_pi_knn_d1, 1-winning_pi_knn_d1)
y_knn_d1 <- ifelse(pi_knn_d1 >0.5, 1, 0)
ER_knn_d1 <- mean((y_test-y_knn_d1)^2)# Error Rate
pred_knn_d1 <- prediction(pi_knn_d1, y_test)
perf_knn_d1 <- performance(pred_knn_d1, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_knn_d1, colorize=TRUE, main="Dropping NAs KNN")#ROC Curve d1

AUC_knn_d1 <- performance(pred_knn_d1,"auc")@y.values[[1]]
##Use 10-folds Cross-validtionto chooy_train##Use 10-folds Cross-validtionto choose K, with a list of k=1, 10, 70, 100 on iterative model
klist2 <- c(1,10,70,100)
nfolds <- 10
fold2=createFolds(1:nrow(train2),k=nfolds,list=FALSE)
kCV_err2 <- rep(0,4)
for(i in 1:nfolds){
pre12.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=1)
pre102.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=10)
pre702.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=70)
pre1002.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=100)
kCV_err2[1]=kCV_err2[1]+mean(pre12.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
kCV_err2[2]=kCV_err2[2]+mean(pre102.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
kCV_err2[3]=kCV_err2[3]+mean(pre702.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
kCV_err2[4]=kCV_err2[4]+mean(pre1002.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
}
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数
## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数
data.frame(klist2, kCV_err2)#Use k=70 since greater k does not lead to change in ER
## klist2 kCV_err2
## 1 1 0.13797701
## 2 10 0.09191367
## 3 70 0.09016755
## 4 100 0.09016755
k2 <- knn(train.knn2, val.knn2, cl=train3[,1], k=70, prob=TRUE)
table(valid3$HeartDiseaseorAttack,k2)
## k2
## 0 1
## 0 3184 0
## 1 316 0
val_err2 <- mean(valid3$HeartDiseaseorAttack!=k2)
val_err2
## [1] 0.09028571
##ROC Curves and Error Rate, and AUC
winning_pi_knn_d2 <- as.numeric(attr(k2, "prob"))
winning_class2 <- k2
pi_knn_d2 <- ifelse(winning_class2==1, winning_pi_knn_d2, 1-winning_pi_knn_d2)
y_knn_d2 <- ifelse(pi_knn_d2 >0.5, 1, 0)
ER_knn_d2 <- mean((y_test2-y_knn_d2)^2)# Error Rate
pred_knn_d2 <- prediction(pi_knn_d2, y_test2)
perf_knn_d2 <- performance(pred_knn_d2, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_knn_d2, colorize=TRUE, main="Iterative Regression KNN")#ROC Curve d2

AUC_knn_d2 <- performance(pred_knn_d2,"auc")@y.values[[1]]
##Compare between two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_knn_d1,ER_knn_d2))
## model ER
## 1 Dropping NAs RF 0.08663627
## 2 Iterative Regression RF 0.09028571
par(mfrow=c(1,2))
plot(perf_knn_d1, colorize=TRUE, main="Dropping NAs KNN")#ROC Curve d1
plot(perf_knn_d2, colorize=TRUE, main="Iterative Regression KNN")#ROC Curve d2

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_knn_d1,AUC_knn_d2))
## model AUC
## 1 Dropping NAs RF 0.7796344
## 2 Iterative Regression RF 0.7706382
#Random Forest
##Use 10-fold Cross-validation to choose mtry value with m=1,2,3,4,5,6,7
nfolds <- 10
fold3=createFolds(1:nrow(train), k=nfolds, list=FALSE)
mlist <- c(1,2,3,4,5,6,7)
ertemp <- rep(NA, 10)
erlist <- rep(NA, 7)
for(i in 1:length(mlist)){
mvalue <- mlist[i]
for(j in 1:nfolds){
rf_temp <- randomForest(HeartDiseaseorAttack~., data=train[fold3==j,], mtry=mvalue, ntree=2000, importance=TRUE)
ertemp[j] <- mean(rf_temp$err.rate[,1])
}
erlist[i] = mean(ertemp)
}
data.frame(mlist,erlist)#m=1 is selected
## mlist erlist
## 1 1 0.07931606
## 2 2 0.08072666
## 3 3 0.08310072
## 4 4 0.08476209
## 5 5 0.08847032
## 6 6 0.09147167
## 7 7 0.09241277
## Fit random Forest model with mtry=1
rf <- randomForest(HeartDiseaseorAttack~., data=heart_disease1, mtry=1, ntree=2000, importance=TRUE)
rf
##
## Call:
## randomForest(formula = HeartDiseaseorAttack ~ ., data = heart_disease1, mtry = 1, ntree = 2000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 8.3%
## Confusion matrix:
## 0 1 class.error
## 0 5229 0 0
## 1 473 0 1
varImpPlot(rf)

##Error Rate and ROC Curve, and AUC
pred_rf_d1 <- predict(rf, heart_disease1, type="response")
ER_rf_d1 <- mean(heart_disease1$HeartDiseaseorAttack!=pred_rf_d1)
pi_rf_d1 <- rf$votes[,2]
pred2_rf_d1 <- prediction(pi_rf_d1, heart_disease1$HeartDiseaseorAttack)
perf_rf_d1 <- performance(pred2_rf_d1, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_rf_d1, colorize=TRUE, main="Dropping NAs RF")# ROC Curve for d1

AUC_rf_d1 <- performance(pred2_rf_d1,"auc")@y.values[[1]]
##Use 10-fold Cross-validation to choose mtry value with m=1,2,3,4,5,6,7 on iterative regression impute
nfolds <- 10
fold4=createFolds(1:nrow(train3), k=nfolds, list=FALSE)
mlist2 <- c(1,2,3,4,5,6,7)
ertemp2 <- rep(NA, 10)
erlist2 <- rep(NA, 7)
for(i in 1:length(mlist2)){
mvalue2 <- mlist2[i]
for(j in 1:nfolds){
rf_temp2 <- randomForest(HeartDiseaseorAttack~., data=train3[fold4==j,], mtry=mvalue2, ntree=2000, importance=TRUE)
ertemp2[j] <- mean(rf_temp2$err.rate[,1])
}
erlist2[i] = mean(ertemp2)
}
data.frame(mlist2,erlist2)#m=1 is selected
## mlist2 erlist2
## 1 1 0.08114077
## 2 2 0.08059767
## 3 3 0.08291125
## 4 4 0.08584110
## 5 5 0.08671922
## 6 6 0.08844765
## 7 7 0.08883238
## Fit random Forest model with mtry=1
rf2 <- randomForest(HeartDiseaseorAttack~., data=heart_disease3, mtry=1, ntree=2000, importance=TRUE)
rf2
##
## Call:
## randomForest(formula = HeartDiseaseorAttack ~ ., data = heart_disease3, mtry = 1, ntree = 2000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 8.57%
## Confusion matrix:
## 0 1 class.error
## 0 6400 0 0
## 1 600 0 1
varImpPlot(rf2)

##Error Rate and ROC Curve, and AUC
pred_rf_d2 <- predict(rf2, heart_disease3, type="response")
ER_rf_d2 <- mean(heart_disease3$HeartDiseaseorAttack!=pred_rf_d2)
pi_rf_d2 <- rf2$votes[,2]
pred2_rf_d2 <- prediction(pi_rf_d2, heart_disease3$HeartDiseaseorAttack)
perf_rf_d2 <- performance(pred2_rf_d2, "tpr","fpr")
plot(perf_rf_d2, colorize=TRUE, main="Iterative Regression RF")# ROC Curve for d2

AUC_rf_d2 <- performance(pred2_rf_d2,"auc")@y.values[[1]]
##Compare two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_rf_d1,ER_rf_d2))
## model ER
## 1 Dropping NAs RF 0.08295335
## 2 Iterative Regression RF 0.08571429
par(mfrow=c(1,2))
plot(perf_rf_d1, colorize=TRUE, main="Dropping NAs RF")# ROC Curve for d1
plot(perf_rf_d2, colorize=TRUE, main="Iterative Regression RF")# ROC Curve for d2

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_rf_d1,AUC_rf_d2))
## model AUC
## 1 Dropping NAs RF 0.7184172
## 2 Iterative Regression RF 0.7076745