STAT-4052-Final-Project.R

heart_disease <- read.csv("D:/下载/heart_disease33.txt", sep="")
library(summarytools)
library(dplyr)

## 
## 载入程辑包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tibble::view()  masks summarytools::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(alr4)

## 载入需要的程辑包：car
## 载入需要的程辑包：carData
## 
## 载入程辑包：'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 载入需要的程辑包：effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(class)
library(caret)

## 载入需要的程辑包：lattice
## 
## 载入程辑包：'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(tree)
library(MASS)

## 
## 载入程辑包：'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## 载入程辑包：'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(ROCR)
library(gbm)

## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(nnet)
library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## 载入程辑包：'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

#Check for Missing data
str(heart_disease)

## 'data.frame':    7000 obs. of  11 variables:
##  $ HeartDiseaseorAttack: int  NA 0 0 1 0 1 0 0 0 0 ...
##  $ HighBP              : int  1 0 0 1 1 1 1 1 0 1 ...
##  $ HighChol            : int  0 0 1 1 0 1 1 0 0 1 ...
##  $ CholCheck           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BMI                 : int  26 26 27 28 25 40 43 42 27 29 ...
##  $ Smoker              : int  1 0 0 1 0 0 1 0 0 1 ...
##  $ Stroke              : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ Diabetes            : int  0 2 0 0 0 0 2 0 0 0 ...
##  $ Sex                 : int  0 0 1 0 1 1 0 0 1 1 ...
##  $ Age                 : int  8 8 10 10 6 7 7 6 9 13 ...
##  $ Education           : int  5 6 6 4 4 6 5 5 5 4 ...

summary(heart_disease)

##  HeartDiseaseorAttack     HighBP         HighChol        CholCheck     
##  Min.   :0.00000      Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000      Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :0.08868      Mean   :0.437   Mean   :0.4201   Mean   :0.9616  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :279                                                           
##       BMI            Smoker           Stroke          Diabetes     
##  Min.   :14.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:24.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :27.00   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :28.25   Mean   :0.4273   Mean   :0.0432   Mean   :0.3189  
##  3rd Qu.:31.00   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :92.00   Max.   :1.0000   Max.   :1.0000   Max.   :2.0000  
##                                   NA's   :499      NA's   :625     
##       Sex              Age           Education    
##  Min.   :0.0000   Min.   : 1.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.: 6.000   1st Qu.:4.000  
##  Median :0.0000   Median : 8.000   Median :5.000  
##  Mean   :0.4369   Mean   : 7.824   Mean   :4.985  
##  3rd Qu.:1.0000   3rd Qu.:10.000   3rd Qu.:6.000  
##  Max.   :1.0000   Max.   :13.000   Max.   :6.000  
##

freq(heart_disease)

## Variable(s) ignored: BMI

## Frequencies  
## heart_disease$HeartDiseaseorAttack  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   6125     91.13          91.13     87.50          87.50
##           1    596      8.87         100.00      8.51          96.01
##        <NA>    279                               3.99         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$HighBP  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   3941     56.30          56.30     56.30          56.30
##           1   3059     43.70         100.00     43.70         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$HighChol  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   4059     57.99          57.99     57.99          57.99
##           1   2941     42.01         100.00     42.01         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$CholCheck  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0    269      3.84           3.84      3.84           3.84
##           1   6731     96.16         100.00     96.16         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Smoker  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   4009     57.27          57.27     57.27          57.27
##           1   2991     42.73         100.00     42.73         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Stroke  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   6220     95.68          95.68     88.86          88.86
##           1    281      4.32         100.00      4.01          92.87
##        <NA>    499                               7.13         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Diabetes  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   5282     82.85          82.85     75.46          75.46
##           1    153      2.40          85.25      2.19          77.64
##           2    940     14.75         100.00     13.43          91.07
##        <NA>    625                               8.93         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Sex  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           0   3942     56.31          56.31     56.31          56.31
##           1   3058     43.69         100.00     43.69         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Age  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           1    199      2.84           2.84      2.84           2.84
##           2    230      3.29           6.13      3.29           6.13
##           3    347      4.96          11.09      4.96          11.09
##           4    427      6.10          17.19      6.10          17.19
##           5    481      6.87          24.06      6.87          24.06
##           6    586      8.37          32.43      8.37          32.43
##           7    735     10.50          42.93     10.50          42.93
##           8    803     11.47          54.40     11.47          54.40
##           9    859     12.27          66.67     12.27          66.67
##          10    838     11.97          78.64     11.97          78.64
##          11    629      8.99          87.63      8.99          87.63
##          12    420      6.00          93.63      6.00          93.63
##          13    446      6.37         100.00      6.37         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00
## 
## heart_disease$Education  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           1     10      0.14           0.14      0.14           0.14
##           2    213      3.04           3.19      3.04           3.19
##           3    346      4.94           8.13      4.94           8.13
##           4   1591     22.73          30.86     22.73          30.86
##           5   1982     28.31          59.17     28.31          59.17
##           6   2858     40.83         100.00     40.83         100.00
##        <NA>      0                               0.00         100.00
##       Total   7000    100.00         100.00    100.00         100.00

freq(heart_disease$BMI)

## Frequencies  
## heart_disease$BMI  
## Type: Integer  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##          14      5     0.071          0.071     0.071          0.071
##          15      3     0.043          0.114     0.043          0.114
##          16     16     0.229          0.343     0.229          0.343
##          17     27     0.386          0.729     0.386          0.729
##          18     61     0.871          1.600     0.871          1.600
##          19    113     1.614          3.214     1.614          3.214
##          20    158     2.257          5.471     2.257          5.471
##          21    273     3.900          9.371     3.900          9.371
##          22    369     5.271         14.643     5.271         14.643
##          23    442     6.314         20.957     6.314         20.957
##          24    531     7.586         28.543     7.586         28.543
##          25    473     6.757         35.300     6.757         35.300
##          26    563     8.043         43.343     8.043         43.343
##          27    657     9.386         52.729     9.386         52.729
##          28    467     6.671         59.400     6.671         59.400
##          29    411     5.871         65.271     5.871         65.271
##          30    412     5.886         71.157     5.886         71.157
##          31    373     5.329         76.486     5.329         76.486
##          32    278     3.971         80.457     3.971         80.457
##          33    250     3.571         84.029     3.571         84.029
##          34    189     2.700         86.729     2.700         86.729
##          35    158     2.257         88.986     2.257         88.986
##          36    130     1.857         90.843     1.857         90.843
##          37    107     1.529         92.371     1.529         92.371
##          38     76     1.086         93.457     1.086         93.457
##          39     82     1.171         94.629     1.171         94.629
##          40     65     0.929         95.557     0.929         95.557
##          41     46     0.657         96.214     0.657         96.214
##          42     52     0.743         96.957     0.743         96.957
##          43     45     0.643         97.600     0.643         97.600
##          44     34     0.486         98.086     0.486         98.086
##          45     24     0.343         98.429     0.343         98.429
##          46     24     0.343         98.771     0.343         98.771
##          47     16     0.229         99.000     0.229         99.000
##          48      9     0.129         99.129     0.129         99.129
##          49     10     0.143         99.271     0.143         99.271
##          50     10     0.143         99.414     0.143         99.414
##          51      4     0.057         99.471     0.057         99.471
##          52      7     0.100         99.571     0.100         99.571
##          53      4     0.057         99.629     0.057         99.629
##          54      4     0.057         99.686     0.057         99.686
##          55      6     0.086         99.771     0.086         99.771
##          56      1     0.014         99.786     0.014         99.786
##          57      2     0.029         99.814     0.029         99.814
##          58      3     0.043         99.857     0.043         99.857
##          59      4     0.057         99.914     0.057         99.914
##          62      2     0.029         99.943     0.029         99.943
##          65      1     0.014         99.957     0.014         99.957
##          73      1     0.014         99.971     0.014         99.971
##          74      1     0.014         99.986     0.014         99.986
##          92      1     0.014        100.000     0.014        100.000
##        <NA>      0                              0.000        100.000
##       Total   7000   100.000        100.000   100.000        100.000

n=nrow(heart_disease)

##Drop NAs for all categorical variables, no NAs for continuous variable
heart_disease1 <- heart_disease %>% 
  drop_na(HeartDiseaseorAttack, Stroke, Diabetes)#create a dataset without NAs
heart_disease2 <- heart_disease %>% 
  drop_na(HeartDiseaseorAttack, Stroke, Diabetes)#create a dataset without NAs without change variable type
#Change categorical variables from integer to factor
heart_disease1$HeartDiseaseorAttack <- as.factor(heart_disease1$HeartDiseaseorAttack)
heart_disease1$Stroke <- as.factor(heart_disease1$Stroke)
heart_disease1$Diabetes <- as.factor(heart_disease1$Diabetes)
summary(heart_disease1)

##  HeartDiseaseorAttack     HighBP          HighChol        CholCheck     
##  0:5229               Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1: 473               1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##                       Median :0.0000   Median :0.0000   Median :1.0000  
##                       Mean   :0.4281   Mean   :0.4135   Mean   :0.9626  
##                       3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                       Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker       Stroke   Diabetes      Sex        
##  Min.   :14.00   Min.   :0.0000   0:5472   0:4754   Min.   :0.0000  
##  1st Qu.:24.00   1st Qu.:0.0000   1: 230   1: 130   1st Qu.:0.0000  
##  Median :27.00   Median :0.0000            2: 818   Median :0.0000  
##  Mean   :28.33   Mean   :0.3653                     Mean   :0.4397  
##  3rd Qu.:31.00   3rd Qu.:1.0000                     3rd Qu.:1.0000  
##  Max.   :74.00   Max.   :1.0000                     Max.   :1.0000  
##       Age           Education    
##  Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 6.000   1st Qu.:4.000  
##  Median : 8.000   Median :5.000  
##  Mean   : 7.782   Mean   :5.074  
##  3rd Qu.:10.000   3rd Qu.:6.000  
##  Max.   :13.000   Max.   :6.000

##Iterative Imputation for missing values
heart_disease3 <- heart_disease
heart_disease3$HeartDiseaseorAttack <- as.factor(heart_disease3$HeartDiseaseorAttack)
heart_disease3$Stroke <- as.factor(heart_disease3$Stroke)
heart_disease3$Diabetes <- as.character(heart_disease3$Diabetes)
summary(heart_disease3)

##  HeartDiseaseorAttack     HighBP         HighChol        CholCheck     
##  0   :6125            Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1   : 596            1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  NA's: 279            Median :0.000   Median :0.0000   Median :1.0000  
##                       Mean   :0.437   Mean   :0.4201   Mean   :0.9616  
##                       3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                       Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker        Stroke       Diabetes        
##  Min.   :14.00   Min.   :0.0000   0   :6220   Length:7000       
##  1st Qu.:24.00   1st Qu.:0.0000   1   : 281   Class :character  
##  Median :27.00   Median :0.0000   NA's: 499   Mode  :character  
##  Mean   :28.25   Mean   :0.4273                                 
##  3rd Qu.:31.00   3rd Qu.:1.0000                                 
##  Max.   :92.00   Max.   :1.0000                                 
##       Sex              Age           Education    
##  Min.   :0.0000   Min.   : 1.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.: 6.000   1st Qu.:4.000  
##  Median :0.0000   Median : 8.000   Median :5.000  
##  Mean   :0.4369   Mean   : 7.824   Mean   :4.985  
##  3rd Qu.:1.0000   3rd Qu.:10.000   3rd Qu.:6.000  
##  Max.   :1.0000   Max.   :13.000   Max.   :6.000

heart_disease4 = heart_disease3



##Impute the missings in HeartDiseaseorAttack, Stroke, Diabetes using most frequent category
heart_disease3$HeartDiseaseorAttack[is.na(heart_disease3$HeartDiseaseorAttack)]="0"
heart_disease3$Stroke[is.na(heart_disease3$Stroke)]="0"
heart_disease3$Diabetes[is.na(heart_disease3$Diabetes)]="0"
summary(heart_disease3)

##  HeartDiseaseorAttack     HighBP         HighChol        CholCheck     
##  0:6404               Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1: 596               1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##                       Median :0.000   Median :0.0000   Median :1.0000  
##                       Mean   :0.437   Mean   :0.4201   Mean   :0.9616  
##                       3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                       Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker       Stroke     Diabetes              Sex        
##  Min.   :14.00   Min.   :0.0000   0:6719   Length:7000        Min.   :0.0000  
##  1st Qu.:24.00   1st Qu.:0.0000   1: 281   Class :character   1st Qu.:0.0000  
##  Median :27.00   Median :0.0000            Mode  :character   Median :0.0000  
##  Mean   :28.25   Mean   :0.4273                               Mean   :0.4369  
##  3rd Qu.:31.00   3rd Qu.:1.0000                               3rd Qu.:1.0000  
##  Max.   :92.00   Max.   :1.0000                               Max.   :1.0000  
##       Age           Education    
##  Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 6.000   1st Qu.:4.000  
##  Median : 8.000   Median :5.000  
##  Mean   : 7.824   Mean   :4.985  
##  3rd Qu.:10.000   3rd Qu.:6.000  
##  Max.   :13.000   Max.   :6.000

n_iter=10 #Set number of iterations
for(i in 1:n_iter)
{
  #impute Stroke given rest
  m_Stroke <- glm(Stroke~., data=heart_disease3, subset=!is.na(heart_disease$Stroke), family=binomial)
  pred_Stroke <- predict(m_Stroke, heart_disease3[is.na(heart_disease$Stroke),], type="response")
  heart_disease3$Stroke[is.na(heart_disease$Stroke)] <-  ifelse(pred_Stroke>0.5,1,0)
  #impute HeartDiseaseorAttack given rest
  m_HDA <- glm(HeartDiseaseorAttack~.,heart_disease3,subset=!is.na(heart_disease$HeartDiseaseorAttack), family=binomial)
  pred_HDA <- predict(m_HDA, heart_disease3[is.na(heart_disease$HeartDiseaseorAttack),], type="response")
  heart_disease3$HeartDiseaseorAttack[is.na(heart_disease$HeartDiseaseorAttack)] <- ifelse(pred_HDA>0.5,1,0)
  #impute Diabetes given rest
  m_Diabetes <- multinom(Diabetes~., heart_disease3, subset=!is.na(heart_disease$Diabetes), trace=FALSE)
  pred_Diabetes <- predict(m_Diabetes, heart_disease3[is.na(heart_disease$Diabetes),], type="class")
  heart_disease3$Diabetes[is.na(heart_disease$Diabetes)] = pred_Diabetes
}
summary(heart_disease3)

##  HeartDiseaseorAttack     HighBP         HighChol        CholCheck     
##  0:6400               Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1: 600               1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##                       Median :0.000   Median :0.0000   Median :1.0000  
##                       Mean   :0.437   Mean   :0.4201   Mean   :0.9616  
##                       3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                       Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker       Stroke     Diabetes              Sex        
##  Min.   :14.00   Min.   :0.0000   0:6718   Length:7000        Min.   :0.0000  
##  1st Qu.:24.00   1st Qu.:0.0000   1: 282   Class :character   1st Qu.:0.0000  
##  Median :27.00   Median :0.0000            Mode  :character   Median :0.0000  
##  Mean   :28.25   Mean   :0.4273                               Mean   :0.4369  
##  3rd Qu.:31.00   3rd Qu.:1.0000                               3rd Qu.:1.0000  
##  Max.   :92.00   Max.   :1.0000                               Max.   :1.0000  
##       Age           Education    
##  Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 6.000   1st Qu.:4.000  
##  Median : 8.000   Median :5.000  
##  Mean   : 7.824   Mean   :4.985  
##  3rd Qu.:10.000   3rd Qu.:6.000  
##  Max.   :13.000   Max.   :6.000

#Logistic Regression model for HeartDiseaseorAttack 
logm1 <- glm(HeartDiseaseorAttack~., data=heart_disease1, family="binomial")
summary(logm1)

## 
## Call:
## glm(formula = HeartDiseaseorAttack ~ ., family = "binomial", 
##     data = heart_disease1)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.520337   0.842005  -8.931  < 2e-16 ***
## HighBP       0.755290   0.124127   6.085 1.17e-09 ***
## HighChol     0.701951   0.113372   6.192 5.96e-10 ***
## CholCheck    1.301442   0.720691   1.806   0.0709 .  
## BMI          0.009998   0.009237   1.082   0.2791    
## Smoker       0.427345   0.105821   4.038 5.38e-05 ***
## Stroke1      1.421228   0.162229   8.761  < 2e-16 ***
## Diabetes1    0.376741   0.277667   1.357   0.1748    
## Diabetes2    0.587354   0.119765   4.904 9.38e-07 ***
## Sex          0.603005   0.107132   5.629 1.82e-08 ***
## Age          0.248172   0.023369  10.620  < 2e-16 ***
## Education   -0.055623   0.049741  -1.118   0.2635    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3260.7  on 5701  degrees of freedom
## Residual deviance: 2606.2  on 5690  degrees of freedom
## AIC: 2630.2
## 
## Number of Fisher Scoring iterations: 7

pchisq(logm1$deviance, 5691, lower.tail=FALSE)#deviance test for good fit

## [1] 1

Pearson = sum(residuals(logm1, type="pearson")^2)#chi-square test for good fit
Pearson

## [1] 5795.519

pchisq(Pearson, 5691, lower.tail=FALSE)

## [1] 0.1635461

plot(logm1, which=5)#Diagnosis Plot

pi_logit_d1 <- predict(logm1, type="response")
y_logit_d1 <- ifelse(pi_logit_d1 >0.5, 1, 0)
ER_logit_d1 <- mean(y_logit_d1!=heart_disease1$HeartDiseaseorAttack)#Error Rate
##ROC curve and AUC
pred_logit_d1 <- prediction(pi_logit_d1, heart_disease1$HeartDiseaseorAttack)
perf_logit_d1 <- performance(pred_logit_d1, "tpr","fpr")
plot(perf_logit_d1, colorize=TRUE, main="Missing Data impute by dropping NAs")

AUC_logit_d1 <- performance(pred_logit_d1,"auc")@y.values[[1]]


##logistic regression model on iterative regression impute
logm2 <- glm(HeartDiseaseorAttack~., data=heart_disease3, family="binomial")
summary(logm2)

## 
## Call:
## glm(formula = HeartDiseaseorAttack ~ ., family = "binomial", 
##     data = heart_disease3)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.99154    0.60013 -11.650  < 2e-16 ***
## HighBP       0.77772    0.11117   6.996 2.64e-12 ***
## HighChol     0.65317    0.10057   6.495 8.30e-11 ***
## CholCheck    0.74257    0.46671   1.591  0.11160    
## BMI          0.01547    0.00797   1.941  0.05226 .  
## Smoker       0.41550    0.09818   4.232 2.32e-05 ***
## Stroke1      1.33036    0.14696   9.053  < 2e-16 ***
## Diabetes1   -0.03548    0.15021  -0.236  0.81327    
## Diabetes2    0.59644    0.11135   5.357 8.48e-08 ***
## Diabetes3    1.41475    0.46628   3.034  0.00241 ** 
## Sex          0.56535    0.09487   5.959 2.53e-09 ***
## Age          0.24734    0.02069  11.956  < 2e-16 ***
## Education   -0.06762    0.04317  -1.566  0.11730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4095.1  on 6999  degrees of freedom
## Residual deviance: 3299.0  on 6987  degrees of freedom
## AIC: 3325
## 
## Number of Fisher Scoring iterations: 6

pchisq(logm2$deviance, 5691, lower.tail=FALSE)#deviance test for good fit

## [1] 1

Pearson = sum(residuals(logm2, type="pearson")^2)#chi-square test for good fit
Pearson

## [1] 7042.115

pchisq(Pearson, 5691, lower.tail=FALSE)

## [1] 2.26052e-32

plot(logm2, which=5)#Diagnosis Plot

pi_logit_d2 <- predict(logm2, type="response")
y_logit_d2 <- ifelse(pi_logit_d2 >0.5, 1, 0)
ER_logit_d2 <- mean(y_logit_d2!=heart_disease3$HeartDiseaseorAttack)#Error Rate
##ROC Curve
pred_logit_d2 <- prediction(pi_logit_d2, heart_disease3$HeartDiseaseorAttack)
perf_logit_d2 <- performance(pred_logit_d2, "tpr","fpr")
plot(perf_logit_d2, colorize=TRUE, main="Missing Data impute by Iterative Regression")

AUC_logit_d2 <- performance(pred_logit_d2,"auc")@y.values[[1]]




###Comparsion between two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_logit_d1,ER_logit_d2))

##                     model         ER
## 1         Dropping NAs RF 0.08260260
## 2 Iterative Regression RF 0.08471429

par(mfrow=c(1,2))
plot(perf_logit_d1, colorize=TRUE, main="Dropping NAs logit")
plot(perf_logit_d2, colorize=TRUE, main="Iterative Regression logit")

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_logit_d1,AUC_logit_d2))

##                     model       AUC
## 1         Dropping NAs RF 0.8281027
## 2 Iterative Regression RF 0.8219561

##Split the dataset into 50% training, and 50% validation
set.seed(4052)
train_ind <- sample(1:nrow(heart_disease1), round(0.5*nrow(heart_disease1)))
train_ind <- sample(1:nrow(heart_disease1), round(0.5*nrow(heart_disease1)))
train <- heart_disease1[train_ind,]
valid <- heart_disease1[-train_ind,] 



#Use KNN to predict HeartDiseaseorAttack
train2 <- heart_disease2[train_ind,] 
valid2 <- heart_disease2[-train_ind,] 
train.knn <- train2[,-1]
val.knn <- valid2[,-1]
y_train <- as.numeric(train2[,1])
y_test <- as.numeric(valid2[,1])

#Use KNN to predict HeartDiseaseorAttack on iterative regression impute
train_ind2 <- sample(1:nrow(heart_disease3), round(0.5*nrow(heart_disease3)))
train3 <- heart_disease3[train_ind2,]
valid3 <- heart_disease3[-train_ind2,]
train.knn2 <- train3[,-1]
val.knn2 <- valid3[,-1]
y_train2 <- as.numeric(train3[,1])-1
y_test2 <- as.numeric(valid3[,1])-1

##Use 10-folds Cross-validation to choose K, with a list of k=1,10,70,100
klist <- c(1,10,70,100)
nfolds <- 10
fold=createFolds(1:nrow(train2),k=nfolds,list=FALSE)
kCV_err <- rep(0,4)
for(i in 1:nfolds){
  pre1.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=1)
  pre10.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=10)
  pre70.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=70)
  pre100.CV <- knn(train.knn[fold!=i,], val.knn[fold!=i,], cl=train2$HeartDiseaseorAttack[fold!=i],k=100)
  kCV_err[1]=kCV_err[1]+mean(pre1.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
  kCV_err[2]=kCV_err[2]+mean(pre10.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
  kCV_err[3]=kCV_err[3]+mean(pre70.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
  kCV_err[4]=kCV_err[4]+mean(pre100.CV!=valid$HeartDiseaseorAttack[fold==i])/nfolds
}

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre10.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre70.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre100.CV, valid$HeartDiseaseorAttack[fold == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

data.frame(klist, kCV_err)#Use k=70 since greater k does not lead to change in ER

##   klist    kCV_err
## 1     1 0.13429841
## 2    10 0.08788196
## 3    70 0.08667391
## 4   100 0.08667391

k1 <- knn(train.knn, val.knn, cl=train2[,1], k=70, prob=TRUE)
table(valid$HeartDiseaseorAttack,k1)

##    k1
##        0    1
##   0 2604    0
##   1  247    0

val_err <- mean(valid$HeartDiseaseorAttack!=k1)
val_err

## [1] 0.08663627

##ROC Curves and Error Rate, and AUC
winning_pi_knn_d1 <- as.numeric(attr(k1, "prob"))
winning_class <- k1
pi_knn_d1 <- ifelse(winning_class==1, winning_pi_knn_d1, 1-winning_pi_knn_d1)
y_knn_d1 <- ifelse(pi_knn_d1 >0.5, 1, 0)
ER_knn_d1 <- mean((y_test-y_knn_d1)^2)# Error Rate

pred_knn_d1 <- prediction(pi_knn_d1, y_test)
perf_knn_d1 <- performance(pred_knn_d1, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_knn_d1, colorize=TRUE, main="Dropping NAs KNN")#ROC Curve d1

AUC_knn_d1 <- performance(pred_knn_d1,"auc")@y.values[[1]]



##Use 10-folds Cross-validtionto chooy_train##Use 10-folds Cross-validtionto choose K, with a list of k=1, 10, 70, 100 on iterative model
klist2 <- c(1,10,70,100)
nfolds <- 10
fold2=createFolds(1:nrow(train2),k=nfolds,list=FALSE)
kCV_err2 <- rep(0,4)
for(i in 1:nfolds){
  pre12.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=1)
  pre102.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=10)
  pre702.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=70)
  pre1002.CV <- knn(train.knn2[fold2!=i,], val.knn2[fold2!=i,], cl=train3$HeartDiseaseorAttack[fold2!=i],k=100)
  kCV_err2[1]=kCV_err2[1]+mean(pre12.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
  kCV_err2[2]=kCV_err2[2]+mean(pre102.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
  kCV_err2[3]=kCV_err2[3]+mean(pre702.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
  kCV_err2[4]=kCV_err2[4]+mean(pre1002.CV!=valid3$HeartDiseaseorAttack[fold2==i])/nfolds
}

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre12.CV, valid3$HeartDiseaseorAttack[fold2 == i]):
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre102.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre702.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

## Warning in `!=.default`(pre1002.CV, valid3$HeartDiseaseorAttack[fold2 == :
## 长的对象长度不是短的对象长度的整倍数

## Warning in is.na(e1) | is.na(e2): 长的对象长度不是短的对象长度的整倍数

data.frame(klist2, kCV_err2)#Use k=70 since greater k does not lead to change in ER

##   klist2   kCV_err2
## 1      1 0.13797701
## 2     10 0.09191367
## 3     70 0.09016755
## 4    100 0.09016755

k2 <- knn(train.knn2, val.knn2, cl=train3[,1], k=70, prob=TRUE)
table(valid3$HeartDiseaseorAttack,k2)

##    k2
##        0    1
##   0 3184    0
##   1  316    0

val_err2 <- mean(valid3$HeartDiseaseorAttack!=k2)
val_err2

## [1] 0.09028571

##ROC Curves and Error Rate, and AUC
winning_pi_knn_d2 <- as.numeric(attr(k2, "prob"))
winning_class2 <- k2
pi_knn_d2 <- ifelse(winning_class2==1, winning_pi_knn_d2, 1-winning_pi_knn_d2)
y_knn_d2 <- ifelse(pi_knn_d2 >0.5, 1, 0)
ER_knn_d2 <- mean((y_test2-y_knn_d2)^2)# Error Rate

pred_knn_d2 <- prediction(pi_knn_d2, y_test2)
perf_knn_d2 <- performance(pred_knn_d2, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_knn_d2, colorize=TRUE, main="Iterative Regression KNN")#ROC Curve d2

AUC_knn_d2 <- performance(pred_knn_d2,"auc")@y.values[[1]]

##Compare between two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_knn_d1,ER_knn_d2))

##                     model         ER
## 1         Dropping NAs RF 0.08663627
## 2 Iterative Regression RF 0.09028571

par(mfrow=c(1,2))
plot(perf_knn_d1, colorize=TRUE, main="Dropping NAs KNN")#ROC Curve d1
plot(perf_knn_d2, colorize=TRUE, main="Iterative Regression KNN")#ROC Curve d2

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_knn_d1,AUC_knn_d2))

##                     model       AUC
## 1         Dropping NAs RF 0.7796344
## 2 Iterative Regression RF 0.7706382

#Random Forest
##Use 10-fold Cross-validation to choose mtry value with m=1,2,3,4,5,6,7
nfolds <- 10
fold3=createFolds(1:nrow(train), k=nfolds, list=FALSE)
mlist <- c(1,2,3,4,5,6,7)
ertemp <- rep(NA, 10)
erlist <- rep(NA, 7)


for(i in 1:length(mlist)){
  mvalue <- mlist[i]
  for(j in 1:nfolds){
    rf_temp <- randomForest(HeartDiseaseorAttack~., data=train[fold3==j,], mtry=mvalue, ntree=2000, importance=TRUE)
    ertemp[j] <- mean(rf_temp$err.rate[,1])
  }
 erlist[i] = mean(ertemp)
}
data.frame(mlist,erlist)#m=1 is selected

##   mlist     erlist
## 1     1 0.07931606
## 2     2 0.08072666
## 3     3 0.08310072
## 4     4 0.08476209
## 5     5 0.08847032
## 6     6 0.09147167
## 7     7 0.09241277

## Fit random Forest model with mtry=1
rf <- randomForest(HeartDiseaseorAttack~., data=heart_disease1, mtry=1, ntree=2000, importance=TRUE)
rf

## 
## Call:
##  randomForest(formula = HeartDiseaseorAttack ~ ., data = heart_disease1,      mtry = 1, ntree = 2000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 8.3%
## Confusion matrix:
##      0 1 class.error
## 0 5229 0           0
## 1  473 0           1

varImpPlot(rf)

##Error Rate and ROC Curve, and AUC
pred_rf_d1 <- predict(rf, heart_disease1, type="response")
ER_rf_d1 <- mean(heart_disease1$HeartDiseaseorAttack!=pred_rf_d1)
pi_rf_d1 <- rf$votes[,2]
pred2_rf_d1 <- prediction(pi_rf_d1, heart_disease1$HeartDiseaseorAttack)
perf_rf_d1 <- performance(pred2_rf_d1, "tpr","fpr")
par(mfrow=c(1,1))
plot(perf_rf_d1, colorize=TRUE, main="Dropping NAs RF")# ROC Curve for d1

AUC_rf_d1 <- performance(pred2_rf_d1,"auc")@y.values[[1]]

##Use 10-fold Cross-validation to choose mtry value with m=1,2,3,4,5,6,7 on iterative regression impute
nfolds <- 10
fold4=createFolds(1:nrow(train3), k=nfolds, list=FALSE)
mlist2 <- c(1,2,3,4,5,6,7)
ertemp2 <- rep(NA, 10)
erlist2 <- rep(NA, 7)


for(i in 1:length(mlist2)){
  mvalue2 <- mlist2[i]
  for(j in 1:nfolds){
    rf_temp2 <- randomForest(HeartDiseaseorAttack~., data=train3[fold4==j,], mtry=mvalue2, ntree=2000, importance=TRUE)
    ertemp2[j] <- mean(rf_temp2$err.rate[,1])
  }
  erlist2[i] = mean(ertemp2)
}
data.frame(mlist2,erlist2)#m=1 is selected

##   mlist2    erlist2
## 1      1 0.08114077
## 2      2 0.08059767
## 3      3 0.08291125
## 4      4 0.08584110
## 5      5 0.08671922
## 6      6 0.08844765
## 7      7 0.08883238

## Fit random Forest model with mtry=1
rf2 <- randomForest(HeartDiseaseorAttack~., data=heart_disease3, mtry=1, ntree=2000, importance=TRUE)
rf2

## 
## Call:
##  randomForest(formula = HeartDiseaseorAttack ~ ., data = heart_disease3,      mtry = 1, ntree = 2000, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 8.57%
## Confusion matrix:
##      0 1 class.error
## 0 6400 0           0
## 1  600 0           1

varImpPlot(rf2)

##Error Rate and ROC Curve, and AUC
pred_rf_d2 <- predict(rf2, heart_disease3, type="response")
ER_rf_d2 <- mean(heart_disease3$HeartDiseaseorAttack!=pred_rf_d2)
pi_rf_d2 <- rf2$votes[,2]
pred2_rf_d2 <- prediction(pi_rf_d2, heart_disease3$HeartDiseaseorAttack)
perf_rf_d2 <- performance(pred2_rf_d2, "tpr","fpr")
plot(perf_rf_d2, colorize=TRUE, main="Iterative Regression RF")# ROC Curve for d2

AUC_rf_d2 <- performance(pred2_rf_d2,"auc")@y.values[[1]]

##Compare two datasets
data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),ER=c(ER_rf_d1,ER_rf_d2))

##                     model         ER
## 1         Dropping NAs RF 0.08295335
## 2 Iterative Regression RF 0.08571429

par(mfrow=c(1,2))
plot(perf_rf_d1, colorize=TRUE, main="Dropping NAs RF")# ROC Curve for d1
plot(perf_rf_d2, colorize=TRUE, main="Iterative Regression RF")# ROC Curve for d2

data.frame(model=c("Dropping NAs RF","Iterative Regression RF"),AUC=c(AUC_rf_d1,AUC_rf_d2))

##                     model       AUC
## 1         Dropping NAs RF 0.7184172
## 2 Iterative Regression RF 0.7076745

STAT-4052-Final-Project.R

david

2024-05-02