#import data
library(readr)

## Warning: package 'readr' was built under R version 4.0.5

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

cvd <-  read_csv("C:/Users/ADMIN/Downloads/UMP SEM 5/STATS MODELING/GROUP ASSIGNMENT/TerpalingLatestCVD.csv")

## New names:
## * `` -> ...1

## Rows: 733 Columns: 21

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (21): ...1, TARGET, SEX, AGE, RACE, SMOKER, ASPIRIN, DYSLIPIDEMIA, HYPER...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

cvd

## # A tibble: 733 x 21
##     ...1 TARGET   SEX   AGE  RACE SMOKER ASPIRIN DYSLIPIDEMIA HYPERTENSION
##    <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>        <dbl>        <dbl>
##  1     0      1     0    53     3      1       0            0            1
##  2     1      0     1    52     2      1       0            0            0
##  3     2      0     1    62     2      1       1            1            1
##  4     3      1     1    68     2      1       1            1            1
##  5     4      1     1    53     2      1       0            1            1
##  6     5      0     1    53     2      1       1            0            1
##  7     6      0     1    63     2      1       0            1            1
##  8     7      0     0    63     2      0       0            0            1
##  9     8      1     1    50     0      0       1            1            0
## 10     9      1     1    59     2      1       1            0            1
## # ... with 723 more rows, and 12 more variables: DIABETES <dbl>, OHA <dbl>,
## #   INSULIN <dbl>, NONTHERAPY <dbl>, PREMCVD <dbl>, HEARTFAIL <dbl>,
## #   HEARTRATE <dbl>, SYSTOLIC <dbl>, DIASTOLIC <dbl>, WAIST <dbl>, HIP <dbl>,
## #   BMI <dbl>

summary(cvd)

##       ...1         TARGET            SEX              AGE             RACE     
##  Min.   :  0   Min.   :0.0000   Min.   :0.0000   Min.   :23.00   Min.   :0.00  
##  1st Qu.:183   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:50.00   1st Qu.:2.00  
##  Median :366   Median :0.0000   Median :1.0000   Median :58.00   Median :2.00  
##  Mean   :366   Mean   :0.3056   Mean   :0.8377   Mean   :57.61   Mean   :1.73  
##  3rd Qu.:549   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:66.00   3rd Qu.:2.00  
##  Max.   :732   Max.   :1.0000   Max.   :1.0000   Max.   :88.00   Max.   :3.00  
##      SMOKER         ASPIRIN       DYSLIPIDEMIA     HYPERTENSION   
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :0.708   Mean   :0.322   Mean   :0.4379   Mean   :0.6603  
##  3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     DIABETES          OHA            INSULIN          NONTHERAPY     
##  Min.   :0.000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.382   Mean   :0.2142   Mean   :0.08458   Mean   :0.07776  
##  3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##     PREMCVD         HEARTFAIL         HEARTRATE         SYSTOLIC    
##  Min.   :0.0000   Min.   :0.00000   Min.   :  1.00   Min.   : 60.0  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.: 70.00   1st Qu.:115.0  
##  Median :0.0000   Median :0.00000   Median : 81.00   Median :131.0  
##  Mean   :0.2838   Mean   :0.05593   Mean   : 83.07   Mean   :134.3  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.: 95.00   3rd Qu.:148.0  
##  Max.   :1.0000   Max.   :1.00000   Max.   :197.00   Max.   :251.0  
##    DIASTOLIC          WAIST             HIP              BMI       
##  Min.   : 40.00   Min.   : 33.00   Min.   : 40.00   Min.   :1.000  
##  1st Qu.: 70.00   1st Qu.: 88.00   1st Qu.: 91.00   1st Qu.:2.000  
##  Median : 80.00   Median : 91.87   Median : 94.00   Median :3.000  
##  Mean   : 80.68   Mean   : 91.89   Mean   : 94.28   Mean   :2.802  
##  3rd Qu.: 90.00   3rd Qu.: 95.00   3rd Qu.: 99.00   3rd Qu.:3.000  
##  Max.   :162.00   Max.   :135.00   Max.   :149.00   Max.   :4.000

FULL MODEL

cvd_model <- glm(formula = TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION + DIABETES +
                   OHA + INSULIN + NONTHERAPY + PREMCVD + HEARTFAIL + HEARTRATE + SYSTOLIC + DIASTOLIC + WAIST + HIP + BMI, family= "binomial", data = cvd)
summary(cvd_model)

## 
## Call:
## glm(formula = TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN + 
##     DYSLIPIDEMIA + HYPERTENSION + DIABETES + OHA + INSULIN + 
##     NONTHERAPY + PREMCVD + HEARTFAIL + HEARTRATE + SYSTOLIC + 
##     DIASTOLIC + WAIST + HIP + BMI, family = "binomial", data = cvd)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3454  -0.6484  -0.3282   0.6416   2.6432  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.137592   1.290955  -0.107 0.915121    
## SEX          -0.686904   0.320765  -2.141 0.032237 *  
## AGE          -0.002218   0.010104  -0.220 0.826238    
## RACE         -0.024033   0.156109  -0.154 0.877651    
## SMOKER       -0.038734   0.280162  -0.138 0.890039    
## ASPIRIN       1.614769   0.233005   6.930  4.2e-12 ***
## DYSLIPIDEMIA  0.709931   0.217771   3.260 0.001114 ** 
## HYPERTENSION  0.967064   0.269800   3.584 0.000338 ***
## DIABETES     -0.124267   0.467902  -0.266 0.790560    
## OHA          -0.564985   0.463592  -1.219 0.222954    
## INSULIN      -0.390161   0.489568  -0.797 0.425481    
## NONTHERAPY   -0.620711   0.579945  -1.070 0.284488    
## PREMCVD       2.195430   0.237516   9.243  < 2e-16 ***
## HEARTFAIL     0.744696   0.465948   1.598 0.109990    
## HEARTRATE    -0.006420   0.005554  -1.156 0.247657    
## SYSTOLIC     -0.006602   0.005996  -1.101 0.270836    
## DIASTOLIC    -0.002204   0.009795  -0.225 0.821982    
## WAIST        -0.039065   0.019731  -1.980 0.047717 *  
## HIP           0.030461   0.018121   1.681 0.092778 .  
## BMI           0.049006   0.161962   0.303 0.762212    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 902.37  on 732  degrees of freedom
## Residual deviance: 625.75  on 713  degrees of freedom
## AIC: 665.75
## 
## Number of Fisher Scoring iterations: 5

Null deviance: 902.37 on 732 degrees of freedom
Residual deviance: 625.75 on 713 degrees of freedom
Chi = Null deviance - Residual deviance
Chi = 902.37 - 625.75
Chi = 276.62
There are p = 19 (732-713) predictor variables degrees of freedom.

1 - pchisq(276.62, df=19)

## [1] 0

We use the Chi-Square and degree of freedom to find the P-Value which is 0.0000. Since this P-value is less than 0.05, we can conclude that our model as a whole fits significantly better than an empty model

coef <- round(coef(cvd_model),4)
coef

##  (Intercept)          SEX          AGE         RACE       SMOKER      ASPIRIN 
##      -0.1376      -0.6869      -0.0022      -0.0240      -0.0387       1.6148 
## DYSLIPIDEMIA HYPERTENSION     DIABETES          OHA      INSULIN   NONTHERAPY 
##       0.7099       0.9671      -0.1243      -0.5650      -0.3902      -0.6207 
##      PREMCVD    HEARTFAIL    HEARTRATE     SYSTOLIC    DIASTOLIC        WAIST 
##       2.1954       0.7447      -0.0064      -0.0066      -0.0022      -0.0391 
##          HIP          BMI 
##       0.0305       0.0490

Feature selection using Variable Importance from Machine Learning Algorithms

library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.0.5

## Loading required package: lattice

rpartImp <- as.data.frame(varImp(cvd_model))
rpartImp <- data.frame(overall = rpartImp$Overall, names = rownames(rpartImp))
rpartImp[order(rpartImp$overall, decreasing = T),]

##      overall        names
## 12 9.2433042      PREMCVD
## 5  6.9301934      ASPIRIN
## 7  3.5843713 HYPERTENSION
## 6  3.2599824 DYSLIPIDEMIA
## 1  2.1414544          SEX
## 17 1.9798772        WAIST
## 18 1.6809212          HIP
## 13 1.5982375    HEARTFAIL
## 9  1.2187120          OHA
## 14 1.1560591    HEARTRATE
## 15 1.1011398     SYSTOLIC
## 11 1.0702922   NONTHERAPY
## 10 0.7969488      INSULIN
## 19 0.3025769          BMI
## 8  0.2655836     DIABETES
## 16 0.2249968    DIASTOLIC
## 2  0.2195287          AGE
## 3  0.1539472         RACE
## 4  0.1382545       SMOKER

print(rpartImp)

##      overall        names
## 1  2.1414544          SEX
## 2  0.2195287          AGE
## 3  0.1539472         RACE
## 4  0.1382545       SMOKER
## 5  6.9301934      ASPIRIN
## 6  3.2599824 DYSLIPIDEMIA
## 7  3.5843713 HYPERTENSION
## 8  0.2655836     DIABETES
## 9  1.2187120          OHA
## 10 0.7969488      INSULIN
## 11 1.0702922   NONTHERAPY
## 12 9.2433042      PREMCVD
## 13 1.5982375    HEARTFAIL
## 14 1.1560591    HEARTRATE
## 15 1.1011398     SYSTOLIC
## 16 0.2249968    DIASTOLIC
## 17 1.9798772        WAIST
## 18 1.6809212          HIP
## 19 0.3025769          BMI

NEW MODEL

cvd_newmodel <- glm(formula = TARGET ~  SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION + PREMCVD + OHA + HEARTRATE + HEARTFAIL, family= "binomial", data = cvd)
summary(cvd_newmodel)

## 
## Call:
## glm(formula = TARGET ~ SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA + 
##     HYPERTENSION + PREMCVD + OHA + HEARTRATE + HEARTFAIL, family = "binomial", 
##     data = cvd)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3172  -0.6532  -0.3350   0.6579   2.6574  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.029884   0.916651  -1.124 0.261213    
## SEX          -0.563329   0.263503  -2.138 0.032529 *  
## WAIST        -0.041946   0.019267  -2.177 0.029469 *  
## HIP           0.032673   0.017867   1.829 0.067453 .  
## ASPIRIN       1.496188   0.224156   6.675 2.48e-11 ***
## DYSLIPIDEMIA  0.682370   0.214233   3.185 0.001447 ** 
## HYPERTENSION  0.860312   0.257732   3.338 0.000844 ***
## PREMCVD       2.231218   0.224387   9.944  < 2e-16 ***
## OHA          -0.567443   0.255265  -2.223 0.026219 *  
## HEARTRATE    -0.009741   0.005275  -1.847 0.064766 .  
## HEARTFAIL     0.740330   0.458685   1.614 0.106522    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 902.37  on 732  degrees of freedom
## Residual deviance: 633.73  on 722  degrees of freedom
## AIC: 655.73
## 
## Number of Fisher Scoring iterations: 5

Null deviance: 902.37 on 732 degrees of freedom
Residual deviance: 633.73 on 722 degrees of freedom
Chi = Null deviance - Residual deviance
Chi = 902.37 - 633.73
Chi = 268.64
There are p = 10 (732-722) predictor variables degrees of freedom.

1 - pchisq(268.64, df=10)

## [1] 0

We use the Chi-Square and degree of freedom to find the P-Value which is 0.0000. Since this P-value is less than 0.05, we can conclude that our new model as a whole fits significantly better than our model as a whole.

LACK OF FIT TEST

Model1 <- cvd_model
Model2 <- cvd_newmodel
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.0.5

## Loading required package: Matrix

## Loaded glmnet 4.1-3

anova(cvd_model, cvd_newmodel, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN + DYSLIPIDEMIA + 
##     HYPERTENSION + DIABETES + OHA + INSULIN + NONTHERAPY + PREMCVD + 
##     HEARTFAIL + HEARTRATE + SYSTOLIC + DIASTOLIC + WAIST + HIP + 
##     BMI
## Model 2: TARGET ~ SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION + 
##     PREMCVD + OHA + HEARTRATE + HEARTFAIL
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       713     625.75                     
## 2       722     633.73 -9   -7.983   0.5359

H0 : the full model (Model1) is as good as the reduced model (Model2)
H1 : the reduced model is significantly better

Based on the table, the corresponding p-value is 0.5359
Since this p-value is greater than .05, we accept the null hypothesis of the test and conclude that the full model (Model1) is as good as the reduced model (Model2).

Thus, the full model (Model1) do not offers a statistically significantly better fit than the reduced model (Model2)

Because of that, we will just use the full model.

CVD GROUP PROJECT

1/23/2022

FULL MODEL

Feature selection using Variable Importance from Machine Learning Algorithms

NEW MODEL

LACK OF FIT TEST