#import data
library(readr)
## Warning: package 'readr' was built under R version 4.0.5
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cvd <- read_csv("C:/Users/ADMIN/Downloads/UMP SEM 5/STATS MODELING/GROUP ASSIGNMENT/TerpalingLatestCVD.csv")
## New names:
## * `` -> ...1
## Rows: 733 Columns: 21
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (21): ...1, TARGET, SEX, AGE, RACE, SMOKER, ASPIRIN, DYSLIPIDEMIA, HYPER...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
cvd
## # A tibble: 733 x 21
## ...1 TARGET SEX AGE RACE SMOKER ASPIRIN DYSLIPIDEMIA HYPERTENSION
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 53 3 1 0 0 1
## 2 1 0 1 52 2 1 0 0 0
## 3 2 0 1 62 2 1 1 1 1
## 4 3 1 1 68 2 1 1 1 1
## 5 4 1 1 53 2 1 0 1 1
## 6 5 0 1 53 2 1 1 0 1
## 7 6 0 1 63 2 1 0 1 1
## 8 7 0 0 63 2 0 0 0 1
## 9 8 1 1 50 0 0 1 1 0
## 10 9 1 1 59 2 1 1 0 1
## # ... with 723 more rows, and 12 more variables: DIABETES <dbl>, OHA <dbl>,
## # INSULIN <dbl>, NONTHERAPY <dbl>, PREMCVD <dbl>, HEARTFAIL <dbl>,
## # HEARTRATE <dbl>, SYSTOLIC <dbl>, DIASTOLIC <dbl>, WAIST <dbl>, HIP <dbl>,
## # BMI <dbl>
summary(cvd)
## ...1 TARGET SEX AGE RACE
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :23.00 Min. :0.00
## 1st Qu.:183 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:50.00 1st Qu.:2.00
## Median :366 Median :0.0000 Median :1.0000 Median :58.00 Median :2.00
## Mean :366 Mean :0.3056 Mean :0.8377 Mean :57.61 Mean :1.73
## 3rd Qu.:549 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:66.00 3rd Qu.:2.00
## Max. :732 Max. :1.0000 Max. :1.0000 Max. :88.00 Max. :3.00
## SMOKER ASPIRIN DYSLIPIDEMIA HYPERTENSION
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.000 Median :0.0000 Median :1.0000
## Mean :0.708 Mean :0.322 Mean :0.4379 Mean :0.6603
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
## DIABETES OHA INSULIN NONTHERAPY
## Min. :0.000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.382 Mean :0.2142 Mean :0.08458 Mean :0.07776
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## PREMCVD HEARTFAIL HEARTRATE SYSTOLIC
## Min. :0.0000 Min. :0.00000 Min. : 1.00 Min. : 60.0
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 70.00 1st Qu.:115.0
## Median :0.0000 Median :0.00000 Median : 81.00 Median :131.0
## Mean :0.2838 Mean :0.05593 Mean : 83.07 Mean :134.3
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.: 95.00 3rd Qu.:148.0
## Max. :1.0000 Max. :1.00000 Max. :197.00 Max. :251.0
## DIASTOLIC WAIST HIP BMI
## Min. : 40.00 Min. : 33.00 Min. : 40.00 Min. :1.000
## 1st Qu.: 70.00 1st Qu.: 88.00 1st Qu.: 91.00 1st Qu.:2.000
## Median : 80.00 Median : 91.87 Median : 94.00 Median :3.000
## Mean : 80.68 Mean : 91.89 Mean : 94.28 Mean :2.802
## 3rd Qu.: 90.00 3rd Qu.: 95.00 3rd Qu.: 99.00 3rd Qu.:3.000
## Max. :162.00 Max. :135.00 Max. :149.00 Max. :4.000
cvd_model <- glm(formula = TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION + DIABETES +
OHA + INSULIN + NONTHERAPY + PREMCVD + HEARTFAIL + HEARTRATE + SYSTOLIC + DIASTOLIC + WAIST + HIP + BMI, family= "binomial", data = cvd)
summary(cvd_model)
##
## Call:
## glm(formula = TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN +
## DYSLIPIDEMIA + HYPERTENSION + DIABETES + OHA + INSULIN +
## NONTHERAPY + PREMCVD + HEARTFAIL + HEARTRATE + SYSTOLIC +
## DIASTOLIC + WAIST + HIP + BMI, family = "binomial", data = cvd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3454 -0.6484 -0.3282 0.6416 2.6432
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.137592 1.290955 -0.107 0.915121
## SEX -0.686904 0.320765 -2.141 0.032237 *
## AGE -0.002218 0.010104 -0.220 0.826238
## RACE -0.024033 0.156109 -0.154 0.877651
## SMOKER -0.038734 0.280162 -0.138 0.890039
## ASPIRIN 1.614769 0.233005 6.930 4.2e-12 ***
## DYSLIPIDEMIA 0.709931 0.217771 3.260 0.001114 **
## HYPERTENSION 0.967064 0.269800 3.584 0.000338 ***
## DIABETES -0.124267 0.467902 -0.266 0.790560
## OHA -0.564985 0.463592 -1.219 0.222954
## INSULIN -0.390161 0.489568 -0.797 0.425481
## NONTHERAPY -0.620711 0.579945 -1.070 0.284488
## PREMCVD 2.195430 0.237516 9.243 < 2e-16 ***
## HEARTFAIL 0.744696 0.465948 1.598 0.109990
## HEARTRATE -0.006420 0.005554 -1.156 0.247657
## SYSTOLIC -0.006602 0.005996 -1.101 0.270836
## DIASTOLIC -0.002204 0.009795 -0.225 0.821982
## WAIST -0.039065 0.019731 -1.980 0.047717 *
## HIP 0.030461 0.018121 1.681 0.092778 .
## BMI 0.049006 0.161962 0.303 0.762212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 902.37 on 732 degrees of freedom
## Residual deviance: 625.75 on 713 degrees of freedom
## AIC: 665.75
##
## Number of Fisher Scoring iterations: 5
Null deviance: 902.37 on 732 degrees of freedom
Residual deviance: 625.75 on 713 degrees of freedom
Chi = Null deviance - Residual deviance
Chi = 902.37 - 625.75
Chi = 276.62
There are p = 19 (732-713) predictor variables degrees of freedom.
1 - pchisq(276.62, df=19)
## [1] 0
coef <- round(coef(cvd_model),4)
coef
## (Intercept) SEX AGE RACE SMOKER ASPIRIN
## -0.1376 -0.6869 -0.0022 -0.0240 -0.0387 1.6148
## DYSLIPIDEMIA HYPERTENSION DIABETES OHA INSULIN NONTHERAPY
## 0.7099 0.9671 -0.1243 -0.5650 -0.3902 -0.6207
## PREMCVD HEARTFAIL HEARTRATE SYSTOLIC DIASTOLIC WAIST
## 2.1954 0.7447 -0.0064 -0.0066 -0.0022 -0.0391
## HIP BMI
## 0.0305 0.0490
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Loading required package: lattice
rpartImp <- as.data.frame(varImp(cvd_model))
rpartImp <- data.frame(overall = rpartImp$Overall, names = rownames(rpartImp))
rpartImp[order(rpartImp$overall, decreasing = T),]
## overall names
## 12 9.2433042 PREMCVD
## 5 6.9301934 ASPIRIN
## 7 3.5843713 HYPERTENSION
## 6 3.2599824 DYSLIPIDEMIA
## 1 2.1414544 SEX
## 17 1.9798772 WAIST
## 18 1.6809212 HIP
## 13 1.5982375 HEARTFAIL
## 9 1.2187120 OHA
## 14 1.1560591 HEARTRATE
## 15 1.1011398 SYSTOLIC
## 11 1.0702922 NONTHERAPY
## 10 0.7969488 INSULIN
## 19 0.3025769 BMI
## 8 0.2655836 DIABETES
## 16 0.2249968 DIASTOLIC
## 2 0.2195287 AGE
## 3 0.1539472 RACE
## 4 0.1382545 SMOKER
print(rpartImp)
## overall names
## 1 2.1414544 SEX
## 2 0.2195287 AGE
## 3 0.1539472 RACE
## 4 0.1382545 SMOKER
## 5 6.9301934 ASPIRIN
## 6 3.2599824 DYSLIPIDEMIA
## 7 3.5843713 HYPERTENSION
## 8 0.2655836 DIABETES
## 9 1.2187120 OHA
## 10 0.7969488 INSULIN
## 11 1.0702922 NONTHERAPY
## 12 9.2433042 PREMCVD
## 13 1.5982375 HEARTFAIL
## 14 1.1560591 HEARTRATE
## 15 1.1011398 SYSTOLIC
## 16 0.2249968 DIASTOLIC
## 17 1.9798772 WAIST
## 18 1.6809212 HIP
## 19 0.3025769 BMI
cvd_newmodel <- glm(formula = TARGET ~ SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION + PREMCVD + OHA + HEARTRATE + HEARTFAIL, family= "binomial", data = cvd)
summary(cvd_newmodel)
##
## Call:
## glm(formula = TARGET ~ SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA +
## HYPERTENSION + PREMCVD + OHA + HEARTRATE + HEARTFAIL, family = "binomial",
## data = cvd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3172 -0.6532 -0.3350 0.6579 2.6574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.029884 0.916651 -1.124 0.261213
## SEX -0.563329 0.263503 -2.138 0.032529 *
## WAIST -0.041946 0.019267 -2.177 0.029469 *
## HIP 0.032673 0.017867 1.829 0.067453 .
## ASPIRIN 1.496188 0.224156 6.675 2.48e-11 ***
## DYSLIPIDEMIA 0.682370 0.214233 3.185 0.001447 **
## HYPERTENSION 0.860312 0.257732 3.338 0.000844 ***
## PREMCVD 2.231218 0.224387 9.944 < 2e-16 ***
## OHA -0.567443 0.255265 -2.223 0.026219 *
## HEARTRATE -0.009741 0.005275 -1.847 0.064766 .
## HEARTFAIL 0.740330 0.458685 1.614 0.106522
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 902.37 on 732 degrees of freedom
## Residual deviance: 633.73 on 722 degrees of freedom
## AIC: 655.73
##
## Number of Fisher Scoring iterations: 5
Null deviance: 902.37 on 732 degrees of freedom
Residual deviance: 633.73 on 722 degrees of freedom
Chi = Null deviance - Residual deviance
Chi = 902.37 - 633.73
Chi = 268.64
There are p = 10 (732-722) predictor variables degrees of freedom.
1 - pchisq(268.64, df=10)
## [1] 0
Model1 <- cvd_model
Model2 <- cvd_newmodel
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.5
## Loading required package: Matrix
## Loaded glmnet 4.1-3
anova(cvd_model, cvd_newmodel, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: TARGET ~ SEX + AGE + RACE + SMOKER + ASPIRIN + DYSLIPIDEMIA +
## HYPERTENSION + DIABETES + OHA + INSULIN + NONTHERAPY + PREMCVD +
## HEARTFAIL + HEARTRATE + SYSTOLIC + DIASTOLIC + WAIST + HIP +
## BMI
## Model 2: TARGET ~ SEX + WAIST + HIP + ASPIRIN + DYSLIPIDEMIA + HYPERTENSION +
## PREMCVD + OHA + HEARTRATE + HEARTFAIL
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 713 625.75
## 2 722 633.73 -9 -7.983 0.5359