The study design to investigate the relationship between chronic kidney disease and blood pressure. And whether other potential risk factor like red blood cells, potassium can affect the association between chronic kidney disease and blood pressure.
Null hypothesis: there is no association of one unit increase in blood pressure with chronic kidney disease risk under the significance level 0.05.
Alternative hypothesis: there is an association of one unit increase in blood pressure with chronic kidney disease risk under the significance level 0.05
rm(list=ls())
options(warn = 1)
options(digits=4)
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
packs <- c(
"sf" , "date" , "plyr" , "dplyr" , "incidence" , "ggplot2" , "tidyr" , "earlyR"
, "readr" , 'scales' , 'VIM' , 'caret' , 'RANN', 'mice'
)
if (!require("pacman"))
install.packages("pacman")
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 4.1.3
pacman::p_load(char = packs)
data <- read_csv("data.csv")
## Rows: 400 Columns: 25
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): class, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane
## dbl (14): age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wc, rc
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data %>% select(class ,rbc , bp, sg, pot , hemo, wc , age,sod )
# trin_copy$ethnicity <- ifelse(trin_copy$ethnicity == "minority", 1, 0)##
data$class <- ifelse(data$class== "ckd", 1, 0)
data$rbc <- ifelse(data$rbc== "normal", 1, 0)
head(data)
names(data)
## [1] "class" "rbc" "bp" "sg" "pot" "hemo" "wc" "age" "sod"
# select specific columns
# FIND NAN in dataset
summary(data)
## class rbc bp sg pot
## Min. :0.000 Min. :0.00 Min. : 50 Min. :1.00 Min. : 2.50
## 1st Qu.:0.000 1st Qu.:1.00 1st Qu.: 70 1st Qu.:1.01 1st Qu.: 3.80
## Median :1.000 Median :1.00 Median : 80 Median :1.02 Median : 4.40
## Mean :0.625 Mean :0.81 Mean : 78 Mean :1.02 Mean : 4.63
## 3rd Qu.:1.000 3rd Qu.:1.00 3rd Qu.: 80 3rd Qu.:1.02 3rd Qu.: 4.90
## Max. :1.000 Max. :1.00 Max. :180 Max. :1.02 Max. :47.00
## NA's :152 NA's :47 NA's :88
## hemo wc age sod
## Min. : 3.1 Min. : 2200 Min. : 2.0 Min. : 4.5
## 1st Qu.:10.3 1st Qu.: 6500 1st Qu.:41.8 1st Qu.:135.0
## Median :12.6 Median : 8000 Median :54.0 Median :138.0
## Mean :12.5 Mean : 8406 Mean :51.2 Mean :137.5
## 3rd Qu.:15.0 3rd Qu.: 9800 3rd Qu.:64.0 3rd Qu.:142.0
## Max. :17.8 Max. :26400 Max. :90.0 Max. :163.0
## NA's :52 NA's :106 NA's :87
# is.na(data)
# how many na we have in our data
sum(is.na(as.matrix(data)))
## [1] 532
colSums(is.na(data))
## class rbc bp sg pot hemo wc age sod
## 0 152 0 47 88 52 106 0 87
# Number of rows without any missing value
nrow(data[complete.cases(data),])/nrow(data)
## [1] 0.46
require(lattice)
hist(data$bp, main ="Distribution of Blood pressure",ylab="Frequency",xlab="Blood pressure" )
boxplot(data$bp, horizontal = TRUE)
# dISPLAY missing data
aggr(data, numbers = TRUE, prop = c(TRUE, FALSE))
# Deal with missing data KNN
preProcValues <- preProcess(as.data.frame(data), method="knnImpute")
data_nerw <- predict(preProcValues , newdata = data)
data_n <- data
procNames <- data.frame(col = names(preProcValues$mean),
mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
data[i] <- data_nerw[i]*preProcValues$std[i]+preProcValues$mean[i]
}
aggr(data, numbers = TRUE, prop = c(TRUE, FALSE))
write.csv(x=data, file="data_update.csv")
data <- read_csv("data_update.csv")
## New names:
## * `` -> ...1
## Rows: 400 Columns: 10-- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (10): ...1, class, rbc, bp, sg, pot, hemo, wc, age, sod
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# WHEN WE REMOVE OUTLIAR
outliers <- boxplot.stats(data$bp )$out
outliers
## [1] 50 50 50 50 50 100 100 100 100 100 100 100 100 100 100 100 100 100 100
## [20] 100 100 100 100 100 100 100 100 100 100 100 110 110 110 120 140 180 130 130
## [39] 160 150 150 150 160 140 140
data<- data[-which(data$bp %in% outliers),]
boxplot(data$bp, horizontal = TRUE)
library(dplyr) # alternatively, this also loads %>%
set.seed(12345)
subsample <- runif(nrow(data))
data$type <- ifelse(subsample < 0.80, "train", "test")
train <- data %>% filter(type == "train") %>%select(-type)
test <- data %>% filter(type == "test") %>% select(-type)
dim(train)
## [1] 273 10
dim(test)
## [1] 82 10
#
# a=subset(data, data$class==1)
# b=subset(data, data$class==0)
# # the red line indicates those with the disease
# plot(density(a$bp), col="red", main='Comparing age distribution', xlab='Age')
# lines(density(b$bp , bh = 0.6))
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.1.3
CrossTable(train$class, train$bp, chisq = TRUE, missing.include = TRUE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 273
##
##
## | train$bp
## train$class | 60 | 70 | 80 | 90 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 37 | 29 | 49 | 0 | 115 |
## | 7.027 | 1.017 | 3.243 | 18.114 | |
## | 0.322 | 0.252 | 0.426 | 0.000 | 0.421 |
## | 0.649 | 0.349 | 0.544 | 0.000 | |
## | 0.136 | 0.106 | 0.179 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 20 | 54 | 41 | 43 | 158 |
## | 5.114 | 0.740 | 2.360 | 13.184 | |
## | 0.127 | 0.342 | 0.259 | 0.272 | 0.579 |
## | 0.351 | 0.651 | 0.456 | 1.000 | |
## | 0.073 | 0.198 | 0.150 | 0.158 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 57 | 83 | 90 | 43 | 273 |
## | 0.209 | 0.304 | 0.330 | 0.158 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 50.8 d.f. = 3 p = 5.4e-11
##
##
##
model=glm(class~bp,family="binomial", data=train)
#
# glm uses the model formula same as the linear regression model.
# family = tells the distribution of the outcome variable. For binary data, the binomial distribution is used.
# link = tell the transformation method. Here, the logit transformation is used.
# The output includes the regression coefficients and their z-statistics and p-values.
# The dispersion parameter is related to the variance of the response variable.
summary(model)
##
## Call:
## glm(formula = class ~ bp, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.484 -1.210 0.689 1.145 1.416
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.2710 1.0021 -4.26 2.0e-05 ***
## bp 0.0621 0.0135 4.59 4.5e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 371.66 on 272 degrees of freedom
## Residual deviance: 348.67 on 271 degrees of freedom
## AIC: 352.7
##
## Number of Fisher Scoring iterations: 4
### Generate the 95% CI
confint(model)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -6.28206 -2.34369
## bp 0.03614 0.08932
### Exponentiate the coefficients
exp(coef(model)) ### Odds ratio
## (Intercept) bp
## 0.01397 1.06407
exp(confint(model)) ### 95% CI (odds ratio)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.00187 0.09597
## bp 1.03680 1.09343
#calculate p-value of overall Chi-Square statistic
# X2 = (Null deviance – Residual deviance) / (Null df – Residual df)
1-pchisq(371.66-348.67, 272-271)
## [1] 1.628e-06
exp(confint(model))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.00187 0.09597
## bp 1.03680 1.09343
# (z= 4.59, P = 0.0000045; 95% CI = 1.03680 to 1.09343)
# Thus the logistic model for these data is:
# E[ odds(chronic kidney disease) ] = -4.2710 + 0.0621*Blood presure.
# The coefficient for Blood pressure= 0.0621 which corresponds to the log of odds ratio between the CKD group and NotCKD group.
# This means that for a one-unit increase in age there is a 0.02 decrease
#in the log odds of vomiting. This can be translated to e-0.02 = 0.98.
#Groups of people in an age group one unit higher than a
#reference group have, on average, 0.98 times the odds of vomiting.
# Since this p-value is less than .05, we reject the null hypothesis at the 0.05 alpha level.
# there is an association of one unit increase in blood pressure with chronic kidney disease risk.
# names(train)
# #use fitted model to predict values of vs
# train$ckd = predict(model, train, type="response")
#
# #plot logistic regression curve
# plot(ckd~bp, data=train, col="steelblue")
# lines(ckd~bp, train, lwd=5)
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#ggpairs(train)
# improved correlation matrix
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
corrplot(cor(train),
method = "number",
type = "lower" # show only upper side
)
### model predition
trn_pred <- ifelse(predict(model, type = "response") > 0.5, 1, 0)
head(trn_pred)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
# Making predictions on the train set.
trn_tab <- table(predicted = trn_pred, actual = train$class)
trn_tab
## actual
## predicted 0 1
## 0 37 20
## 1 78 138
# Making predictions on the test set.
tst_pred <- ifelse(predict(model, newdata = test, type = "response") > 0.5, 1, 0)
tst_tab <- table(predicted = tst_pred, actual = test$class)
tst_tab
## actual
## predicted 0 1
## 0 9 6
## 1 26 41
Perhaps unsurprisingly, the most common metrics for evaluating logistic regression models are error rate and accuracy (which is simply the additive inverse of the error rate). These metrics can be calculated directly from the confustion matrix.
calc_class_err <- function(actual, predicted) {
mean(actual != predicted)
}
calc_class_err(actual = train$class, predicted = trn_pred)
## [1] 0.359
# Test error rate should be close to train error rate if model is fitted properly.
calc_class_err(actual = test$class, predicted = tst_pred)
## [1] 0.3902
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
test_prob <- predict(model, newdata = test, type = "response")
test_roc <- roc(test$class ~ test_prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
as.numeric(test_roc$auc)
## [1] 0.5705
\[\text{Sens} = \text{True Positive Rate} = \frac{\text{TP}}{\text{P}} = \frac{\text{TP}}{\text{TP + FN}}\] \[\text{Spec} = \text{True Negative Rate} = \frac{\text{TN}}{\text{N}} = \frac{\text{TN}}{\text{TN + FP}}\]
\[\text{Prev} = \frac{\text{P}}{\text{Total Obs}}= \frac{\text{TP + FN}}{\text{Total Obs}}\]
Calculations of metrics such as sensitivity, specificity, and prevalance are derived from the confusion matrix. The importance of these (and other) metrics is dependent on the nature of the data (e.g. lower values may be acceptable if the data is deemed difficult to predict), as well as the tolerance for the type of misclassification. For example, we may want to bias our predictions for classifying defaults such that we are more likely to predict a default when one does not occur. We must carefully identify whether we want to prioritize sensitivity or specificity.
library("caret")
confusionMatrix(trn_tab)
## Confusion Matrix and Statistics
##
## actual
## predicted 0 1
## 0 37 20
## 1 78 138
##
## Accuracy : 0.641
## 95% CI : (0.581, 0.698)
## No Information Rate : 0.579
## P-Value [Acc > NIR] : 0.0209
##
## Kappa : 0.21
##
## Mcnemar's Test P-Value : 8.52e-09
##
## Sensitivity : 0.322
## Specificity : 0.873
## Pos Pred Value : 0.649
## Neg Pred Value : 0.639
## Prevalence : 0.421
## Detection Rate : 0.136
## Detection Prevalence : 0.209
## Balanced Accuracy : 0.598
##
## 'Positive' Class : 0
##
library(GGally)
ggpairs(data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.1.3
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
model_ols_step_both_p <- lm(class ~ ., data = train)
ols_step_both_p(model_ols_step_both_p)
##
## Stepwise Selection Summary
## --------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------------------
## 1 hemo addition 0.619 0.618 244.0020 132.0378 0.3059
## 2 sg addition 0.742 0.740 80.2340 27.4910 0.2522
## 3 rbc addition 0.778 0.776 33.4430 -11.8467 0.2342
## 4 wc addition 0.788 0.785 22.6030 -21.8523 0.2295
## 5 sod addition 0.795 0.792 14.4480 -29.7357 0.2258
## 6 age addition 0.802 0.797 8.0490 -36.1889 0.2228
## --------------------------------------------------------------------------------------
pred_model_ols_step_both_p <- predict(model_ols_step_both_p, test)
rmse_model_ols_step_both_p<- sqrt(1/length(pred_model_ols_step_both_p)
* sum((test$class- pred_model_ols_step_both_p)^2))
rmse_model_ols_step_both_p
## [1] 0.245
model_ols_step_forward_p <- lm(class ~ ., data = train)
ols_step_forward_p(model_ols_step_forward_p)
##
## Selection Summary
## --------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## --------------------------------------------------------------------------
## 1 hemo 0.6189 0.6175 244.0017 132.0378 0.3059
## 2 sg 0.7421 0.7402 80.2336 27.4910 0.2522
## 3 rbc 0.7783 0.7758 33.4426 -11.8467 0.2342
## 4 wc 0.7878 0.7847 22.6026 -21.8523 0.2295
## 5 sod 0.7954 0.7916 14.4479 -29.7357 0.2258
## 6 age 0.8016 0.7972 8.0495 -36.1889 0.2228
## 7 bp 0.8035 0.7983 7.4879 -36.8203 0.2221
## --------------------------------------------------------------------------
pred_model_ols_step_forward_p <- predict(model_ols_step_forward_p, test)
rmse_model_ols_step_forward_p <- sqrt(1/length(pred_model_ols_step_forward_p)
* sum((test$class- pred_model_ols_step_forward_p)^2))
rmse_model_ols_step_forward_p
## [1] 0.245
model_ols_step_backward_p <- lm(class ~ ., data = train)
ols_step_backward_p(model_ols_step_backward_p)
##
##
## Elimination Summary
## ------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## ------------------------------------------------------------------------
## 1 pot 0.8041 0.7982 8.6737 -35.6619 0.2222
## 2 ...1 0.8035 0.7983 7.4879 -36.8203 0.2221
## ------------------------------------------------------------------------
pred_model_ols_step_backward_p <- predict(model_ols_step_backward_p, test)
rmse_model_ols_step_backward_p<- sqrt(1/length(pred_model_ols_step_backward_p)
* sum((test$class- pred_model_ols_step_backward_p)^2))
rmse_model_ols_step_backward_p
## [1] 0.245