Abstract

The study design to investigate the relationship between chronic kidney disease and blood pressure. And whether other potential risk factor like red blood cells, potassium can affect the association between chronic kidney disease and blood pressure.

Null hypothesis: there is no association of one unit increase in blood pressure with chronic kidney disease risk under the significance level 0.05.

Alternative hypothesis: there is an association of one unit increase in blood pressure with chronic kidney disease risk under the significance level 0.05

rm(list=ls()) 
options(warn = 1)
options(digits=4)


r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)

packs <- c(
  "sf" , "date" , "plyr" , "dplyr" , "incidence" , "ggplot2" , "tidyr" , "earlyR"
  , "readr" , 'scales' , 'VIM' , 'caret' , 'RANN', 'mice'
  
  
)

if (!require("pacman"))
  install.packages("pacman")

## Loading required package: pacman

## Warning: package 'pacman' was built under R version 4.1.3

pacman::p_load(char = packs)


data <- read_csv("data.csv")

## Rows: 400 Columns: 25

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): class, rbc, pc, pcc, ba, htn, dm, cad, appet, pe, ane
## dbl (14): age, bp, sg, al, su, bgr, bu, sc, sod, pot, hemo, pcv, wc, rc
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

data <- data %>% select(class ,rbc , bp, sg, pot , hemo, wc , age,sod )

# trin_copy$ethnicity <- ifelse(trin_copy$ethnicity == "minority", 1, 0)##

data$class <- ifelse(data$class== "ckd", 1, 0)
data$rbc <- ifelse(data$rbc== "normal", 1, 0)
head(data)

names(data)

## [1] "class" "rbc"   "bp"    "sg"    "pot"   "hemo"  "wc"    "age"   "sod"

# select specific columns
# FIND NAN in dataset
summary(data)

##      class            rbc             bp            sg            pot       
##  Min.   :0.000   Min.   :0.00   Min.   : 50   Min.   :1.00   Min.   : 2.50  
##  1st Qu.:0.000   1st Qu.:1.00   1st Qu.: 70   1st Qu.:1.01   1st Qu.: 3.80  
##  Median :1.000   Median :1.00   Median : 80   Median :1.02   Median : 4.40  
##  Mean   :0.625   Mean   :0.81   Mean   : 78   Mean   :1.02   Mean   : 4.63  
##  3rd Qu.:1.000   3rd Qu.:1.00   3rd Qu.: 80   3rd Qu.:1.02   3rd Qu.: 4.90  
##  Max.   :1.000   Max.   :1.00   Max.   :180   Max.   :1.02   Max.   :47.00  
##                  NA's   :152                  NA's   :47     NA's   :88     
##       hemo            wc             age            sod       
##  Min.   : 3.1   Min.   : 2200   Min.   : 2.0   Min.   :  4.5  
##  1st Qu.:10.3   1st Qu.: 6500   1st Qu.:41.8   1st Qu.:135.0  
##  Median :12.6   Median : 8000   Median :54.0   Median :138.0  
##  Mean   :12.5   Mean   : 8406   Mean   :51.2   Mean   :137.5  
##  3rd Qu.:15.0   3rd Qu.: 9800   3rd Qu.:64.0   3rd Qu.:142.0  
##  Max.   :17.8   Max.   :26400   Max.   :90.0   Max.   :163.0  
##  NA's   :52     NA's   :106                    NA's   :87

# is.na(data)
# how many na we have in our data
sum(is.na(as.matrix(data)))

## [1] 532

colSums(is.na(data))

## class   rbc    bp    sg   pot  hemo    wc   age   sod 
##     0   152     0    47    88    52   106     0    87

# Number of rows without any missing value
nrow(data[complete.cases(data),])/nrow(data)

## [1] 0.46

require(lattice)


hist(data$bp, main ="Distribution of Blood pressure",ylab="Frequency",xlab="Blood pressure" )

boxplot(data$bp, horizontal = TRUE)

# dISPLAY missing data
aggr(data, numbers = TRUE, prop = c(TRUE, FALSE))

# Deal with missing data KNN


preProcValues <- preProcess(as.data.frame(data), method="knnImpute")
data_nerw <- predict(preProcValues ,  newdata = data)
data_n <- data
procNames <- data.frame(col = names(preProcValues$mean), 
                        mean = preProcValues$mean, sd = preProcValues$std)

for(i in procNames$col){
  data[i] <- data_nerw[i]*preProcValues$std[i]+preProcValues$mean[i] 
}

aggr(data, numbers = TRUE, prop = c(TRUE, FALSE))

write.csv(x=data, file="data_update.csv")

data <- read_csv("data_update.csv")

## New names:
## * `` -> ...1
## Rows: 400 Columns: 10-- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (10): ...1, class, rbc, bp, sg, pot, hemo, wc, age, sod
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

# WHEN WE REMOVE OUTLIAR 
outliers <- boxplot.stats(data$bp )$out
outliers

##  [1]  50  50  50  50  50 100 100 100 100 100 100 100 100 100 100 100 100 100 100
## [20] 100 100 100 100 100 100 100 100 100 100 100 110 110 110 120 140 180 130 130
## [39] 160 150 150 150 160 140 140

data<- data[-which(data$bp %in% outliers),]



boxplot(data$bp, horizontal = TRUE)

library(dplyr)    # alternatively, this also loads %>%

set.seed(12345)
subsample <- runif(nrow(data))
data$type <- ifelse(subsample < 0.80, "train", "test")
train <- data %>% filter(type == "train") %>%select(-type)
test <- data %>% filter(type == "test") %>% select(-type)
dim(train)

## [1] 273  10

dim(test)

## [1] 82 10

# 
# a=subset(data, data$class==1)
# b=subset(data, data$class==0)
# # the red line indicates those  with the disease
# plot(density(a$bp), col="red", main='Comparing age distribution', xlab='Age')
# lines(density(b$bp , bh = 0.6))

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.1.3

CrossTable(train$class, train$bp, chisq = TRUE, missing.include = TRUE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  273 
## 
##  
##              | train$bp 
##  train$class |        60 |        70 |        80 |        90 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            0 |        37 |        29 |        49 |         0 |       115 | 
##              |     7.027 |     1.017 |     3.243 |    18.114 |           | 
##              |     0.322 |     0.252 |     0.426 |     0.000 |     0.421 | 
##              |     0.649 |     0.349 |     0.544 |     0.000 |           | 
##              |     0.136 |     0.106 |     0.179 |     0.000 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            1 |        20 |        54 |        41 |        43 |       158 | 
##              |     5.114 |     0.740 |     2.360 |    13.184 |           | 
##              |     0.127 |     0.342 |     0.259 |     0.272 |     0.579 | 
##              |     0.351 |     0.651 |     0.456 |     1.000 |           | 
##              |     0.073 |     0.198 |     0.150 |     0.158 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |        57 |        83 |        90 |        43 |       273 | 
##              |     0.209 |     0.304 |     0.330 |     0.158 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  50.8     d.f. =  3     p =  5.4e-11 
## 
## 
##

model=glm(class~bp,family="binomial", data=train)
# 
# glm uses the model formula same as the linear regression model.
# family = tells the distribution of the outcome variable. For binary data, the binomial distribution is used.
# link = tell the transformation method. Here, the logit transformation is used.
# The output includes the regression coefficients and their z-statistics and p-values.
# The dispersion parameter is related to the variance of the response variable.


summary(model)

## 
## Call:
## glm(formula = class ~ bp, family = "binomial", data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.484  -1.210   0.689   1.145   1.416  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.2710     1.0021   -4.26  2.0e-05 ***
## bp            0.0621     0.0135    4.59  4.5e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 371.66  on 272  degrees of freedom
## Residual deviance: 348.67  on 271  degrees of freedom
## AIC: 352.7
## 
## Number of Fisher Scoring iterations: 4

### Generate the 95% CI
confint(model)

## Waiting for profiling to be done...

##                2.5 %   97.5 %
## (Intercept) -6.28206 -2.34369
## bp           0.03614  0.08932

### Exponentiate the coefficients
exp(coef(model))     ### Odds ratio

## (Intercept)          bp 
##     0.01397     1.06407

exp(confint(model))  ### 95% CI (odds ratio)

## Waiting for profiling to be done...

##               2.5 %  97.5 %
## (Intercept) 0.00187 0.09597
## bp          1.03680 1.09343

#calculate p-value of overall Chi-Square statistic
# X2 = (Null deviance – Residual deviance) / (Null df – Residual df)

1-pchisq(371.66-348.67, 272-271)

## [1] 1.628e-06

exp(confint(model))

## Waiting for profiling to be done...

##               2.5 %  97.5 %
## (Intercept) 0.00187 0.09597
## bp          1.03680 1.09343

# (z= 4.59, P = 0.0000045; 95% CI = 1.03680 to 1.09343) 

# Thus the logistic model for these data is:
# E[ odds(chronic kidney disease) ] =  -4.2710 + 0.0621*Blood presure.
# The coefficient for Blood pressure= 0.0621 which corresponds to the log  of odds ratio between the CKD group and NotCKD group.
# This means that for a one-unit increase in age there is a 0.02 decrease 
#in the log odds of vomiting. This can be translated to e-0.02 =  0.98. 
#Groups of people in an age group one unit higher than a
#reference group have, on average, 0.98 times the odds of vomiting.


# Since this p-value is  less than .05, we  reject the null hypothesis at the 0.05 alpha level. 
#  there is an association of one unit increase in blood pressure with chronic kidney disease risk.

# names(train)
# #use fitted model to predict values of vs
# train$ckd = predict(model, train, type="response")
# 
# #plot logistic regression curve
# plot(ckd~bp, data=train, col="steelblue")
# lines(ckd~bp, train, lwd=5)


library(GGally)

## Warning: package 'GGally' was built under R version 4.1.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

#ggpairs(train)












# improved correlation matrix
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.1.3

## corrplot 0.92 loaded

corrplot(cor(train),
         method = "number",
         type = "lower" # show only upper side
)

### model predition

trn_pred <- ifelse(predict(model, type = "response") > 0.5, 1, 0)

head(trn_pred)

## 1 2 3 4 5 6 
## 0 0 0 0 0 0

# Making predictions on the train set.
trn_tab <- table(predicted = trn_pred, actual = train$class)
trn_tab

##          actual
## predicted   0   1
##         0  37  20
##         1  78 138

# Making predictions on the test set.
tst_pred <- ifelse(predict(model, newdata = test, type = "response")  > 0.5, 1, 0)
tst_tab <- table(predicted = tst_pred, actual = test$class)
tst_tab

##          actual
## predicted  0  1
##         0  9  6
##         1 26 41

Perhaps unsurprisingly, the most common metrics for evaluating logistic regression models are error rate and accuracy (which is simply the additive inverse of the error rate). These metrics can be calculated directly from the confustion matrix.

calc_class_err <- function(actual, predicted) {
  mean(actual != predicted)
}


calc_class_err(actual = train$class, predicted = trn_pred)

## [1] 0.359

# Test error rate should be close to train error rate if model is fitted properly.
calc_class_err(actual = test$class, predicted = tst_pred)

## [1] 0.3902

library("pROC")

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:gmodels':
## 
##     ci

## The following object is masked from 'package:colorspace':
## 
##     coords

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

test_prob <- predict(model, newdata = test, type = "response")
test_roc <- roc(test$class ~ test_prob, plot = TRUE, print.auc = TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

as.numeric(test_roc$auc)

## [1] 0.5705

\[\text{Sens} = \text{True Positive Rate} = \frac{\text{TP}}{\text{P}} = \frac{\text{TP}}{\text{TP + FN}}\] \[\text{Spec} = \text{True Negative Rate} = \frac{\text{TN}}{\text{N}} = \frac{\text{TN}}{\text{TN + FP}}\]

\[\text{Prev} = \frac{\text{P}}{\text{Total Obs}}= \frac{\text{TP + FN}}{\text{Total Obs}}\]

Calculations of metrics such as sensitivity, specificity, and prevalance are derived from the confusion matrix. The importance of these (and other) metrics is dependent on the nature of the data (e.g. lower values may be acceptable if the data is deemed difficult to predict), as well as the tolerance for the type of misclassification. For example, we may want to bias our predictions for classifying defaults such that we are more likely to predict a default when one does not occur. We must carefully identify whether we want to prioritize sensitivity or specificity.

library("caret")
confusionMatrix(trn_tab)

## Confusion Matrix and Statistics
## 
##          actual
## predicted   0   1
##         0  37  20
##         1  78 138
##                                         
##                Accuracy : 0.641         
##                  95% CI : (0.581, 0.698)
##     No Information Rate : 0.579         
##     P-Value [Acc > NIR] : 0.0209        
##                                         
##                   Kappa : 0.21          
##                                         
##  Mcnemar's Test P-Value : 8.52e-09      
##                                         
##             Sensitivity : 0.322         
##             Specificity : 0.873         
##          Pos Pred Value : 0.649         
##          Neg Pred Value : 0.639         
##              Prevalence : 0.421         
##          Detection Rate : 0.136         
##    Detection Prevalence : 0.209         
##       Balanced Accuracy : 0.598         
##                                         
##        'Positive' Class : 0             
##

library(GGally)
ggpairs(data)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.1.3

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

model_ols_step_both_p <- lm(class ~ ., data = train)
ols_step_both_p(model_ols_step_both_p)

## 
##                               Stepwise Selection Summary                               
## --------------------------------------------------------------------------------------
##                      Added/                   Adj.                                        
## Step    Variable    Removed     R-Square    R-Square      C(p)        AIC        RMSE     
## --------------------------------------------------------------------------------------
##    1      hemo      addition       0.619       0.618    244.0020    132.0378    0.3059    
##    2       sg       addition       0.742       0.740     80.2340     27.4910    0.2522    
##    3      rbc       addition       0.778       0.776     33.4430    -11.8467    0.2342    
##    4       wc       addition       0.788       0.785     22.6030    -21.8523    0.2295    
##    5      sod       addition       0.795       0.792     14.4480    -29.7357    0.2258    
##    6      age       addition       0.802       0.797      8.0490    -36.1889    0.2228    
## --------------------------------------------------------------------------------------

pred_model_ols_step_both_p <- predict(model_ols_step_both_p, test)
rmse_model_ols_step_both_p<- sqrt(1/length(pred_model_ols_step_both_p)
                                      * sum((test$class- pred_model_ols_step_both_p)^2))
rmse_model_ols_step_both_p

## [1] 0.245

model_ols_step_forward_p <- lm(class ~ ., data = train)
ols_step_forward_p(model_ols_step_forward_p)

## 
##                             Selection Summary                              
## --------------------------------------------------------------------------
##         Variable                  Adj.                                        
## Step    Entered     R-Square    R-Square      C(p)        AIC        RMSE     
## --------------------------------------------------------------------------
##    1    hemo          0.6189      0.6175    244.0017    132.0378    0.3059    
##    2    sg            0.7421      0.7402     80.2336     27.4910    0.2522    
##    3    rbc           0.7783      0.7758     33.4426    -11.8467    0.2342    
##    4    wc            0.7878      0.7847     22.6026    -21.8523    0.2295    
##    5    sod           0.7954      0.7916     14.4479    -29.7357    0.2258    
##    6    age           0.8016      0.7972      8.0495    -36.1889    0.2228    
##    7    bp            0.8035      0.7983      7.4879    -36.8203    0.2221    
## --------------------------------------------------------------------------

pred_model_ols_step_forward_p <- predict(model_ols_step_forward_p, test)
rmse_model_ols_step_forward_p <- sqrt(1/length(pred_model_ols_step_forward_p)
                                      * sum((test$class- pred_model_ols_step_forward_p)^2))
rmse_model_ols_step_forward_p

## [1] 0.245

model_ols_step_backward_p <- lm(class ~ ., data = train)
ols_step_backward_p(model_ols_step_backward_p)

## 
## 
##                           Elimination Summary                            
## ------------------------------------------------------------------------
##         Variable                  Adj.                                      
## Step    Removed     R-Square    R-Square     C(p)       AIC        RMSE     
## ------------------------------------------------------------------------
##    1    pot           0.8041      0.7982    8.6737    -35.6619    0.2222    
##    2    ...1          0.8035      0.7983    7.4879    -36.8203    0.2221    
## ------------------------------------------------------------------------

pred_model_ols_step_backward_p <- predict(model_ols_step_backward_p, test)
rmse_model_ols_step_backward_p<- sqrt(1/length(pred_model_ols_step_backward_p)
                                      * sum((test$class- pred_model_ols_step_backward_p)^2))
rmse_model_ols_step_backward_p

## [1] 0.245

Update it

Ali Nemati

Last compiled on 13 April, 2022 , 00:20:37

Abstract

ref