# Import data set
require(data.table) # Used fread() from read.table as the dataset size is big
diabetes <- fread("C:/Users/welcome/Desktop/diabeteshypertension.csv")
str(diabetes)
## Classes 'data.table' and 'data.frame': 5651 obs. of 15 variables:
## $ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 31 29 29 29 26 23 43 31 34 27 ...
## $ gender : chr "female" "female" "female" "female" ...
## $ bmi : num 28.5 19.8 24.9 21.7 23.4 ...
## $ waist_size : num 32 NA NA NA 30 28 30 34 32 NA ...
## $ alcohol : int 1 0 1 0 0 0 1 0 0 0 ...
## $ smoke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ exercise : int 0 0 1 1 0 0 0 0 1 1 ...
## $ hypertension : chr NA "Normal" "Normal" "Normal" ...
## $ parents_hypertensive: int 1 NA NA NA NA NA 1 NA NA NA ...
## $ parents_diabetic : int 1 NA 1 NA 2 NA 2 2 1 NA ...
## $ parents_heartcondn : int 1 NA 1 NA 2 NA 2 2 1 NA ...
## $ stress_factor : int 5 NA NA NA 4 NA 4 NA NA NA ...
## $ diet_factor : int NA NA NA NA 6 NA 7 NA NA NA ...
## $ diabetes : chr "Normal" "Normal" "Normal" "Normal" ...
## - attr(*, ".internal.selfref")=<externalptr>
### DATA CLEANING AND PREPARATION###
# Change class label "female" to 1 and "male" to 0 for gender
diabetes[gender == "female", 3] <- 1
diabetes[gender == "male", 3] <- 0
# Change class label "Normal" to 0 for diabetes
# "Diabetic" to 2
# "Pre-Diabetic" to 1
diabetes[diabetes == "Normal", 15] <- 0
diabetes[diabetes == "Diabetic", 15] <- 2
diabetes[diabetes == "Pre-Diabetic", 15] <- 1
# Change class label "Normal" to 0 for hypertension
# "Hypertension" to 2
#" Pre-Hypertension" to 1
diabetes[hypertension == "Normal", 9] <- 0
diabetes[hypertension == "Pre-Hypertension", 9] <- 1
diabetes[hypertension == "Hypertension", 9] <- 2
# Remove following variables from the data set
# Parents_diabetic
# Parents_heartcondn
# stress_factor
# diet_factor
# V1
diabetes <- diabetes[,-c(1,10:14)]
diabetes$hypertension <- as.factor(diabetes$hypertension)
# Visualize missing values
require(VIM)
aggr(diabetes, prop = FALSE, numbers = TRUE)
#### Decision tree and random forest models #####
# FOr Diabetes
diabetes_dt <- diabetes # Data set for Decision tree
diabetes_dt$hypertension <- as.numeric(diabetes_dt$hypertension)
diabetes_dt[, 1:9][is.na(diabetes_dt[, 1:9])] <- -1 # Replac NA with -1
diabetes_dt[hypertension == 1, 8] <- 0
diabetes_dt[hypertension == 2, 8] <- 1
diabetes_dt[hypertension == 3, 8] <- 2
cols_diab_dt <- names(diabetes_dt[,c(2,5:9)]) # Variables for class conversion
diabetes_dt[,cols_diab_dt] <- lapply(diabetes_dt[,cols_diab_dt, with = FALSE], factor)# Convert to factor
diabetes_rf <- diabetes # Dataset for Random Forest
diabetes_rf$hypertension <- as.numeric(diabetes_rf$hypertension)
diabetes_rf[, 1:9][is.na(diabetes_rf[, 1:9])] <- -1 # Replac NA with -1
diabetes_rf[hypertension == 1, 8] <- 0
diabetes_rf[hypertension == 2, 8] <- 1
diabetes_rf[hypertension == 3, 8] <- 2
cols_diab_rf <- names(diabetes_rf[,c(2,5:9)]) # variables for class conversion
diabetes_rf[,cols_diab_rf] <- lapply(diabetes_rf[,cols_diab_rf, with = FALSE], factor)# Convert to factor
#### Decision tree and random forest models #####
# FOr Hypertension
hypertension_dt <- diabetes # Data for Decision tree
hypertension_dt$hypertension <- as.numeric(hypertension_dt$hypertension)
hypertension_dt[, 1:7][is.na(hypertension_dt[, 1:7])] <- -1 # Replace NA with -1
hypertension_dt <- hypertension_dt[!is.na(hypertension_dt$hypertension),] # Remove missing values in hypertension
hypertension_dt[hypertension == 1, 8] <- 0
hypertension_dt[hypertension == 2, 8] <- 1
hypertension_dt[hypertension == 3, 8] <- 2
cols_hyp_dt <- names(hypertension_dt[,c(2,5:9)]) # Variables for class conversion
hypertension_dt[,cols_hyp_dt] <- lapply(hypertension_dt[,cols_hyp_dt, with = FALSE], factor) # Convert to factor
hypertension_rf <- diabetes # Data for Random Forest
hypertension_rf$hypertension <- as.numeric(hypertension_rf$hypertension)
hypertension_rf[, 1:7][is.na(hypertension_rf[, 1:7])] <- -1 # replace NA with -1
hypertension_rf <- hypertension_rf[!is.na(hypertension_rf$hypertension),] # Remove missing values in hypertension
hypertension_rf[hypertension == 1, 8] <- 0
hypertension_rf[hypertension == 2, 8] <- 1
hypertension_rf[hypertension == 3, 8] <- 2
cols_hyp_rf <- names(hypertension_rf[,c(2,5:9)]) # variables for class conversion
hypertension_rf[,cols_hyp_dt] <- lapply(hypertension_rf[,cols_hyp_rf, with = FALSE], factor) # change class to factor
#### Explorartory data analysis (EDA) ####
# Remove NA's for EDA
diabetes_eda <- na.omit(diabetes) # Remove NA's
cols_eda <- names(diabetes_eda[,c(2,5:9)]) # Variables for class conversion
diabetes_eda[,cols_eda] <- lapply(diabetes_eda[,cols_eda, with = FALSE], factor) # Change class to factor
summary(diabetes_eda) # Summary statitics
## age gender bmi waist_size alcohol
## Min. :18.00 0:2271 Min. :14.47 Min. :26.00 0:1500
## 1st Qu.:27.00 1: 599 1st Qu.:22.60 1st Qu.:32.00 1:1370
## Median :30.00 Median :24.71 Median :33.00
## Mean :30.61 Mean :25.01 Mean :33.09
## 3rd Qu.:34.00 3rd Qu.:27.21 3rd Qu.:35.00
## Max. :56.00 Max. :49.14 Max. :42.00
## smoke exercise hypertension diabetes
## 0:2331 0:2361 0:1816 0:2438
## 1: 539 1: 509 1: 673 1: 143
## 2: 381 2: 289
##
##
##
Model 1:Conditional inference ordinal response tree (CIORT)
#### Model Building ####
require(party)
# Conditional inference ordinal response tree (CIORT)
N_diabetes_dt <- nrow(diabetes_dt) # Number of rows
N_diabetes_dt*0.7 # 70 % data
## [1] 3955.7
set.seed(1234)
train_diabetes_dt <- sample(1:N_diabetes_dt, 3956, replace = FALSE ) #Random sampling
train_diab_data_dt <- diabetes_dt[train_diabetes_dt,] # Train dataset
test_diab_data_dt <- diabetes_dt[-train_diabetes_dt,] # Test dataset
fit_dib_dt <- ctree(diabetes~., data = train_diab_data_dt,
controls = ctree_control(mincriterion = 0.95)) # Train CIORT
# training accuracy and misclassification error (MCE)
tab_diab_dt_train <- table(predict(fit_dib_dt, train_diab_data_dt), train_diab_data_dt$diabetes) # predicition on train data
accuracy1 <- sum(diag(tab_diab_dt_train)) / sum(tab_diab_dt_train[1,]) #Accracy
missclass1 <- 1 - accuracy1 # MCE
# test accuracy and misclassification error (MCE)
tab_diab_dt_test <- table(predict(fit_dib_dt, test_diab_data_dt), test_diab_data_dt$diabetes) # Predicition on test data
accuracy2 <- sum(diag(tab_diab_dt_test)) / sum(tab_diab_dt_test[1,]) #Accuracy
missclass2 <- 1 - accuracy2 # MCE
Model2: Conditional inference ordinal random forest (CIORF)
# Conditional inference ordinal random forest (CIORF)
N_diabetes_rf <- nrow(diabetes_rf) # Number of rows
N_diabetes_rf*0.7 # 70 % data
## [1] 3955.7
set.seed(1234)
train_diabetes_rf <- sample(1:N_diabetes_rf, 3956, replace = FALSE ) # Random sampling
train_diab_data_rf <- diabetes_rf[train_diabetes_rf,] # Train dataset
test_diab_data_rf <- diabetes_rf[-train_diabetes_rf,] # Test datset
fit_dib_rf <- cforest(diabetes~., data = train_diab_data_rf,
controls = cforest_unbiased( mtry = 3)) # Train CIORF
# training accuracy and misclassification error (MCE)
tab_diab_rf_train <- table(predict(fit_dib_rf, train_diab_data_rf, OOB=TRUE, type = "response"), train_diab_data_rf$diabetes) # predicition on train data
accuracy1_rf <- sum(diag(tab_diab_rf_train)) / sum(tab_diab_rf_train[1,]) #ACcuracy
missclass1_rf <- 1 - accuracy1_rf # MCE
# test accuracy and misclassification error (MCE)
tab_diab_rf_test <- table(predict(fit_dib_rf, test_diab_data_rf, OOB=TRUE, type = "response"), test_diab_data_rf$diabetes) # Predicition on test data
accuracy2_rf <- sum(diag(tab_diab_rf_test)) / sum(tab_diab_rf_test[1,]) #Accuracy
missclass2_rf <- 1 - accuracy2_rf #MCE
Model 1:Conditional inference ordinal response tree (CIORT)
# Model 1
# Conditional inference ordinal response tree (CIORT)
N_hypertension_dt <- nrow(hypertension_dt) # number of rows
N_hypertension_dt*0.7 # 70 % data
## [1] 3697.4
set.seed(1234)
train_hypertension_dt <- sample(1:N_hypertension_dt, 3697, replace = FALSE ) #Random sampling
train_hyp_data_dt <- hypertension_dt[train_hypertension_dt,] # tain dataset
test_hyp_data_dt <- hypertension_dt[-train_hypertension_dt,] # test dataset
fit_hyp_dt <- ctree(hypertension~., data = train_hyp_data_dt,
controls = ctree_control(mincriterion = 0.95)) #train CIORT
# training accuracy and misclassification error (MCE)
tab_hyp_dt_train <- table(predict(fit_hyp_dt, train_hyp_data_dt), train_hyp_data_dt$hypertension) # predicition on train data
accuracy1_dt_hyp <- sum(diag(tab_hyp_dt_train)) / sum(tab_hyp_dt_train[1,]) #Accuracy
missclass1_dt_hyp <- 1 - accuracy1_dt_hyp #MCE
# test accuracy and misclassification error (MCE)
tab_hyp_dt_test <- table(predict(fit_hyp_dt, test_hyp_data_dt), test_hyp_data_dt$hypertension) # Predicition on test data
accuracy2_dt_hyp <- sum(diag(tab_hyp_dt_test)) / sum(tab_hyp_dt_test[1,])#Accuracy
missclass2_dt_hyp <- 1 - accuracy2_dt_hyp # MCE
Model2: Conditional inference ordinal random forest (CIORF)
# Model 2
# Conditional inference ordinal random forest (CIORF)
N_hypertension_rf <- nrow(hypertension_rf) # number of rows
N_hypertension_rf*0.7 # 70 % data
## [1] 3697.4
set.seed(1234)
train_hypertension_rf <- sample(1:N_hypertension_rf, 3697, replace = FALSE )#Random sampling
train_hyp_data_rf <- hypertension_rf[train_hypertension_rf,] # Train dataset
test_hyp_data_rf <- hypertension_rf[-train_hypertension_rf,] # Test dataset
fit_hyp_rf <- cforest(hypertension~., data = train_hyp_data_rf,
controls = cforest_unbiased(mtry = 3)) # train CIORF
# training accuracy and misclassification error (MCE)
tab_hyp_rf_train <- table(predict(fit_hyp_rf, train_hyp_data_rf, OOB=TRUE, type = "response"), train_hyp_data_rf$hypertension) # predicition on train data
accuracy1_hyp_rf <- sum(diag(tab_hyp_rf_train)) / sum(tab_hyp_rf_train[1,])# Accuracy
missclass1_hyp_rf <- 1 - accuracy1_hyp_rf # MCE
# test accuracy and misclassification error (MCE)
tab_hyp_rf_test <- table(predict(fit_hyp_rf, test_hyp_data_rf, OOB=TRUE, type = "response"), test_hyp_data_rf$hypertension) # Predicition on test data
accuracy2_hyp_rf <- sum(diag(tab_hyp_rf_test)) / sum(tab_hyp_rf_test[1,]) # Accuracy
missclass2_hyp_rf <- 1 - accuracy2_hyp_rf # MCE
results_diabetes <- data.frame(Model = c("CIORT", "CIORF"), Train_Accuracy = c(accuracy1, accuracy1_rf), Train_MCE = c(missclass1,missclass1_rf), Test_Accuracy = c(accuracy2, accuracy2_rf ), Test_MCE = c(missclass2, missclass2_rf))
knitr::kable(
head(results_diabetes), booktabs = TRUE,
caption = 'Classfiers for prediction of diabetes'
)
Model | Train_Accuracy | Train_MCE | Test_Accuracy | Test_MCE |
---|---|---|---|---|
CIORT | 0.8850370 | 0.1149630 | 0.8964902 | 0.1035098 |
CIORF | 0.8827569 | 0.1172431 | 0.8943620 | 0.1056380 |
results_hypertension <- data.frame(Model = c("CIORT", "CIORF"), Train_Accuracy = c(accuracy1_dt_hyp, accuracy1_hyp_rf ), Train_MCE = c(missclass1_dt_hyp,missclass1_hyp_rf), Test_Accuracy = c(accuracy2_dt_hyp, accuracy2_hyp_rf), Test_MCE = c(missclass2_dt_hyp , missclass2_hyp_rf))
knitr::kable(
head(results_hypertension), booktabs = TRUE,
caption = 'Classfiers for prediction of hypertension'
)
Model | Train_Accuracy | Train_MCE | Test_Accuracy | Test_MCE |
---|---|---|---|---|
CIORT | 0.7782821 | 0.2217179 | 0.7757085 | 0.2242915 |
CIORF | 0.8009003 | 0.1990997 | 0.7789389 | 0.2210611 |