Data Cleaning and preparation

# Import data set

require(data.table) # Used fread() from read.table as the dataset size is big

diabetes <- fread("C:/Users/welcome/Desktop/diabeteshypertension.csv")

str(diabetes)

## Classes 'data.table' and 'data.frame':   5651 obs. of  15 variables:
##  $ V1                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age                 : int  31 29 29 29 26 23 43 31 34 27 ...
##  $ gender              : chr  "female" "female" "female" "female" ...
##  $ bmi                 : num  28.5 19.8 24.9 21.7 23.4 ...
##  $ waist_size          : num  32 NA NA NA 30 28 30 34 32 NA ...
##  $ alcohol             : int  1 0 1 0 0 0 1 0 0 0 ...
##  $ smoke               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ exercise            : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ hypertension        : chr  NA "Normal" "Normal" "Normal" ...
##  $ parents_hypertensive: int  1 NA NA NA NA NA 1 NA NA NA ...
##  $ parents_diabetic    : int  1 NA 1 NA 2 NA 2 2 1 NA ...
##  $ parents_heartcondn  : int  1 NA 1 NA 2 NA 2 2 1 NA ...
##  $ stress_factor       : int  5 NA NA NA 4 NA 4 NA NA NA ...
##  $ diet_factor         : int  NA NA NA NA 6 NA 7 NA NA NA ...
##  $ diabetes            : chr  "Normal" "Normal" "Normal" "Normal" ...
##  - attr(*, ".internal.selfref")=<externalptr>

### DATA CLEANING AND PREPARATION###


# Change class label "female" to 1  and "male" to 0 for gender

diabetes[gender == "female", 3] <- 1

diabetes[gender == "male", 3] <- 0


# Change class label "Normal" to 0 for diabetes
# "Diabetic" to 2
# "Pre-Diabetic" to 1

diabetes[diabetes == "Normal", 15] <- 0

diabetes[diabetes == "Diabetic", 15] <- 2

diabetes[diabetes == "Pre-Diabetic", 15] <- 1



# Change class label "Normal" to 0 for hypertension
# "Hypertension" to 2
#" Pre-Hypertension" to 1

diabetes[hypertension == "Normal", 9] <- 0

diabetes[hypertension == "Pre-Hypertension", 9] <- 1

diabetes[hypertension == "Hypertension", 9] <- 2

# Remove following variables from the data set
# Parents_diabetic
# Parents_heartcondn
# stress_factor
# diet_factor
# V1

diabetes <- diabetes[,-c(1,10:14)]

diabetes$hypertension <- as.factor(diabetes$hypertension)

# Visualize missing values

require(VIM)

aggr(diabetes, prop = FALSE, numbers = TRUE)

#### Decision tree and random forest models #####
# FOr Diabetes

diabetes_dt <- diabetes   # Data set for Decision tree

diabetes_dt$hypertension <- as.numeric(diabetes_dt$hypertension)

diabetes_dt[, 1:9][is.na(diabetes_dt[, 1:9])] <- -1 # Replac NA with -1

diabetes_dt[hypertension == 1, 8] <- 0 

diabetes_dt[hypertension == 2, 8] <- 1

diabetes_dt[hypertension == 3, 8] <- 2

cols_diab_dt <- names(diabetes_dt[,c(2,5:9)]) # Variables for class conversion

diabetes_dt[,cols_diab_dt] <- lapply(diabetes_dt[,cols_diab_dt, with = FALSE], factor)# Convert to factor



diabetes_rf <- diabetes  # Dataset for Random Forest

diabetes_rf$hypertension <- as.numeric(diabetes_rf$hypertension)

diabetes_rf[, 1:9][is.na(diabetes_rf[, 1:9])] <- -1 # Replac NA with -1

diabetes_rf[hypertension == 1, 8] <- 0 

diabetes_rf[hypertension == 2, 8] <- 1

diabetes_rf[hypertension == 3, 8] <- 2

cols_diab_rf <- names(diabetes_rf[,c(2,5:9)]) # variables for class conversion

diabetes_rf[,cols_diab_rf] <- lapply(diabetes_rf[,cols_diab_rf, with = FALSE], factor)# Convert to factor

#### Decision tree and random forest models #####
# FOr Hypertension

hypertension_dt <- diabetes  # Data for Decision tree

hypertension_dt$hypertension <- as.numeric(hypertension_dt$hypertension)

hypertension_dt[, 1:7][is.na(hypertension_dt[, 1:7])] <- -1 # Replace NA with -1

hypertension_dt <- hypertension_dt[!is.na(hypertension_dt$hypertension),] # Remove missing values in hypertension

hypertension_dt[hypertension == 1, 8] <- 0 

hypertension_dt[hypertension == 2, 8] <- 1

hypertension_dt[hypertension == 3, 8] <- 2


cols_hyp_dt <- names(hypertension_dt[,c(2,5:9)]) # Variables for class conversion

hypertension_dt[,cols_hyp_dt] <- lapply(hypertension_dt[,cols_hyp_dt, with = FALSE], factor) # Convert to factor


hypertension_rf <- diabetes    # Data for Random Forest 

hypertension_rf$hypertension <- as.numeric(hypertension_rf$hypertension)

hypertension_rf[, 1:7][is.na(hypertension_rf[, 1:7])] <- -1 # replace NA with -1

hypertension_rf <- hypertension_rf[!is.na(hypertension_rf$hypertension),] # Remove missing values in hypertension

hypertension_rf[hypertension == 1, 8] <- 0 

hypertension_rf[hypertension == 2, 8] <- 1

hypertension_rf[hypertension == 3, 8] <- 2

cols_hyp_rf <- names(hypertension_rf[,c(2,5:9)]) # variables for class conversion

hypertension_rf[,cols_hyp_dt] <- lapply(hypertension_rf[,cols_hyp_rf, with = FALSE], factor) # change class to factor

#### Explorartory data analysis (EDA) ####

# Remove NA's for EDA

diabetes_eda <- na.omit(diabetes) # Remove NA's

cols_eda <- names(diabetes_eda[,c(2,5:9)]) # Variables for class conversion

diabetes_eda[,cols_eda] <- lapply(diabetes_eda[,cols_eda, with = FALSE], factor) # Change class to factor

Exploratory Data Analysis (EDA)

summary(diabetes_eda) # Summary statitics

##       age        gender        bmi          waist_size    alcohol 
##  Min.   :18.00   0:2271   Min.   :14.47   Min.   :26.00   0:1500  
##  1st Qu.:27.00   1: 599   1st Qu.:22.60   1st Qu.:32.00   1:1370  
##  Median :30.00            Median :24.71   Median :33.00           
##  Mean   :30.61            Mean   :25.01   Mean   :33.09           
##  3rd Qu.:34.00            3rd Qu.:27.21   3rd Qu.:35.00           
##  Max.   :56.00            Max.   :49.14   Max.   :42.00           
##  smoke    exercise hypertension diabetes
##  0:2331   0:2361   0:1816       0:2438  
##  1: 539   1: 509   1: 673       1: 143  
##                    2: 381       2: 289  
##                                         
##                                         
##

Patient related factors for diabetes

# Age vs Diabetes

plot(age~diabetes,data = diabetes_eda,col=colors()[100:102], 
     main = "Age vs Diabetes")

# Waist_size vs Diabetes

plot(waist_size~diabetes,data = diabetes_eda,col=colors()[100:102], 
     main = "Waist_size vs Diabetes")

require(ggplot2)

# Gender vs Diabetes

ggplot(diabetes_eda, aes(diabetes, ..count..)) +
  geom_bar(aes(fill = gender), position = "dodge") +
  labs(title = "Gender vs Diabetes", 
       subtitle = "Patiennt related factors", 
       caption = " * Higher number of male patients
         with pre-diabetes and diabetes than female patients")

# BMI vs Diabetes

plot(bmi~diabetes, data = diabetes_eda, col=colors()[100:102], 
     main = "BMI vs Diabetes")

Patient related factors for hypertension

# Gender vs Hypertension

ggplot(diabetes_eda, aes(hypertension, ..count..)) +
  geom_bar(aes(fill = gender), position = "dodge") +
  labs(title = "Gender vs Hypertension", 
       subtitle = "Patient related factors", 
       caption = " * Higher number of male patients
         with pre-hypertension and hypertension than female patients")

# Age vs Hypertension

plot(age~hypertension,data = diabetes_eda,col=colors()[100:102], 
     main = "Age vs Hypertension")

# Waist_size vs Hypertension

plot(waist_size~hypertension,data = diabetes_eda,col=colors()[100:102], 
     main = "Waist_size vs Hypertension")

# BMI vs Hypertension

plot(bmi~hypertension, data = diabetes_eda, col=colors()[100:102], 
     main = "BMI vs Hypertension")

Lifestyle related factors for diabetes

# Alcohol vs diabetes

ggplot(diabetes_eda, aes(diabetes, ..count..)) +
      geom_bar(aes(fill = alcohol), position = "dodge") +
  labs(title = "Alcohol vs Diabetes", 
       subtitle = "Lifestyle related factors for diabetes", 
       caption = " *Alcohol consumption among diabetes pateints is higher than 
                   patients with pre-diabetes")

# Smoking vs diabetes

ggplot(diabetes_eda, aes(diabetes, ..count..)) +
  geom_bar(aes(fill = smoke), position = "dodge") +
  labs(title = "Smoking vs Diabetes", 
       subtitle = "Lifestyle related factors for diabetes", 
       caption = " *Number of habitual smokers are higher in
                   diabetic pateints patients with pre-diabetes")

# Exercise vs Diabetes
ggplot(diabetes_eda, aes(diabetes, ..count..)) +
  geom_bar(aes(fill = exercise), position = "dodge") +
  labs(title = "Exercise vs Diabetes", 
       subtitle = "Lifestyle related factors", 
       caption = " * Lower incidence of diabetes in patients who exercise")

Lifestyle related factors for hypertension

#  Alcohol vs hypertension

ggplot(diabetes_eda, aes(hypertension, ..count..)) +
  geom_bar(aes(fill = alcohol), position = "dodge") +
  labs(title = "Alcohol vs Hypertension", 
       subtitle = "Lifestyle related factors for hypertension", 
       caption = " *Alcohol consumption is more common in patients
       with pre-hypertension than with hypertension")

#  Smoking vs hypertension

ggplot(diabetes_eda, aes(hypertension, ..count..)) +
  geom_bar(aes(fill = smoke), position = "dodge") +
  labs(title = "Smoking vs Hypertension", 
       subtitle = "Lifestyle related factors for hypertension", 
       caption = " *Smoking is slightly more common in patients
       with pre-hypertension than with hypertension")

# Exercise vs Hypertension

ggplot(diabetes_eda, aes(hypertension, ..count..)) +
  geom_bar(aes(fill = exercise), position = "dodge") +
  labs(title = "Exercise vs Hypertension", 
       subtitle = "Lifestyle related factors", 
       caption = " * Lower incidence of hypertension in patients who exercise")

Model Building

Classification models for diabetes

Model 1:Conditional inference ordinal response tree (CIORT)

#### Model Building ####

require(party)


# Conditional inference ordinal response tree (CIORT)

N_diabetes_dt <- nrow(diabetes_dt) # Number of rows

N_diabetes_dt*0.7 # 70 % data

## [1] 3955.7

set.seed(1234)

train_diabetes_dt <- sample(1:N_diabetes_dt, 3956, replace = FALSE ) #Random sampling

train_diab_data_dt <- diabetes_dt[train_diabetes_dt,] # Train dataset

test_diab_data_dt <- diabetes_dt[-train_diabetes_dt,] # Test dataset

fit_dib_dt <- ctree(diabetes~., data = train_diab_data_dt,
                    controls = ctree_control(mincriterion = 0.95)) # Train CIORT


# training  accuracy and misclassification error (MCE)
tab_diab_dt_train <- table(predict(fit_dib_dt, train_diab_data_dt), train_diab_data_dt$diabetes)  # predicition on train data

accuracy1 <-  sum(diag(tab_diab_dt_train)) / sum(tab_diab_dt_train[1,]) #Accracy

missclass1 <- 1 - accuracy1 # MCE

# test  accuracy and misclassification error (MCE)
tab_diab_dt_test <- table(predict(fit_dib_dt, test_diab_data_dt), test_diab_data_dt$diabetes) # Predicition on test data

accuracy2 <-  sum(diag(tab_diab_dt_test)) / sum(tab_diab_dt_test[1,]) #Accuracy

missclass2 <- 1 - accuracy2 # MCE

Model2: Conditional inference ordinal random forest (CIORF)

# Conditional inference ordinal random forest (CIORF)

N_diabetes_rf <- nrow(diabetes_rf) # Number of rows

N_diabetes_rf*0.7 # 70 % data

## [1] 3955.7

set.seed(1234)

train_diabetes_rf <- sample(1:N_diabetes_rf, 3956, replace = FALSE ) # Random sampling

train_diab_data_rf <- diabetes_rf[train_diabetes_rf,] # Train dataset

test_diab_data_rf <- diabetes_rf[-train_diabetes_rf,] # Test datset

fit_dib_rf <- cforest(diabetes~., data = train_diab_data_rf,
                      controls = cforest_unbiased( mtry = 3)) # Train CIORF

# training  accuracy and misclassification error (MCE)
tab_diab_rf_train <- table(predict(fit_dib_rf, train_diab_data_rf, OOB=TRUE, type = "response"), train_diab_data_rf$diabetes)  # predicition on train data


accuracy1_rf <-  sum(diag(tab_diab_rf_train)) / sum(tab_diab_rf_train[1,]) #ACcuracy

missclass1_rf <- 1 - accuracy1_rf # MCE

# test  accuracy and misclassification error (MCE)
tab_diab_rf_test <- table(predict(fit_dib_rf, test_diab_data_rf, OOB=TRUE, type = "response"), test_diab_data_rf$diabetes) # Predicition on test data

accuracy2_rf <-  sum(diag(tab_diab_rf_test)) / sum(tab_diab_rf_test[1,]) #Accuracy

missclass2_rf <- 1 - accuracy2_rf #MCE

Classification models for Hypertension

Model 1:Conditional inference ordinal response tree (CIORT)

# Model 1
# Conditional inference ordinal response tree (CIORT)


N_hypertension_dt <- nrow(hypertension_dt) # number of rows

N_hypertension_dt*0.7 # 70 % data

## [1] 3697.4

set.seed(1234)

train_hypertension_dt <- sample(1:N_hypertension_dt, 3697, replace = FALSE ) #Random sampling

train_hyp_data_dt <- hypertension_dt[train_hypertension_dt,] # tain dataset

test_hyp_data_dt <- hypertension_dt[-train_hypertension_dt,] # test dataset


fit_hyp_dt <- ctree(hypertension~., data = train_hyp_data_dt,
                    controls = ctree_control(mincriterion = 0.95)) #train CIORT

# training  accuracy and misclassification error (MCE)
tab_hyp_dt_train <- table(predict(fit_hyp_dt, train_hyp_data_dt), train_hyp_data_dt$hypertension)  # predicition on train data


accuracy1_dt_hyp <-  sum(diag(tab_hyp_dt_train)) / sum(tab_hyp_dt_train[1,]) #Accuracy

missclass1_dt_hyp <- 1 - accuracy1_dt_hyp #MCE

# test  accuracy and misclassification error (MCE)
tab_hyp_dt_test <- table(predict(fit_hyp_dt, test_hyp_data_dt), test_hyp_data_dt$hypertension) # Predicition on test data

accuracy2_dt_hyp <-  sum(diag(tab_hyp_dt_test)) / sum(tab_hyp_dt_test[1,])#Accuracy

missclass2_dt_hyp <- 1 - accuracy2_dt_hyp # MCE

Model2: Conditional inference ordinal random forest (CIORF)

# Model 2
# Conditional inference ordinal random forest (CIORF)

N_hypertension_rf <- nrow(hypertension_rf) # number of rows

N_hypertension_rf*0.7 # 70 % data

## [1] 3697.4

set.seed(1234)

train_hypertension_rf <- sample(1:N_hypertension_rf, 3697, replace = FALSE )#Random sampling

train_hyp_data_rf <- hypertension_rf[train_hypertension_rf,] # Train dataset

test_hyp_data_rf <- hypertension_rf[-train_hypertension_rf,] # Test dataset


fit_hyp_rf <- cforest(hypertension~., data = train_hyp_data_rf,
                      controls = cforest_unbiased(mtry = 3)) # train CIORF


# training  accuracy and misclassification error (MCE)
tab_hyp_rf_train <- table(predict(fit_hyp_rf, train_hyp_data_rf, OOB=TRUE, type = "response"), train_hyp_data_rf$hypertension)  # predicition on train data


accuracy1_hyp_rf <-  sum(diag(tab_hyp_rf_train)) / sum(tab_hyp_rf_train[1,])# Accuracy

missclass1_hyp_rf <- 1 - accuracy1_hyp_rf # MCE

# test  accuracy and misclassification error (MCE)
tab_hyp_rf_test <- table(predict(fit_hyp_rf, test_hyp_data_rf, OOB=TRUE, type = "response"), test_hyp_data_rf$hypertension) # Predicition on test data

accuracy2_hyp_rf <-  sum(diag(tab_hyp_rf_test)) / sum(tab_hyp_rf_test[1,]) # Accuracy

missclass2_hyp_rf <- 1 - accuracy2_hyp_rf # MCE

Results

Results for Diabetes classifiers

results_diabetes <- data.frame(Model = c("CIORT", "CIORF"), Train_Accuracy = c(accuracy1, accuracy1_rf), Train_MCE = c(missclass1,missclass1_rf), Test_Accuracy = c(accuracy2, accuracy2_rf ), Test_MCE = c(missclass2, missclass2_rf))

knitr::kable(
  head(results_diabetes), booktabs = TRUE,
  caption = 'Classfiers for prediction of diabetes'
)

Classfiers for prediction of diabetes
Model	Train_Accuracy	Train_MCE	Test_Accuracy	Test_MCE
CIORT	0.8850370	0.1149630	0.8964902	0.1035098
CIORF	0.8827569	0.1172431	0.8943620	0.1056380

Results for Hypertension classifiers

results_hypertension <- data.frame(Model = c("CIORT", "CIORF"), Train_Accuracy = c(accuracy1_dt_hyp, accuracy1_hyp_rf ), Train_MCE = c(missclass1_dt_hyp,missclass1_hyp_rf), Test_Accuracy = c(accuracy2_dt_hyp, accuracy2_hyp_rf), Test_MCE = c(missclass2_dt_hyp , missclass2_hyp_rf))

knitr::kable(
  head(results_hypertension), booktabs = TRUE,
  caption = 'Classfiers for prediction of hypertension'
)

Classfiers for prediction of hypertension
Model	Train_Accuracy	Train_MCE	Test_Accuracy	Test_MCE
CIORT	0.7782821	0.2217179	0.7757085	0.2242915
CIORF	0.8009003	0.1990997	0.7789389	0.2210611

Classification models for prediction of diabetes and hypertension

karthik

November 27, 2017