library(caret)
library(magrittr)
library(DMwR)
library(knitr)
library(tibble)
library(pROC)
library(ggplot2)
library(randomForest)
library(class)
library(sgd)

Model A

Logistic regression
    set.seed(126)
    
    #data = readRDS(file = 'exer_trainD.rds')
    mydata = readRDS(file = 'exer_data1.rds')
    #data[] = lapply(data[, c(2:8)], factor)
    
    F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
    data_sub = mydata[F1$Resample1,]
    F2 <- createDataPartition(data_sub$CHCOCNCR_F, p = 0.8, groups = 2)
    
    select = c('CVDCRHD4_F', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'EXRACT11',
               'X_BMI5CAT')
               
    select_col = c()
    for (i in 1:length(names(mydata))) {
        if (names(mydata)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train = data_sub[F2$Resample1,select_col]
    test = data_sub[-F2$Resample1,select_col]
    y = 'CVDCRHD4_F'
    x = select[2:8]
    fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
    #model <- glm(fmla, data = train, family = binomial(link = 'logit'))
#    modelA <- model
Extracting advice from logistic model

By looking at coefficients of our glm model, we find variables that are positively correlated with the cardio recommendation (TRUE) includes: * No health care coverage * BMI category 4: BMI > 30 * Exercise types that are correlated with the cardio recommendation: frisbee, mowing lawn, pilates, tai chi, volley ball, yoga, etc (those activities are low intensity)

Other variables are negatively correlated with the cardio recommendation.

model$coefficients
Variable importance

The variables with the top 3 highest importance includes SEX, Age, and duration of exercise.

impDf = varImp(model)
class(impDf)
impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
                          imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))

Make predictions and evaluate predictions

predict
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
characterize prediction quality

By looking at the density curve of predicted probabilities, we can see that the positive predictions (recommendation on more cardio exercise) are concentrated on the right and the negative (not giving recommendation on more cardio exercise) are concentrated on the left. Those two instances are well separated. To turn the model into a classifier, we need to pick a thresold.

ggplot(train, aes(x= pred, color = CVDCRHD4_F, linetype = CVDCRHD4_F)) +
    geom_density()

Model A2: GLM on SMOTE processed data

    set.seed(126)
    
    #data = readRDS(file = 'exer_trainD.rds')
    mydata = readRDS(file = 'exer_data1.rds')
    #data[] = lapply(data[, c(2:8)], factor)
    
    F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
    data_sub = mydata[F1$Resample1,]
    F2 <- createDataPartition(data_sub$CHCOCNCR_F, p = 0.8, groups = 2)
    
    select = c('CVDCRHD4_F', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'EXRACT11',
               'X_BMI5CAT')
               
    select_col = c()
    for (i in 1:length(names(mydata))) {
        if (names(mydata)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train = data_sub[F2$Resample1,select_col]
    test = data_sub[-F2$Resample1,select_col]
    
    train_smote <- SMOTE(CVDCRHD4_F~., train, 
                         perc.over = 100, perc.under=200)
    
    y = 'CVDCRHD4_F'
    x = select[2:8]
    fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
    #model <- glm(fmla, data = train_smote, family = binomial(link = 'logit'))
    model <- readRDS('model_gbm_heart.rds')
    train_smote$pred <- predict(model, newdata = train_smote, type = "response")
    test$pred <- predict(model, newdata = test, type = "response")
    
    ggplot(train_smote, 
           aes(x= pred, color = CVDCRHD4_F, linetype = CVDCRHD4_F)) +
    geom_density()

Model B: Random forest model

## use cvgrid search to tune the mtry paramter
set.seed(999)
# mtry = sqrt(length(select))
# tunegrid <- expand.grid(.mtry=mtry)
trCtrl <- trainControl(method = 'cv', number = 3, summaryFunction=defaultSummary)
Grid <- expand.grid(.mtry=c(1:7))

 select = c('CVDCRHD4_F', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met',
               'X_BMI5CAT')
    
    
    select_col <- c()
    for (i in 1:length(names(mydata))) {
        if (names(mydata)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train <- data_sub[F2$Resample1,select_col]
    test <- data_sub[-F2$Resample1,select_col]

train <- train[complete.cases(train), ]
test <- test[complete.cases(test),]

select2 <- c('X_HCVU651_F',
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met', 'X_BMI5CAT')

#tune.rf <- tuneRF( train[,select2], train$CVDCRHD4_F, stepFactor=2)
#plot(tune.rf)
#best.rf <- readRDS('rf-model.rds')
best.rf <- randomForest(train$CVDCRHD4_F~.,
             data = train,
            mtry = 2)
# best.rf <- fit
# trCtrl <- trainControl(method = 'cv', 
#                        number = 3, summaryFunction=defaultSummary)
# grid1 = expand.grid(.mtry = 2)
# best.rf <- train(train$X_PASTRNG_F_logic~., 
#                  trainControl = trCtl, data = train,
#                  method = 'rf', tuneGrid = grid1)
# #Grid <- expand.grid(cp = seq(0, 0.03, 0.01))
varImpPlot(best.rf)
pred <- predict(best.rf, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
ctab.test

Oversampling and undersampling to resolve the issue of imbalance

Oops, something is wrong with both glm and random forest model. The predicted outcomes are all FALSE and there is not a single positive prediction. This is because only 2% population has heart disease (a rare event). There are multiple ways to eliminate this problem(see this post). I first try to perform synthetic oversampling on the heart disease affected population. I use SMOT method in DMwR pacakge, which is to take

train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
summary(train_smote$CVDCRHD4_F)

After SWOT sampling, the ratio of positive and negative outcome in training data is 1:1.

Model C: Random forest model on SWOT processed training data

## use cvgrid search to tune the mtry paramter
set.seed(188)

select2 <- c('X_HCVU651_F',
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met', 'X_BMI5CAT')
tune.rf <- tuneRF( train_smote[,select2], train_smote$CVDCRHD4_F, stepFactor=2)
best.rf <- randomForest(train_smote$CVDCRHD4_F~.,
            data = train_smote, mtry = 2)
pred <- predict(best.rf, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)

The random forest model managed to acheieve 5% precision after training on SMOT processed data. There are a large number of false positive in the predicted outcome. Because of the precision paradox, precision is not a good paramter to evaluate models with rare event like heart disease risk. Therefore, I try to use ROC to evaluate the model.

pROC to measure the ROC
auc <- roc(as.numeric(test$CVDCRHD4_F), 
           as.numeric(pred))
print(auc)

Model D: SMOTE with tuned parameter

There ara several parameters to tune the SMOTE function. perc.over is how many extra samples to generate from the minority class. perc.under is how many extra samples are selected from the majority class given each minority cases.

train_smote2 <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=1000)
summary(train_smote2$CVDCRHD4_F)
## use cvgrid search to tune the mtry paramter
set.seed(188)

select2 <- c('X_HCVU651_F',
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met', 'X_BMI5CAT')
tune.rf <- tuneRF( train_smote2[,select2], train_smote2$CVDCRHD4_F, stepFactor=2)
best.rf2 <- randomForest(train_smote2$CVDCRHD4_F~.,
            data = train_smote2, mtry = 2)
pred <- predict(best.rf2, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
pROC to measure the ROC
auc <- roc(as.numeric(test$CVDCRHD4_F), 
           as.numeric(pred))
print(auc)

So a more realistic SMOTE sampling (15% positive outcome) actually does not confer a better performance.

Model E: KNN on SMOTE processed data

Because of the low performance of both random forest and the GLM, I try to use KNN on numeric variables to creat a now model.

mydata = readRDS(file = 'exer_data1.rds')

    F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
    data_sub = mydata[F1$Resample1,]
    F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
    
    select = c('CVDCRHD4_F', 
               'PADUR1_', 'PAFREQ1_')
    
    data_sub = data_sub[, select]
    data_sub[, c(2:3)] = lapply(data_sub[, c(2:3)], as.numeric)
    
    ##normalize the data
    normalize <- function(x) {
    y <- (x - min(x))/(max(x) - min(x))
    y
    }
    data_sub[, c(2:3)] = lapply(data_sub[,c(2:3)], normalize)

    
    train = data_sub[F2$Resample1,select]
    
    train_smote <- SMOTE(CVDCRHD4_F~., train, 
                         perc.over = 100, perc.under=200)
    summary(train_smote$CVDCRHD4_F)
    test = data_sub[-F2$Resample1,select]
# prc_test_pred <- class::knn(train = train_smote[,c(2:3)],
#                      test = test[,c(2:3)],
#                      cl = train_smote[, 1], k=1,
#                      use.all = FALSE)

## too many ties in the data

Looks like there are too many ties in the data (some variables, such as frequency and duration, have many repetitive values).

Model F: GBM on SMOTE processed data

mydata = readRDS(file = 'exer_data1.rds')

    F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
    data_sub = mydata[F1$Resample1,]
    F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
select = c('CVDCRHD4_F', 'X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'met',
               'X_BMI5CAT')
    
    
    select_col <- c()
    for (i in 1:length(names(mydata))) {
        if (names(mydata)[i] %in% select) {
            select_col = c(select_col, i)
        }
    }
    
    train <- data_sub[F2$Resample1,select_col]
    test <- data_sub[-F2$Resample1,select_col]

train <- train[complete.cases(train), ]
test <- test[complete.cases(test),]

train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
#summary(train_smote$CVDCRHD4_F)

trCtl <- trainControl(method = 'cv', number = 3, 
                      summaryFunction=defaultSummary)
Grid <- expand.grid( n.trees = seq(5, 100, 20), 
                     interaction.depth = c(10), 
                     shrinkage = c(0.1), n.minobsinnode = 20)
fit.gbm <- train(CVDCRHD4_F~., data = train_smote,
                 method = 'gbm', trControl = trCtl, tuneGrid = Grid)
#fit.gbm <- readRDS('rpart-gbm.rds')
plot(fit.gbm)
pred <- predict(fit.gbm, newdata = test)
ctab.test <- table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
#library(pROC)
auc <- roc(as.numeric(test$CVDCRHD4_F), 
           as.numeric(pred))
print(auc)

Here the GBM model performs a bit better than the first random forest model.

Model G: Stochastic Gradient Descent (SGD) model

SGD model is particularly efficient for large scale classification problem. Here I use sgd package to implement this model with default hyperparamters.

mydata = readRDS(file = 'exer_data1.rds')

    F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
    select = c('CVDCRHD4_F', 'PADUR1_',
           'PAFREQ1_', 'X_BMI5',
           'met', 'X_AGE_G')
    data_sub = mydata[F1$Resample1, select]
    ## as SGD is sensitive to feature scaling, scale the data first (http://scikit-learn.org/stable/modules/sgd.html#sgd)if it is numeric data
    data_sub[, c(2:6)] = lapply(data_sub[,c(2:6)], as.numeric)
    data_sub[, c(2:6)] = lapply(data_sub[,c(2:6)], scale)
    
    F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
    
    train <- data_sub[F2$Resample1, select]
    test <- data_sub[-F2$Resample1, select]

    train <- train[complete.cases(train), ]
    test <- test[complete.cases(test),]


train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
##false = 0, true = 1
train_smote[, 1] = as.integer(c(0, 1)[as.numeric(train_smote[, 1])])
train_smote[, c(2:6)] = lapply(train_smote[, c(2:6)], as.numeric)
fit.sgd <- sgd(CVDCRHD4_F~., data = train_smote, 
               model = 'glm',
               model.control = list(family = "binomial"))

train_matrix <- as.matrix(sapply(train_smote, as.numeric))
train_smote$pred <- predict(fit.sgd, x_test = train_matrix, 
                             type = 'response')
train_smote$pred <- as.numeric(train_smote$pred)
train_smote$CVDCRHD4_F <- as.factor(train_smote$CVDCRHD4_F)
ggplot(train_smote, aes(x= pred, 
                        color = CVDCRHD4_F, 
                        linetype = CVDCRHD4_F)) +
    geom_density()

test_matrix <- as.matrix(sapply(test, as.numeric))
test$pred <- predict(fit.sgd, x_test = test_matrix, type='response')
summary(test$pred[test$CVDCRHD4_F == TRUE])
summary(test$pred[test$CVDCRHD4_F == FALSE])
test$pred_f <- ifelse(test$pred < 0.0007173, TRUE, FALSE)
#########bug here#######
ctab.test <- table(test$pred_f, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
#library(pROC)
auc <- roc(as.numeric(test$CVDCRHD4_F), 
           as.numeric(test$pred))
print(auc)