library(caret)
library(magrittr)
library(DMwR)
library(knitr)
library(tibble)
library(pROC)
library(ggplot2)
library(randomForest)
library(class)
library(sgd)
set.seed(126)
#data = readRDS(file = 'exer_trainD.rds')
mydata = readRDS(file = 'exer_data1.rds')
#data[] = lapply(data[, c(2:8)], factor)
F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
data_sub = mydata[F1$Resample1,]
F2 <- createDataPartition(data_sub$CHCOCNCR_F, p = 0.8, groups = 2)
select = c('CVDCRHD4_F', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'EXRACT11',
'X_BMI5CAT')
select_col = c()
for (i in 1:length(names(mydata))) {
if (names(mydata)[i] %in% select) {
select_col = c(select_col, i)
}
}
train = data_sub[F2$Resample1,select_col]
test = data_sub[-F2$Resample1,select_col]
y = 'CVDCRHD4_F'
x = select[2:8]
fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
#model <- glm(fmla, data = train, family = binomial(link = 'logit'))
# modelA <- model
By looking at coefficients of our glm model, we find variables that are positively correlated with the cardio recommendation (TRUE) includes: * No health care coverage * BMI category 4: BMI > 30 * Exercise types that are correlated with the cardio recommendation: frisbee, mowing lawn, pilates, tai chi, volley ball, yoga, etc (those activities are low intensity)
Other variables are negatively correlated with the cardio recommendation.
model$coefficients
The variables with the top 3 highest importance includes SEX, Age, and duration of exercise.
impDf = varImp(model)
class(impDf)
impDf_order = data.frame( varNames = rownames(impDf)[order(impDf$Overall, decreasing = T)],
imp = impDf[order(impDf$Overall, decreasing = T),])
#impDf_order$varNames
kable(head(impDf_order, 15))
train$pred <- predict(model, newdata = train, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
By looking at the density curve of predicted probabilities, we can see that the positive predictions (recommendation on more cardio exercise) are concentrated on the right and the negative (not giving recommendation on more cardio exercise) are concentrated on the left. Those two instances are well separated. To turn the model into a classifier, we need to pick a thresold.
ggplot(train, aes(x= pred, color = CVDCRHD4_F, linetype = CVDCRHD4_F)) +
geom_density()
set.seed(126)
#data = readRDS(file = 'exer_trainD.rds')
mydata = readRDS(file = 'exer_data1.rds')
#data[] = lapply(data[, c(2:8)], factor)
F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
data_sub = mydata[F1$Resample1,]
F2 <- createDataPartition(data_sub$CHCOCNCR_F, p = 0.8, groups = 2)
select = c('CVDCRHD4_F', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'EXRACT11',
'X_BMI5CAT')
select_col = c()
for (i in 1:length(names(mydata))) {
if (names(mydata)[i] %in% select) {
select_col = c(select_col, i)
}
}
train = data_sub[F2$Resample1,select_col]
test = data_sub[-F2$Resample1,select_col]
train_smote <- SMOTE(CVDCRHD4_F~., train,
perc.over = 100, perc.under=200)
y = 'CVDCRHD4_F'
x = select[2:8]
fmla <- paste(y, paste(x, collapse = "+"), sep = '~')
#model <- glm(fmla, data = train_smote, family = binomial(link = 'logit'))
model <- readRDS('model_gbm_heart.rds')
train_smote$pred <- predict(model, newdata = train_smote, type = "response")
test$pred <- predict(model, newdata = test, type = "response")
ggplot(train_smote,
aes(x= pred, color = CVDCRHD4_F, linetype = CVDCRHD4_F)) +
geom_density()
## use cvgrid search to tune the mtry paramter
set.seed(999)
# mtry = sqrt(length(select))
# tunegrid <- expand.grid(.mtry=mtry)
trCtrl <- trainControl(method = 'cv', number = 3, summaryFunction=defaultSummary)
Grid <- expand.grid(.mtry=c(1:7))
select = c('CVDCRHD4_F', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met',
'X_BMI5CAT')
select_col <- c()
for (i in 1:length(names(mydata))) {
if (names(mydata)[i] %in% select) {
select_col = c(select_col, i)
}
}
train <- data_sub[F2$Resample1,select_col]
test <- data_sub[-F2$Resample1,select_col]
train <- train[complete.cases(train), ]
test <- test[complete.cases(test),]
select2 <- c('X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met', 'X_BMI5CAT')
#tune.rf <- tuneRF( train[,select2], train$CVDCRHD4_F, stepFactor=2)
#plot(tune.rf)
#best.rf <- readRDS('rf-model.rds')
best.rf <- randomForest(train$CVDCRHD4_F~.,
data = train,
mtry = 2)
# best.rf <- fit
# trCtrl <- trainControl(method = 'cv',
# number = 3, summaryFunction=defaultSummary)
# grid1 = expand.grid(.mtry = 2)
# best.rf <- train(train$X_PASTRNG_F_logic~.,
# trainControl = trCtl, data = train,
# method = 'rf', tuneGrid = grid1)
# #Grid <- expand.grid(cp = seq(0, 0.03, 0.01))
varImpPlot(best.rf)
pred <- predict(best.rf, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
ctab.test
Oops, something is wrong with both glm and random forest model. The predicted outcomes are all FALSE and there is not a single positive prediction. This is because only 2% population has heart disease (a rare event). There are multiple ways to eliminate this problem(see this post). I first try to perform synthetic oversampling on the heart disease affected population. I use SMOT method in DMwR pacakge, which is to take
train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
summary(train_smote$CVDCRHD4_F)
After SWOT sampling, the ratio of positive and negative outcome in training data is 1:1.
## use cvgrid search to tune the mtry paramter
set.seed(188)
select2 <- c('X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met', 'X_BMI5CAT')
tune.rf <- tuneRF( train_smote[,select2], train_smote$CVDCRHD4_F, stepFactor=2)
best.rf <- randomForest(train_smote$CVDCRHD4_F~.,
data = train_smote, mtry = 2)
pred <- predict(best.rf, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
The random forest model managed to acheieve 5% precision after training on SMOT processed data. There are a large number of false positive in the predicted outcome. Because of the precision paradox, precision is not a good paramter to evaluate models with rare event like heart disease risk. Therefore, I try to use ROC to evaluate the model.
auc <- roc(as.numeric(test$CVDCRHD4_F),
as.numeric(pred))
print(auc)
There ara several parameters to tune the SMOTE function. perc.over is how many extra samples to generate from the minority class. perc.under is how many extra samples are selected from the majority class given each minority cases.
train_smote2 <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=1000)
summary(train_smote2$CVDCRHD4_F)
## use cvgrid search to tune the mtry paramter
set.seed(188)
select2 <- c('X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met', 'X_BMI5CAT')
tune.rf <- tuneRF( train_smote2[,select2], train_smote2$CVDCRHD4_F, stepFactor=2)
best.rf2 <- randomForest(train_smote2$CVDCRHD4_F~.,
data = train_smote2, mtry = 2)
pred <- predict(best.rf2, newdata = test[,select2])
ctab.test = table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
auc <- roc(as.numeric(test$CVDCRHD4_F),
as.numeric(pred))
print(auc)
So a more realistic SMOTE sampling (15% positive outcome) actually does not confer a better performance.
Because of the low performance of both random forest and the GLM, I try to use KNN on numeric variables to creat a now model.
mydata = readRDS(file = 'exer_data1.rds')
F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
data_sub = mydata[F1$Resample1,]
F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
select = c('CVDCRHD4_F',
'PADUR1_', 'PAFREQ1_')
data_sub = data_sub[, select]
data_sub[, c(2:3)] = lapply(data_sub[, c(2:3)], as.numeric)
##normalize the data
normalize <- function(x) {
y <- (x - min(x))/(max(x) - min(x))
y
}
data_sub[, c(2:3)] = lapply(data_sub[,c(2:3)], normalize)
train = data_sub[F2$Resample1,select]
train_smote <- SMOTE(CVDCRHD4_F~., train,
perc.over = 100, perc.under=200)
summary(train_smote$CVDCRHD4_F)
test = data_sub[-F2$Resample1,select]
# prc_test_pred <- class::knn(train = train_smote[,c(2:3)],
# test = test[,c(2:3)],
# cl = train_smote[, 1], k=1,
# use.all = FALSE)
## too many ties in the data
Looks like there are too many ties in the data (some variables, such as frequency and duration, have many repetitive values).
mydata = readRDS(file = 'exer_data1.rds')
F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
data_sub = mydata[F1$Resample1,]
F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
select = c('CVDCRHD4_F', 'X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'met',
'X_BMI5CAT')
select_col <- c()
for (i in 1:length(names(mydata))) {
if (names(mydata)[i] %in% select) {
select_col = c(select_col, i)
}
}
train <- data_sub[F2$Resample1,select_col]
test <- data_sub[-F2$Resample1,select_col]
train <- train[complete.cases(train), ]
test <- test[complete.cases(test),]
train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
#summary(train_smote$CVDCRHD4_F)
trCtl <- trainControl(method = 'cv', number = 3,
summaryFunction=defaultSummary)
Grid <- expand.grid( n.trees = seq(5, 100, 20),
interaction.depth = c(10),
shrinkage = c(0.1), n.minobsinnode = 20)
fit.gbm <- train(CVDCRHD4_F~., data = train_smote,
method = 'gbm', trControl = trCtl, tuneGrid = Grid)
#fit.gbm <- readRDS('rpart-gbm.rds')
plot(fit.gbm)
pred <- predict(fit.gbm, newdata = test)
ctab.test <- table(pred, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
#library(pROC)
auc <- roc(as.numeric(test$CVDCRHD4_F),
as.numeric(pred))
print(auc)
Here the GBM model performs a bit better than the first random forest model.
SGD model is particularly efficient for large scale classification problem. Here I use sgd package to implement this model with default hyperparamters.
mydata = readRDS(file = 'exer_data1.rds')
F1 <- createDataPartition(mydata$CVDCRHD4_F, p = 0.5, groups = 2)
select = c('CVDCRHD4_F', 'PADUR1_',
'PAFREQ1_', 'X_BMI5',
'met', 'X_AGE_G')
data_sub = mydata[F1$Resample1, select]
## as SGD is sensitive to feature scaling, scale the data first (http://scikit-learn.org/stable/modules/sgd.html#sgd)if it is numeric data
data_sub[, c(2:6)] = lapply(data_sub[,c(2:6)], as.numeric)
data_sub[, c(2:6)] = lapply(data_sub[,c(2:6)], scale)
F2 <- createDataPartition(data_sub$CVDCRHD4_F, p = 0.8, groups = 2)
train <- data_sub[F2$Resample1, select]
test <- data_sub[-F2$Resample1, select]
train <- train[complete.cases(train), ]
test <- test[complete.cases(test),]
train_smote <- SMOTE(CVDCRHD4_F~., train, perc.over = 100, perc.under=200)
##false = 0, true = 1
train_smote[, 1] = as.integer(c(0, 1)[as.numeric(train_smote[, 1])])
train_smote[, c(2:6)] = lapply(train_smote[, c(2:6)], as.numeric)
fit.sgd <- sgd(CVDCRHD4_F~., data = train_smote,
model = 'glm',
model.control = list(family = "binomial"))
train_matrix <- as.matrix(sapply(train_smote, as.numeric))
train_smote$pred <- predict(fit.sgd, x_test = train_matrix,
type = 'response')
train_smote$pred <- as.numeric(train_smote$pred)
train_smote$CVDCRHD4_F <- as.factor(train_smote$CVDCRHD4_F)
ggplot(train_smote, aes(x= pred,
color = CVDCRHD4_F,
linetype = CVDCRHD4_F)) +
geom_density()
test_matrix <- as.matrix(sapply(test, as.numeric))
test$pred <- predict(fit.sgd, x_test = test_matrix, type='response')
summary(test$pred[test$CVDCRHD4_F == TRUE])
summary(test$pred[test$CVDCRHD4_F == FALSE])
test$pred_f <- ifelse(test$pred < 0.0007173, TRUE, FALSE)
#########bug here#######
ctab.test <- table(test$pred_f, test$CVDCRHD4_F)
precision <- ctab.test[2,2]/sum(ctab.test[2,])
print(precision)
#library(pROC)
auc <- roc(as.numeric(test$CVDCRHD4_F),
as.numeric(test$pred))
print(auc)