In this project, you have to predict the probability of death of a patient that is entering an ICU (Intensive Care Unit), programming in R.
To load all the libraries to be used we are using some code found on stack exchange to do it neatly. In addition, as in previous assigments, to avoid problems and run this code from any machine we are loading the data using links to our own Github repository. Finally, we will load some user defined functions. As in previous assigments, if something is done twice better wrapped it up as a function to avoid repetition.
#Install and load the packages needed
list.of.packages <- c('caret', 'ggplot2','gsubfn' ,'FNN','kknn','fBasics' , 'mvtnorm' ,
'dplyr', 'mice' , 'fastDummies','glmnet', 'MLmetrics','ROCR','tidyr',
'tidyverse', 'magrittr', 'Hmisc')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
sapply(list.of.packages, require, character.only = TRUE)
## caret ggplot2 gsubfn FNN kknn fBasics
## TRUE TRUE TRUE TRUE TRUE TRUE
## mvtnorm dplyr mice fastDummies glmnet MLmetrics
## TRUE TRUE TRUE TRUE TRUE TRUE
## ROCR tidyr tidyverse magrittr Hmisc
## TRUE TRUE TRUE TRUE TRUE
####Load the datasets ####
data_train<-read.csv('https://github.com/AndresMtnezGlez/heptaomicron/raw/main/train_prob_death.csv') # this is a relative path from current folder to the folder where you have the dataset
data_test<-read.csv('https://github.com/AndresMtnezGlez/heptaomicron/raw/main/test_prob_death.csv')
#head(data_train) #Peak the dataset first rows
#head(data_test) #Peak the dataset first rows
#str(data_train) #Check the structure of the variables
User defined functions(getmode() and plot_miss())
set.seed(100)
getmode <- function(v){
v=v[nchar(as.character(v))>0]
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
plot_miss <- function(data_train) {
df <- as.data.frame(colSums(is.na(data_train))/nrow(data_train))
df <- data.frame(row.names(df), df, row.names = NULL)
ggplot(df, aes(x=row.names.df., y=colSums.is.na.data_train...nrow.data_train.)) +
geom_bar(stat = "identity",fill="#4199cb" )+
ggtitle("Share of missing values per category")+
coord_flip()
}
The dataset and its variables is already well known from previous assignments. Therefore we will focus our efforts in creating different methods of prediction instead of a profound exploration of the dataset.
One problem that we can easily solve at first stance is to change the structure of the dates from ‘character’ to actual ‘dates’.
data_train$DOB <- as.Date(data_train$DOB)
data_train$ADMITTIME <- as.Date(data_train$ADMITTIME)
str(data_train) #To check that it has worked
## 'data.frame': 13840 obs. of 39 variables:
## $ HOSPITAL_EXPIRE_FLAG: int 0 0 0 0 0 0 1 0 0 0 ...
## $ subject_id : int 55440 28424 86233 53787 99384 70776 68623 94431 76652 53782 ...
## $ hadm_id : int 195768 127337 184606 174772 168087 183743 179098 156573 103471 190648 ...
## $ icustay_id : int 228357 225281 237514 244413 298919 278855 207692 277837 231233 246339 ...
## $ HeartRate_Min : num 89 NA 62 84 74 94 56 52 59 72 ...
## $ HeartRate_Max : num 145 NA 100 109 98 109 77 127 91 100 ...
## $ HeartRate_Mean : num 121 NA 82.9 94.7 81.1 ...
## $ SysBP_Min : num 74 NA 62 81 84 128 83 92 105 81 ...
## $ SysBP_Max : num 127 NA 154 163 140 151 148 144 149 150 ...
## $ SysBP_Mean : num 107 NA 115 122 114 ...
## $ DiasBP_Min : num 42 NA 34 29 35 60 46 41 46 47 ...
## $ DiasBP_Max : num 90 NA 113 77 72 82 91 67 117 69 ...
## $ DiasBP_Mean : num 61.2 NA 57 47.9 54.3 ...
## $ MeanBP_Min : num 59 NA 48 49 31 78 59 59 60 52 ...
## $ MeanBP_Max : num 94 NA 122 87 81 109 98 97 124 93 ...
## $ MeanBP_Mean : num 74.5 NA 72.8 65.7 66.8 ...
## $ RespRate_Min : num 15 NA 11 15 17 11 16 9 9 10 ...
## $ RespRate_Max : num 30 NA 26 25 28 27 28 31 21 36 ...
## $ RespRate_Mean : num 22.3 NA 18.9 19.9 23.3 ...
## $ TempC_Min : num 35.1 NA 36.1 35.6 35.9 ...
## $ TempC_Max : num 36.9 NA 37.7 36.9 37.1 ...
## $ TempC_Mean : num 36.1 NA 36.9 36.2 36.7 ...
## $ SpO2_Min : num 90 NA 87 89 88 92 94 95 97 94 ...
## $ SpO2_Max : num 99 NA 100 100 99 97 100 100 100 100 ...
## $ SpO2_Mean : num 95.7 NA 96.9 92.9 94.6 ...
## $ Glucose_Min : num 111 97 116 233 85 142 137 78 97 62 ...
## $ Glucose_Max : num 230 137 183 484 161 173 164 185 125 164 ...
## $ Glucose_Mean : num 161 113 142 361 112 ...
## $ GENDER : chr "F" "F" "F" "F" ...
## $ DOB : Date, format: "1938-11-23" "1929-04-30" ...
## $ ADMITTIME : Date, format: "2008-06-15" "2008-09-12" ...
## $ ADMISSION_TYPE : chr "EMERGENCY" "EMERGENCY" "ELECTIVE" "EMERGENCY" ...
## $ INSURANCE : chr "Medicare" "Medicare" "Medicare" "Medicare" ...
## $ RELIGION : chr "PROTESTANT QUAKER" "JEWISH" "PROTESTANT QUAKER" "CATHOLIC" ...
## $ MARITAL_STATUS : chr "SINGLE" "WIDOWED" "MARRIED" "DIVORCED" ...
## $ ETHNICITY : chr "WHITE" "WHITE" "WHITE" "WHITE" ...
## $ DIAGNOSIS : chr "GASTROINTESTINAL BLEED" "ABDOMINAL PAIN" "LEFT LUNG CANCER/SDA" "ASTHMA;COPD EXACERBATION" ...
## $ ICD9_diagnosis : chr "5789" "56211" "1625" "49322" ...
## $ FIRST_CAREUNIT : chr "MICU" "TSICU" "SICU" "MICU" ...
The next step is to deal with missing variables. First we plot the share of missing to guage the problem. Second we group the numerical variables that encode features (meaning that we do not care of ordinal variables such as ID’s). Third we input missing values with the package MICE as Professor Verdu suggested in the Slack channel. The first step is to start the process imputing the unconditonal mean values, and the second process is to iterate multiple imputations. Finally we complete our dataframe with the imputations and plot again the share of missings to see if it has worked.
####Missing values####
plot_miss(data_train)
#Replace missing in categorical by most frecuent value
l_num_features <- c('HeartRate_Min','HeartRate_Max','HeartRate_Mean','SysBP_Min','SysBP_Max','SysBP_Mean','DiasBP_Min','DiasBP_Max',
'DiasBP_Mean','MeanBP_Min','MeanBP_Max','MeanBP_Mean','RespRate_Min','RespRate_Max','RespRate_Mean','TempC_Min',
'TempC_Max','TempC_Mean','SpO2_Min','SpO2_Max','SpO2_Mean','Glucose_Min','Glucose_Max','Glucose_Mean')
df_num_feautes <- data_train[,l_num_features]
init <- mice(df_num_feautes, meth="mean", maxit=0)
imputation <- mice(df_num_feautes, method=init$method, maxit=5, m = 3)
##
## iter imp variable
## 1 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 1 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 1 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
data_train[,l_num_features]<-complete(imputation, 2)
plot_miss(data_train)
To simplify the exercise, as well as we previously did, we create an ‘AGE’ variable by substracting the ADMITIME to DOB.
#Getting only the "year" from the dates and calculating the age
data_train$DOB <- format(as.Date(data_train$DOB, format="%Y/%m/%d"),"%Y")
data_train$ADMITTIME <- format(as.Date(data_train$ADMITTIME, format="%Y/%m/%d"),"%Y")
data_train$DOB<- as.numeric(as.character(data_train$DOB))
data_train$ADMITTIME<- as.numeric(as.character(data_train$ADMITTIME))
data_train %<>% mutate(AGE = DOB - ADMITTIME)
data_train %<>% select(-DOB, -ADMITTIME)
In previous corrections we have been told that cummulative dummify our categorical variables might misslead the model because it can mean that the more the better or teh other way around when in reality the cateogories do not embded this information. As to dummy a category with too many sub categories leads to too many variables and lost of tractability we will dummify only the two variables we think might impact in someones health: Gender and the First Care Unit (meaning, there should be Units where the more seve cases go and other for more normals)
#Dummify cateogorical variables of interest for probability of death
dta <- fastDummies::dummy_cols(data_train, select_columns =c("GENDER","FIRST_CAREUNIT" ))
dta %<>% select(-subject_id, -hadm_id,-icustay_id, -GENDER,-ADMISSION_TYPE,
-INSURANCE, -RELIGION, -MARITAL_STATUS, -ETHNICITY, -DIAGNOSIS,
-ICD9_diagnosis, -FIRST_CAREUNIT)
An issue we have encountered several times in the course is the risk of missleading information because of lack of standarization of the numerical features. As a clean solution we will normalize with the function scale() that normalizes the variables to zero mean and unitary standard deviation.
#Standarize numerical variables (all but dummies and exitus flag)
l_num_var = names(dta)
l_no_num = c('HOSPITAL_EXPIRE_FLAG','AGE',
"GENDER_F" ,"GENDER_M" ,"FIRST_CAREUNIT_CCU",
"FIRST_CAREUNIT_CSRU" ,"FIRST_CAREUNIT_MICU" ,"FIRST_CAREUNIT_SICU",
"FIRST_CAREUNIT_TSICU" )
l_num_var <- l_num_var[!(l_num_var %in% l_no_num)]
dta %<>% mutate_at(l_num_var, funs(c(scale(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
To quickly see if our normalization has worked we can plot a probability density function of two variables such as “TemC_Min” and “Glucose_Min”. If there where no normalization they should have very different first and second moment. As we can see in te plot below it has worked, they are standardized to follow a (0,1)
#Normalization worked
aux <- select(dta,TempC_Min, Glucose_Min )#Let's sample two variables to see if
df_tidy <- gather(aux, cols, value)
ggplot(df_tidy, aes(x = value)) +
geom_density(aes(color=cols))+
ggtitle("Density plot of normalize variables")
# 5. Prepare the data for the models Once we have deal with missing, dummify and normalize our data, it is time to get it ready for the modelling phase. We have been given a test data set. Nevertheless to check how our predictions do or do not overfit we will split our training dataset in two parts.
Below we first divide the dataframe between dependent variable and inedependent variables. Later we factorize our dependent variable
X <- select(dta,-'HOSPITAL_EXPIRE_FLAG')
y<- select(dta,'HOSPITAL_EXPIRE_FLAG')
data <- cbind(X,y)
data$HOSPITAL_EXPIRE_FLAG<-as.character(data$HOSPITAL_EXPIRE_FLAG)
data$HOSPITAL_EXPIRE_FLAG[data$HOSPITAL_EXPIRE_FLAG=='1']<-'D' #1 now becomes "D" from Death
data$HOSPITAL_EXPIRE_FLAG[data$HOSPITAL_EXPIRE_FLAG!='D']<-'S' #0 now becomes "S" from Survival
data$HOSPITAL_EXPIRE_FLAG<-as.factor(data$HOSPITAL_EXPIRE_FLAG)
Now we split the data into two groups by randomnly sampling with the sample function. It is important to note that the data frame “datatest” is just a split of the original data_train and is not the the for test pourposes data frame that we load at the beginning which is called “data_test”
sample_train <- sample(seq_len(nrow(data_train)), size = round(.95 * nrow(data_train)))
datatrain<-data[sample_train,]
Xtrain<-X[sample_train,]
ytrain<-y[sample_train,]
datatest<-data[-sample_train,]
Xtest<-X[-sample_train,]
ytest<-y[-sample_train,]
Finally, we are going to calculate the weights of the exitus (death) and survival (0) result in both, training and test split. The reason is that it is going to be needed as argument for our regressions.
fraction_0 <- 1.5*rep(1 - sum(y == '0') / nrow(y), sum(y == '0'))
fraction_1 <- rep(1 - sum(y == '1') / nrow(y), sum(y == '1'))
weights <- numeric(nrow(y))
weighted <-TRUE
if (weighted == TRUE) {
weights[y == '0'] <- fraction_0
weights[y == '1'] <- fraction_1
} else {
weights <- rep(1, nrow(y))
}
weights_train <- weights[sample_train]
weights_test <- weights[-sample_train]
To see how many individuals survive in our sample we can make use of the geom_count() function to plot the dummy variable whose size is realtive to the magnitude.
ggplot(y, aes(y=HOSPITAL_EXPIRE_FLAG, x="", colour = "#4199cb")) + geom_count() +
ggtitle("Size of individuals with reasult death vs survivals") +
xlab("0 means survival, 1 means death")
# 6. Modelling strategy We will try out two penalized logistic regressions. The virtue of these regression is that they imposes a penalty to the logistic model for having too many variables. This results in shrinking the coefficients of the less contributive variables toward zero. This is also known as regularization.
In the ridge regression variables with minor contribution have their coefficients close to zero. However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.
The plot be displays the cross-validation error according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of lambda is approximately 0.1, which is the one that minimizes the prediction error. This lambda value will give the most accurate model.
Ridge_fit=cv.glmnet(as.matrix(Xtrain),y=factor(ytrain),type.measure = 'auc',family='binomial',alpha=0,weights=weights_train)
plot(Ridge_fit)
This lambda value will give the most accurate model. The exact value of lambda can be viewed as follow:
Ridge_lambda = Ridge_fit$lambda.min
Ridge_fit$lambda.min
## [1] 0.0125702
summary(Ridge_fit)
## Length Class Mode
## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 7 -none- call
## name 1 -none- character
## glmnet.fit 13 lognet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## index 2 -none- numeric
Finally, we have to obtain the AUC. AUC - ROC curve is a performance measurement for the classification problems at various threshold settings. ROC is a probability curve and AUC represents the degree or measure of separability. It tells how much the model is capable of distinguishing between classes. Higher the AUC, the better the model is at predicting 0s as 0s and 1s as 1s.
We have obtained an AUC of 0.79 which is a huge improvement from the “flip coin” previous stituation when answering the quesiton if the patient in the ICU will survive or not with the data obtained in the first minutes.
preds = predict(Ridge_fit,newx = as.matrix(Xtest), type="response",s='lambda.min')
y_pred_acc<-as.factor(as.numeric(preds>0.5))# Compute different Evaluation metrics
ytest<-as.factor(ytest)
# Prin and plot the AUC - ROC
pred <- prediction(preds,as.integer(ytest))
auc<-performance(pred,'auc')@y.values[[1]]
print(paste0('The AUC with Ridge Regression is:: ',auc))
## [1] "The AUC with Ridge Regression is:: 0.794189171893"
y_bin<-as.integer(ytest=='1')
auc<-performance(pred,'auc')@y.values[[1]]
perf <- performance(pred,"tpr","fpr")
plot(perf)
## 6.2 Lasso Regression
Now we jump to a Lasso regression. In the Lasso regression the coefficients of some less contributive variables are forced to be exactly zero. Only the most significant variables are kept in the final model.
As before, the plot be displays the cross-validation error according to the log of lambda. log of the optimal value of lambda is approximately -3.9, which is the one that minimizes the prediction error.
Lasso_fit=cv.glmnet(as.matrix(Xtrain),y=factor(ytrain),family='binomial',alpha=1,type.measure = 'auc',weights = weights_train)
plot(Lasso_fit)
The Lasso regression performs very well too. However, its AUC is a little bit lower than the AUC that we get in the Ridge Regression. So the Ridge Regression is going to be the model from which we withdraw our predictions.
preds = predict(Lasso_fit,newx = as.matrix(Xtest), type="response",s='lambda.min')
y_pred_acc<-as.factor(as.numeric(preds>0.5))# Compute different Evaluation metrics
ytest<-as.factor(ytest)
y_bin<-as.integer(ytest=='1')
auc<-performance(pred,'auc')@y.values[[1]]
print(paste0('The AUC with Lasso Regression is:: ',auc))
## [1] "The AUC with Lasso Regression is:: 0.794189171893"
pred <- prediction(preds,y_bin)
perf <- performance(pred,"tpr","fpr")
plot(perf)
# 7. Predictions
Now that we have our model (the Ridge regression) proven in our training dataset we have to applied to our datatest doing the same changes
data_test$DOB <- as.Date(data_test$DOB)
data_test$ADMITTIME <- as.Date(data_test$ADMITTIME)
l_num_features <- c('HeartRate_Min','HeartRate_Max','HeartRate_Mean','SysBP_Min','SysBP_Max','SysBP_Mean','DiasBP_Min','DiasBP_Max',
'DiasBP_Mean','MeanBP_Min','MeanBP_Max','MeanBP_Mean','RespRate_Min','RespRate_Max','RespRate_Mean','TempC_Min',
'TempC_Max','TempC_Mean','SpO2_Min','SpO2_Max','SpO2_Mean','Glucose_Min','Glucose_Max','Glucose_Mean')
df_num_feautes <- data_test[,l_num_features]
init <- mice(df_num_feautes, meth="mean", maxit=0)
## Warning: Number of logged events: 1
imputation <- mice(df_num_feautes, method=init$method, maxit=5, m = 3)
##
## iter imp variable
## 1 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 1 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 1 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## Warning: Number of logged events: 1
data_test[,l_num_features]<-complete(imputation, 2)
plot_miss(data_test)
For some reason, the MICE function has not worked with the column Gloucose Mean. We will directly tackle the issue replacing by column mean.
data_test$Glucose_Mean[is.na(data_test$Glucose_Mean)]<-mean(data_test$Glucose_Mean,na.rm=TRUE)
plot_miss(data_test)
Again, we calculate the age
#Getting only the "year" from the dates and calculating the age
data_test$DOB <- format(as.Date(data_test$DOB, format="%Y/%m/%d"),"%Y")
data_test$ADMITTIME <- format(as.Date(data_test$ADMITTIME, format="%Y/%m/%d"),"%Y")
data_test$DOB<- as.numeric(as.character(data_test$DOB))
data_test$ADMITTIME<- as.numeric(as.character(data_test$ADMITTIME))
data_test %<>% mutate(AGE = DOB - ADMITTIME)
data_test %<>% select(-DOB, -ADMITTIME)
And again we dumify the catogical variables
#Dummify cateogorical variables of interest for probability of death
dta <- fastDummies::dummy_cols(data_test, select_columns =c("GENDER","FIRST_CAREUNIT" ))
dta_test_ic <- select(dta, icustay_id)
dta %<>% select(-subject_id, -hadm_id,-icustay_id, -GENDER,-ADMISSION_TYPE,
-INSURANCE, -RELIGION, -MARITAL_STATUS, -ETHNICITY, -DIAGNOSIS,
-ICD9_diagnosis, -FIRST_CAREUNIT)
Again, we standarize the numerical variables
#Standarize numerical variables (all but dummies and exitus flag)
l_num_var = names(dta)
l_no_num = c('HOSPITAL_EXPIRE_FLAG','AGE',
"GENDER_F" ,"GENDER_M" ,"FIRST_CAREUNIT_CCU",
"FIRST_CAREUNIT_CSRU" ,"FIRST_CAREUNIT_MICU" ,"FIRST_CAREUNIT_SICU",
"FIRST_CAREUNIT_TSICU" )
l_num_var <- l_num_var[!(l_num_var %in% l_no_num)]
dta %<>% mutate_at(l_num_var, funs(c(scale(.))))
Load the dta as dta_test
dta_test <- dta
Now, we make predictions
preds_test = predict(Ridge_fit,newx = as.matrix(dta_test), type="response",s=Ridge_lambda)
y_preds_test<-as.factor(as.numeric(preds_test>=0.5))# Compute different Evaluation metrics
df_y_preds_test <- as.data.frame(y_preds_test)
And finally merge it as the mock solution to submission to kaggle
str(dta_test_ic)
## 'data.frame': 12065 obs. of 1 variable:
## $ icustay_id: int 221004 296315 245557 287519 231164 280367 258379 255605 211153 254678 ...
str(df_y_preds_test)
## 'data.frame': 12065 obs. of 1 variable:
## $ y_preds_test: Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 2 1 1 ...
names(dta_test_ic) <- "icustay_id"
names(df_y_preds_test) <- "HOSPITAL_EXPIRE_FLAG"
result_prob_death_AMG <- cbind(dta_test_ic,df_y_preds_test)
write.csv(result_prob_death_AMG,"result_prob_death_AMG.csv", row.names = FALSE)
The score that I have received in kaggle with this iteration is 0.68423. .