In this project, you have to predict the probability of death of a patient that is entering an ICU (Intensive Care Unit), programming in R.
To load all the libraries to be used we are using some code found on stack exchange to do it neatly. In addition, as in previous assigments, to avoid problems and run this code from any machine we are loading the data using links to our own Github repository. Finally, we will load some user defined functions. As in previous assigments, if something is done twice better wrapped it up as a function to avoid repetition.
#Load the libraries
list.of.packages <- c('caret', 'ggplot2','gsubfn' ,'FNN','kknn','fBasics' , 'mvtnorm' ,
'dplyr', 'mice' , 'fastDummies','glmnet', 'MLmetrics','ROCR','tidyr',
'tidyverse', 'magrittr', 'Hmisc')
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
sapply(list.of.packages, require, character.only = TRUE)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: FNN
## Loading required package: kknn
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
## Loading required package: fBasics
## Loading required package: timeDate
## Loading required package: timeSeries
## Loading required package: mvtnorm
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:timeSeries':
##
## filter, lag
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: mice
##
## Attaching package: 'mice'
## The following object is masked from 'package:timeSeries':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: fastDummies
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1
## Loading required package: MLmetrics
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
## Loading required package: ROCR
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x mice::filter() masks dplyr::filter(), timeSeries::filter(), stats::filter()
## x dplyr::lag() masks timeSeries::lag(), stats::lag()
## x purrr::lift() masks caret::lift()
## x tidyr::pack() masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: Hmisc
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## caret ggplot2 gsubfn FNN kknn fBasics
## TRUE TRUE TRUE TRUE TRUE TRUE
## mvtnorm dplyr mice fastDummies glmnet MLmetrics
## TRUE TRUE TRUE TRUE TRUE TRUE
## ROCR tidyr tidyverse magrittr Hmisc
## TRUE TRUE TRUE TRUE TRUE
Now we load the datasets
data <- read.csv(url("https://github.com/AndresMtnezGlez/heptaomicron/raw/main/train_prob_death.csv"))
data_test <- read.csv(url("https://github.com/AndresMtnezGlez/heptaomicron/raw/main/test_prob_death.csv"))
extra_d <- read.csv(url("https://github.com/AndresMtnezGlez/heptaomicron/raw/main/extra_diagnoses_R.csv"))
Now we load the user defined datasets
#building a fuction to calculate the age
age_calc <- function(DOB, ADMITTIME){
start <- format(as.Date(DOB, format="%Y/%m/%d"),"%Y")
end <- format(as.Date(ADMITTIME, format="%Y/%m/%d"),"%Y")
start <- as.numeric(as.character(start))
end <- as.numeric(as.character(end))
AGE <- start - end
return(result)
}
#Get the mode
getmode <- function(v){
v=v[nchar(as.character(v))>0]
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
#Plot missing values
plot_miss <- function(data_train) {
df <- as.data.frame(colSums(is.na(data_train))/nrow(data_train))
df <- data.frame(row.names(df), df, row.names = NULL)
ggplot(df, aes(x=row.names.df., y=colSums.is.na.data_train...nrow.data_train.)) +
geom_bar(stat = "identity",fill="#4199cb" )+
ggtitle("Share of missing values per category")+
coord_flip()
}
#Transform NA to mean
NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
The next step is to deal with missing variables. First we plot the share of missing to guage the problem. Second we group the numerical variables that encode features (meaning that we do not care of ordinal variables such as ID’s). Third we input missing values with the package MICE as Professor Verdu suggested in the Slack channel. The first step is to start the process imputing the unconditonal mean values, and the second process is to iterate multiple imputations. Finally we complete our dataframe with the imputations and plot again the share of missings to see if it has worked.
You can also embed plots, for example:
##Calculate AGE
data$DOB <- as.Date(data$DOB)
data$ADMITTIME <- as.Date(data$ADMITTIME)
data$DOB <- format(as.Date(data$DOB, format="%Y/%m/%d"),"%Y")
data$ADMITTIME <- format(as.Date(data$ADMITTIME, format="%Y/%m/%d"),"%Y")
data$DOB<- as.numeric(as.character(data$DOB))
data$ADMITTIME<- as.numeric(as.character(data$ADMITTIME))
data %<>% mutate(AGE =ADMITTIME - DOB )
data %<>% select(-DOB, -ADMITTIME)
#Plot a Historgram with age
hist(data$AGE,
main="Histogram of AGE",
xlab="Years",
xlim=c(0,120),
col="darkmagenta",
freq=FALSE
)
data_test$DOB <- as.Date(data_test$DOB)
data_test$ADMITTIME <- as.Date(data_test$ADMITTIME)
data_test$DOB <- format(as.Date(data_test$DOB, format="%Y/%m/%d"),"%Y")
data_test$ADMITTIME <- format(as.Date(data_test$ADMITTIME, format="%Y/%m/%d"),"%Y")
data_test$DOB<- as.numeric(as.character(data_test$DOB))
data_test$ADMITTIME<- as.numeric(as.character(data_test$ADMITTIME))
data_test %<>% mutate(AGE =ADMITTIME - DOB )
data_test %<>% select(-DOB, -ADMITTIME)
#Plot a Historgram with age
hist(data_test$AGE,
main="Histogram of AGE in Test Dataset",
xlab="Years",
xlim=c(0,120),
col="navyblue",
freq=FALSE
)
plot_miss(extra_d)
extra_d %<>% spread(SEQ_NUM, ICD9_CODE) #Create a column for each disease
extra_d <- extra_d[-ncol(extra_d)] #Delete last column (NA)
aux <- names(extra_d) #Store the names in a aux vector
names(extra_d) <- c("subject_id", "hadm_id", paste0("dis_",aux[-c(1,2)] )) #Change the col names
extra_d$na_count <- apply(extra_d, 1, function(x) sum(is.na(x))) #Count the NA row wise
extra_d$dis_tot <- (ncol(extra_d)-3)-extra_d$na_count #Count no. of diseases row wise
summary(extra_d$dis_tot) #The mean no. of diseases is 11, the 1st and 3rd Qu is 6 and 15
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 9.00 11.04 15.00 39.00
extra_d <- extra_d[, -c(23:42)] #Drop diseases from
data <- merge(x = extra_d, y = data, by =c("subject_id", "hadm_id"), all.y = TRUE)
data_test$subject_id <- trimws(data_test$subject_id)
extra_d$subject_id <- trimws(extra_d$subject_id)
data_test <- merge(x = extra_d, y = data_test, by=c("subject_id","hadm_id"))
You can also embed plots, for example:
#Cluster for cateogrical variables
#By disease
data %<>% mutate(ICD_3 = substr(data$ICD9_diagnosis,1, 3))
data_test %<>% mutate(ICD_3 = substr(data_test$ICD9_diagnosis,1, 3))
data$Disease <- with(data, ifelse(
ICD_3 >= 0 & ICD_3 <= 139, 'Infectious', ifelse(
ICD_3 >= 140 & ICD_3 <= 239, 'Neoplasms', ifelse(
ICD_3 >= 240 & ICD_3 <= 279, 'Endocrine', ifelse(
ICD_3 >= 280 & ICD_3 <= 289, 'Blood', ifelse(
ICD_3 >= 290 & ICD_3 <= 319, 'Mental', ifelse(
ICD_3 >= 320 & ICD_3 <= 389, 'Nervous_Sys', ifelse(
ICD_3 >= 390 & ICD_3 <= 459, 'Circulatory_Sys', ifelse(
ICD_3 >= 460 & ICD_3 <= 519, 'Respiratory_Sys', ifelse(
ICD_3 >= 520 & ICD_3 <= 579, 'Digestive_Sys', ifelse(
ICD_3 >= 580 & ICD_3 <= 629, 'Genitorinary_Sys', ifelse(
ICD_3 >= 630 & ICD_3 <= 679, 'Childbirth', ifelse(
ICD_3 >= 680 & ICD_3 <= 709, 'Skin', ifelse(
ICD_3 >= 710 & ICD_3 <= 739, 'Muscoloskeletal', ifelse(
ICD_3 >= 740 & ICD_3 <= 759, 'Congenital', ifelse(
ICD_3 >= 760 & ICD_3 <= 779, 'Perinatal', ifelse(
ICD_3 >= 780 & ICD_3 <= 799, 'Poisoning', ifelse(
ICD_3 >= 800 & ICD_3 <= 999, 'External', 'Others'))))))))))))))))))
data_test$Disease <- with(data_test, ifelse(
ICD_3 >= 0 & ICD_3 <= 139, 'Infectious', ifelse(
ICD_3 >= 140 & ICD_3 <= 239, 'Neoplasms', ifelse(
ICD_3 >= 240 & ICD_3 <= 279, 'Endocrine', ifelse(
ICD_3 >= 280 & ICD_3 <= 289, 'Blood', ifelse(
ICD_3 >= 290 & ICD_3 <= 319, 'Mental', ifelse(
ICD_3 >= 320 & ICD_3 <= 389, 'Nervous_Sys', ifelse(
ICD_3 >= 390 & ICD_3 <= 459, 'Circulatory_Sys', ifelse(
ICD_3 >= 460 & ICD_3 <= 519, 'Respiratory_Sys', ifelse(
ICD_3 >= 520 & ICD_3 <= 579, 'Digestive_Sys', ifelse(
ICD_3 >= 580 & ICD_3 <= 629, 'Genitorinary_Sys', ifelse(
ICD_3 >= 630 & ICD_3 <= 679, 'Childbirth', ifelse(
ICD_3 >= 680 & ICD_3 <= 709, 'Skin', ifelse(
ICD_3 >= 710 & ICD_3 <= 739, 'Muscoloskeletal', ifelse(
ICD_3 >= 740 & ICD_3 <= 759, 'Congenital', ifelse(
ICD_3 >= 760 & ICD_3 <= 779, 'Perinatal', ifelse(
ICD_3 >= 780 & ICD_3 <= 799, 'Poisoning', ifelse(
ICD_3 >= 800 & ICD_3 <= 999, 'External', 'Others'))))))))))))))))))
You can also embed plots, for example:
aux <- as.data.frame(filter(count(data, Disease, HOSPITAL_EXPIRE_FLAG), HOSPITAL_EXPIRE_FLAG ==1))
aux2 <- as.data.frame(filter(count(data, Disease, HOSPITAL_EXPIRE_FLAG), HOSPITAL_EXPIRE_FLAG ==0))
aux <- merge(aux, aux2, by="Disease", all.y=T)
aux %<>% mutate( mortatility = 100*n.x/(n.x+n.y) )
aux <-aux[order(aux$mortatility),]
ggplot(aux, aes(x=Disease, y=mortatility))+
geom_bar(stat="identity", fill="steelblue")+
coord_flip()+
theme_minimal()
## Warning: Removed 2 rows containing missing values (position_stack).
##Fill the missing values
l_num_features <- c('HeartRate_Min','HeartRate_Max','HeartRate_Mean','SysBP_Min','SysBP_Max','SysBP_Mean','DiasBP_Min','DiasBP_Max',
'DiasBP_Mean','MeanBP_Min','MeanBP_Max','MeanBP_Mean','RespRate_Min','RespRate_Max','RespRate_Mean','TempC_Min',
'TempC_Max','TempC_Mean','SpO2_Min','SpO2_Max','SpO2_Mean','Glucose_Min','Glucose_Max','Glucose_Mean')
df_num_feautes <- data[,l_num_features]
init <- mice(df_num_feautes, meth="mean", maxit=0)
imputation <- mice(df_num_feautes, method=init$method, maxit=5, m = 3)
##
## iter imp variable
## 1 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 1 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 1 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 2 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 3 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 4 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
## 5 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max Glucose_Mean
data[,l_num_features]<-complete(imputation, 2)
plot_miss(data)
df_num_feautes <- data_test[,l_num_features] #With test dataset
init <- mice(df_num_feautes, meth="mean", maxit=0)
## Warning: Number of logged events: 1
imputation <- mice(df_num_feautes, method=init$method, maxit=5, m = 3)
##
## iter imp variable
## 1 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 1 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 1 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 2 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 3 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 4 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 1 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 2 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## 5 3 HeartRate_Min HeartRate_Max HeartRate_Mean SysBP_Min SysBP_Max SysBP_Mean DiasBP_Min DiasBP_Max DiasBP_Mean MeanBP_Min MeanBP_Max MeanBP_Mean RespRate_Min RespRate_Max RespRate_Mean TempC_Min TempC_Max TempC_Mean SpO2_Min SpO2_Max SpO2_Mean Glucose_Min Glucose_Max
## Warning: Number of logged events: 1
data_test[,l_num_features]<-complete(imputation, 2)
plot_miss(data_test)
new_data <- data
df <- data
new_data$Disease <- factor (new_data$Disease, exclude = NULL)
new_data$RELIGION <- factor (new_data$RELIGION, exclude = NULL)
dv <- caret::dummyVars(" ~ Disease + GENDER", data = new_data)
new_data <- data.frame(predict(dv, newdata = df))
data <- cbind(df, new_data)
new_data <- data_test #With test dataset
df <- data_test
new_data$Disease <- factor (new_data$Disease, exclude = NULL)
new_data$RELIGION <- factor (new_data$RELIGION, exclude = NULL)
dv <- caret::dummyVars(" ~ Disease + GENDER", data = new_data)
new_data <- data.frame(predict(dv, newdata = df))
data_test <- cbind(df, new_data)
data <- subset(data, select= -c(dis_1,dis_2,
dis_3,dis_4,
dis_5,dis_6,
dis_7,dis_8,
dis_9,dis_10,
dis_11,dis_12,
dis_13,dis_14,
dis_15,dis_16,
dis_17,dis_18,
dis_19,dis_20,dis_tot,
GENDER, ADMISSION_TYPE,INSURANCE, RELIGION, MARITAL_STATUS,
ETHNICITY, DIAGNOSIS, ICD9_diagnosis, FIRST_CAREUNIT, Disease,ICD_3
))
data_test <- subset(data_test, select= -c(dis_1,dis_2,
dis_3,dis_4,
dis_5,dis_6,
dis_7,dis_8,
dis_9,dis_10,
dis_11,dis_12,
dis_13,dis_14,
dis_15,dis_16,
dis_17,dis_18,
dis_19,dis_20,dis_tot,
GENDER, ADMISSION_TYPE,INSURANCE, RELIGION, MARITAL_STATUS,
ETHNICITY, DIAGNOSIS, ICD9_diagnosis, FIRST_CAREUNIT, Disease,ICD_3
))
new_DF <- data_test[rowSums(is.na(data_test)) > 0,]
colnames(new_DF)[colSums(is.na(new_DF)) > 0]
## [1] "Glucose_Mean"
data_test$Glucose_Mean[is.na(data_test$Glucose_Mean)] <- mean(data_test$Glucose_Mean, na.rm=TRUE)
l_num_var = names(data)
l_no_num = c("subject_id", "hadm_id",
"HOSPITAL_EXPIRE_FLAG", "icustay_id",
"Disease.Childbirth" , "Disease.Circulatory_Sys" ,
"Disease.Congenital" , "Disease.Digestive_Sys" ,
"Disease.Endocrine", "Disease.External" ,
"Disease.Genitorinary_Sys" ,"Disease.Infectious" ,
"Disease.Mental" , "Disease.Muscoloskeletal" ,
"Disease.Neoplasms" , "Disease.Nervous_Sys" ,
"Disease.Others" , "Disease.Poisoning" ,
"Disease.Respiratory_Sys", "Disease.Skin" ,
"GENDERF", "GENDERM"
)
l_num_var <- l_num_var[!(l_num_var %in% l_no_num)]
data %<>% mutate_at(l_num_var, funs(c(scale(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Plot an example
aux <- select(data,TempC_Min, Glucose_Min )#Let's sample two variables to see if
df_tidy <- gather(aux, cols, value)
ggplot(df_tidy, aes(x = value)) +
geom_density(aes(color=cols))+
ggtitle("Density plot of normalize variables")
#Scaling with test data
l_num_var = names(data_test)
l_num_var <- l_num_var[!(l_num_var %in% l_no_num)]
data_test %<>% mutate_at(l_num_var, funs(c(scale(.))))
aux <- select(data_test,TempC_Min, Glucose_Min )#Let's sample two variables to see if
df_tidy <- gather(aux, cols, value)
ggplot(df_tidy, aes(x = value)) +
geom_density(aes(color=cols))+
ggtitle("Density plot of normalize variables")
data_id <- as.data.frame(data_test$icustay_id)
data %<>% select(-subject_id , -hadm_id, -icustay_id )
data_test %<>% select(-subject_id , -hadm_id, -icustay_id)
write.csv(data,"data_train_19_Feb.csv", row.names = FALSE)
write.csv(data_id,"ID_data_test_19_Feb.csv", row.names = FALSE)
write.csv(data_test,"data_test_19_Feb.csv", row.names = FALSE)