In the First Section I load libraries and import the data

# Initilization
#cat("\014") # Clear the console
rm(list=ls())
graphics.off()

# If you can not load Biocomb package, first remove Java based on the removal software from java.com
# install the java based on your R version (64 or 32 bit)
# set the system environment based by running either below codes:
# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_191') # for 64-bit version
# Sys.setenv(JAVA_HOME='C:\\Program Files (x86)\\Java\\jre1.8.0_191') # for 32-bit version

require(pacman)
pacman::p_load(dplyr,caret,foreign,
       lubridate,dataPreparation,httr, DT,stringr,AUC,parallel, testit,caretEnsemble,
       C50,
       randomForest,
       kernlab,
       e1071,DT,Biocomb,Boruta)
# Biocomb is for faster filter based feature selection

source("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/isotonic_paper_functions.R")

############################ Data Gathering #########################
# first I download the data from github and then merge them into one large file named heart.df
{
  # file1<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_1_9000L.rds?raw=true"))
  # file2<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_9001_18000L.rds?raw=true"))
  # file3<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_18001_27000L.rds?raw=true"))
  # file4<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_27001_36000L.rds?raw=true"))
  # file5<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_36001_45000L.rds?raw=true"))
  # file6<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_45001_54000L.rds?raw=true"))
  # file7<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_54001_63000L.rds?raw=true"))
  # file8<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_63001_72000L.rds?raw=true"))
  # file9<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_72001_81000L.rds?raw=true"))
  # file10<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_81001_90000L.rds?raw=true"))
  # file11<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_90001_99000L.rds?raw=true"))
  # file12<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_99001_108000L.rds?raw=true"))
  # file13<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_108001_117000L.rds?raw=true"))
  # file14<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_117001_126000L.rds?raw=true"))
  # file15<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_126001_135000L.rds?raw=true"))
  # file16<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_135001_144000L.rds?raw=true"))
  # file17<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_144001_153000L.rds?raw=true"))
  # file18<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_153001_159318L.rds?raw=true"))
  
  file1<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_1_9000L.rds"))
  file2<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_9001_18000L.rds"))
  file3<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_18001_27000L.rds"))
  file4<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_27001_36000L.rds"))
  file5<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_36001_45000L.rds"))
  file6<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_45001_54000L.rds"))
  file7<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_54001_63000L.rds"))
  file8<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_63001_72000L.rds"))
  file9<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_72001_81000L.rds"))
  file10<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_81001_90000L.rds"))
  file11<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_90001_99000L.rds"))
  file12<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_99001_108000L.rds"))
  file13<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_108001_117000L.rds"))
  file14<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_117001_126000L.rds"))
  file15<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_126001_135000L.rds"))
  file16<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_135001_144000L.rds"))
  file17<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_144001_153000L.rds"))
  file18<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_153001_159318L.rds"))
  
  
  heart.df<-rbind(file1,file2,file3,file4,file5,file6,file7,file8,file9,file10,file11,file12,file13,file14,file15,file16,
                  file17,file18)
  heart.df$ID<-row.names(heart.df)
  
  rm(list = c("file1","file2","file3","file4","file5","file6","file7","file8","file9","file10","file11","file12","file13",
              "file14","file15","file16","file17","file18"))
}
#####end of importing the data

# heart.form contains the variable definitions of each variable in data
heart.form <- read.csv("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/var_desc.csv")  

In the next Section, we drop some variables and observations, based on these criterias:

  • variables associated to before the surgery data
  • variables that did not end & those added before 2000
  • variables that are not interesting/relevant for survival analysis (e.g. follow up number, Donor ID, Dates, and …)
  • patients who are older than 18 years old
  • patients that are associated extreme 0.1 lightest people
  • patients that are associated extreme 0.1 shortest people
  • patient who did not have info. of transplanted organ
  • patients who has lung transplant, too.
# Identifying the variables that are discarded for further analysis
vars_discarded <- heart.form %>% # We created a column a colum INTERPRETATION_TYPE  
  subset(INTERPRETATION_TYPE=="D", select=c(1,2))
heart.discard <- vars_discarded$VARIABLE.NAME %>% as.character()

# Identifying the variables that did not end & those added before 2000
var.end.dates <- trimws(heart.form$VAR.END.DATE) %>% str_trim()
if.var.did.not.end <- which(var.end.dates=="")
names.of.var.did.not.end <- heart.form[if.var.did.not.end, 1] 

names.of.var.did.not.end <- c(as.character(names.of.var.did.not.end),"ID")

# Identifying the variables that ended
vars_ended <-heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df)[!(names(heart.df) %in% names.of.var.did.not.end)]),(1:2)]


vars.added.dates <- heart.form$VAR.START.DATE %>% as.character.Date()
vars.added.dates[which(vars.added.dates=="01-Oct-87, 01-Oct-90")] <- "01-Oct-90"
vars.added.years<- sapply(vars.added.dates, function(x) str_extract_all(x,"[0-9]{1,2}")[[1]][2]) %>% as.integer()
heart.form$YR_ADDED <- vars.added.years
vars.added.before.2000 <- subset(heart.form,YR_ADDED>=87,
                                 select = c(1))  # available variables added before 2000
vars.added.NA <- subset(heart.form,is.na(YR_ADDED),select = c(1))
vars.added.all <- rbind(vars.added.before.2000,vars.added.NA)
vars.added.all <- vars.added.all[["VARIABLE.NAME"]] %>% 
  as.character()

vars.added.all<- c(as.character(vars.added.all),"ID")

# Subsetting the data nd getting rid of variables that are ended and extreme patients or patients that are less than 18
heart.df.cleaned <- subset(heart.df, WL_ORG=="HR") %>% # Heart 
  subset(AGE>=18) %>% # Adults only
  # we excluded too light or too short people
  subset(WGT_KG_DON_CALC >= quantile(WGT_KG_DON_CALC, 0.0001, na.rm = TRUE)) %>% 
  subset(WGT_KG_TCR >= quantile(WGT_KG_TCR, 0.0001, na.rm = TRUE)) %>%
  subset(HGT_CM_DON_CALC >= quantile(HGT_CM_DON_CALC, 0.0001, na.rm = TRUE)) %>% 
  subset(HGT_CM_TCR >= quantile(HGT_CM_TCR, 0.0001, na.rm = TRUE)) %>% 
  subset(select=intersect(names.of.var.did.not.end,vars.added.all))


#Identifying and removing the variables that are post transplant
# Based on observations from the Form and Form Section Descriptors
vars.post.trans.index1 <-  sapply(heart.form$FORM.SECTION,
                                  function(x) str_detect(x,
                                                         "POST TRANSPLANT CLINICAL INFORMATION"))
vars.post.trans.index2 <-  sapply(heart.form$FORM,
                                  function(x) str_detect(x,
                                                         "TRF/TRR|TRR/TRF-CALCULATED|TRR/TRF|TRF"))
vars_post<- heart.form[as.logical(vars.post.trans.index1+vars.post.trans.index2),][,1] %>%
  as.character()

# Identifying the post transplant variables
vars_posttrans<-heart.form[which(heart.form$VARIABLE.NAME %in% vars_post),(1:2)]

vars.post.trans <- intersect(colnames(heart.df.cleaned),
                             vars_post)
heart.df.cleaned <- select(heart.df.cleaned,-vars.post.trans)
heart.discard <- intersect(heart.discard, colnames(heart.df.cleaned))
heart.df.cleaned <- select(heart.df.cleaned, -heart.discard) 



Here are the variables that in the initial check we dropped them for being not interesting/irrelavence

DT::datatable(vars_discarded)



Here are the variables that ended. Their adding date is checked, even if they dropped late (2015) but the added early so they are not intersting (e.g. PRAMR_CL2).

DT::datatable(vars_ended)



Here are the variables that are relevent to the post transplant info. So, we dropped them.

DT::datatable(vars_posttrans)



Here are a report about the remaining variables (discarded means irrelevant/not interesting variables) We lready got rid of discarded variables so their frequency is zero

rem_type<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type)<-c("Variable Type","Frequency")
rem_type$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")

org_type<-as.data.frame(table(heart.df$WL_ORG))
names(org_type)<-c("Organ Type","No. of Patients")
org_type$`Variable Type`<-c("UNKOWN","Heart & Lung","Heart","Lung")
DT::datatable(org_type)
DT::datatable(rem_type)
cat("Number of patients after dropping irrelevant patients: ",nrow(heart.df.cleaned))
## Number of patients after dropping irrelevant patients:  48442





inthe following section variables are recategorized, drived, and developed based on literature considering the pool of patients

# ref1: Medved, Dennis, et al. "Improving prediction of heart transplantation outcome using deep learning techniques." Scientific reports 8.1 (2018): 3613.
  # refer to a tool provided in the: http://ihtsa.cs.lth.se/ , which is product of this paper:
  # https://www.nature.com/articles/s41598-018-21417-7.pdf
  
  # In this paper they used these variables for recipients:
  # Diagnosis, Age, Gender, Height, Weight, insulin treated diabetes, infection within two weeks, Blood group, previous blood transfusion,
  # previously transplanted, previous cardiac surgery, intensive care unit, mechanical ventilation,
  # ECMO, ventiricular assist device, transplant era, SPP, PVR, creatinine, serum bilirubin,
  # use (mg/dl) instead of (mio mol/l), PRA>10%, HLA-DR 2 mismatch
  # Also, they used these variables for donor:
  # Age, Gender, Height, Weight, Duration of ischemia, Blood group, cause of death

  # we also used the following variables without any change:
  # Age / AGE, AGE_DON
  # Height / HGT_CM_CALC, HGT_CM_DON_CALC, HGT_CM_TCR, INIT_HGT_CM_CALC, 
  # Gender / GENDER, GENDER_DON
  # Weight / WGT_KG_DON_CALC, WGT_KG_TCR, WGT_KG_CALC, PERCENT_WGT_CHANGE (we made it)
  # infection within two weeks / INFECT_IV_DRUG_TRR
  # previous blood transfusion / TRANSFUSIONS
  # Previously transplanted / PREV_TX
  # Intensive care unit / ICU
  # ventiricular assist device / VAS
  # Mechanical ventilation / VENTILATOR_TRR
  # SPP (mmHG), systolic pulmonary pressure  / HEMO_SYS_TRR
  # Creatinine / CREAT_TRR
  # Serum bilirubin (μmol/l) / TBILI
  # Use (mg/dl) instead of (μmol/l) / TBILI
  # we used mg/dl
  # we did not work with PRAMR since it is added after 2004

# ref2: Dag, Ali, et al. "Predicting heart transplantation outcomes through data analytics." Decision Support Systems 94 (2017): 42-52.

# Subsetting the data and creating other variables based on literature or to capture time effects

heart.df.cleaned <- subset(heart.df.cleaned) %>%  
  #ref1
  mutate(PVR = (HEMO_PA_MN_TRR- HEMO_PCW_TRR)*79.72/HEMO_CO_TRR) %>% 
  #ref1
  mutate(ISCHTIME = ISCHTIME*60) %>% 
  #ref1
  mutate(ECMO = ifelse(ECMO_TCR + ECMO_TRR == 0, 0, 1)) %>% 
  # PVR, pulmonary vascular resistance / its calculation is based on the below mentioned links:
  # https://en.wikipedia.org/wiki/Vascular_resistance
  # http://www.scymed.com/en/smnxph/phkhr013.htm
  # https://radiopaedia.org/articles/mean-pulmonary-arterial-pressure  (calculation of Mean Pulmonary Arterial Pressure)
  # PVR= (Mean Pulmonary Arterial Pressure (mmHg) - Pulmonary Capillary Wedge Pressure (mmHg)) * 79.72 / Cardiac Output (L/min)
  # PVR = (HEMO_PA_MN_TRR - HEMO_PCW_TRR)* 79.72 / HEMO_CO_TRR

  # ECMO / merge of (ECMO_TCR, ECMO_TRR)

  # The following variables are mutated by the author
  mutate(BMI_CHNG = 100*(BMI_CALC- INIT_BMI_CALC)/INIT_BMI_CALC) %>%
  # mutate(WAITING_TIME = TX_DATE - INIT_DATE)  %>%  #no need, because it is already in there as "DAYSWAIT_CHRON"
  mutate(WGT_CHNG = 100*(WGT_KG_CALC - INIT_WGT_KG_CALC)/INIT_WGT_KG_CALC) %>%
  mutate(HGT_CHNG = 100*(HGT_CM_CALC - INIT_HGT_CM_CALC)/INIT_HGT_CM_CALC) %>%
  mutate(AGE_MAT = abs(AGE - AGE_DON)) %>%
  mutate(BMI_MAT = abs(BMI_CALC - BMI_DON_CALC))


 # DIAG was used in ref1, but it has same form with TCR_DGN and THORACIC_DGN so we recategorize all three together

  val_old<-c(1000,1001,1002,1003,1004,1005,1006,1049,1007,1200)
  val_new<-c("DILATED_MYOPATHY_IDI","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
             "DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
             "DILATED_MYOPATHY_ISC","CORONARY")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAG",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="TCR_DGN",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="THORACIC_DGN",val_old,val_new)


 #Due to discrepency, I drop these 2 variables especially natural cause for death is not same in both
 heart.df.cleaned[c("DEATH_CIRCUM_DON","DEATH_MECH_DON")]<-NULL

val_old<-c(1,2,3,4,999,"Unknown")
val_new<-c("ANOXIA","CEREBROVASCULAR_STROKE","HEAD_TRAUMA","OTHER","OTHER",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="COD_CAD_DON",val_old,val_new)
  
val_old<-c("A","A1","A2","B","O","AB","A1B","A2B")
val_new<-c("A","A","A","B","O","AB","AB","AB")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO_DON",val_old,val_new)

val_old<-c(1,2,3,4,5,998)
val_new<-c("no","one","two","other","other",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAB",val_old,val_new)

  # previous cardiac surgery /merge of (PRIOR_CARD_SURG_TCR, PRIOR_CARD_SURG_TRR)
  
  heart.df.cleaned$CARD_SURG<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i])){
      if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}
      
      if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="N"){
        
        if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
          
          if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="N"){heart.df.cleaned$CARD_SURG[i]<-"N"}
        }
      }
    }
    if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
      if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}}
  }


# the following section is associated to antigen matching we have the matching parameter which means if donor-recepient 
# number of matched antigen coming from parent (0: no match, 1: one matched, 2: both matched)
# we drop the antigen location numbers (trivial) and just keep the matching parameters
heart.df.cleaned[c("DA1","DA2","RA1","RA2","DB1","DB2","RB1","RB2","RDR1","RDR2","DDR1","DDR2")]<-NULL 
#these are the matching parameters that we keep
# HLAMIS, AMIS, BMIS , DRMIS


# refrence for HLAMIS
# Weisdorf, Daniel, et al. "Classification of HLA-matching for retrospective analysis of unrelated donor transplantation: revised definitions # to predict survival." Biology of Blood and Marrow Transplantation 14.7 (2008): 748-758.

# refrences for AMIS, BMIS, DRMIS:
# Parham, Peter. The immune system. Garland Science, 2014.
val_old<-c(0,1,2,3,4,5,6)
val_new<-c("a","a","a","b","c","f","e")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HLAMIS",val_old,val_new)

# previously, I categorized the transplant year in different groups based on the Nature paper, however Dr. Tessa believes to leave it as it is.
  # # Transplant era / TX_YEAR:
  # # I focused on after 2000. however, I can do for the rest, together or 4 different modeling
  # # -1995
  # # 1996-2000
  # # 2001-2005
  # # 2006-
  # heart.df.cleaned$TX_YEAR_CAT<-heart.df.cleaned$TX_YEAR
  # 
  # # categoirizing the transplantation date
  # for(i in 1:nrow(heart.df.cleaned)){
  #   if(!is.na(heart.df.cleaned$TX_YEAR[i])){ 
  #     if(heart.df.cleaned$TX_YEAR[i]<=1995){heart.df.cleaned$TX_YEAR_CAT[i]<-"A"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>1995 && heart.df.cleaned$TX_YEAR[i]<=2000){heart.df.cleaned$TX_YEAR_CAT[i]<-"B"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>2000 && heart.df.cleaned$TX_YEAR[i]<=2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"C"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"D"}
  #   }
  # }
  # heart.df.cleaned$TX_YEAR<-NULL


 # the following block is our variable manipulation based on the other literature as specified in Ali Dag's paper
  
  heart.df.cleaned$ETH_MAT<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$ETHCAT[i])){
      if(!is.na(heart.df.cleaned$ETHCAT_DON[i])){
        if(heart.df.cleaned$ETHCAT_DON[i]==heart.df.cleaned$ETHCAT[i]){
          heart.df.cleaned$ETH_MAT[i]<-"Y"}else{heart.df.cleaned$ETH_MAT[i]<-"N"}
      }
    }
  }
  
  heart.df.cleaned$GENDER_MAT<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$GENDER[i])){
      if(!is.na(heart.df.cleaned$GENDER_DON[i])){
        if(heart.df.cleaned$GENDER[i]==heart.df.cleaned$GENDER_DON[i]){
          heart.df.cleaned$GENDER_MAT[i]<-"Y"}else{heart.df.cleaned$GENDER_MAT[i]<-"N"}
      }
    }
  }  
  
  # PROC_TY_HR, from literature
  val_old<-c(1,2)
  val_new<-c("Bicaval","Traditional")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PROC_TY_HR",val_old,val_new)
  
  # SHARE_TY, ALLOCATION TYPE-LOCAL/REGIONAL/NATIONAL - 3=LOCAL/4=REGIONAL/5=NATIONAL/6=FOREIGN
  val_old<-c(3,4)
  val_new<-c("LOCAL","REGIONAL")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="SHARE_TY",val_old,val_new)
  
  # EDUCATION
  val_old<-c(1,2,3,4,5,6,996,998)
  val_new<-c("a","a","b","c","d","d",NA,NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EDUCATION",val_old,val_new)
  
  # ETHCAT, ethnicity of recepients
  val_old<-c(1,2,4,5,6,7,9,998)
  val_new<-c("w","b","h","o","o","o","o",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT_DON",val_old,val_new)
  # the following variables identifies if they are hispanic/latino or not
  # ETHNICITY
  
  # I dropped PRI_PAYMENT_CTRY_TRR and PRI_PAYMENT_CTRY_TRR because too many NAs
  heart.df.cleaned[c("PRI_PAYMENT_CTRY_TCR","PRI_PAYMENT_CTRY_TRR")]<-NULL


  val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
  val_new<-c("pv","pbma","pbmcffs","pbmoth","pbmoth","pbmoth","pbmoth","other","other","other","other","other","other","other")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TCR",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TRR",val_old,val_new)

  val_old<-c(1,2,3,4,5,6,7,8,9,10,11)
  val_new<-c("NE","NE","SE","SE","W","W","MW","MW","NE","MW","SE")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="REGION",val_old,val_new)

  # here we recategrize FUNC_STAT_TCR, based on their activitiy level and hospitalization status
  # https://www.communitycarenc.org/media/tool-resource-files/what-does-it-take-qualify-personal-care-services-d.pdf
  # http://www.npcrc.org/files/news/karnofsky_performance_scale.pdf

  val_old<-c(1,2,3,996,998,2010,2020,2030,2040,2050,2060,2070,2080,2090,2100)
  val_new<-c("A","B","B",NA,NA,"C","C","C","C","D","D","D","E","E","E")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TRR",val_old,val_new)
  # level OTHER in the next variable has low observations but it does not seem to be merged with other categories
  # I rather keep them than drop them because if this category will not important anyway and not selected as an important feature
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TCR",val_old,val_new)

In the following section variables are recategorized, drived, and developed based on studying their definitions and distributions:

  • Variables are recategorized after studying their values
  • If more than 90% of the observations for a variable is NA, that variable is dropped
  • IF more than 90% of the observations for a variable is in just one category, that variable is dropped
 #these variables are not useful because their (pattern of) distribution is not helpful
 # I drop maligancy (MALIG_TY,MALIG_TY_TCR) variables because the levels of categories are not distinguishable very well, also there is too many NAs
 #one way is divide it by larger categories like different cancers, but even after this the number of observations in the levels
 # will be too low.
 heart.df.cleaned[ c("WL_ID_CODE", "WL_ORG","INIT_DATE","TX_DATE","CTR_CODE","DATA_TRANSPLANT",
                     "DATA_WAITLIST","DISTANCE", "DON_RETYP","ECD_DONOR","END_OPO_CTR_CODE","HOME_STATE_DON",
                     "INIT_OPO_CTR_CODE", "INOTROP_VASO_CO_TRR","INOTROP_VASO_DIA_TRR","INOTROP_VASO_MN_TRR", 
                     "INOTROP_VASO_PCW_TRR","INOTROP_VASO_SYS_TCR","INOTROP_VASO_SYS_TRR","LISTING_CTR_CODE","LOS",
                     "MALIG_TY","MALIG_TY_TCR","OPO_CTR_CODE","ORGAN","OTH_LIFE_SUP_OSTXT_TCR","OTH_LIFE_SUP_OSTXT_TRR",
                     "PERM_STATE","PRIOR_CARD_SURG_TYPE_OSTXT_TCR","PRIOR_CARD_SURG_TYPE_OSTXT_TRR","PT_CODE",
                     "TRR_ID_CODE")]<-NULL

   # Here the NA equivalent characters would change to NA
  {
    NA_cells<-c(""," ","U")
    
    for(i in 1:length(NA_cells)){
      heart.df.cleaned[heart.df.cleaned == NA_cells[i]] <- NA
      gc()}
  }

 # here the categorical variables that are fine and we use them for feature selection
 # ABO_MAT (the last category just has 13 observations), ANTIHYPE_DON , BLOOD_INF_DON, CARDARREST_NEURO
 # CLIN_INFECT_DON, CONTIN_CIG_DON, CONTIN_COCAINE_DON, CONTIN_OTH_DRUG_DON , CRSMATCH_DONE, DDAVP_DON, DIABETES_DON
 # DIAL_AFTER_LIST , DIAL_PRIOR_TX, DIET_DON, GSTATUS, GTIME,HEPARIN_DON,HIST_CIG_DON,HIST_COCAINE_DON
 # HIST_HYPERTENS_DON, HIST_OTH_DRUG_DON, IABP_TCR ,IABP_TRR, IMPL_DEFIBRIL, INOTROPES_TCR ,INOTROPES_TRR, INOTROPIC  
 # check: https://en.wikipedia.org/wiki/Inotrope, LIFE_SUP_TRR, LIFE_SUP_TCR, LV_EJECT_METH, ,"MALIG", "MALIG_TCR", "MED_COND_TRR"
 # ONVENT,"OTH_LIFE_SUP_TCR","OTH_LIFE_SUP_TRR","OTHER_HYPERTENS_MED_DON", "OTHER_INF_DON","PRIOR_LUNG_SURG_TRR","PROTEIN_URINE",
 # "PT_DIURETICS_DON","PT_STEROIDS_DON","PT_T3_DON","PT_T4_DON","PULM_CATH_DON","PULM_INF_DON","STEROID","SUD_DEATH","TATTOOS","TRTREJ1Y" ,
 # "URINE_INF_CONF_DON","URINE_INF_DON","VASODIL_DON" ,"VENT_SUPPORT_AFTER_LIST","VENT_SUPPORT_TRR"
 # VENT_TIMEFRAME_TRR # 1: At time of transplant, 2:Within three months of transplant, 3: > three months prior to transplant
 # "VENTILATOR_TCR"
 
 #here the numerical variable the we use them for feature selection 
 # BUN_DON CREAT_DON 
 # patients number of days in different levels of ptiorities:
 # "DAYS_STAT1", "DAYS_STAT1A" , "DAYS_STAT1B"   ,  "DAYS_STAT2"
 # Slaughter, Mark S. "UNOS status of heart transplant patients supported with a left ventricular assist device: 
 # is it time to reconsider the status criteria?." Texas Heart Institute Journal 38.5 (2011): 549.
 # DAYSWAIT_CHRON, END_BMI_CALC, HEMO_CO_TCR,HEMO_PA_DIA_TCR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_SYS_TCR
 # LV_EJECT :https://www.uwhealth.org/health/topic/special/heart-failure-with-reduced-ejection-fraction-systolic-heart-failure/tx4090abc.html 
 # "PO2" ,"PO2_FIO2_DON", "SGOT_DON","SGPT_DON", "TBILI_DON", "TOT_SERUM_ALBUM"

 
  val_old<-c(1,2,3,4,5,6,7,998)
  val_new<-c("NO","BNOR","BAPS","BAOTH","BAOTH","BAOTH","BAOTH",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_LT_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_RT_DON",val_old,val_new)
  
  val_old<-c(0,1,2,3,4,5,998,999)
  val_new<-c(NA,NA,"NOR","AB","AB","ABboth",NA,NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CHEST_XRAY_DON",val_old,val_new)
  
  val_old<-c("C","I","N","ND","P","U")
  val_new<-c(NA,NA,"Neg",NA,"POS",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CMV_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EBV_SEROSTATUS",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_SUR_ANTIGEN",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HCV_SEROSTATUS",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HEP_C_ANTI_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HTLV2_OLD_DON",val_old,val_new)
  # the next one does not have SERStAT how ever after checking it's values I understand I can use same function as above 
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIV_SEROSTATUS",val_old,val_new)
  
  val_old<-c(1,2,3)
  val_new<-c("NO", "YES","YES")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CORONARY_ANGIO",val_old,val_new)
  
  val_old<-c(1,2,3,4,5,998)
  val_new<-c("NO", "YES","YES","YES","YES",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIST_DIABETES_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HYPERTENS_DUR_DON",val_old,val_new)
  
  # although 2099 has low frequeency I can not merge due to this paper:
  # Huang, Edmund, et al. "Incidence of conversion to active waitlist status among temporarily inactive obese renal transplant candidates." 
  # Transplantation 98.2 (2014): 177-186.
  val_old<-c(2010,2020,2030,2090,2999)
  val_new<-c("ONE", "ONE","TWO","ONE","OTHER")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="END_STAT",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="INIT_STAT",val_old,val_new)
  
# LAST_INACT_REASON definition , I got it from: https://www.srtr.org/requesting-srtr-data/saf-data-dictionary/
# 1: Candidate cannot be contacted
# 2: Candidate choice
# 3: Candidate work-up incomplete
# 4: Insurance issues
# 5: Medical non-compliance
# 6: Inappropriate substance use
# 7: Temporarily too sick
# 8: Temporarily too well
# 9: Weight currently inappropriate for transplant
# 10: TX'ed - removal pending UNET data correction
# 11: Inactivation due to VAD implantation and/or VAD complication
# 12: TX Pending
# 13: Physician/Surgeon unavailable
# 14: Candidate for living donor transplant only
# I regroup if it's inactive because of health issue
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
val_new<-c("ONE", "ONE","ONE","ONE","ONE", "ONE","TWO","ONE","TWO", "ONE","ONE","ONE","ONE","ONE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="LAST_INACT_REASON",val_old,val_new)

val_old<-c(0,1,2,3,4,5,6,7,8,9,10)
val_new<-c("ZERO", "MORE","MORE","MORE","MORE", "MORE","MORE","MORE","MORE", "MORE", "MORE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="NUM_PREV_TX",val_old,val_new)

# CABG: Coronary artery bypass graft
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
           "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
           "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)

# search for procurement in this form: All_Forms_eg_RH.pdf
# CABG: Coronary artery bypass graft
val_old<-c("HEPARIN","ANCEF","DOPAMINE",4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
           "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
           "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)

# I change the following medicines to "HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN", these are the most reapeted medicines
temp<-heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]
new_vars<-temp
names(new_vars)<-c("HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")
new_vars[,]<-NA

for(i in 1:nrow(temp)){
 if(row_missing_function(temp[i,])<1){new_vars[i,]<-"NO"}}


for(i in 1:nrow(temp)){
  if("HEPARIN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,1]<-"YES"}
  if("ANCEF" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,2]<-"YES"}
  if("DOPAMINE" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,3]<-"YES"}
  if("ZOSYN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,4]<-"YES"}
}

heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]<-NULL
heart.df.cleaned<-cbind(heart.df.cleaned,new_vars)


val_old<-c(1,2,3,998)
val_new<-c("ONE", "MORE","MORE",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="STERNOTOMY_TRR",val_old,val_new)

rm(list=c("new_vars"))

#===================================================================
# Here I drop columns that 90% of their data is NA
NA_Col_Rate<-col_missing_function(heart.df.cleaned)
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.9),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
 # Here are drop variables that more than 90% of the observations are in one category
 cat_dis <- vector(mode="numeric", length=ncol(heart.df.cleaned))
 for(i in 1:ncol(heart.df.cleaned)){
   struct<-as.data.frame(table(heart.df.cleaned[i]))
   max_cat<-max(struct$Freq)
   all_freq<-sum(struct$Freq)
   if((max_cat/all_freq)>0.9){cat_dis[i]<-1}
 }
 heart.df.cleaned[(cat_dis==1)]<-NULL
#===================================================================
#temp_save
# Here I define categorical and numerical variables
initial_num<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="NUM")]
mutated_num<-c("BMI_CHNG" ,"WGT_CHNG" ,"HGT_CHNG" ,"AGE_MAT","BMI_MAT","PVR")
pool_num<-c(as.character(initial_num),mutated_num)
# the below is the numerical variables in the cleaned dataset
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))]

initial_char<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="CHAR")] 
mutated_char<-c("GENDER_MAT","ETH_MAT" ,"CARD_SURG" ,"HEPARIN" ,"ANCEF" ,"DOPAMINE","ZOSYN")
pool_char<-c(as.character(initial_char),mutated_char)
# the below is the character variables in the cleaned dataset
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]

#=================================================================== 
# Here are drop the numerical variables that more than 30% of the observations are NA
# we decided not to impute numerical values so a more conservative approach is adopted
NA_Col_Rate<-col_missing_function(heart.df.cleaned[pool_num_clean])
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.3),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
# the below is the numerical variables in the cleaned dataset (I update it after dropping some variables)
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))] 

#save1





Here are a report about the remaining variables (discarded means irrelevant/not interesting variables)

rem_type2<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type2)<-c("Variable Type","Frequency")
rem_type2$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")

cat("here is the brand new variables that I developed from the dataset")
## here is the brand new variables that I developed from the dataset
str(heart.df.cleaned[,c("PVR","BMI_CHNG","WGT_CHNG","HGT_CHNG","AGE_MAT","BMI_MAT","GENDER_MAT","ETH_MAT","CARD_SURG","HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")])
## 'data.frame':    48442 obs. of  13 variables:
##  $ PVR       : num  NA NA NA NA NA ...
##  $ BMI_CHNG  : num  0 0 0 0 0 ...
##  $ WGT_CHNG  : num  0 0 0 0 0 ...
##  $ HGT_CHNG  : num  0 0 0 0 0 ...
##  $ AGE_MAT   : num  28 26 19 32 28 24 14 6 43 19 ...
##  $ BMI_MAT   : num  5.197 9.388 2.725 0.249 5.64 ...
##  $ GENDER_MAT: chr  "Y" "N" "Y" "N" ...
##  $ ETH_MAT   : chr  "N" "Y" "N" "Y" ...
##  $ CARD_SURG : chr  NA NA NA NA ...
##  $ HEPARIN   : chr  NA NA NA NA ...
##  $ ANCEF     : chr  NA NA NA NA ...
##  $ DOPAMINE  : chr  NA NA NA NA ...
##  $ ZOSYN     : chr  NA NA NA NA ...
cat("Number of patients: ",nrow(heart.df.cleaned),"& Number of variables: ",ncol(heart.df.cleaned))
## Number of patients:  48442 & Number of variables:  123

In the following section is for identifying holdout and train sets and making them ready for the last two sections which feature selection and prediction:

  • categorical variables are converted to dummy variables
  • “UNKOWN” is imputed for NA values in the categorical variables
  • in case of having NA for numerical values, associated row is dropped
  • IDs of the holdout and train sets are selected
  • a portion of holdout sets have same observations, so later they can be used for monotonicity evaluation
# here I made 11 consecutive dependent variables, month1, year1, year2, year3, ..., year10
{
  p_unit<-c(1/12,1,2,3,4,5,6,7,8,9,10)
  predict_length<-365
  
  heart.df.cleaned$year0<-NA
  heart.df.cleaned$year1<-NA
  heart.df.cleaned$year2<-NA
  heart.df.cleaned$year3<-NA
  heart.df.cleaned$year4<-NA
  heart.df.cleaned$year5<-NA
  heart.df.cleaned$year6<-NA
  heart.df.cleaned$year7<-NA
  heart.df.cleaned$year8<-NA
  heart.df.cleaned$year9<-NA
  heart.df.cleaned$year10<-NA
  
  heart.df.cleaned$GSTATUS<-heart.df.cleaned$GSTATUS
  heart.df.cleaned$GTIME<-heart.df.cleaned$GTIME
  
  heart.df.cleaned$GSTATUS<-as.character(heart.df.cleaned$GSTATUS)
  
  heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$GTIME),]
  
  for(i in 1:nrow(heart.df.cleaned)){
    
    heart.df.cleaned$year0[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[1],predict_length)
    heart.df.cleaned$year1[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[2],predict_length)
    heart.df.cleaned$year2[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[3],predict_length)
    heart.df.cleaned$year3[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[4],predict_length)
    heart.df.cleaned$year4[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[5],predict_length)
    heart.df.cleaned$year5[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[6],predict_length)
    heart.df.cleaned$year6[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[7],predict_length)
    heart.df.cleaned$year7[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[8],predict_length)
    heart.df.cleaned$year8[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[9],predict_length)
    heart.df.cleaned$year9[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[10],predict_length)
    heart.df.cleaned$year10[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[11],predict_length)
  }
  
  #heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$year10),]
  heart.df.cleaned$GSTATUS<-NULL
  heart.df.cleaned$GTIME<-NULL
  
}
#temp
#####################

# I update numerical and categorical variables here:
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))] 
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]

heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned[pool_num_clean]),]

# I made a new version of data and then drop the numerical NAs and change the NA in categorical data to "UNKOWN"

heart.df.cleaned_char<-heart.df.cleaned[pool_char_clean]
heart.df.cleaned_char[is.na(heart.df.cleaned_char)] <- "UNKOWN"
heart.df.cleaned_num<-heart.df.cleaned[pool_num_clean]

heart.df.cleaned<-cbind(heart.df.cleaned_num,heart.df.cleaned_char,heart.df.cleaned[c("year0", "year1", "year2", "year3",
                                                                                      "year4", "year5", "year6", "year7","year8", "year9", "year10","ID")])


pool_char_clean<-c(pool_char_clean,"year0", "year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10")


rm(list=c("heart.df.cleaned_char","heart.df.cleaned_num"))


#next part for ensuring if type of each variable is recorded corrctly
for(i in names(heart.df.cleaned)){
  if(i %in% pool_char_clean){heart.df.cleaned[i]<-as.character(heart.df.cleaned[,i])}
  if(i %in% pool_num_clean){heart.df.cleaned[i]<-as.numeric(heart.df.cleaned[,i])}
}

{
  keep_NA <- list()
  
  # Here I keep ID 
  keep_NA$ID0 <- heart.df.cleaned[!is.na(heart.df.cleaned$year0),"ID"]
  keep_NA$ID1 <- heart.df.cleaned[!is.na(heart.df.cleaned$year1),"ID"]
  keep_NA$ID2 <- heart.df.cleaned[!is.na(heart.df.cleaned$year2),"ID"]
  keep_NA$ID3 <- heart.df.cleaned[!is.na(heart.df.cleaned$year3),"ID"]
  keep_NA$ID4 <- heart.df.cleaned[!is.na(heart.df.cleaned$year4),"ID"]
  keep_NA$ID5 <- heart.df.cleaned[!is.na(heart.df.cleaned$year5),"ID"]
  keep_NA$ID6 <- heart.df.cleaned[!is.na(heart.df.cleaned$year6),"ID"]
  keep_NA$ID7 <- heart.df.cleaned[!is.na(heart.df.cleaned$year7),"ID"]
  keep_NA$ID8 <- heart.df.cleaned[!is.na(heart.df.cleaned$year8),"ID"]
  keep_NA$ID9 <- heart.df.cleaned[!is.na(heart.df.cleaned$year9),"ID"]
  keep_NA$ID10 <- heart.df.cleaned[!is.na(heart.df.cleaned$year10),"ID"]
  all_sizes <- unname(unlist(lapply(keep_NA, length)))
  test_sizes <- round(all_sizes*0.2)
  train_sizes <- all_sizes - test_sizes
  
  # I select the holdout sets that contain the year 10 test data
  # Then I exclude those IDs from each year to find IDs for other train sets
  set.seed(2019)
  keep_NA$ID_holdout10 <- sample(keep_NA$ID10,test_sizes[11])
  keep_NA$ID_holdout9 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID9, keep_NA$ID_holdout10), (test_sizes[10]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout8 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID8, keep_NA$ID_holdout10), (test_sizes[9]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout7 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID7, keep_NA$ID_holdout10), (test_sizes[8]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout6 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID6, keep_NA$ID_holdout10), (test_sizes[7]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout5 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID5, keep_NA$ID_holdout10), (test_sizes[6]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout4 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID4, keep_NA$ID_holdout10), (test_sizes[5]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout3 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID3, keep_NA$ID_holdout10), (test_sizes[4]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout2 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID2, keep_NA$ID_holdout10), (test_sizes[3]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout1 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID1, keep_NA$ID_holdout10), (test_sizes[2]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout0 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID0, keep_NA$ID_holdout10), (test_sizes[1]-length(keep_NA$ID_holdout10))))
  
  keep_NA$ID_train0 <- setdiff(keep_NA$ID0, keep_NA$ID_holdout0)
  keep_NA$ID_train1 <- setdiff(keep_NA$ID1, keep_NA$ID_holdout1)
  keep_NA$ID_train2 <- setdiff(keep_NA$ID2, keep_NA$ID_holdout2)
  keep_NA$ID_train3 <- setdiff(keep_NA$ID3, keep_NA$ID_holdout3)
  keep_NA$ID_train4 <- setdiff(keep_NA$ID4, keep_NA$ID_holdout4)
  keep_NA$ID_train5 <- setdiff(keep_NA$ID5, keep_NA$ID_holdout5)
  keep_NA$ID_train6 <- setdiff(keep_NA$ID6, keep_NA$ID_holdout6)
  keep_NA$ID_train7 <- setdiff(keep_NA$ID7, keep_NA$ID_holdout7)
  keep_NA$ID_train8 <- setdiff(keep_NA$ID8, keep_NA$ID_holdout8)
  keep_NA$ID_train9 <- setdiff(keep_NA$ID9, keep_NA$ID_holdout9)
  keep_NA$ID_train10 <- setdiff(keep_NA$ID10, keep_NA$ID_holdout10)
  
  # here I make dummy variables based different data sets by making dummy variables over categorical variables
  exclud<-c("year0","year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10","ID")
  var_ind_char<-pool_char_clean[!pool_char_clean %in% exclud]
  
  # the following section is for creating dummy maker variables for indipendent categorical variables
  heart.df.cleaned.dum<-dummy_maker(heart.df.cleaned,var_ind_char)
 
}

#save2

The following section is for feature selection:

  • Three types of feature selections is adopted (wrapper, filter, and hybrid)
  • Wrapper method: Random Forest “best subset”
  • Filter Method: Fast correlation based feature selection “correlation”
  • Embedded method (hybrid): LASSO
# for checking th definition of the feature selection algorithms, check this paper:
# Fonti, Valeria, and Eduard Belitser. "Feature Selection using LASSO." 
     
      features<-list()   

#=============================================================
#==============Fast Feature selection
#=============================================================
features$FFS<-list()
      
# for(i in 0:10){
#         data<-heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
#                                    keep_NA[[paste("ID",i,sep="")]],c(names(heart.df.cleaned.dum)[
#                                    !names(heart.df.cleaned.dum) %in% exclud],paste("year",i,sep=""))]
#         features$FFS[[paste("year",i,sep="")]]<-FFS_bin(data)
# }

tasks_FFS <- list(
  
  FFS_0 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_1 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_2 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_3 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_4 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                                !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_5 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_6 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_7 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_8 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_9 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_10 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud])])}
  
)
out_FFS <- parallel::mclapply( 
  tasks_FFS, 
  function(f) f())

for(i in 0:10){
  features$FFS[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_FFS$FFS_",i,sep = "")))
}

#=============================================================
#==============Lasso Feature selection for Binomial TARGETS
#=============================================================
# LASSO takes more time so I do it in parallel
features$LASSO<-list()

  # i did parallel processing fo this part
  tasks_LASSO <- list(
    
    LASSO_0 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year0")],
                          yvar="year0",folds=5,trace=F,alpha=1)},
    
    LASSO_1 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year1")],
                          yvar="year1",folds=5,trace=F,alpha=1)},
    
    LASSO_2 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year2")],
                          yvar="year2",folds=5,trace=F,alpha=1)},
    
    LASSO_3 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year3")],
                          yvar="year3",folds=5,trace=F,alpha=1)},
    
    LASSO_4 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year4")],
                          yvar="year4",folds=5,trace=F,alpha=1)},

    LASSO_5 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year5")],
                          yvar="year5",folds=5,trace=F,alpha=1)},
    
    LASSO_6 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year6")],
                          yvar="year6",folds=5,trace=F,alpha=1)},
    
    LASSO_7 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year7")],
                          yvar="year7",folds=5,trace=F,alpha=1)},
    
    LASSO_8 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year8")],
                          yvar="year8",folds=5,trace=F,alpha=1)},
    
    LASSO_9 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year9")],
                          yvar="year9",folds=5,trace=F,alpha=1)},
    
    LASSO_10 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year10")],
                          yvar="year10",folds=5,trace=F,alpha=1)}
    
  )
  out_LASSO <- parallel::mclapply( 
    tasks_LASSO, 
    function(f) f())
  
for(i in 0:10){
    features$LASSO[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_LASSO$LASSO_",i,sep = "")))
  }
  
#=============================================
#==============Random Forest Feature Selection
#=============================================
# RF takes more time so I do it in parallel
features$RF<-list()
# feature selection with rnadom forrest takes a significant time, I already did this part and load
# from github, however by changing "pre_calculated_RF_features" to know, you can run this feature 
# selection algorithm
  
pre_calculated_RF_features<-"YES"
#pre_calculated_RF_features<-"NO"

if(pre_calculated_RF_features=="NO"){
# I did parallel processing fo this part
tasks_RF <- list(
  
  RF_0 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year0")],
                                  TARGET="year0")},
  
  RF_1 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year1")],
                                  TARGET="year1")},
  
  RF_2 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year2")],
                                  TARGET="year2")},
  
  RF_3 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year3")],
                                  TARGET="year3")},
  
  RF_4 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year4")],
                                  TARGET="year4")},
  
  RF_5 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year5")],
                                  TARGET="year5")},
  
  RF_6 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year6")],
                                  TARGET="year6")},
  
  RF_7 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year7")],
                                  TARGET="year7")},
  
  RF_8 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year8")],
                                  TARGET="year8")},
  
  RF_9 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year9")],
                                  TARGET="year9")},
  
  RF_10 = function() {RF_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                   keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                                   !names(heart.df.cleaned.dum) %in% exclud],"year10")],
                                   TARGET="year10")}
  
)
out_RF <- parallel::mclapply( 
  tasks_RF, 
  function(f) f())

for(i in 0:10){
  features$RF[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_RF$RF_",i,sep = "")))
}
}else{
  
  for(i in 0:10){
    
  features$RF[[paste("year",i,sep="")]]<-read.csv( paste("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/features/RF_Features_year",i,".csv",sep = ""))
                }
  
}


#now I merge all the features
features$all<-list()
for(i in 0:10){
  features$all[[paste("year",i,sep="")]]<-merge(merge(features$FFS[[paste("year",i,sep="")]],features$LASSO[[paste("year",i,sep="")]],
                                                all.x=TRUE, all.y=TRUE),features$RF[[paste("year",i,sep="")]],
                                                all.x=TRUE, all.y=TRUE)
}

#save3

  1. Aubrun University, hamid@auburn.edu

  2. Miami University, fmegahed@miamioh.edu

  3. University of Dayton, ychen4@udayton.edu