Data Cleaning for transplant surgeries

In the First Section I load libraries and import the data

# Initilization
#cat("\014") # Clear the console
rm(list=ls())
graphics.off()

# If you can not load Biocomb package, first remove Java based on the removal software from java.com
# install the java based on your R version (64 or 32 bit)
# set the system environment based by running either below codes:
# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_191') # for 64-bit version
# Sys.setenv(JAVA_HOME='C:\\Program Files (x86)\\Java\\jre1.8.0_191') # for 32-bit version

require(pacman)
pacman::p_load(dplyr,caret,foreign,
       lubridate,dataPreparation,httr, DT,stringr,AUC,parallel, testit,caretEnsemble,
       C50,
       randomForest,
       kernlab,
       e1071,DT,Biocomb,Boruta)
# Biocomb is for faster filter based feature selection

source("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/isotonic_paper_functions.R")

############################ Data Gathering #########################
# first I download the data from github and then merge them into one large file named heart.df
{
  # file1<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_1_9000L.rds?raw=true"))
  # file2<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_9001_18000L.rds?raw=true"))
  # file3<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_18001_27000L.rds?raw=true"))
  # file4<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_27001_36000L.rds?raw=true"))
  # file5<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_36001_45000L.rds?raw=true"))
  # file6<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_45001_54000L.rds?raw=true"))
  # file7<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_54001_63000L.rds?raw=true"))
  # file8<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_63001_72000L.rds?raw=true"))
  # file9<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_72001_81000L.rds?raw=true"))
  # file10<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_81001_90000L.rds?raw=true"))
  # file11<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_90001_99000L.rds?raw=true"))
  # file12<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_99001_108000L.rds?raw=true"))
  # file13<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_108001_117000L.rds?raw=true"))
  # file14<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_117001_126000L.rds?raw=true"))
  # file15<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_126001_135000L.rds?raw=true"))
  # file16<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_135001_144000L.rds?raw=true"))
  # file17<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_144001_153000L.rds?raw=true"))
  # file18<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_153001_159318L.rds?raw=true"))
  
  file1<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_1_9000L.rds"))
  file2<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_9001_18000L.rds"))
  file3<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_18001_27000L.rds"))
  file4<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_27001_36000L.rds"))
  file5<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_36001_45000L.rds"))
  file6<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_45001_54000L.rds"))
  file7<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_54001_63000L.rds"))
  file8<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_63001_72000L.rds"))
  file9<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_72001_81000L.rds"))
  file10<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_81001_90000L.rds"))
  file11<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_90001_99000L.rds"))
  file12<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_99001_108000L.rds"))
  file13<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_108001_117000L.rds"))
  file14<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_117001_126000L.rds"))
  file15<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_126001_135000L.rds"))
  file16<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_135001_144000L.rds"))
  file17<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_144001_153000L.rds"))
  file18<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_153001_159318L.rds"))
  
  
  heart.df<-rbind(file1,file2,file3,file4,file5,file6,file7,file8,file9,file10,file11,file12,file13,file14,file15,file16,
                  file17,file18)
  heart.df$ID<-row.names(heart.df)
  
  rm(list = c("file1","file2","file3","file4","file5","file6","file7","file8","file9","file10","file11","file12","file13",
              "file14","file15","file16","file17","file18"))
}
#####end of importing the data

# heart.form contains the variable definitions of each variable in data
heart.form <- read.csv("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/var_desc.csv")

In the next Section, we drop some variables and observations, based on these criterias:

variables associated to before the surgery data
variables that did not end & those added before 2000
variables that are not interesting/relevant for survival analysis (e.g. follow up number, Donor ID, Dates, and …)
patients who are older than 18 years old
patients that are associated extreme 0.1 lightest people
patients that are associated extreme 0.1 shortest people
patient who did not have info. of transplanted organ
patients who has lung transplant, too.

# Identifying the variables that are discarded for further analysis
vars_discarded <- heart.form %>% # We created a column a colum INTERPRETATION_TYPE  
  subset(INTERPRETATION_TYPE=="D", select=c(1,2))
heart.discard <- vars_discarded$VARIABLE.NAME %>% as.character()

# Identifying the variables that did not end & those added before 2000
var.end.dates <- trimws(heart.form$VAR.END.DATE) %>% str_trim()
if.var.did.not.end <- which(var.end.dates=="")
names.of.var.did.not.end <- heart.form[if.var.did.not.end, 1] 

names.of.var.did.not.end <- c(as.character(names.of.var.did.not.end),"ID")

# Identifying the variables that ended
vars_ended <-heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df)[!(names(heart.df) %in% names.of.var.did.not.end)]),(1:2)]


vars.added.dates <- heart.form$VAR.START.DATE %>% as.character.Date()
vars.added.dates[which(vars.added.dates=="01-Oct-87, 01-Oct-90")] <- "01-Oct-90"
vars.added.years<- sapply(vars.added.dates, function(x) str_extract_all(x,"[0-9]{1,2}")[[1]][2]) %>% as.integer()
heart.form$YR_ADDED <- vars.added.years
vars.added.before.2000 <- subset(heart.form,YR_ADDED>=87,
                                 select = c(1))  # available variables added before 2000
vars.added.NA <- subset(heart.form,is.na(YR_ADDED),select = c(1))
vars.added.all <- rbind(vars.added.before.2000,vars.added.NA)
vars.added.all <- vars.added.all[["VARIABLE.NAME"]] %>% 
  as.character()

vars.added.all<- c(as.character(vars.added.all),"ID")

# Subsetting the data nd getting rid of variables that are ended and extreme patients or patients that are less than 18
heart.df.cleaned <- subset(heart.df, WL_ORG=="HR") %>% # Heart 
  subset(AGE>=18) %>% # Adults only
  # we excluded too light or too short people
  subset(WGT_KG_DON_CALC >= quantile(WGT_KG_DON_CALC, 0.0001, na.rm = TRUE)) %>% 
  subset(WGT_KG_TCR >= quantile(WGT_KG_TCR, 0.0001, na.rm = TRUE)) %>%
  subset(HGT_CM_DON_CALC >= quantile(HGT_CM_DON_CALC, 0.0001, na.rm = TRUE)) %>% 
  subset(HGT_CM_TCR >= quantile(HGT_CM_TCR, 0.0001, na.rm = TRUE)) %>% 
  subset(select=intersect(names.of.var.did.not.end,vars.added.all))


#Identifying and removing the variables that are post transplant
# Based on observations from the Form and Form Section Descriptors
vars.post.trans.index1 <-  sapply(heart.form$FORM.SECTION,
                                  function(x) str_detect(x,
                                                         "POST TRANSPLANT CLINICAL INFORMATION"))
vars.post.trans.index2 <-  sapply(heart.form$FORM,
                                  function(x) str_detect(x,
                                                         "TRF/TRR|TRR/TRF-CALCULATED|TRR/TRF|TRF"))
vars_post<- heart.form[as.logical(vars.post.trans.index1+vars.post.trans.index2),][,1] %>%
  as.character()

# Identifying the post transplant variables
vars_posttrans<-heart.form[which(heart.form$VARIABLE.NAME %in% vars_post),(1:2)]

vars.post.trans <- intersect(colnames(heart.df.cleaned),
                             vars_post)
heart.df.cleaned <- select(heart.df.cleaned,-vars.post.trans)
heart.discard <- intersect(heart.discard, colnames(heart.df.cleaned))
heart.df.cleaned <- select(heart.df.cleaned, -heart.discard)

Here are the variables that in the initial check we dropped them for being not interesting/irrelavence

DT::datatable(vars_discarded)

Here are the variables that ended. Their adding date is checked, even if they dropped late (2015) but the added early so they are not intersting (e.g. PRAMR_CL2).

DT::datatable(vars_ended)

Here are the variables that are relevent to the post transplant info. So, we dropped them.

DT::datatable(vars_posttrans)

Here are a report about the remaining variables (discarded means irrelevant/not interesting variables) We lready got rid of discarded variables so their frequency is zero

rem_type<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type)<-c("Variable Type","Frequency")
rem_type$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")

org_type<-as.data.frame(table(heart.df$WL_ORG))
names(org_type)<-c("Organ Type","No. of Patients")
org_type$`Variable Type`<-c("UNKOWN","Heart & Lung","Heart","Lung")
DT::datatable(org_type)

DT::datatable(rem_type)

cat("Number of patients after dropping irrelevant patients: ",nrow(heart.df.cleaned))

## Number of patients after dropping irrelevant patients:  48442

inthe following section variables are recategorized, drived, and developed based on literature considering the pool of patients

# ref1: Medved, Dennis, et al. "Improving prediction of heart transplantation outcome using deep learning techniques." Scientific reports 8.1 (2018): 3613.
  # refer to a tool provided in the: http://ihtsa.cs.lth.se/ , which is product of this paper:
  # https://www.nature.com/articles/s41598-018-21417-7.pdf
  
  # In this paper they used these variables for recipients:
  # Diagnosis, Age, Gender, Height, Weight, insulin treated diabetes, infection within two weeks, Blood group, previous blood transfusion,
  # previously transplanted, previous cardiac surgery, intensive care unit, mechanical ventilation,
  # ECMO, ventiricular assist device, transplant era, SPP, PVR, creatinine, serum bilirubin,
  # use (mg/dl) instead of (mio mol/l), PRA>10%, HLA-DR 2 mismatch
  # Also, they used these variables for donor:
  # Age, Gender, Height, Weight, Duration of ischemia, Blood group, cause of death

  # we also used the following variables without any change:
  # Age / AGE, AGE_DON
  # Height / HGT_CM_CALC, HGT_CM_DON_CALC, HGT_CM_TCR, INIT_HGT_CM_CALC, 
  # Gender / GENDER, GENDER_DON
  # Weight / WGT_KG_DON_CALC, WGT_KG_TCR, WGT_KG_CALC, PERCENT_WGT_CHANGE (we made it)
  # infection within two weeks / INFECT_IV_DRUG_TRR
  # previous blood transfusion / TRANSFUSIONS
  # Previously transplanted / PREV_TX
  # Intensive care unit / ICU
  # ventiricular assist device / VAS
  # Mechanical ventilation / VENTILATOR_TRR
  # SPP (mmHG), systolic pulmonary pressure  / HEMO_SYS_TRR
  # Creatinine / CREAT_TRR
  # Serum bilirubin (Î¼mol/l) / TBILI
  # Use (mg/dl) instead of (Î¼mol/l) / TBILI
  # we used mg/dl
  # we did not work with PRAMR since it is added after 2004

# ref2: Dag, Ali, et al. "Predicting heart transplantation outcomes through data analytics." Decision Support Systems 94 (2017): 42-52.

# Subsetting the data and creating other variables based on literature or to capture time effects

heart.df.cleaned <- subset(heart.df.cleaned) %>%  
  #ref1
  mutate(PVR = (HEMO_PA_MN_TRR- HEMO_PCW_TRR)*79.72/HEMO_CO_TRR) %>% 
  #ref1
  mutate(ISCHTIME = ISCHTIME*60) %>% 
  #ref1
  mutate(ECMO = ifelse(ECMO_TCR + ECMO_TRR == 0, 0, 1)) %>% 
  # PVR, pulmonary vascular resistance / its calculation is based on the below mentioned links:
  # https://en.wikipedia.org/wiki/Vascular_resistance
  # http://www.scymed.com/en/smnxph/phkhr013.htm
  # https://radiopaedia.org/articles/mean-pulmonary-arterial-pressure  (calculation of Mean Pulmonary Arterial Pressure)
  # PVR= (Mean Pulmonary Arterial Pressure (mmHg) - Pulmonary Capillary Wedge Pressure (mmHg)) * 79.72 / Cardiac Output (L/min)
  # PVR = (HEMO_PA_MN_TRR - HEMO_PCW_TRR)* 79.72 / HEMO_CO_TRR

  # ECMO / merge of (ECMO_TCR, ECMO_TRR)

  # The following variables are mutated by the author
  mutate(BMI_CHNG = 100*(BMI_CALC- INIT_BMI_CALC)/INIT_BMI_CALC) %>%
  # mutate(WAITING_TIME = TX_DATE - INIT_DATE)  %>%  #no need, because it is already in there as "DAYSWAIT_CHRON"
  mutate(WGT_CHNG = 100*(WGT_KG_CALC - INIT_WGT_KG_CALC)/INIT_WGT_KG_CALC) %>%
  mutate(HGT_CHNG = 100*(HGT_CM_CALC - INIT_HGT_CM_CALC)/INIT_HGT_CM_CALC) %>%
  mutate(AGE_MAT = abs(AGE - AGE_DON)) %>%
  mutate(BMI_MAT = abs(BMI_CALC - BMI_DON_CALC))


 # DIAG was used in ref1, but it has same form with TCR_DGN and THORACIC_DGN so we recategorize all three together

  val_old<-c(1000,1001,1002,1003,1004,1005,1006,1049,1007,1200)
  val_new<-c("DILATED_MYOPATHY_IDI","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
             "DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
             "DILATED_MYOPATHY_ISC","CORONARY")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAG",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="TCR_DGN",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="THORACIC_DGN",val_old,val_new)


 #Due to discrepency, I drop these 2 variables especially natural cause for death is not same in both
 heart.df.cleaned[c("DEATH_CIRCUM_DON","DEATH_MECH_DON")]<-NULL

val_old<-c(1,2,3,4,999,"Unknown")
val_new<-c("ANOXIA","CEREBROVASCULAR_STROKE","HEAD_TRAUMA","OTHER","OTHER",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="COD_CAD_DON",val_old,val_new)
  
val_old<-c("A","A1","A2","B","O","AB","A1B","A2B")
val_new<-c("A","A","A","B","O","AB","AB","AB")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO_DON",val_old,val_new)

val_old<-c(1,2,3,4,5,998)
val_new<-c("no","one","two","other","other",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAB",val_old,val_new)

  # previous cardiac surgery /merge of (PRIOR_CARD_SURG_TCR, PRIOR_CARD_SURG_TRR)
  
  heart.df.cleaned$CARD_SURG<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i])){
      if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}
      
      if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="N"){
        
        if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
          
          if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="N"){heart.df.cleaned$CARD_SURG[i]<-"N"}
        }
      }
    }
    if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
      if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}}
  }


# the following section is associated to antigen matching we have the matching parameter which means if donor-recepient 
# number of matched antigen coming from parent (0: no match, 1: one matched, 2: both matched)
# we drop the antigen location numbers (trivial) and just keep the matching parameters
heart.df.cleaned[c("DA1","DA2","RA1","RA2","DB1","DB2","RB1","RB2","RDR1","RDR2","DDR1","DDR2")]<-NULL 
#these are the matching parameters that we keep
# HLAMIS, AMIS, BMIS , DRMIS


# refrence for HLAMIS
# Weisdorf, Daniel, et al. "Classification of HLA-matching for retrospective analysis of unrelated donor transplantation: revised definitions # to predict survival." Biology of Blood and Marrow Transplantation 14.7 (2008): 748-758.

# refrences for AMIS, BMIS, DRMIS:
# Parham, Peter. The immune system. Garland Science, 2014.
val_old<-c(0,1,2,3,4,5,6)
val_new<-c("a","a","a","b","c","f","e")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HLAMIS",val_old,val_new)

# previously, I categorized the transplant year in different groups based on the Nature paper, however Dr. Tessa believes to leave it as it is.
  # # Transplant era / TX_YEAR:
  # # I focused on after 2000. however, I can do for the rest, together or 4 different modeling
  # # -1995
  # # 1996-2000
  # # 2001-2005
  # # 2006-
  # heart.df.cleaned$TX_YEAR_CAT<-heart.df.cleaned$TX_YEAR
  # 
  # # categoirizing the transplantation date
  # for(i in 1:nrow(heart.df.cleaned)){
  #   if(!is.na(heart.df.cleaned$TX_YEAR[i])){ 
  #     if(heart.df.cleaned$TX_YEAR[i]<=1995){heart.df.cleaned$TX_YEAR_CAT[i]<-"A"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>1995 && heart.df.cleaned$TX_YEAR[i]<=2000){heart.df.cleaned$TX_YEAR_CAT[i]<-"B"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>2000 && heart.df.cleaned$TX_YEAR[i]<=2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"C"}
  #     else if(heart.df.cleaned$TX_YEAR[i]>2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"D"}
  #   }
  # }
  # heart.df.cleaned$TX_YEAR<-NULL


 # the following block is our variable manipulation based on the other literature as specified in Ali Dag's paper
  
  heart.df.cleaned$ETH_MAT<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$ETHCAT[i])){
      if(!is.na(heart.df.cleaned$ETHCAT_DON[i])){
        if(heart.df.cleaned$ETHCAT_DON[i]==heart.df.cleaned$ETHCAT[i]){
          heart.df.cleaned$ETH_MAT[i]<-"Y"}else{heart.df.cleaned$ETH_MAT[i]<-"N"}
      }
    }
  }
  
  heart.df.cleaned$GENDER_MAT<-NA
  for(i in 1:nrow(heart.df.cleaned)){
    if(!is.na(heart.df.cleaned$GENDER[i])){
      if(!is.na(heart.df.cleaned$GENDER_DON[i])){
        if(heart.df.cleaned$GENDER[i]==heart.df.cleaned$GENDER_DON[i]){
          heart.df.cleaned$GENDER_MAT[i]<-"Y"}else{heart.df.cleaned$GENDER_MAT[i]<-"N"}
      }
    }
  }  
  
  # PROC_TY_HR, from literature
  val_old<-c(1,2)
  val_new<-c("Bicaval","Traditional")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PROC_TY_HR",val_old,val_new)
  
  # SHARE_TY, ALLOCATION TYPE-LOCAL/REGIONAL/NATIONAL - 3=LOCAL/4=REGIONAL/5=NATIONAL/6=FOREIGN
  val_old<-c(3,4)
  val_new<-c("LOCAL","REGIONAL")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="SHARE_TY",val_old,val_new)
  
  # EDUCATION
  val_old<-c(1,2,3,4,5,6,996,998)
  val_new<-c("a","a","b","c","d","d",NA,NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EDUCATION",val_old,val_new)
  
  # ETHCAT, ethnicity of recepients
  val_old<-c(1,2,4,5,6,7,9,998)
  val_new<-c("w","b","h","o","o","o","o",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT_DON",val_old,val_new)
  # the following variables identifies if they are hispanic/latino or not
  # ETHNICITY
  
  # I dropped PRI_PAYMENT_CTRY_TRR and PRI_PAYMENT_CTRY_TRR because too many NAs
  heart.df.cleaned[c("PRI_PAYMENT_CTRY_TCR","PRI_PAYMENT_CTRY_TRR")]<-NULL


  val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
  val_new<-c("pv","pbma","pbmcffs","pbmoth","pbmoth","pbmoth","pbmoth","other","other","other","other","other","other","other")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TCR",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TRR",val_old,val_new)

  val_old<-c(1,2,3,4,5,6,7,8,9,10,11)
  val_new<-c("NE","NE","SE","SE","W","W","MW","MW","NE","MW","SE")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="REGION",val_old,val_new)

  # here we recategrize FUNC_STAT_TCR, based on their activitiy level and hospitalization status
  # https://www.communitycarenc.org/media/tool-resource-files/what-does-it-take-qualify-personal-care-services-d.pdf
  # http://www.npcrc.org/files/news/karnofsky_performance_scale.pdf

  val_old<-c(1,2,3,996,998,2010,2020,2030,2040,2050,2060,2070,2080,2090,2100)
  val_new<-c("A","B","B",NA,NA,"C","C","C","C","D","D","D","E","E","E")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TRR",val_old,val_new)
  # level OTHER in the next variable has low observations but it does not seem to be merged with other categories
  # I rather keep them than drop them because if this category will not important anyway and not selected as an important feature
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TCR",val_old,val_new)

In the following section variables are recategorized, drived, and developed based on studying their definitions and distributions:

Variables are recategorized after studying their values
If more than 90% of the observations for a variable is NA, that variable is dropped
IF more than 90% of the observations for a variable is in just one category, that variable is dropped

 #these variables are not useful because their (pattern of) distribution is not helpful
 # I drop maligancy (MALIG_TY,MALIG_TY_TCR) variables because the levels of categories are not distinguishable very well, also there is too many NAs
 #one way is divide it by larger categories like different cancers, but even after this the number of observations in the levels
 # will be too low.
 heart.df.cleaned[ c("WL_ID_CODE", "WL_ORG","INIT_DATE","TX_DATE","CTR_CODE","DATA_TRANSPLANT",
                     "DATA_WAITLIST","DISTANCE", "DON_RETYP","ECD_DONOR","END_OPO_CTR_CODE","HOME_STATE_DON",
                     "INIT_OPO_CTR_CODE", "INOTROP_VASO_CO_TRR","INOTROP_VASO_DIA_TRR","INOTROP_VASO_MN_TRR", 
                     "INOTROP_VASO_PCW_TRR","INOTROP_VASO_SYS_TCR","INOTROP_VASO_SYS_TRR","LISTING_CTR_CODE","LOS",
                     "MALIG_TY","MALIG_TY_TCR","OPO_CTR_CODE","ORGAN","OTH_LIFE_SUP_OSTXT_TCR","OTH_LIFE_SUP_OSTXT_TRR",
                     "PERM_STATE","PRIOR_CARD_SURG_TYPE_OSTXT_TCR","PRIOR_CARD_SURG_TYPE_OSTXT_TRR","PT_CODE",
                     "TRR_ID_CODE")]<-NULL

   # Here the NA equivalent characters would change to NA
  {
    NA_cells<-c(""," ","U")
    
    for(i in 1:length(NA_cells)){
      heart.df.cleaned[heart.df.cleaned == NA_cells[i]] <- NA
      gc()}
  }

 # here the categorical variables that are fine and we use them for feature selection
 # ABO_MAT (the last category just has 13 observations), ANTIHYPE_DON , BLOOD_INF_DON, CARDARREST_NEURO
 # CLIN_INFECT_DON, CONTIN_CIG_DON, CONTIN_COCAINE_DON, CONTIN_OTH_DRUG_DON , CRSMATCH_DONE, DDAVP_DON, DIABETES_DON
 # DIAL_AFTER_LIST , DIAL_PRIOR_TX, DIET_DON, GSTATUS, GTIME,HEPARIN_DON,HIST_CIG_DON,HIST_COCAINE_DON
 # HIST_HYPERTENS_DON, HIST_OTH_DRUG_DON, IABP_TCR ,IABP_TRR, IMPL_DEFIBRIL, INOTROPES_TCR ,INOTROPES_TRR, INOTROPIC  
 # check: https://en.wikipedia.org/wiki/Inotrope, LIFE_SUP_TRR, LIFE_SUP_TCR, LV_EJECT_METH, ,"MALIG", "MALIG_TCR", "MED_COND_TRR"
 # ONVENT,"OTH_LIFE_SUP_TCR","OTH_LIFE_SUP_TRR","OTHER_HYPERTENS_MED_DON", "OTHER_INF_DON","PRIOR_LUNG_SURG_TRR","PROTEIN_URINE",
 # "PT_DIURETICS_DON","PT_STEROIDS_DON","PT_T3_DON","PT_T4_DON","PULM_CATH_DON","PULM_INF_DON","STEROID","SUD_DEATH","TATTOOS","TRTREJ1Y" ,
 # "URINE_INF_CONF_DON","URINE_INF_DON","VASODIL_DON" ,"VENT_SUPPORT_AFTER_LIST","VENT_SUPPORT_TRR"
 # VENT_TIMEFRAME_TRR # 1: At time of transplant, 2:Within three months of transplant, 3: > three months prior to transplant
 # "VENTILATOR_TCR"
 
 #here the numerical variable the we use them for feature selection 
 # BUN_DON CREAT_DON 
 # patients number of days in different levels of ptiorities:
 # "DAYS_STAT1", "DAYS_STAT1A" , "DAYS_STAT1B"   ,  "DAYS_STAT2"
 # Slaughter, Mark S. "UNOS status of heart transplant patients supported with a left ventricular assist device: 
 # is it time to reconsider the status criteria?." Texas Heart Institute Journal 38.5 (2011): 549.
 # DAYSWAIT_CHRON, END_BMI_CALC, HEMO_CO_TCR,HEMO_PA_DIA_TCR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_SYS_TCR
 # LV_EJECT :https://www.uwhealth.org/health/topic/special/heart-failure-with-reduced-ejection-fraction-systolic-heart-failure/tx4090abc.html 
 # "PO2" ,"PO2_FIO2_DON", "SGOT_DON","SGPT_DON", "TBILI_DON", "TOT_SERUM_ALBUM"

 
  val_old<-c(1,2,3,4,5,6,7,998)
  val_new<-c("NO","BNOR","BAPS","BAOTH","BAOTH","BAOTH","BAOTH",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_LT_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_RT_DON",val_old,val_new)
  
  val_old<-c(0,1,2,3,4,5,998,999)
  val_new<-c(NA,NA,"NOR","AB","AB","ABboth",NA,NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CHEST_XRAY_DON",val_old,val_new)
  
  val_old<-c("C","I","N","ND","P","U")
  val_new<-c(NA,NA,"Neg",NA,"POS",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CMV_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EBV_SEROSTATUS",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_SUR_ANTIGEN",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HCV_SEROSTATUS",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HEP_C_ANTI_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HTLV2_OLD_DON",val_old,val_new)
  # the next one does not have SERStAT how ever after checking it's values I understand I can use same function as above 
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIV_SEROSTATUS",val_old,val_new)
  
  val_old<-c(1,2,3)
  val_new<-c("NO", "YES","YES")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CORONARY_ANGIO",val_old,val_new)
  
  val_old<-c(1,2,3,4,5,998)
  val_new<-c("NO", "YES","YES","YES","YES",NA)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIST_DIABETES_DON",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HYPERTENS_DUR_DON",val_old,val_new)
  
  # although 2099 has low frequeency I can not merge due to this paper:
  # Huang, Edmund, et al. "Incidence of conversion to active waitlist status among temporarily inactive obese renal transplant candidates." 
  # Transplantation 98.2 (2014): 177-186.
  val_old<-c(2010,2020,2030,2090,2999)
  val_new<-c("ONE", "ONE","TWO","ONE","OTHER")
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="END_STAT",val_old,val_new)
  heart.df.cleaned<-cat_changer(heart.df.cleaned,var="INIT_STAT",val_old,val_new)
  
# LAST_INACT_REASON definition , I got it from: https://www.srtr.org/requesting-srtr-data/saf-data-dictionary/
# 1: Candidate cannot be contacted
# 2: Candidate choice
# 3: Candidate work-up incomplete
# 4: Insurance issues
# 5: Medical non-compliance
# 6: Inappropriate substance use
# 7: Temporarily too sick
# 8: Temporarily too well
# 9: Weight currently inappropriate for transplant
# 10: TX'ed - removal pending UNET data correction
# 11: Inactivation due to VAD implantation and/or VAD complication
# 12: TX Pending
# 13: Physician/Surgeon unavailable
# 14: Candidate for living donor transplant only
# I regroup if it's inactive because of health issue
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
val_new<-c("ONE", "ONE","ONE","ONE","ONE", "ONE","TWO","ONE","TWO", "ONE","ONE","ONE","ONE","ONE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="LAST_INACT_REASON",val_old,val_new)

val_old<-c(0,1,2,3,4,5,6,7,8,9,10)
val_new<-c("ZERO", "MORE","MORE","MORE","MORE", "MORE","MORE","MORE","MORE", "MORE", "MORE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="NUM_PREV_TX",val_old,val_new)

# CABG: Coronary artery bypass graft
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
           "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
           "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)

# search for procurement in this form: All_Forms_eg_RH.pdf
# CABG: Coronary artery bypass graft
val_old<-c("HEPARIN","ANCEF","DOPAMINE",4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
           "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
           "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)

# I change the following medicines to "HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN", these are the most reapeted medicines
temp<-heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]
new_vars<-temp
names(new_vars)<-c("HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")
new_vars[,]<-NA

for(i in 1:nrow(temp)){
 if(row_missing_function(temp[i,])<1){new_vars[i,]<-"NO"}}


for(i in 1:nrow(temp)){
  if("HEPARIN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,1]<-"YES"}
  if("ANCEF" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,2]<-"YES"}
  if("DOPAMINE" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,3]<-"YES"}
  if("ZOSYN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
    new_vars[i,4]<-"YES"}
}

heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]<-NULL
heart.df.cleaned<-cbind(heart.df.cleaned,new_vars)


val_old<-c(1,2,3,998)
val_new<-c("ONE", "MORE","MORE",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="STERNOTOMY_TRR",val_old,val_new)

rm(list=c("new_vars"))

#===================================================================
# Here I drop columns that 90% of their data is NA
NA_Col_Rate<-col_missing_function(heart.df.cleaned)
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.9),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
 # Here are drop variables that more than 90% of the observations are in one category
 cat_dis <- vector(mode="numeric", length=ncol(heart.df.cleaned))
 for(i in 1:ncol(heart.df.cleaned)){
   struct<-as.data.frame(table(heart.df.cleaned[i]))
   max_cat<-max(struct$Freq)
   all_freq<-sum(struct$Freq)
   if((max_cat/all_freq)>0.9){cat_dis[i]<-1}
 }
 heart.df.cleaned[(cat_dis==1)]<-NULL
#===================================================================
#temp_save
# Here I define categorical and numerical variables
initial_num<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="NUM")]
mutated_num<-c("BMI_CHNG" ,"WGT_CHNG" ,"HGT_CHNG" ,"AGE_MAT","BMI_MAT","PVR")
pool_num<-c(as.character(initial_num),mutated_num)
# the below is the numerical variables in the cleaned dataset
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))]

initial_char<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="CHAR")] 
mutated_char<-c("GENDER_MAT","ETH_MAT" ,"CARD_SURG" ,"HEPARIN" ,"ANCEF" ,"DOPAMINE","ZOSYN")
pool_char<-c(as.character(initial_char),mutated_char)
# the below is the character variables in the cleaned dataset
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]

#=================================================================== 
# Here are drop the numerical variables that more than 30% of the observations are NA
# we decided not to impute numerical values so a more conservative approach is adopted
NA_Col_Rate<-col_missing_function(heart.df.cleaned[pool_num_clean])
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.3),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
# the below is the numerical variables in the cleaned dataset (I update it after dropping some variables)
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))] 

#save1

Here are a report about the remaining variables (discarded means irrelevant/not interesting variables)

rem_type2<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type2)<-c("Variable Type","Frequency")
rem_type2$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")

cat("here is the brand new variables that I developed from the dataset")

## here is the brand new variables that I developed from the dataset

str(heart.df.cleaned[,c("PVR","BMI_CHNG","WGT_CHNG","HGT_CHNG","AGE_MAT","BMI_MAT","GENDER_MAT","ETH_MAT","CARD_SURG","HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")])

## 'data.frame':    48442 obs. of  13 variables:
##  $ PVR       : num  NA NA NA NA NA ...
##  $ BMI_CHNG  : num  0 0 0 0 0 ...
##  $ WGT_CHNG  : num  0 0 0 0 0 ...
##  $ HGT_CHNG  : num  0 0 0 0 0 ...
##  $ AGE_MAT   : num  28 26 19 32 28 24 14 6 43 19 ...
##  $ BMI_MAT   : num  5.197 9.388 2.725 0.249 5.64 ...
##  $ GENDER_MAT: chr  "Y" "N" "Y" "N" ...
##  $ ETH_MAT   : chr  "N" "Y" "N" "Y" ...
##  $ CARD_SURG : chr  NA NA NA NA ...
##  $ HEPARIN   : chr  NA NA NA NA ...
##  $ ANCEF     : chr  NA NA NA NA ...
##  $ DOPAMINE  : chr  NA NA NA NA ...
##  $ ZOSYN     : chr  NA NA NA NA ...

cat("Number of patients: ",nrow(heart.df.cleaned),"& Number of variables: ",ncol(heart.df.cleaned))

## Number of patients:  48442 & Number of variables:  123

In the following section is for identifying holdout and train sets and making them ready for the last two sections which feature selection and prediction:

categorical variables are converted to dummy variables
“UNKOWN” is imputed for NA values in the categorical variables
in case of having NA for numerical values, associated row is dropped
IDs of the holdout and train sets are selected
a portion of holdout sets have same observations, so later they can be used for monotonicity evaluation

# here I made 11 consecutive dependent variables, month1, year1, year2, year3, ..., year10
{
  p_unit<-c(1/12,1,2,3,4,5,6,7,8,9,10)
  predict_length<-365
  
  heart.df.cleaned$year0<-NA
  heart.df.cleaned$year1<-NA
  heart.df.cleaned$year2<-NA
  heart.df.cleaned$year3<-NA
  heart.df.cleaned$year4<-NA
  heart.df.cleaned$year5<-NA
  heart.df.cleaned$year6<-NA
  heart.df.cleaned$year7<-NA
  heart.df.cleaned$year8<-NA
  heart.df.cleaned$year9<-NA
  heart.df.cleaned$year10<-NA
  
  heart.df.cleaned$GSTATUS<-heart.df.cleaned$GSTATUS
  heart.df.cleaned$GTIME<-heart.df.cleaned$GTIME
  
  heart.df.cleaned$GSTATUS<-as.character(heart.df.cleaned$GSTATUS)
  
  heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$GTIME),]
  
  for(i in 1:nrow(heart.df.cleaned)){
    
    heart.df.cleaned$year0[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[1],predict_length)
    heart.df.cleaned$year1[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[2],predict_length)
    heart.df.cleaned$year2[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[3],predict_length)
    heart.df.cleaned$year3[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[4],predict_length)
    heart.df.cleaned$year4[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[5],predict_length)
    heart.df.cleaned$year5[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[6],predict_length)
    heart.df.cleaned$year6[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[7],predict_length)
    heart.df.cleaned$year7[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[8],predict_length)
    heart.df.cleaned$year8[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[9],predict_length)
    heart.df.cleaned$year9[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[10],predict_length)
    heart.df.cleaned$year10[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[11],predict_length)
  }
  
  #heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$year10),]
  heart.df.cleaned$GSTATUS<-NULL
  heart.df.cleaned$GTIME<-NULL
  
}
#temp
#####################

# I update numerical and categorical variables here:
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))] 
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]

heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned[pool_num_clean]),]

# I made a new version of data and then drop the numerical NAs and change the NA in categorical data to "UNKOWN"

heart.df.cleaned_char<-heart.df.cleaned[pool_char_clean]
heart.df.cleaned_char[is.na(heart.df.cleaned_char)] <- "UNKOWN"
heart.df.cleaned_num<-heart.df.cleaned[pool_num_clean]

heart.df.cleaned<-cbind(heart.df.cleaned_num,heart.df.cleaned_char,heart.df.cleaned[c("year0", "year1", "year2", "year3",
                                                                                      "year4", "year5", "year6", "year7","year8", "year9", "year10","ID")])


pool_char_clean<-c(pool_char_clean,"year0", "year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10")


rm(list=c("heart.df.cleaned_char","heart.df.cleaned_num"))


#next part for ensuring if type of each variable is recorded corrctly
for(i in names(heart.df.cleaned)){
  if(i %in% pool_char_clean){heart.df.cleaned[i]<-as.character(heart.df.cleaned[,i])}
  if(i %in% pool_num_clean){heart.df.cleaned[i]<-as.numeric(heart.df.cleaned[,i])}
}

{
  keep_NA <- list()
  
  # Here I keep ID 
  keep_NA$ID0 <- heart.df.cleaned[!is.na(heart.df.cleaned$year0),"ID"]
  keep_NA$ID1 <- heart.df.cleaned[!is.na(heart.df.cleaned$year1),"ID"]
  keep_NA$ID2 <- heart.df.cleaned[!is.na(heart.df.cleaned$year2),"ID"]
  keep_NA$ID3 <- heart.df.cleaned[!is.na(heart.df.cleaned$year3),"ID"]
  keep_NA$ID4 <- heart.df.cleaned[!is.na(heart.df.cleaned$year4),"ID"]
  keep_NA$ID5 <- heart.df.cleaned[!is.na(heart.df.cleaned$year5),"ID"]
  keep_NA$ID6 <- heart.df.cleaned[!is.na(heart.df.cleaned$year6),"ID"]
  keep_NA$ID7 <- heart.df.cleaned[!is.na(heart.df.cleaned$year7),"ID"]
  keep_NA$ID8 <- heart.df.cleaned[!is.na(heart.df.cleaned$year8),"ID"]
  keep_NA$ID9 <- heart.df.cleaned[!is.na(heart.df.cleaned$year9),"ID"]
  keep_NA$ID10 <- heart.df.cleaned[!is.na(heart.df.cleaned$year10),"ID"]
  all_sizes <- unname(unlist(lapply(keep_NA, length)))
  test_sizes <- round(all_sizes*0.2)
  train_sizes <- all_sizes - test_sizes
  
  # I select the holdout sets that contain the year 10 test data
  # Then I exclude those IDs from each year to find IDs for other train sets
  set.seed(2019)
  keep_NA$ID_holdout10 <- sample(keep_NA$ID10,test_sizes[11])
  keep_NA$ID_holdout9 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID9, keep_NA$ID_holdout10), (test_sizes[10]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout8 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID8, keep_NA$ID_holdout10), (test_sizes[9]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout7 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID7, keep_NA$ID_holdout10), (test_sizes[8]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout6 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID6, keep_NA$ID_holdout10), (test_sizes[7]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout5 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID5, keep_NA$ID_holdout10), (test_sizes[6]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout4 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID4, keep_NA$ID_holdout10), (test_sizes[5]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout3 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID3, keep_NA$ID_holdout10), (test_sizes[4]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout2 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID2, keep_NA$ID_holdout10), (test_sizes[3]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout1 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID1, keep_NA$ID_holdout10), (test_sizes[2]-length(keep_NA$ID_holdout10))))
  keep_NA$ID_holdout0 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID0, keep_NA$ID_holdout10), (test_sizes[1]-length(keep_NA$ID_holdout10))))
  
  keep_NA$ID_train0 <- setdiff(keep_NA$ID0, keep_NA$ID_holdout0)
  keep_NA$ID_train1 <- setdiff(keep_NA$ID1, keep_NA$ID_holdout1)
  keep_NA$ID_train2 <- setdiff(keep_NA$ID2, keep_NA$ID_holdout2)
  keep_NA$ID_train3 <- setdiff(keep_NA$ID3, keep_NA$ID_holdout3)
  keep_NA$ID_train4 <- setdiff(keep_NA$ID4, keep_NA$ID_holdout4)
  keep_NA$ID_train5 <- setdiff(keep_NA$ID5, keep_NA$ID_holdout5)
  keep_NA$ID_train6 <- setdiff(keep_NA$ID6, keep_NA$ID_holdout6)
  keep_NA$ID_train7 <- setdiff(keep_NA$ID7, keep_NA$ID_holdout7)
  keep_NA$ID_train8 <- setdiff(keep_NA$ID8, keep_NA$ID_holdout8)
  keep_NA$ID_train9 <- setdiff(keep_NA$ID9, keep_NA$ID_holdout9)
  keep_NA$ID_train10 <- setdiff(keep_NA$ID10, keep_NA$ID_holdout10)
  
  # here I make dummy variables based different data sets by making dummy variables over categorical variables
  exclud<-c("year0","year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10","ID")
  var_ind_char<-pool_char_clean[!pool_char_clean %in% exclud]
  
  # the following section is for creating dummy maker variables for indipendent categorical variables
  heart.df.cleaned.dum<-dummy_maker(heart.df.cleaned,var_ind_char)
 
}

#save2

The following section is for feature selection:

Three types of feature selections is adopted (wrapper, filter, and hybrid)
Wrapper method: Random Forest “best subset”
Filter Method: Fast correlation based feature selection “correlation”
Embedded method (hybrid): LASSO

# for checking th definition of the feature selection algorithms, check this paper:
# Fonti, Valeria, and Eduard Belitser. "Feature Selection using LASSO." 
     
      features<-list()   

#=============================================================
#==============Fast Feature selection
#=============================================================
features$FFS<-list()
      
# for(i in 0:10){
#         data<-heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
#                                    keep_NA[[paste("ID",i,sep="")]],c(names(heart.df.cleaned.dum)[
#                                    !names(heart.df.cleaned.dum) %in% exclud],paste("year",i,sep=""))]
#         features$FFS[[paste("year",i,sep="")]]<-FFS_bin(data)
# }

tasks_FFS <- list(
  
  FFS_0 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_1 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_2 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_3 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_4 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                                !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_5 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_6 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_7 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_8 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_9 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                 keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                                 !names(heart.df.cleaned.dum) %in% exclud])])},
  
  FFS_10 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud])])}
  
)
out_FFS <- parallel::mclapply( 
  tasks_FFS, 
  function(f) f())

for(i in 0:10){
  features$FFS[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_FFS$FFS_",i,sep = "")))
}

#=============================================================
#==============Lasso Feature selection for Binomial TARGETS
#=============================================================
# LASSO takes more time so I do it in parallel
features$LASSO<-list()

  # i did parallel processing fo this part
  tasks_LASSO <- list(
    
    LASSO_0 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year0")],
                          yvar="year0",folds=5,trace=F,alpha=1)},
    
    LASSO_1 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year1")],
                          yvar="year1",folds=5,trace=F,alpha=1)},
    
    LASSO_2 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year2")],
                          yvar="year2",folds=5,trace=F,alpha=1)},
    
    LASSO_3 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year3")],
                          yvar="year3",folds=5,trace=F,alpha=1)},
    
    LASSO_4 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year4")],
                          yvar="year4",folds=5,trace=F,alpha=1)},

    LASSO_5 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year5")],
                          yvar="year5",folds=5,trace=F,alpha=1)},
    
    LASSO_6 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year6")],
                          yvar="year6",folds=5,trace=F,alpha=1)},
    
    LASSO_7 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year7")],
                          yvar="year7",folds=5,trace=F,alpha=1)},
    
    LASSO_8 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year8")],
                          yvar="year8",folds=5,trace=F,alpha=1)},
    
    LASSO_9 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year9")],
                          yvar="year9",folds=5,trace=F,alpha=1)},
    
    LASSO_10 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                          keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                          !names(heart.df.cleaned.dum) %in% exclud],"year10")],
                          yvar="year10",folds=5,trace=F,alpha=1)}
    
  )
  out_LASSO <- parallel::mclapply( 
    tasks_LASSO, 
    function(f) f())
  
for(i in 0:10){
    features$LASSO[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_LASSO$LASSO_",i,sep = "")))
  }
  
#=============================================
#==============Random Forest Feature Selection
#=============================================
# RF takes more time so I do it in parallel
features$RF<-list()
# feature selection with rnadom forrest takes a significant time, I already did this part and load
# from github, however by changing "pre_calculated_RF_features" to know, you can run this feature 
# selection algorithm
  
pre_calculated_RF_features<-"YES"
#pre_calculated_RF_features<-"NO"

if(pre_calculated_RF_features=="NO"){
# I did parallel processing fo this part
tasks_RF <- list(
  
  RF_0 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year0")],
                                  TARGET="year0")},
  
  RF_1 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year1")],
                                  TARGET="year1")},
  
  RF_2 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year2")],
                                  TARGET="year2")},
  
  RF_3 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year3")],
                                  TARGET="year3")},
  
  RF_4 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year4")],
                                  TARGET="year4")},
  
  RF_5 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year5")],
                                  TARGET="year5")},
  
  RF_6 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year6")],
                                  TARGET="year6")},
  
  RF_7 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year7")],
                                  TARGET="year7")},
  
  RF_8 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year8")],
                                  TARGET="year8")},
  
  RF_9 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                  keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
                                  !names(heart.df.cleaned.dum) %in% exclud],"year9")],
                                  TARGET="year9")},
  
  RF_10 = function() {RF_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in% 
                                   keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
                                   !names(heart.df.cleaned.dum) %in% exclud],"year10")],
                                   TARGET="year10")}
  
)
out_RF <- parallel::mclapply( 
  tasks_RF, 
  function(f) f())

for(i in 0:10){
  features$RF[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_RF$RF_",i,sep = "")))
}
}else{
  
  for(i in 0:10){
    
  features$RF[[paste("year",i,sep="")]]<-read.csv( paste("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/features/RF_Features_year",i,".csv",sep = ""))
                }
  
}


#now I merge all the features
features$all<-list()
for(i in 0:10){
  features$all[[paste("year",i,sep="")]]<-merge(merge(features$FFS[[paste("year",i,sep="")]],features$LASSO[[paste("year",i,sep="")]],
                                                all.x=TRUE, all.y=TRUE),features$RF[[paste("year",i,sep="")]],
                                                all.x=TRUE, all.y=TRUE)
}

#save3

Aubrun University, hamid@auburn.edu ↩
Miami University, fmegahed@miamioh.edu ↩
University of Dayton, ychen4@udayton.edu ↩

Data Cleaning for transplant surgeries

Hamidreza Ahady Dolatsara¹

Fadel M. Megahed²

Ying-Ju Tessa Chen³

December 22, 2018

In the next Section, we drop some variables and observations, based on these criterias:

In the following section variables are recategorized, drived, and developed based on studying their definitions and distributions:

In the following section is for identifying holdout and train sets and making them ready for the last two sections which feature selection and prediction:

The following section is for feature selection:

Data Cleaning for transplant surgeries

Hamidreza Ahady Dolatsara1

Fadel M. Megahed2

Ying-Ju Tessa Chen3

December 22, 2018

In the next Section, we drop some variables and observations, based on these criterias:

In the following section variables are recategorized, drived, and developed based on studying their definitions and distributions:

In the following section is for identifying holdout and train sets and making them ready for the last two sections which feature selection and prediction:

The following section is for feature selection:

Hamidreza Ahady Dolatsara¹

Fadel M. Megahed²

Ying-Ju Tessa Chen³