In the First Section I load libraries and import the data
# Initilization
#cat("\014") # Clear the console
rm(list=ls())
graphics.off()
# If you can not load Biocomb package, first remove Java based on the removal software from java.com
# install the java based on your R version (64 or 32 bit)
# set the system environment based by running either below codes:
# Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_191') # for 64-bit version
# Sys.setenv(JAVA_HOME='C:\\Program Files (x86)\\Java\\jre1.8.0_191') # for 32-bit version
require(pacman)
pacman::p_load(dplyr,caret,foreign,
lubridate,dataPreparation,httr, DT,stringr,AUC,parallel, testit,caretEnsemble,
C50,
randomForest,
kernlab,
e1071,DT,Biocomb,Boruta)
# Biocomb is for faster filter based feature selection
source("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/isotonic_paper_functions.R")
############################ Data Gathering #########################
# first I download the data from github and then merge them into one large file named heart.df
{
# file1<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_1_9000L.rds?raw=true"))
# file2<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_9001_18000L.rds?raw=true"))
# file3<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_18001_27000L.rds?raw=true"))
# file4<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_27001_36000L.rds?raw=true"))
# file5<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_36001_45000L.rds?raw=true"))
# file6<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_45001_54000L.rds?raw=true"))
# file7<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_54001_63000L.rds?raw=true"))
# file8<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_63001_72000L.rds?raw=true"))
# file9<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_72001_81000L.rds?raw=true"))
# file10<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_81001_90000L.rds?raw=true"))
# file11<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_90001_99000L.rds?raw=true"))
# file12<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_99001_108000L.rds?raw=true"))
# file13<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_108001_117000L.rds?raw=true"))
# file14<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_117001_126000L.rds?raw=true"))
# file15<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_126001_135000L.rds?raw=true"))
# file16<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_135001_144000L.rds?raw=true"))
# file17<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_144001_153000L.rds?raw=true"))
# file18<-readRDS(url("https://github.com/hamidahady/transplant/blob/master/data/raw/thoracic_data_153001_159318L.rds?raw=true"))
file1<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_1_9000L.rds"))
file2<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_9001_18000L.rds"))
file3<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_18001_27000L.rds"))
file4<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_27001_36000L.rds"))
file5<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_36001_45000L.rds"))
file6<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_45001_54000L.rds"))
file7<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_54001_63000L.rds"))
file8<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_63001_72000L.rds"))
file9<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_72001_81000L.rds"))
file10<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_81001_90000L.rds"))
file11<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_90001_99000L.rds"))
file12<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_99001_108000L.rds"))
file13<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_108001_117000L.rds"))
file14<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_117001_126000L.rds"))
file15<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_126001_135000L.rds"))
file16<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_135001_144000L.rds"))
file17<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_144001_153000L.rds"))
file18<-readRDS(("C:/Users/hza0020/Box/Transplant/Thesis/github/transplant/transplant/data/raw/thoracic_data_153001_159318L.rds"))
heart.df<-rbind(file1,file2,file3,file4,file5,file6,file7,file8,file9,file10,file11,file12,file13,file14,file15,file16,
file17,file18)
heart.df$ID<-row.names(heart.df)
rm(list = c("file1","file2","file3","file4","file5","file6","file7","file8","file9","file10","file11","file12","file13",
"file14","file15","file16","file17","file18"))
}
#####end of importing the data
# heart.form contains the variable definitions of each variable in data
heart.form <- read.csv("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/var_desc.csv")
In the next Section, we drop some variables and observations, based on these criterias:
# Identifying the variables that are discarded for further analysis
vars_discarded <- heart.form %>% # We created a column a colum INTERPRETATION_TYPE
subset(INTERPRETATION_TYPE=="D", select=c(1,2))
heart.discard <- vars_discarded$VARIABLE.NAME %>% as.character()
# Identifying the variables that did not end & those added before 2000
var.end.dates <- trimws(heart.form$VAR.END.DATE) %>% str_trim()
if.var.did.not.end <- which(var.end.dates=="")
names.of.var.did.not.end <- heart.form[if.var.did.not.end, 1]
names.of.var.did.not.end <- c(as.character(names.of.var.did.not.end),"ID")
# Identifying the variables that ended
vars_ended <-heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df)[!(names(heart.df) %in% names.of.var.did.not.end)]),(1:2)]
vars.added.dates <- heart.form$VAR.START.DATE %>% as.character.Date()
vars.added.dates[which(vars.added.dates=="01-Oct-87, 01-Oct-90")] <- "01-Oct-90"
vars.added.years<- sapply(vars.added.dates, function(x) str_extract_all(x,"[0-9]{1,2}")[[1]][2]) %>% as.integer()
heart.form$YR_ADDED <- vars.added.years
vars.added.before.2000 <- subset(heart.form,YR_ADDED>=87,
select = c(1)) # available variables added before 2000
vars.added.NA <- subset(heart.form,is.na(YR_ADDED),select = c(1))
vars.added.all <- rbind(vars.added.before.2000,vars.added.NA)
vars.added.all <- vars.added.all[["VARIABLE.NAME"]] %>%
as.character()
vars.added.all<- c(as.character(vars.added.all),"ID")
# Subsetting the data nd getting rid of variables that are ended and extreme patients or patients that are less than 18
heart.df.cleaned <- subset(heart.df, WL_ORG=="HR") %>% # Heart
subset(AGE>=18) %>% # Adults only
# we excluded too light or too short people
subset(WGT_KG_DON_CALC >= quantile(WGT_KG_DON_CALC, 0.0001, na.rm = TRUE)) %>%
subset(WGT_KG_TCR >= quantile(WGT_KG_TCR, 0.0001, na.rm = TRUE)) %>%
subset(HGT_CM_DON_CALC >= quantile(HGT_CM_DON_CALC, 0.0001, na.rm = TRUE)) %>%
subset(HGT_CM_TCR >= quantile(HGT_CM_TCR, 0.0001, na.rm = TRUE)) %>%
subset(select=intersect(names.of.var.did.not.end,vars.added.all))
#Identifying and removing the variables that are post transplant
# Based on observations from the Form and Form Section Descriptors
vars.post.trans.index1 <- sapply(heart.form$FORM.SECTION,
function(x) str_detect(x,
"POST TRANSPLANT CLINICAL INFORMATION"))
vars.post.trans.index2 <- sapply(heart.form$FORM,
function(x) str_detect(x,
"TRF/TRR|TRR/TRF-CALCULATED|TRR/TRF|TRF"))
vars_post<- heart.form[as.logical(vars.post.trans.index1+vars.post.trans.index2),][,1] %>%
as.character()
# Identifying the post transplant variables
vars_posttrans<-heart.form[which(heart.form$VARIABLE.NAME %in% vars_post),(1:2)]
vars.post.trans <- intersect(colnames(heart.df.cleaned),
vars_post)
heart.df.cleaned <- select(heart.df.cleaned,-vars.post.trans)
heart.discard <- intersect(heart.discard, colnames(heart.df.cleaned))
heart.df.cleaned <- select(heart.df.cleaned, -heart.discard)
Here are the variables that in the initial check we dropped them for being not interesting/irrelavence
DT::datatable(vars_discarded)
DT::datatable(vars_ended)
DT::datatable(vars_posttrans)
rem_type<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type)<-c("Variable Type","Frequency")
rem_type$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")
org_type<-as.data.frame(table(heart.df$WL_ORG))
names(org_type)<-c("Organ Type","No. of Patients")
org_type$`Variable Type`<-c("UNKOWN","Heart & Lung","Heart","Lung")
DT::datatable(org_type)
DT::datatable(rem_type)
cat("Number of patients after dropping irrelevant patients: ",nrow(heart.df.cleaned))
## Number of patients after dropping irrelevant patients: 48442
inthe following section variables are recategorized, drived, and developed based on literature considering the pool of patients
# ref1: Medved, Dennis, et al. "Improving prediction of heart transplantation outcome using deep learning techniques." Scientific reports 8.1 (2018): 3613.
# refer to a tool provided in the: http://ihtsa.cs.lth.se/ , which is product of this paper:
# https://www.nature.com/articles/s41598-018-21417-7.pdf
# In this paper they used these variables for recipients:
# Diagnosis, Age, Gender, Height, Weight, insulin treated diabetes, infection within two weeks, Blood group, previous blood transfusion,
# previously transplanted, previous cardiac surgery, intensive care unit, mechanical ventilation,
# ECMO, ventiricular assist device, transplant era, SPP, PVR, creatinine, serum bilirubin,
# use (mg/dl) instead of (mio mol/l), PRA>10%, HLA-DR 2 mismatch
# Also, they used these variables for donor:
# Age, Gender, Height, Weight, Duration of ischemia, Blood group, cause of death
# we also used the following variables without any change:
# Age / AGE, AGE_DON
# Height / HGT_CM_CALC, HGT_CM_DON_CALC, HGT_CM_TCR, INIT_HGT_CM_CALC,
# Gender / GENDER, GENDER_DON
# Weight / WGT_KG_DON_CALC, WGT_KG_TCR, WGT_KG_CALC, PERCENT_WGT_CHANGE (we made it)
# infection within two weeks / INFECT_IV_DRUG_TRR
# previous blood transfusion / TRANSFUSIONS
# Previously transplanted / PREV_TX
# Intensive care unit / ICU
# ventiricular assist device / VAS
# Mechanical ventilation / VENTILATOR_TRR
# SPP (mmHG), systolic pulmonary pressure / HEMO_SYS_TRR
# Creatinine / CREAT_TRR
# Serum bilirubin (μmol/l) / TBILI
# Use (mg/dl) instead of (μmol/l) / TBILI
# we used mg/dl
# we did not work with PRAMR since it is added after 2004
# ref2: Dag, Ali, et al. "Predicting heart transplantation outcomes through data analytics." Decision Support Systems 94 (2017): 42-52.
# Subsetting the data and creating other variables based on literature or to capture time effects
heart.df.cleaned <- subset(heart.df.cleaned) %>%
#ref1
mutate(PVR = (HEMO_PA_MN_TRR- HEMO_PCW_TRR)*79.72/HEMO_CO_TRR) %>%
#ref1
mutate(ISCHTIME = ISCHTIME*60) %>%
#ref1
mutate(ECMO = ifelse(ECMO_TCR + ECMO_TRR == 0, 0, 1)) %>%
# PVR, pulmonary vascular resistance / its calculation is based on the below mentioned links:
# https://en.wikipedia.org/wiki/Vascular_resistance
# http://www.scymed.com/en/smnxph/phkhr013.htm
# https://radiopaedia.org/articles/mean-pulmonary-arterial-pressure (calculation of Mean Pulmonary Arterial Pressure)
# PVR= (Mean Pulmonary Arterial Pressure (mmHg) - Pulmonary Capillary Wedge Pressure (mmHg)) * 79.72 / Cardiac Output (L/min)
# PVR = (HEMO_PA_MN_TRR - HEMO_PCW_TRR)* 79.72 / HEMO_CO_TRR
# ECMO / merge of (ECMO_TCR, ECMO_TRR)
# The following variables are mutated by the author
mutate(BMI_CHNG = 100*(BMI_CALC- INIT_BMI_CALC)/INIT_BMI_CALC) %>%
# mutate(WAITING_TIME = TX_DATE - INIT_DATE) %>% #no need, because it is already in there as "DAYSWAIT_CHRON"
mutate(WGT_CHNG = 100*(WGT_KG_CALC - INIT_WGT_KG_CALC)/INIT_WGT_KG_CALC) %>%
mutate(HGT_CHNG = 100*(HGT_CM_CALC - INIT_HGT_CM_CALC)/INIT_HGT_CM_CALC) %>%
mutate(AGE_MAT = abs(AGE - AGE_DON)) %>%
mutate(BMI_MAT = abs(BMI_CALC - BMI_DON_CALC))
# DIAG was used in ref1, but it has same form with TCR_DGN and THORACIC_DGN so we recategorize all three together
val_old<-c(1000,1001,1002,1003,1004,1005,1006,1049,1007,1200)
val_new<-c("DILATED_MYOPATHY_IDI","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
"DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH","DILATED_MYOPATHY_OTH",
"DILATED_MYOPATHY_ISC","CORONARY")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAG",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="TCR_DGN",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="THORACIC_DGN",val_old,val_new)
#Due to discrepency, I drop these 2 variables especially natural cause for death is not same in both
heart.df.cleaned[c("DEATH_CIRCUM_DON","DEATH_MECH_DON")]<-NULL
val_old<-c(1,2,3,4,999,"Unknown")
val_new<-c("ANOXIA","CEREBROVASCULAR_STROKE","HEAD_TRAUMA","OTHER","OTHER",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="COD_CAD_DON",val_old,val_new)
val_old<-c("A","A1","A2","B","O","AB","A1B","A2B")
val_new<-c("A","A","A","B","O","AB","AB","AB")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ABO_DON",val_old,val_new)
val_old<-c(1,2,3,4,5,998)
val_new<-c("no","one","two","other","other",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="DIAB",val_old,val_new)
# previous cardiac surgery /merge of (PRIOR_CARD_SURG_TCR, PRIOR_CARD_SURG_TRR)
heart.df.cleaned$CARD_SURG<-NA
for(i in 1:nrow(heart.df.cleaned)){
if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i])){
if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}
if(heart.df.cleaned$PRIOR_CARD_SURG_TCR[i]=="N"){
if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="N"){heart.df.cleaned$CARD_SURG[i]<-"N"}
}
}
}
if(!is.na(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i])){
if(heart.df.cleaned$PRIOR_CARD_SURG_TRR[i]=="Y"){heart.df.cleaned$CARD_SURG[i]<-"Y"}}
}
# the following section is associated to antigen matching we have the matching parameter which means if donor-recepient
# number of matched antigen coming from parent (0: no match, 1: one matched, 2: both matched)
# we drop the antigen location numbers (trivial) and just keep the matching parameters
heart.df.cleaned[c("DA1","DA2","RA1","RA2","DB1","DB2","RB1","RB2","RDR1","RDR2","DDR1","DDR2")]<-NULL
#these are the matching parameters that we keep
# HLAMIS, AMIS, BMIS , DRMIS
# refrence for HLAMIS
# Weisdorf, Daniel, et al. "Classification of HLA-matching for retrospective analysis of unrelated donor transplantation: revised definitions # to predict survival." Biology of Blood and Marrow Transplantation 14.7 (2008): 748-758.
# refrences for AMIS, BMIS, DRMIS:
# Parham, Peter. The immune system. Garland Science, 2014.
val_old<-c(0,1,2,3,4,5,6)
val_new<-c("a","a","a","b","c","f","e")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HLAMIS",val_old,val_new)
# previously, I categorized the transplant year in different groups based on the Nature paper, however Dr. Tessa believes to leave it as it is.
# # Transplant era / TX_YEAR:
# # I focused on after 2000. however, I can do for the rest, together or 4 different modeling
# # -1995
# # 1996-2000
# # 2001-2005
# # 2006-
# heart.df.cleaned$TX_YEAR_CAT<-heart.df.cleaned$TX_YEAR
#
# # categoirizing the transplantation date
# for(i in 1:nrow(heart.df.cleaned)){
# if(!is.na(heart.df.cleaned$TX_YEAR[i])){
# if(heart.df.cleaned$TX_YEAR[i]<=1995){heart.df.cleaned$TX_YEAR_CAT[i]<-"A"}
# else if(heart.df.cleaned$TX_YEAR[i]>1995 && heart.df.cleaned$TX_YEAR[i]<=2000){heart.df.cleaned$TX_YEAR_CAT[i]<-"B"}
# else if(heart.df.cleaned$TX_YEAR[i]>2000 && heart.df.cleaned$TX_YEAR[i]<=2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"C"}
# else if(heart.df.cleaned$TX_YEAR[i]>2005){heart.df.cleaned$TX_YEAR_CAT[i]<-"D"}
# }
# }
# heart.df.cleaned$TX_YEAR<-NULL
# the following block is our variable manipulation based on the other literature as specified in Ali Dag's paper
heart.df.cleaned$ETH_MAT<-NA
for(i in 1:nrow(heart.df.cleaned)){
if(!is.na(heart.df.cleaned$ETHCAT[i])){
if(!is.na(heart.df.cleaned$ETHCAT_DON[i])){
if(heart.df.cleaned$ETHCAT_DON[i]==heart.df.cleaned$ETHCAT[i]){
heart.df.cleaned$ETH_MAT[i]<-"Y"}else{heart.df.cleaned$ETH_MAT[i]<-"N"}
}
}
}
heart.df.cleaned$GENDER_MAT<-NA
for(i in 1:nrow(heart.df.cleaned)){
if(!is.na(heart.df.cleaned$GENDER[i])){
if(!is.na(heart.df.cleaned$GENDER_DON[i])){
if(heart.df.cleaned$GENDER[i]==heart.df.cleaned$GENDER_DON[i]){
heart.df.cleaned$GENDER_MAT[i]<-"Y"}else{heart.df.cleaned$GENDER_MAT[i]<-"N"}
}
}
}
# PROC_TY_HR, from literature
val_old<-c(1,2)
val_new<-c("Bicaval","Traditional")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PROC_TY_HR",val_old,val_new)
# SHARE_TY, ALLOCATION TYPE-LOCAL/REGIONAL/NATIONAL - 3=LOCAL/4=REGIONAL/5=NATIONAL/6=FOREIGN
val_old<-c(3,4)
val_new<-c("LOCAL","REGIONAL")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="SHARE_TY",val_old,val_new)
# EDUCATION
val_old<-c(1,2,3,4,5,6,996,998)
val_new<-c("a","a","b","c","d","d",NA,NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EDUCATION",val_old,val_new)
# ETHCAT, ethnicity of recepients
val_old<-c(1,2,4,5,6,7,9,998)
val_new<-c("w","b","h","o","o","o","o",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="ETHCAT_DON",val_old,val_new)
# the following variables identifies if they are hispanic/latino or not
# ETHNICITY
# I dropped PRI_PAYMENT_CTRY_TRR and PRI_PAYMENT_CTRY_TRR because too many NAs
heart.df.cleaned[c("PRI_PAYMENT_CTRY_TCR","PRI_PAYMENT_CTRY_TRR")]<-NULL
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
val_new<-c("pv","pbma","pbmcffs","pbmoth","pbmoth","pbmoth","pbmoth","other","other","other","other","other","other","other")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRI_PAYMENT_TRR",val_old,val_new)
val_old<-c(1,2,3,4,5,6,7,8,9,10,11)
val_new<-c("NE","NE","SE","SE","W","W","MW","MW","NE","MW","SE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="REGION",val_old,val_new)
# here we recategrize FUNC_STAT_TCR, based on their activitiy level and hospitalization status
# https://www.communitycarenc.org/media/tool-resource-files/what-does-it-take-qualify-personal-care-services-d.pdf
# http://www.npcrc.org/files/news/karnofsky_performance_scale.pdf
val_old<-c(1,2,3,996,998,2010,2020,2030,2040,2050,2060,2070,2080,2090,2100)
val_new<-c("A","B","B",NA,NA,"C","C","C","C","D","D","D","E","E","E")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TRR",val_old,val_new)
# level OTHER in the next variable has low observations but it does not seem to be merged with other categories
# I rather keep them than drop them because if this category will not important anyway and not selected as an important feature
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="FUNC_STAT_TCR",val_old,val_new)
In the following section variables are recategorized, drived, and developed based on studying their definitions and distributions:
#these variables are not useful because their (pattern of) distribution is not helpful
# I drop maligancy (MALIG_TY,MALIG_TY_TCR) variables because the levels of categories are not distinguishable very well, also there is too many NAs
#one way is divide it by larger categories like different cancers, but even after this the number of observations in the levels
# will be too low.
heart.df.cleaned[ c("WL_ID_CODE", "WL_ORG","INIT_DATE","TX_DATE","CTR_CODE","DATA_TRANSPLANT",
"DATA_WAITLIST","DISTANCE", "DON_RETYP","ECD_DONOR","END_OPO_CTR_CODE","HOME_STATE_DON",
"INIT_OPO_CTR_CODE", "INOTROP_VASO_CO_TRR","INOTROP_VASO_DIA_TRR","INOTROP_VASO_MN_TRR",
"INOTROP_VASO_PCW_TRR","INOTROP_VASO_SYS_TCR","INOTROP_VASO_SYS_TRR","LISTING_CTR_CODE","LOS",
"MALIG_TY","MALIG_TY_TCR","OPO_CTR_CODE","ORGAN","OTH_LIFE_SUP_OSTXT_TCR","OTH_LIFE_SUP_OSTXT_TRR",
"PERM_STATE","PRIOR_CARD_SURG_TYPE_OSTXT_TCR","PRIOR_CARD_SURG_TYPE_OSTXT_TRR","PT_CODE",
"TRR_ID_CODE")]<-NULL
# Here the NA equivalent characters would change to NA
{
NA_cells<-c(""," ","U")
for(i in 1:length(NA_cells)){
heart.df.cleaned[heart.df.cleaned == NA_cells[i]] <- NA
gc()}
}
# here the categorical variables that are fine and we use them for feature selection
# ABO_MAT (the last category just has 13 observations), ANTIHYPE_DON , BLOOD_INF_DON, CARDARREST_NEURO
# CLIN_INFECT_DON, CONTIN_CIG_DON, CONTIN_COCAINE_DON, CONTIN_OTH_DRUG_DON , CRSMATCH_DONE, DDAVP_DON, DIABETES_DON
# DIAL_AFTER_LIST , DIAL_PRIOR_TX, DIET_DON, GSTATUS, GTIME,HEPARIN_DON,HIST_CIG_DON,HIST_COCAINE_DON
# HIST_HYPERTENS_DON, HIST_OTH_DRUG_DON, IABP_TCR ,IABP_TRR, IMPL_DEFIBRIL, INOTROPES_TCR ,INOTROPES_TRR, INOTROPIC
# check: https://en.wikipedia.org/wiki/Inotrope, LIFE_SUP_TRR, LIFE_SUP_TCR, LV_EJECT_METH, ,"MALIG", "MALIG_TCR", "MED_COND_TRR"
# ONVENT,"OTH_LIFE_SUP_TCR","OTH_LIFE_SUP_TRR","OTHER_HYPERTENS_MED_DON", "OTHER_INF_DON","PRIOR_LUNG_SURG_TRR","PROTEIN_URINE",
# "PT_DIURETICS_DON","PT_STEROIDS_DON","PT_T3_DON","PT_T4_DON","PULM_CATH_DON","PULM_INF_DON","STEROID","SUD_DEATH","TATTOOS","TRTREJ1Y" ,
# "URINE_INF_CONF_DON","URINE_INF_DON","VASODIL_DON" ,"VENT_SUPPORT_AFTER_LIST","VENT_SUPPORT_TRR"
# VENT_TIMEFRAME_TRR # 1: At time of transplant, 2:Within three months of transplant, 3: > three months prior to transplant
# "VENTILATOR_TCR"
#here the numerical variable the we use them for feature selection
# BUN_DON CREAT_DON
# patients number of days in different levels of ptiorities:
# "DAYS_STAT1", "DAYS_STAT1A" , "DAYS_STAT1B" , "DAYS_STAT2"
# Slaughter, Mark S. "UNOS status of heart transplant patients supported with a left ventricular assist device:
# is it time to reconsider the status criteria?." Texas Heart Institute Journal 38.5 (2011): 549.
# DAYSWAIT_CHRON, END_BMI_CALC, HEMO_CO_TCR,HEMO_PA_DIA_TCR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_SYS_TCR
# LV_EJECT :https://www.uwhealth.org/health/topic/special/heart-failure-with-reduced-ejection-fraction-systolic-heart-failure/tx4090abc.html
# "PO2" ,"PO2_FIO2_DON", "SGOT_DON","SGPT_DON", "TBILI_DON", "TOT_SERUM_ALBUM"
val_old<-c(1,2,3,4,5,6,7,998)
val_new<-c("NO","BNOR","BAPS","BAOTH","BAOTH","BAOTH","BAOTH",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_LT_DON",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="BRONCHO_RT_DON",val_old,val_new)
val_old<-c(0,1,2,3,4,5,998,999)
val_new<-c(NA,NA,"NOR","AB","AB","ABboth",NA,NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CHEST_XRAY_DON",val_old,val_new)
val_old<-c("C","I","N","ND","P","U")
val_new<-c(NA,NA,"Neg",NA,"POS",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CMV_DON",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="EBV_SEROSTATUS",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_CORE_DON",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HBV_SUR_ANTIGEN",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HCV_SEROSTATUS",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HEP_C_ANTI_DON",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HTLV2_OLD_DON",val_old,val_new)
# the next one does not have SERStAT how ever after checking it's values I understand I can use same function as above
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIV_SEROSTATUS",val_old,val_new)
val_old<-c(1,2,3)
val_new<-c("NO", "YES","YES")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="CORONARY_ANGIO",val_old,val_new)
val_old<-c(1,2,3,4,5,998)
val_new<-c("NO", "YES","YES","YES","YES",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HIST_DIABETES_DON",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="HYPERTENS_DUR_DON",val_old,val_new)
# although 2099 has low frequeency I can not merge due to this paper:
# Huang, Edmund, et al. "Incidence of conversion to active waitlist status among temporarily inactive obese renal transplant candidates."
# Transplantation 98.2 (2014): 177-186.
val_old<-c(2010,2020,2030,2090,2999)
val_new<-c("ONE", "ONE","TWO","ONE","OTHER")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="END_STAT",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="INIT_STAT",val_old,val_new)
# LAST_INACT_REASON definition , I got it from: https://www.srtr.org/requesting-srtr-data/saf-data-dictionary/
# 1: Candidate cannot be contacted
# 2: Candidate choice
# 3: Candidate work-up incomplete
# 4: Insurance issues
# 5: Medical non-compliance
# 6: Inappropriate substance use
# 7: Temporarily too sick
# 8: Temporarily too well
# 9: Weight currently inappropriate for transplant
# 10: TX'ed - removal pending UNET data correction
# 11: Inactivation due to VAD implantation and/or VAD complication
# 12: TX Pending
# 13: Physician/Surgeon unavailable
# 14: Candidate for living donor transplant only
# I regroup if it's inactive because of health issue
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)
val_new<-c("ONE", "ONE","ONE","ONE","ONE", "ONE","TWO","ONE","TWO", "ONE","ONE","ONE","ONE","ONE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="LAST_INACT_REASON",val_old,val_new)
val_old<-c(0,1,2,3,4,5,6,7,8,9,10)
val_new<-c("ZERO", "MORE","MORE","MORE","MORE", "MORE","MORE","MORE","MORE", "MORE", "MORE")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="NUM_PREV_TX",val_old,val_new)
# CABG: Coronary artery bypass graft
val_old<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
"CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
"CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)
# search for procurement in this form: All_Forms_eg_RH.pdf
# CABG: Coronary artery bypass graft
val_old<-c("HEPARIN","ANCEF","DOPAMINE",4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)
val_new<-c("CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV",
"CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER",
"CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG","OTHER", "CABG","VALV", "CABG")
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TCR",val_old,val_new)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="PRIOR_CARD_SURG_TYPE_TRR",val_old,val_new)
# I change the following medicines to "HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN", these are the most reapeted medicines
temp<-heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]
new_vars<-temp
names(new_vars)<-c("HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")
new_vars[,]<-NA
for(i in 1:nrow(temp)){
if(row_missing_function(temp[i,])<1){new_vars[i,]<-"NO"}}
for(i in 1:nrow(temp)){
if("HEPARIN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
new_vars[i,1]<-"YES"}
if("ANCEF" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
new_vars[i,2]<-"YES"}
if("DOPAMINE" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
new_vars[i,3]<-"YES"}
if("ZOSYN" %in% gsub("[[:space:]]", "",c(strsplit(temp[i,1],",")[[1]],strsplit(temp[i,2],",")[[1]],strsplit(temp[i,3],",")[[1]],strsplit(temp[i,4],",")[[1]]))){
new_vars[i,4]<-"YES"}
}
heart.df.cleaned[c("PT_OTH1_OSTXT_DON","PT_OTH2_OSTXT_DON","PT_OTH3_OSTXT_DON","PT_OTH4_OSTXT_DON")]<-NULL
heart.df.cleaned<-cbind(heart.df.cleaned,new_vars)
val_old<-c(1,2,3,998)
val_new<-c("ONE", "MORE","MORE",NA)
heart.df.cleaned<-cat_changer(heart.df.cleaned,var="STERNOTOMY_TRR",val_old,val_new)
rm(list=c("new_vars"))
#===================================================================
# Here I drop columns that 90% of their data is NA
NA_Col_Rate<-col_missing_function(heart.df.cleaned)
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.9),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
# Here are drop variables that more than 90% of the observations are in one category
cat_dis <- vector(mode="numeric", length=ncol(heart.df.cleaned))
for(i in 1:ncol(heart.df.cleaned)){
struct<-as.data.frame(table(heart.df.cleaned[i]))
max_cat<-max(struct$Freq)
all_freq<-sum(struct$Freq)
if((max_cat/all_freq)>0.9){cat_dis[i]<-1}
}
heart.df.cleaned[(cat_dis==1)]<-NULL
#===================================================================
#temp_save
# Here I define categorical and numerical variables
initial_num<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="NUM")]
mutated_num<-c("BMI_CHNG" ,"WGT_CHNG" ,"HGT_CHNG" ,"AGE_MAT","BMI_MAT","PVR")
pool_num<-c(as.character(initial_num),mutated_num)
# the below is the numerical variables in the cleaned dataset
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))]
initial_char<-heart.form$VARIABLE.NAME[which(heart.form$INTERPRETATION_TYPE=="CHAR")]
mutated_char<-c("GENDER_MAT","ETH_MAT" ,"CARD_SURG" ,"HEPARIN" ,"ANCEF" ,"DOPAMINE","ZOSYN")
pool_char<-c(as.character(initial_char),mutated_char)
# the below is the character variables in the cleaned dataset
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]
#===================================================================
# Here are drop the numerical variables that more than 30% of the observations are NA
# we decided not to impute numerical values so a more conservative approach is adopted
NA_Col_Rate<-col_missing_function(heart.df.cleaned[pool_num_clean])
NA_Col_Rate$varname<-rownames(NA_Col_Rate)
NA_Col_Rate<-NA_Col_Rate[which(NA_Col_Rate$na_count_col>0.3),]
NA_Col_Rate<-NA_Col_Rate$varname
heart.df.cleaned[NA_Col_Rate]<-NULL
#===================================================================
# the below is the numerical variables in the cleaned dataset (I update it after dropping some variables)
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))]
#save1
Here are a report about the remaining variables (discarded means irrelevant/not interesting variables)
rem_type2<-as.data.frame(table(heart.form[which(heart.form$VARIABLE.NAME %in% names(heart.df.cleaned)), "INTERPRETATION_TYPE"]))
names(rem_type2)<-c("Variable Type","Frequency")
rem_type2$`Variable Type`<-c("Categorical","Initially Discarded","Date","Numerical")
cat("here is the brand new variables that I developed from the dataset")
## here is the brand new variables that I developed from the dataset
str(heart.df.cleaned[,c("PVR","BMI_CHNG","WGT_CHNG","HGT_CHNG","AGE_MAT","BMI_MAT","GENDER_MAT","ETH_MAT","CARD_SURG","HEPARIN", "ANCEF", "DOPAMINE", "ZOSYN")])
## 'data.frame': 48442 obs. of 13 variables:
## $ PVR : num NA NA NA NA NA ...
## $ BMI_CHNG : num 0 0 0 0 0 ...
## $ WGT_CHNG : num 0 0 0 0 0 ...
## $ HGT_CHNG : num 0 0 0 0 0 ...
## $ AGE_MAT : num 28 26 19 32 28 24 14 6 43 19 ...
## $ BMI_MAT : num 5.197 9.388 2.725 0.249 5.64 ...
## $ GENDER_MAT: chr "Y" "N" "Y" "N" ...
## $ ETH_MAT : chr "N" "Y" "N" "Y" ...
## $ CARD_SURG : chr NA NA NA NA ...
## $ HEPARIN : chr NA NA NA NA ...
## $ ANCEF : chr NA NA NA NA ...
## $ DOPAMINE : chr NA NA NA NA ...
## $ ZOSYN : chr NA NA NA NA ...
cat("Number of patients: ",nrow(heart.df.cleaned),"& Number of variables: ",ncol(heart.df.cleaned))
## Number of patients: 48442 & Number of variables: 123
In the following section is for identifying holdout and train sets and making them ready for the last two sections which feature selection and prediction:
# here I made 11 consecutive dependent variables, month1, year1, year2, year3, ..., year10
{
p_unit<-c(1/12,1,2,3,4,5,6,7,8,9,10)
predict_length<-365
heart.df.cleaned$year0<-NA
heart.df.cleaned$year1<-NA
heart.df.cleaned$year2<-NA
heart.df.cleaned$year3<-NA
heart.df.cleaned$year4<-NA
heart.df.cleaned$year5<-NA
heart.df.cleaned$year6<-NA
heart.df.cleaned$year7<-NA
heart.df.cleaned$year8<-NA
heart.df.cleaned$year9<-NA
heart.df.cleaned$year10<-NA
heart.df.cleaned$GSTATUS<-heart.df.cleaned$GSTATUS
heart.df.cleaned$GTIME<-heart.df.cleaned$GTIME
heart.df.cleaned$GSTATUS<-as.character(heart.df.cleaned$GSTATUS)
heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$GTIME),]
for(i in 1:nrow(heart.df.cleaned)){
heart.df.cleaned$year0[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[1],predict_length)
heart.df.cleaned$year1[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[2],predict_length)
heart.df.cleaned$year2[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[3],predict_length)
heart.df.cleaned$year3[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[4],predict_length)
heart.df.cleaned$year4[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[5],predict_length)
heart.df.cleaned$year5[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[6],predict_length)
heart.df.cleaned$year6[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[7],predict_length)
heart.df.cleaned$year7[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[8],predict_length)
heart.df.cleaned$year8[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[9],predict_length)
heart.df.cleaned$year9[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[10],predict_length)
heart.df.cleaned$year10[i]<-class_generator_bino(heart.df.cleaned$GSTATUS[i], heart.df.cleaned$GTIME[i] ,p_unit[11],predict_length)
}
#heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned$year10),]
heart.df.cleaned$GSTATUS<-NULL
heart.df.cleaned$GTIME<-NULL
}
#temp
#####################
# I update numerical and categorical variables here:
pool_num_clean<-pool_num[which(pool_num %in% names(heart.df.cleaned))]
pool_char_clean<-pool_char[which(pool_char %in% names(heart.df.cleaned))]
heart.df.cleaned<-heart.df.cleaned[complete.cases(heart.df.cleaned[pool_num_clean]),]
# I made a new version of data and then drop the numerical NAs and change the NA in categorical data to "UNKOWN"
heart.df.cleaned_char<-heart.df.cleaned[pool_char_clean]
heart.df.cleaned_char[is.na(heart.df.cleaned_char)] <- "UNKOWN"
heart.df.cleaned_num<-heart.df.cleaned[pool_num_clean]
heart.df.cleaned<-cbind(heart.df.cleaned_num,heart.df.cleaned_char,heart.df.cleaned[c("year0", "year1", "year2", "year3",
"year4", "year5", "year6", "year7","year8", "year9", "year10","ID")])
pool_char_clean<-c(pool_char_clean,"year0", "year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10")
rm(list=c("heart.df.cleaned_char","heart.df.cleaned_num"))
#next part for ensuring if type of each variable is recorded corrctly
for(i in names(heart.df.cleaned)){
if(i %in% pool_char_clean){heart.df.cleaned[i]<-as.character(heart.df.cleaned[,i])}
if(i %in% pool_num_clean){heart.df.cleaned[i]<-as.numeric(heart.df.cleaned[,i])}
}
{
keep_NA <- list()
# Here I keep ID
keep_NA$ID0 <- heart.df.cleaned[!is.na(heart.df.cleaned$year0),"ID"]
keep_NA$ID1 <- heart.df.cleaned[!is.na(heart.df.cleaned$year1),"ID"]
keep_NA$ID2 <- heart.df.cleaned[!is.na(heart.df.cleaned$year2),"ID"]
keep_NA$ID3 <- heart.df.cleaned[!is.na(heart.df.cleaned$year3),"ID"]
keep_NA$ID4 <- heart.df.cleaned[!is.na(heart.df.cleaned$year4),"ID"]
keep_NA$ID5 <- heart.df.cleaned[!is.na(heart.df.cleaned$year5),"ID"]
keep_NA$ID6 <- heart.df.cleaned[!is.na(heart.df.cleaned$year6),"ID"]
keep_NA$ID7 <- heart.df.cleaned[!is.na(heart.df.cleaned$year7),"ID"]
keep_NA$ID8 <- heart.df.cleaned[!is.na(heart.df.cleaned$year8),"ID"]
keep_NA$ID9 <- heart.df.cleaned[!is.na(heart.df.cleaned$year9),"ID"]
keep_NA$ID10 <- heart.df.cleaned[!is.na(heart.df.cleaned$year10),"ID"]
all_sizes <- unname(unlist(lapply(keep_NA, length)))
test_sizes <- round(all_sizes*0.2)
train_sizes <- all_sizes - test_sizes
# I select the holdout sets that contain the year 10 test data
# Then I exclude those IDs from each year to find IDs for other train sets
set.seed(2019)
keep_NA$ID_holdout10 <- sample(keep_NA$ID10,test_sizes[11])
keep_NA$ID_holdout9 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID9, keep_NA$ID_holdout10), (test_sizes[10]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout8 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID8, keep_NA$ID_holdout10), (test_sizes[9]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout7 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID7, keep_NA$ID_holdout10), (test_sizes[8]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout6 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID6, keep_NA$ID_holdout10), (test_sizes[7]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout5 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID5, keep_NA$ID_holdout10), (test_sizes[6]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout4 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID4, keep_NA$ID_holdout10), (test_sizes[5]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout3 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID3, keep_NA$ID_holdout10), (test_sizes[4]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout2 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID2, keep_NA$ID_holdout10), (test_sizes[3]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout1 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID1, keep_NA$ID_holdout10), (test_sizes[2]-length(keep_NA$ID_holdout10))))
keep_NA$ID_holdout0 <- c(keep_NA$ID_holdout10, sample(setdiff(keep_NA$ID0, keep_NA$ID_holdout10), (test_sizes[1]-length(keep_NA$ID_holdout10))))
keep_NA$ID_train0 <- setdiff(keep_NA$ID0, keep_NA$ID_holdout0)
keep_NA$ID_train1 <- setdiff(keep_NA$ID1, keep_NA$ID_holdout1)
keep_NA$ID_train2 <- setdiff(keep_NA$ID2, keep_NA$ID_holdout2)
keep_NA$ID_train3 <- setdiff(keep_NA$ID3, keep_NA$ID_holdout3)
keep_NA$ID_train4 <- setdiff(keep_NA$ID4, keep_NA$ID_holdout4)
keep_NA$ID_train5 <- setdiff(keep_NA$ID5, keep_NA$ID_holdout5)
keep_NA$ID_train6 <- setdiff(keep_NA$ID6, keep_NA$ID_holdout6)
keep_NA$ID_train7 <- setdiff(keep_NA$ID7, keep_NA$ID_holdout7)
keep_NA$ID_train8 <- setdiff(keep_NA$ID8, keep_NA$ID_holdout8)
keep_NA$ID_train9 <- setdiff(keep_NA$ID9, keep_NA$ID_holdout9)
keep_NA$ID_train10 <- setdiff(keep_NA$ID10, keep_NA$ID_holdout10)
# here I make dummy variables based different data sets by making dummy variables over categorical variables
exclud<-c("year0","year1", "year2", "year3", "year4", "year5", "year6", "year7","year8", "year9", "year10","ID")
var_ind_char<-pool_char_clean[!pool_char_clean %in% exclud]
# the following section is for creating dummy maker variables for indipendent categorical variables
heart.df.cleaned.dum<-dummy_maker(heart.df.cleaned,var_ind_char)
}
#save2
The following section is for feature selection:
# for checking th definition of the feature selection algorithms, check this paper:
# Fonti, Valeria, and Eduard Belitser. "Feature Selection using LASSO."
features<-list()
#=============================================================
#==============Fast Feature selection
#=============================================================
features$FFS<-list()
# for(i in 0:10){
# data<-heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
# keep_NA[[paste("ID",i,sep="")]],c(names(heart.df.cleaned.dum)[
# !names(heart.df.cleaned.dum) %in% exclud],paste("year",i,sep=""))]
# features$FFS[[paste("year",i,sep="")]]<-FFS_bin(data)
# }
tasks_FFS <- list(
FFS_0 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_1 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_2 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_3 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_4 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_5 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_6 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_7 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_8 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_9 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])},
FFS_10 = function() {FFS_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud])])}
)
out_FFS <- parallel::mclapply(
tasks_FFS,
function(f) f())
for(i in 0:10){
features$FFS[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_FFS$FFS_",i,sep = "")))
}
#=============================================================
#==============Lasso Feature selection for Binomial TARGETS
#=============================================================
# LASSO takes more time so I do it in parallel
features$LASSO<-list()
# i did parallel processing fo this part
tasks_LASSO <- list(
LASSO_0 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year0")],
yvar="year0",folds=5,trace=F,alpha=1)},
LASSO_1 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year1")],
yvar="year1",folds=5,trace=F,alpha=1)},
LASSO_2 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year2")],
yvar="year2",folds=5,trace=F,alpha=1)},
LASSO_3 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year3")],
yvar="year3",folds=5,trace=F,alpha=1)},
LASSO_4 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year4")],
yvar="year4",folds=5,trace=F,alpha=1)},
LASSO_5 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year5")],
yvar="year5",folds=5,trace=F,alpha=1)},
LASSO_6 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year6")],
yvar="year6",folds=5,trace=F,alpha=1)},
LASSO_7 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year7")],
yvar="year7",folds=5,trace=F,alpha=1)},
LASSO_8 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year8")],
yvar="year8",folds=5,trace=F,alpha=1)},
LASSO_9 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year9")],
yvar="year9",folds=5,trace=F,alpha=1)},
LASSO_10 = function() {Lasso_bin(df=heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year10")],
yvar="year10",folds=5,trace=F,alpha=1)}
)
out_LASSO <- parallel::mclapply(
tasks_LASSO,
function(f) f())
for(i in 0:10){
features$LASSO[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_LASSO$LASSO_",i,sep = "")))
}
#=============================================
#==============Random Forest Feature Selection
#=============================================
# RF takes more time so I do it in parallel
features$RF<-list()
# feature selection with rnadom forrest takes a significant time, I already did this part and load
# from github, however by changing "pre_calculated_RF_features" to know, you can run this feature
# selection algorithm
pre_calculated_RF_features<-"YES"
#pre_calculated_RF_features<-"NO"
if(pre_calculated_RF_features=="NO"){
# I did parallel processing fo this part
tasks_RF <- list(
RF_0 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID0"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year0")],
TARGET="year0")},
RF_1 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID1"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year1")],
TARGET="year1")},
RF_2 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID2"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year2")],
TARGET="year2")},
RF_3 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID3"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year3")],
TARGET="year3")},
RF_4 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID4"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year4")],
TARGET="year4")},
RF_5 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID5"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year5")],
TARGET="year5")},
RF_6 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID6"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year6")],
TARGET="year6")},
RF_7 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID7"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year7")],
TARGET="year7")},
RF_8 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID8"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year8")],
TARGET="year8")},
RF_9 = function() {RF_bin(!nheart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID9"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year9")],
TARGET="year9")},
RF_10 = function() {RF_bin(heart.df.cleaned.dum[heart.df.cleaned.dum$ID %in%
keep_NA[["ID10"]],c(names(heart.df.cleaned.dum)[
!names(heart.df.cleaned.dum) %in% exclud],"year10")],
TARGET="year10")}
)
out_RF <- parallel::mclapply(
tasks_RF,
function(f) f())
for(i in 0:10){
features$RF[[paste("year",i,sep="")]]<-eval(parse(text = paste("out_RF$RF_",i,sep = "")))
}
}else{
for(i in 0:10){
features$RF[[paste("year",i,sep="")]]<-read.csv( paste("https://raw.githubusercontent.com/hamidahady/transplant/Hamid/data/features/RF_Features_year",i,".csv",sep = ""))
}
}
#now I merge all the features
features$all<-list()
for(i in 0:10){
features$all[[paste("year",i,sep="")]]<-merge(merge(features$FFS[[paste("year",i,sep="")]],features$LASSO[[paste("year",i,sep="")]],
all.x=TRUE, all.y=TRUE),features$RF[[paste("year",i,sep="")]],
all.x=TRUE, all.y=TRUE)
}
#save3
Aubrun University, hamid@auburn.edu↩
Miami University, fmegahed@miamioh.edu↩
University of Dayton, ychen4@udayton.edu↩