library("ggplot2")
library("ggdendro")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
patientData <- read.csv('C:/Users/u6977/Desktop/Patient.csv')

transplantData <- read.csv('C:/Users/u6977/Desktop/Transplant.csv')


creatineData <- read.csv('C:/Users/u6977/Desktop/SerumCreatinine.csv')

transplantdata2 <- transplantData[!duplicated(transplantData$id),]

# Join patientData and transplantData
Data <- full_join(patientData,transplantdata2, by = "id")

# Extract some interesting variables
use <- c('gendercode','creatinineatentry','height','weight','smokingcode','cancerever','chroniclungcode','coronaryarterycode','peripheralvascularcode','cerebrovasularcode','diabetescode','graftno','transplantcentrestate','recipientantibodycmvcode','recipientantibodyebvcode','donorsourcecode','donorage','donorgendercode','ischaemia','ageattransplant','hlamismatchesa','hlamismatchesb','hlamismatchesdr','hlamismatchesdq','maxcytotoxicantibodies','currentcytotoxicantibodies','timeondialysis', 'transplantstatus', 'aliveperiod', 'transplantperiod')

Data <- Data[,use]

# Recode some NA
Data[Data==""]  = NA
Data[Data=="-"]  = NA

library(tidyverse)
library(dplyr)
library(naniar)

vis_miss(Data)

R Markdown

Create a vector qualitative that contains the names of the qualitative (categorical) variables in the dataset. These are the variables that will be used in further analysis.

Subset the original dataset Data using the variable names in the qualitative vector and store this new dataset in qualitative_data.

Create dummy variables for the categorical variables in the qualitative_data dataset using the dummyVars function from the caret package. This function generates a dummy variable matrix, where each categorical variable is replaced by a set of binary (0/1) variables that indicate the presence or absence of each level (category) of the original variable.

Transform the qualitative_data dataset using the generated dummies matrix. This is done using the predict function, which applies the dummy variable transformation to the dataset. The transformed dataset is stored in data_dummies.

Merge the transformed dataset data_dummies with the original dataset qualitative_data using the cbind function. This function combines the two datasets column-wise (side-by-side) and stores the resulting dataset in data_final

In summary, this R code is preparing a dataset by creating dummy variables for categorical variables and then merging the transformed data with the original data. This is a common preprocessing step when working with categorical data in statistical analysis or machine learning.

sort(colSums(is.na(Data)))
##                 gendercode                 cancerever 
##                          0                          0 
##                    graftno            ageattransplant 
##                          0                          0 
##             timeondialysis           transplantstatus 
##                          0                          0 
##                aliveperiod           transplantperiod 
##                          0                          0 
##         coronaryarterycode     peripheralvascularcode 
##                          4                          4 
##         cerebrovasularcode               diabetescode 
##                          4                          4 
##            chroniclungcode            donorsourcecode 
##                          5                          5 
##                   donorage                smokingcode 
##                         13                         15 
##      transplantcentrestate                     height 
##                         24                         25 
##                     weight   recipientantibodycmvcode 
##                         27                         35 
##   recipientantibodyebvcode             hlamismatchesa 
##                         38                         38 
##             hlamismatchesb            hlamismatchesdr 
##                         38                         41 
## currentcytotoxicantibodies            donorgendercode 
##                         53                         54 
##                  ischaemia     maxcytotoxicantibodies 
##                         59                         75 
##          creatinineatentry            hlamismatchesdq 
##                         85                        740
Data <- dplyr::select(Data,-hlamismatchesdq)
vis_miss(Data)

Data <- na.omit(Data)
Data <- droplevels(Data)
vis_miss(Data)

Including Plots

You can also embed plots, for example:

qualitative <- c('gendercode','smokingcode','cancerever','chroniclungcode','coronaryarterycode','peripheralvascularcode','cerebrovasularcode','diabetescode','donorsourcecode','donorgendercode','transplantstatus','aliveperiod')

qualitative_data <- Data[, qualitative]


dummies <- dummyVars(~., data = qualitative_data[, qualitative])

# transform the data using the dummy variables
data_dummies <- predict(dummies, newdata = qualitative_data)

# merge the transformed data with the original data
data_final <- cbind(qualitative_data, data_dummies)


# fit the multiple regression model
model2 <- lm(aliveperiod ~ gendercode+smokingcode+cancerever+chroniclungcode+coronaryarterycode+peripheralvascularcode+cerebrovasularcode+diabetescode+donorsourcecode+donorgendercode+transplantstatus+aliveperiod, data = data_final)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 12 in
## model.matrix: no columns are assigned
summary(model2)
## 
## Call:
## lm(formula = aliveperiod ~ gendercode + smokingcode + cancerever + 
##     chroniclungcode + coronaryarterycode + peripheralvascularcode + 
##     cerebrovasularcode + diabetescode + donorsourcecode + donorgendercode + 
##     transplantstatus + aliveperiod, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3261.7 -1351.2   -72.8  1158.9  4190.1 
## 
## Coefficients:
##                                                       Estimate Std. Error
## (Intercept)                                           2130.029   1709.551
## gendercodeM                                            -32.382    144.662
## smokingcodeFormer                                      319.735    266.157
## smokingcodeNever                                       244.353    247.895
## smokingcodeUnknown                                   -1816.892   1221.693
## cancereverYes                                          369.496    215.777
## chroniclungcodeS                                       574.055    782.230
## chroniclungcodeY                                       453.925    377.270
## coronaryarterycodeS                                   -770.839    435.829
## coronaryarterycodeY                                   -384.167    254.856
## peripheralvascularcodeS                               -949.002    613.201
## peripheralvascularcodeY                               -175.373    328.954
## cerebrovasularcodeS                                    411.878    659.249
## cerebrovasularcodeY                                    430.119    416.542
## diabetescodeO                                         -103.225    258.847
## diabetescodeP                                         -738.121    302.968
## diabetescodeQ                                         -950.654    290.455
## donorsourcecodeBrother                                 511.567   1730.476
## donorsourcecodeBrother-in-law                         2459.651   2451.393
## donorsourcecodeCousin                                  413.327   1795.902
## donorsourcecodeDaughter                                 55.030   1827.719
## donorsourcecodeDeceased                                  7.100   1692.329
## donorsourcecodeDirected kidney exchange              -1626.317   1887.366
## donorsourcecodeFather                                 1041.985   1721.498
## donorsourcecodeFather-in-law                          -243.510   2081.331
## donorsourcecodeFriend                                   84.581   1746.899
## donorsourcecodeGrandfather                            2177.618   2388.473
## donorsourcecodeHusband                                1426.172   1743.658
## donorsourcecodeMonzygotic (Identical Twin Boy)        2833.361   2393.245
## donorsourcecodeMonzygotic (Identical Twin Girl)       -554.680   2423.839
## donorsourcecodeMother                                  623.203   1712.093
## donorsourcecodeNiece                                  -151.752   2070.222
## donorsourcecodeNon-directed, waiting list             -716.457   1893.909
## donorsourcecodeOther related (Emotionally - specify)  1206.923   2388.468
## donorsourcecodePartner                                -879.696   1886.853
## donorsourcecodePathological                            -57.448   1912.964
## donorsourcecodeSister                                    7.092   1730.140
## donorsourcecodeSister-in-law                          1077.809   1961.270
## donorsourcecodeSon                                     558.100   1828.556
## donorsourcecodeStepfather                             1718.439   2387.144
## donorsourcecodeStepmother                             1978.980   2398.896
## donorsourcecodeUncle                                   879.134   1893.736
## donorsourcecodeWife                                    169.113   1720.105
## donorgendercodeM                                       102.561    140.368
## transplantstatus                                       328.077    161.321
##                                                      t value Pr(>|t|)   
## (Intercept)                                            1.246  0.21329   
## gendercodeM                                           -0.224  0.82295   
## smokingcodeFormer                                      1.201  0.23013   
## smokingcodeNever                                       0.986  0.32469   
## smokingcodeUnknown                                    -1.487  0.13752   
## cancereverYes                                          1.712  0.08737 . 
## chroniclungcodeS                                       0.734  0.46333   
## chroniclungcodeY                                       1.203  0.22940   
## coronaryarterycodeS                                   -1.769  0.07748 . 
## coronaryarterycodeY                                   -1.507  0.13226   
## peripheralvascularcodeS                               -1.548  0.12227   
## peripheralvascularcodeY                               -0.533  0.59416   
## cerebrovasularcodeS                                    0.625  0.53237   
## cerebrovasularcodeY                                    1.033  0.30223   
## diabetescodeO                                         -0.399  0.69020   
## diabetescodeP                                         -2.436  0.01514 * 
## diabetescodeQ                                         -3.273  0.00113 **
## donorsourcecodeBrother                                 0.296  0.76763   
## donorsourcecodeBrother-in-law                          1.003  0.31611   
## donorsourcecodeCousin                                  0.230  0.81806   
## donorsourcecodeDaughter                                0.030  0.97599   
## donorsourcecodeDeceased                                0.004  0.99665   
## donorsourcecodeDirected kidney exchange               -0.862  0.38922   
## donorsourcecodeFather                                  0.605  0.54523   
## donorsourcecodeFather-in-law                          -0.117  0.90690   
## donorsourcecodeFriend                                  0.048  0.96140   
## donorsourcecodeGrandfather                             0.912  0.36230   
## donorsourcecodeHusband                                 0.818  0.41374   
## donorsourcecodeMonzygotic (Identical Twin Boy)         1.184  0.23694   
## donorsourcecodeMonzygotic (Identical Twin Girl)       -0.229  0.81907   
## donorsourcecodeMother                                  0.364  0.71599   
## donorsourcecodeNiece                                  -0.073  0.94159   
## donorsourcecodeNon-directed, waiting list             -0.378  0.70535   
## donorsourcecodeOther related (Emotionally - specify)   0.505  0.61353   
## donorsourcecodePartner                                -0.466  0.64123   
## donorsourcecodePathological                           -0.030  0.97605   
## donorsourcecodeSister                                  0.004  0.99673   
## donorsourcecodeSister-in-law                           0.550  0.58284   
## donorsourcecodeSon                                     0.305  0.76031   
## donorsourcecodeStepfather                              0.720  0.47190   
## donorsourcecodeStepmother                              0.825  0.40974   
## donorsourcecodeUncle                                   0.464  0.64266   
## donorsourcecodeWife                                    0.098  0.92172   
## donorgendercodeM                                       0.731  0.46529   
## transplantstatus                                       2.034  0.04244 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1685 on 572 degrees of freedom
## Multiple R-squared:  0.1262, Adjusted R-squared:  0.05901 
## F-statistic: 1.878 on 44 and 572 DF,  p-value: 0.0007408
quantitative <- c('creatinineatentry','height','weight','recipientantibodycmvcode','donorage','ischaemia','ageattransplant','currentcytotoxicantibodies','timeondialysis','aliveperiod','transplantperiod')

quantitative_data <- Data[, quantitative]

## Linear Regression Model

model <- lm(aliveperiod ~creatinineatentry+height+weight+recipientantibodycmvcode+donorage+ischaemia+ageattransplant+currentcytotoxicantibodies+timeondialysis+aliveperiod+transplantperiod, data = quantitative_data)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 10 in
## model.matrix: no columns are assigned
summary(model)
## 
## Call:
## lm(formula = aliveperiod ~ creatinineatentry + height + weight + 
##     recipientantibodycmvcode + donorage + ischaemia + ageattransplant + 
##     currentcytotoxicantibodies + timeondialysis + aliveperiod + 
##     transplantperiod, data = quantitative_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -867.8 -338.4 -189.5  -11.8 5365.0 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1034.39199  366.95916   2.819  0.00498 ** 
## creatinineatentry                  -0.06426    0.08452  -0.760  0.44740    
## height                             -1.85167    2.67090  -0.693  0.48840    
## weight                              1.68572    2.22374   0.758  0.44871    
## recipientantibodycmvcodeNot Done -161.07010  813.16826  -0.198  0.84305    
## recipientantibodycmvcodePositive  -58.46302   70.68190  -0.827  0.40849    
## donorage                            3.68906    1.86095   1.982  0.04789 *  
## ischaemia                          -0.97740    5.24024  -0.187  0.85210    
## ageattransplant                   -10.45835    2.52005  -4.150  3.8e-05 ***
## currentcytotoxicantibodies          0.94083    1.95900   0.480  0.63122    
## timeondialysis                     10.74696    7.53804   1.426  0.15447    
## transplantperiod                    0.90105    0.02036  44.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 809.2 on 605 degrees of freedom
## Multiple R-squared:  0.7869, Adjusted R-squared:  0.783 
## F-statistic: 203.1 on 11 and 605 DF,  p-value: < 2.2e-16

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.