HCC Data Survival Prediction with Logistic Regression

Load used Packages

#Load Packages
library(dplyr)
library(reshape2)
library(corrplot)
library(ggplot2)
library(gmodels)
library(caret) 
library(class)
library(rpart)
library(arm)
library(rpart.plot)
library(glmnet)
library(data.table)
library(mltools)

Load the Dataset

#Set Work Directory
#setwd("directory")

#Read Dataset
hccdata <- read.csv("hcc-data.csv", h = TRUE, sep = ",", na.strings = "?", dec = '.',
                    col.names = c('Gender', 'Symptoms', 'Alcohol',
                                  'HBsAg', 'HBeAg', 'HBcAb', 'HCVAb',
                                  'Cirrhosis', 'Endemic','Smoking',
                                  'Diabetes',   'Obesity','Hemochro',
                                  'AHT', 'CRI', 'HIV', 'NASH',
                                  'Varices', 'Spleno','PHT', 'PVT',
                                  'Metastasis',  'Hallmark', 'Age',
                                  'Grams_day', 'Packs_year', 'PS',
                                  'Encephalopathy', 'Ascites', 'INR',
                                  'AFP', 'Hemoglobin', 'MCV', 'Leucocytes',
                                  'Platelets', 'Albumin', 'Total_Bil',
                                  'ALT', 'AST', 'GGT', 'ALP', 'TP',
                                  'Creatinine', 'Nodules','Major_Dim',
                                  'Dir_Bil', 'Iron', 'Sat', 'Ferritin',
                                  'Class'))
#Look at the Dataset Organization
#str(hccdata)
#summary(hccdata)
#dim(hccdata)

Exploratory Data Analysis

#Check NAs
sapply(hccdata, function(x) sum(is.na(x)))

##         Gender       Symptoms        Alcohol          HBsAg          HBeAg 
##              0             18              0             17             39 
##          HBcAb          HCVAb      Cirrhosis        Endemic        Smoking 
##             24              9              0             39             41 
##       Diabetes        Obesity       Hemochro            AHT            CRI 
##              3              9             23              3              2 
##            HIV           NASH        Varices         Spleno            PHT 
##             14             22             52             15             11 
##            PVT     Metastasis       Hallmark            Age      Grams_day 
##              3              4              2              0             48 
##     Packs_year             PS Encephalopathy        Ascites            INR 
##             53              0              1              2              4 
##            AFP     Hemoglobin            MCV     Leucocytes      Platelets 
##              8              3              3              3              3 
##        Albumin      Total_Bil            ALT            AST            GGT 
##              6              5              4              3              3 
##            ALP             TP     Creatinine        Nodules      Major_Dim 
##              3             11              7              2             20 
##        Dir_Bil           Iron            Sat       Ferritin          Class 
##             44             78             79             79              0

#Variables Distribution
cbind(freq=table(hccdata$Class), percentage=prop.table(table(hccdata$Class))*100)

##   freq percentage
## 0   63   38.41463
## 1  101   61.58537

#Distribution histogram
par(mfrow=c(4,4))
       for(i in 1:48) {
         hist(hccdata[,i], main=names(hccdata)[i])
       }

#Density Distribution
par(mfrow=c(3,3))

for(i in 1:48) {
plot(density(na.omit(hccdata[,i])), main=names(hccdata)[i])
}

#Boxplot
par(mfrow=c(1,4))

for(i in 1:48) {
boxplot(na.omit(hccdata[,i]), main=names(hccdata)[i])
}

#Atributes by Class - Fazer por partes

jittered_x <- sapply(hccdata[,1:5], jitter)
pairs(jittered_x, names(hccdata[,1:5]), col=c('blue', 'red') )

par(mfrow=c(3,2))
for(i in 1:48) {
barplot(table(hccdata$Class,hccdata[,i]), main=names(hccdata)[i],)
}

Feature Engineering

## Variables type
numeric.vars     <- c('Grams_day',  'Packs_year', 'INR',    'AFP',
                      'Hemoglobin', 'MCV',  'Leucocytes',   'Platelets', 
                      'Albumin',    'Total_Bil', 'ALT', 'AST',  'GGT',  'ALP',
                      'TP', 'Creatinine',   'Major_Dim',    'Dir_Bil',
                      'Iron',   'Sat',  'Ferritin')
categorical.vars <- c('Gender', 'Symptoms', 'Alcohol',  'HBsAg',
                      'HBcAb',  'HCVAb', 'Cirrhosis',   'Endemic',  'Smoking',
                      'Diabetes',   'Obesity',  'Hemochro', 'AHT',  'CRI',  
                      'HIV',    'NASH', 'Varices',  'Spleno',   'PHT',  'PVT',
                      'Metastasis',  'Hallmark', 'Class', 'PS', 'Encephalopathy', 'Ascites', 'Nodules')

ordinal.vars     <- c('PS', 'Encephalopathy', 'Ascites', 'Nodules')

I decided exclude the column HBeAg because it presents missing values and only one value of the Class 1.

#Removing column: HBeAg(0:124, 1:1, NA:39)
hccdata$HBeAg <- NULL

Fill Missing Values

Filling NA values in Packs by Year with a random sampled value. Once I have this value, I can complete the Smoking column. If Packs_Year is bigger than 0, Smoking receives value 1.

#Smoking and Packs_Year

for(i in 1:nrow(hccdata)){
  if(is.na(hccdata$Packs_year[i]==TRUE)){
    hccdata$Packs_year[i] <- sample(na.omit(hccdata$Packs_year), 1)
  }
}

for(i in 1:nrow(hccdata)){
  if(hccdata$Packs_year[i]>0){
    hccdata$Smoking[i] <- 1
  } else {
    hccdata$Smoking[i] <- 0
  }
}

In order to complete NA values in the remaining variables, I splitted the 2 classes and I filled NA values with a random sampled value inside each class.

### Splitting classes
dead <- filter(hccdata, hccdata$Class=='1')
notdead <- filter(hccdata, hccdata$Class=='0')

#Other variables
for(i in 1:nrow(dead)){
  for(j in 1:ncol(dead)){
    if(is.na(dead[i,j]==TRUE)){
      dead[i,j] <- sample(na.omit(dead[,j]), 1)
    }  
  }
}

for(i in 1:nrow(notdead)){
    for(j in 1:ncol(notdead)){
      if(is.na(notdead[i,j]==TRUE)){
        notdead[i,j] <- sample(na.omit(notdead[,j]), 1)
    }  
    }
}

hccdata <- rbind(dead, notdead)
rm(dead, notdead)
invisible(gc())

Setting variable types

## Formatting columns to factors function
to.factors <- function(df, variables){
  for (variable in variables){
    df[[variable]] <- as.factor(df[[variable]])
  }
  return(df)
}

## Factor variables
hccdata <- to.factors(df = hccdata, variables = categorical.vars)

One hot encode

The dataset has three ordinal variables. These variables were binarized.

datatabledata <- data.table::as.data.table(hccdata)
hccdata <- mltools::one_hot(datatabledata ,  cols = c('Encephalopathy', 'PS', 'Ascites', 'Nodules'), dropCols = TRUE)
hccdata <- as.data.frame(hccdata)

factorsvar <- c('Nodules_1', 'Nodules_2', 'Nodules_3', 'Nodules_4', 'Nodules_5', 'Ascites_1', 'Ascites_2', 'Ascites_3', 'Encephalopathy_1', 'Encephalopathy_2', 'Encephalopathy_3', 'PS_0', 'PS_1', 'PS_2', 'PS_3', 'PS_4')

hccdata <- to.factors(df = hccdata, variables = factorsvar)
rm(datatabledata)
invisible(gc())

Data Standardization

The numeric variables Age and Nodules were normalized by the maximum value. Remaining numeric variables were scaled.

### Age
hccdata$Age <- hccdata$Age/max(hccdata$Age)

# Normalization function 
scale.features <- function(df, variables){
  for (variable in variables){
    df[[variable]] <- scale(df[[variable]], center=T, scale=T)
  }
  return(df)
}
hccdata_scaled <- scale.features(hccdata, numeric.vars)

Split train & test

The dataset was randomly divided in train and test data sets using a proportion os 80%/20%.

# Spliting train and test datasets
set.seed(100)
intrain<- caret::createDataPartition(hccdata_scaled$Class,p=0.8,list=FALSE)
training<- hccdata_scaled[intrain,]
testing<- hccdata_scaled[-intrain,]
prop.table(table(testing$Class))

## 
##     0     1 
## 0.375 0.625

#class(training)
#training$Class
#View(training)

Colinearity

# Verifying correlation between numeric variables
numeric.var <- sapply(training, is.numeric) 
corr.matrix <- cor(training[,numeric.vars])  
corrplot::corrplot(corr.matrix, main="\n\n Correlation Between Numeric Variables", method="number")

Variables Total_Bil e Dir_Bil seem to be collinear. One of them can be excluded from the model.

# Feature Selection
training$Class <- as.factor(training$Class)
formula <- "Class ~ ."
formula <- as.formula(formula)
control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 10)
model <- caret::train(formula, data = training, method = "bayesglm", trControl = control)

## Loading required package: lattice

## Loading required package: ggplot2

importance <- caret::varImp(model, scale = FALSE)
#print(model)
# Plot
plot(importance)

Some variables seem to be important to the model: ALP, AFP, Hemoglobin, Ferritin, Albumin, PS_0, Dir_Bil, Symptons.

#Function to select variables using Random Forest 
run.feature.selection <- function(num.iters=30, feature.vars, class.var){
  set.seed(10)
  variable.sizes <- 1:10
  control <- rfeControl(functions = rfFuncs, method = "cv", 
                        verbose = FALSE, returnResamp = "all", 
                        number = num.iters)
  results.rfe <- rfe(x = feature.vars, y = class.var, 
                     sizes = variable.sizes, 
                     rfeControl = control)
  return(results.rfe)
}

rfe.results <- run.feature.selection(feature.vars = training[,-62], 
                                     class.var = training[,62])


#Visualization of results
#rfe.results

varImp((rfe.results))

This analysis showed as important variables to the model: Ferritin, AFP, Iron, and Hemoglobin

Model

Baseline Model

The baseline model showed an accuracy was 83% with the test set.

# Train the Model
model <- "Class ~ ."
LogModel <- arm::bayesglm(model, family=binomial,
                     data=training, drop.unused.levels = FALSE)

training$Class <- as.character(training$Class)
fitted.results <- predict(LogModel,newdata=training,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != training$Class)
print(paste('Logistic Regression Accuracy',1-misClasificError))

## [1] "Logistic Regression Accuracy 0.964285714285714"

testing$Class <- as.character(testing$Class)
fitted.results <- predict(LogModel,newdata=testing,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != testing$Class)
print(paste('Logistic Regression Accuracy',1-misClasificError))

## [1] "Logistic Regression Accuracy 0.875"

#Confusion Matrix of Logistic Regression
print("Confusion Matrix Para Logistic Regression")

## [1] "Confusion Matrix Para Logistic Regression"

table(testing$Class, fitted.results > 0.5)

##    
##     FALSE TRUE
##   0     7    2
##   1     1   14

#anova(LogModel, test="Chisq")

Selecting Features

Some variables were excluded from the model. It is better to have a model with less variables because it is more generalizable. The Accuracy with test set increased to 87.5%.

# Train the Model
model <- "Class ~  Symptoms + Alcohol +       
HBsAg + HCVAb +            
 Smoking +          
Diabetes +          
AHT + CRI + HIV +             
NASH + Varices + Spleno +           
PHT + PVT +        
 Age +        
Packs_year +  PS_1 +             
 PS_3 + PS_4 +             
 Encephalopathy_2 + Encephalopathy_3 + 
Ascites_1 +  Ascites_3 +        
INR + AFP + Hemoglobin +       
MCV + Leucocytes + Platelets +        
Albumin + ALT +             
AST + ALP +             
TP + Creatinine +         
 Nodules_2 + Nodules_3 +        
  Major_Dim +        
Dir_Bil + Iron + Sat + 
Ferritin"
training$Class <- as.factor(training$Class)
LogModel <- arm::bayesglm(model, family=binomial,
                     data=training, drop.unused.levels = FALSE)

training$Class <- as.character(training$Class)
fitted.results <- predict(LogModel,newdata=training,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != training$Class)
print(paste('Logistic Regression Accuracy',1-misClasificError))

## [1] "Logistic Regression Accuracy 0.95"

testing$Class <- as.character(testing$Class)
fitted.results <- predict(LogModel,newdata=testing,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != testing$Class)
print(paste('Logistic Regression Accuracy',1-misClasificError))

## [1] "Logistic Regression Accuracy 0.875"

Confusion Matrix

#Confusion Matrix of Logistic Regression
print("Confusion Matrix Para Logistic Regression")

## [1] "Confusion Matrix Para Logistic Regression"

table(testing$Class, fitted.results > 0.5)

##    
##     FALSE TRUE
##   0     7    2
##   1     1   14