Visualization

Importation

chol.dataset <- read.csv("cholangitis.csv", sep = ",")

chol.dataset$status <- as.factor(chol.dataset$status)
chol.dataset$drug <- as.factor(chol.dataset$drug)
chol.dataset$sex <- as.factor(chol.dataset$sex)
chol.dataset$ascites <- as.factor(chol.dataset$ascites)
chol.dataset$hepatomegaly <- as.factor(chol.dataset$hepatomegaly)
chol.dataset$spiders <- as.factor(chol.dataset$spiders)
chol.dataset$edema <- as.factor(chol.dataset$edema)
chol.dataset$stage <- as.factor(chol.dataset$stage)
head(chol.dataset)

##   id n_days status            drug   age sex ascites hepatomegaly spiders edema
## 1  1    400      D D-penicillamine 21464   F       Y            Y       Y     Y
## 2  2   4500      C D-penicillamine 20617   F       N            Y       Y     N
## 3  3   1012      D D-penicillamine 25594   M       N            N       N     S
## 4  4   1925      D D-penicillamine 19994   F       N            Y       Y     S
## 5  5   1504     CL         Placebo 13918   F       N            Y       Y     N
## 6  6   2503      D         Placebo 24201   F       N            Y       N     N
##   bilirubin cholesterol albumin copper alk_phos   sgot tryglicerides platelets
## 1      14.5         261    2.60    156   1718.0 137.95           172       190
## 2       1.1         302    4.14     54   7394.8 113.52            88       221
## 3       1.4         176    3.48    210    516.0  96.10            55       151
## 4       1.8         244    2.54     64   6121.8  60.63            92       183
## 5       3.4         279    3.53    143    671.0 113.15            72       136
## 6       0.8         248    3.98     50    944.0  93.00            63       361
##   prothrombin stage
## 1        12.2     4
## 2        10.6     3
## 3        12.0     4
## 4        10.3     4
## 5        10.9     3
## 6        11.0     3

Basic Exploratory Data Analysis

summary(chol.dataset)

##        id            n_days     status                drug          age       
##  Min.   :  1.0   Min.   :  41   C :232   D-penicillamine:158   Min.   : 9598  
##  1st Qu.:105.2   1st Qu.:1093   CL: 25   Placebo        :154   1st Qu.:15644  
##  Median :209.5   Median :1730   D :161   NA's           :106   Median :18628  
##  Mean   :209.5   Mean   :1918                                  Mean   :18533  
##  3rd Qu.:313.8   3rd Qu.:2614                                  3rd Qu.:21272  
##  Max.   :418.0   Max.   :4795                                  Max.   :28650  
##                                                                               
##  sex     ascites hepatomegaly spiders edema     bilirubin       cholesterol    
##  F:374   N:390   N:203        N:298   N:354   Min.   : 0.300   Min.   : 120.0  
##  M: 44   Y: 28   Y:215        Y:120   S: 44   1st Qu.: 0.800   1st Qu.: 248.0  
##                                       Y: 20   Median : 1.400   Median : 310.0  
##                                               Mean   : 3.221   Mean   : 365.5  
##                                               3rd Qu.: 3.400   3rd Qu.: 400.0  
##                                               Max.   :28.000   Max.   :1775.0  
##                                                                NA's   :5       
##     albumin          copper          alk_phos            sgot       
##  Min.   :1.960   Min.   :  4.00   Min.   :  289.0   Min.   : 26.35  
##  1st Qu.:3.243   1st Qu.: 41.25   1st Qu.:  857.2   1st Qu.: 82.04  
##  Median :3.530   Median : 72.50   Median : 1257.0   Median :114.11  
##  Mean   :3.497   Mean   : 95.71   Mean   : 1937.1   Mean   :121.75  
##  3rd Qu.:3.770   3rd Qu.:123.00   3rd Qu.: 2039.0   3rd Qu.:151.90  
##  Max.   :4.640   Max.   :588.00   Max.   :13862.4   Max.   :457.25  
##                                                                     
##  tryglicerides     platelets      prothrombin    stage  
##  Min.   : 33.0   Min.   : 62.0   Min.   : 9.00   1: 21  
##  1st Qu.: 84.0   1st Qu.:190.0   1st Qu.:10.00   2: 94  
##  Median :109.0   Median :250.0   Median :10.60   3:156  
##  Mean   :122.9   Mean   :257.4   Mean   :10.73   4:147  
##  3rd Qu.:151.0   3rd Qu.:318.0   3rd Qu.:11.10          
##  Max.   :598.0   Max.   :721.0   Max.   :18.00          
##  NA's   :5

There’s a big range in the number of days spent between registration and death/transplant/end of study. Those who spent very few days in the study were likely very sick and close to death, or likely had other factors that drove them away from the study, opposed to the participants who remained active in the study for much longer.

Regarding ‘status,’ the patients who received a liver transplant should be dropped from analysis

chol.dataset <- filter(chol.dataset, status != "CL")
head(chol.dataset)

##   id n_days status            drug   age sex ascites hepatomegaly spiders edema
## 1  1    400      D D-penicillamine 21464   F       Y            Y       Y     Y
## 2  2   4500      C D-penicillamine 20617   F       N            Y       Y     N
## 3  3   1012      D D-penicillamine 25594   M       N            N       N     S
## 4  4   1925      D D-penicillamine 19994   F       N            Y       Y     S
## 5  6   2503      D         Placebo 24201   F       N            Y       N     N
## 6  7   1832      C         Placebo 20284   F       N            Y       N     N
##   bilirubin cholesterol albumin copper alk_phos   sgot tryglicerides platelets
## 1      14.5         261    2.60    156   1718.0 137.95           172       190
## 2       1.1         302    4.14     54   7394.8 113.52            88       221
## 3       1.4         176    3.48    210    516.0  96.10            55       151
## 4       1.8         244    2.54     64   6121.8  60.63            92       183
## 5       0.8         248    3.98     50    944.0  93.00            63       361
## 6       1.0         322    4.09     52    824.0  60.45           213       204
##   prothrombin stage
## 1        12.2     4
## 2        10.6     3
## 3        12.0     4
## 4        10.3     4
## 5        11.0     3
## 6         9.7     3

When analyzing the drug treatment, the double-blinded & placebo-controlled trial suggests comparing the outcomes between the placebo and patients who received the actual treatment drug. Patients who received neither should be excluded for now, but this group could also be compared to the placebo group in order to assess whether a slight ‘placebo effect’ may or may not exist.

chol.data_placebo_effect <- filter(chol.dataset, drug != "D-penicillamine")
chol.dataset <- filter(chol.dataset, drug != "NA")
head(chol.dataset)

##   id n_days status            drug   age sex ascites hepatomegaly spiders edema
## 1  1    400      D D-penicillamine 21464   F       Y            Y       Y     Y
## 2  2   4500      C D-penicillamine 20617   F       N            Y       Y     N
## 3  3   1012      D D-penicillamine 25594   M       N            N       N     S
## 4  4   1925      D D-penicillamine 19994   F       N            Y       Y     S
## 5  6   2503      D         Placebo 24201   F       N            Y       N     N
## 6  7   1832      C         Placebo 20284   F       N            Y       N     N
##   bilirubin cholesterol albumin copper alk_phos   sgot tryglicerides platelets
## 1      14.5         261    2.60    156   1718.0 137.95           172       190
## 2       1.1         302    4.14     54   7394.8 113.52            88       221
## 3       1.4         176    3.48    210    516.0  96.10            55       151
## 4       1.8         244    2.54     64   6121.8  60.63            92       183
## 5       0.8         248    3.98     50    944.0  93.00            63       361
## 6       1.0         322    4.09     52    824.0  60.45           213       204
##   prothrombin stage
## 1        12.2     4
## 2        10.6     3
## 3        12.0     4
## 4        10.3     4
## 5        11.0     3
## 6         9.7     3

The age in days is difficult to interpret, so I decided to transform it into years instead.

chol.dataset$age <- round(chol.dataset$age / 365, 4)
head(chol.dataset)

##   id n_days status            drug     age sex ascites hepatomegaly spiders
## 1  1    400      D D-penicillamine 58.8055   F       Y            Y       Y
## 2  2   4500      C D-penicillamine 56.4849   F       N            Y       Y
## 3  3   1012      D D-penicillamine 70.1205   M       N            N       N
## 4  4   1925      D D-penicillamine 54.7781   F       N            Y       Y
## 5  6   2503      D         Placebo 66.3041   F       N            Y       N
## 6  7   1832      C         Placebo 55.5726   F       N            Y       N
##   edema bilirubin cholesterol albumin copper alk_phos   sgot tryglicerides
## 1     Y      14.5         261    2.60    156   1718.0 137.95           172
## 2     N       1.1         302    4.14     54   7394.8 113.52            88
## 3     S       1.4         176    3.48    210    516.0  96.10            55
## 4     S       1.8         244    2.54     64   6121.8  60.63            92
## 5     N       0.8         248    3.98     50    944.0  93.00            63
## 6     N       1.0         322    4.09     52    824.0  60.45           213
##   platelets prothrombin stage
## 1       190        12.2     4
## 2       221        10.6     3
## 3       151        12.0     4
## 4       183        10.3     4
## 5       361        11.0     3
## 6       204         9.7     3

There are 5 individuals who have NA values each in both the cholesterol and triglyceride variables, which wasn’t measured for some reason. There are two options for handling this missing data: predict the missing values based on different patients who have similar values in the other variables, or omit these individuals completely.

chol.dataset <- na.omit(chol.dataset)

ggplot(chol.dataset, aes(x = n_days, color = status)) + geom_density() + xlab("Number of Days Enrolled in Study") + ggtitle("Days Enrolled in Study and Survival") + geom_vline(xintercept = mean(filter(chol.dataset, status == 'C')$n_days), color = 'red', linetype="dotted") + geom_vline(xintercept = mean(filter(chol.dataset, status == 'D')$n_days), color = 'blue', linetype="dotted")

Since we’ll be using the number of days enrolled in the study somewhat as a proxy continuous variable for survivability, we should assess how the distributions of the living and deceased compare. As expected, those enrolled for less days make up a greater proportion of the deceased, which declines rapidly. Although they overlap beyond an alpha significance level of 0.05, they do appear to be different distributions. There is a bimodal trend for patients who survived, the bulk centered around 2500, with another peak at the maximum around 4000.

st_1 <- filter(chol.dataset, stage == 1)$n_days
st_2 <- filter(chol.dataset, stage == 2)$n_days
st_3 <- filter(chol.dataset, stage == 3)$n_days
st_4 <- filter(chol.dataset, stage == 4)$n_days
ggplot(chol.dataset, aes(x = n_days, color = stage)) + geom_density() + xlab("Number of Days Enrolled in Study") + ggtitle("Days Enrolled in Study and Disease Progression") + geom_vline(xintercept = mean(st_1), color = 'red', linetype="dotted") + geom_vline(xintercept = mean(st_2), color = 'green', linetype="dotted") + geom_vline(xintercept = mean(st_3), color = 'blue', linetype="dotted") + geom_vline(xintercept = mean(st_4), color = 'violet', linetype="dotted")

Exploring deeper into how long someone was enrolled in the study and their likelihood of survival can be latently examined by assessing number of days by disease stage. Below is the graph demonstrating that survivability decreases with disease stage progression, which is supported here, as the median number of days enrolled in the study decreases with disease severity. On average, stage one participants were enrolled for twice as long as stage four participants.

ggplot(chol.dataset, aes(x = n_days, color = drug)) + geom_density() + xlab("Number of Days Enrolled in Study") + ggtitle("Days Enrolled in Study and Drug Treatment") + geom_vline(xintercept = mean(filter(chol.dataset, drug == 'D-penicillamine')$n_days), color = 'red', linetype="dotted") + geom_vline(xintercept = mean(filter(chol.dataset, drug == 'Placebo')$n_days), color = 'blue', linetype="dotted")

There is only a slight difference in the number of days enrolled in the study based on drug treatment, which is a good sign that the double-blind remained in tact, and that the number of days a participant remained in the study wasn’t influenced by the treatment they received, but instead an unforeseeable circumstance, or death may cause patients to exit the study.

ggplot(chol.dataset, aes(x = drug, fill = status)) + geom_bar() + xlab("Treatment Assigned") + ggtitle("Drug Treatment and Survival")

Looking at this bar chart comparing the drug treatments, it would initially appear as if there isn’t a significant difference between the two groups, or that even D-penicillamine has a slightly higher proportion of deceased participants.

ggplot(chol.dataset, aes(x = drug, fill = status)) + geom_bar() + xlab("Treatment Assigned") + ggtitle("Drug Treatment and Survival by Disease Stage") + facet_wrap(~ stage)

This plot demonstrates how survivability varies across disease stages and drug treatment groups. The proportion of deceased participants increased with disease progression, as expected. There doesn’t appear to be much of a significant difference between treatment groups however, except more deaths occurred in the treatment group in stage 1 patients.

ggplot(chol.dataset, aes(x = n_days)) + geom_boxplot(aes(fill = drug)) + xlab("Treatment Assigned") + ggtitle("Drug Treatment and Survival by Disease Stage") + facet_wrap(~ stage)

#chol.dataset$ascites
#status 
ggplot(chol.dataset, aes(x = drug, fill= status)) + geom_bar() + xlab("Treatment Assigned") + ggtitle("Drug Treatment and Survival") + facet_wrap(~ ascites)

#ggplot(chol.dataset, aes(x = n_days)) + geom_boxplot(aes(fill = status)) + xlab("Treatment Assigned") + ggtitle("Drug Treatment and Survival by Disease Stage") + facet_wrap(~ ascites)

ggplot(chol.dataset, aes(x = age, color = status)) + geom_density() + xlab("Age (Years)") + ggtitle("Survivability and Age") + geom_vline(xintercept = mean(filter(chol.dataset, status == 'C')$age), color = 'red', linetype="dotted") + geom_vline(xintercept = mean(filter(chol.dataset, status == 'D')$age), color = 'blue', linetype="dotted")

I would’ve expected the dead distribution to be left shifted when compared to the living participants, but it rather symmetrically distributed around 55 or so, while the distribution for the living is right skewed, and centered lower.

b <- ggplot(chol.dataset, aes(x = n_days, y = bilirubin, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Bilirubin Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

chl <- ggplot(chol.dataset, aes(x = n_days, y = cholesterol, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Cholesterol Over Time")+ theme(axis.text.x = element_text(angle = 90))+ xlab("Number of Days")

a <- ggplot(chol.dataset, aes(x = n_days, y = albumin, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Albumin Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

cu <- ggplot(chol.dataset, aes(x = n_days, y = copper, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Copper Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

plot_grid(b, chl, a, cu)

ak <- ggplot(chol.dataset, aes(x = n_days, y = alk_phos, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Alkaline Phosphatase Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

sg <- ggplot(chol.dataset, aes(x = n_days, y = sgot, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("SGOT Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

tri <- ggplot(chol.dataset, aes(x = n_days, y = tryglicerides, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Triglycerides Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

pl <- ggplot(chol.dataset, aes(x = n_days, y = platelets, color = drug)) + geom_point(alpha = 0.5, size = 0.5) + ggtitle("Platelets Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

plot_grid(ak, sg, tri, pl)

ggplot(chol.dataset, aes(x = n_days, y = prothrombin, color = drug)) + geom_point(alpha = 0.5) + ggtitle("Prothrombin Over Time") + theme(axis.text.x = element_text(angle = 90)) + xlab("Number of Days")

This collection demonstrates the scatterplots of the continuous variables against ‘n_days,’ suggesting how they change over time.

cont_var_chol.dataset <- chol.dataset[, -c(1, 3, 4, 6:10, 20)]

cat_var_chol.dataset <- chol.dataset[, c(3, 4, 6:10, 20)]

pairs(cont_var_chol.dataset)

This pairs chart is just too much to look at!

library(gpairs)
corrgram(cont_var_chol.dataset)

Multivariate Regression

As I begin to think about performing a multivariate regression of the number of days on the continuous variables, these pair plots suggest some relationships between variables, but I should clear out outliers (evident in the scatterplots) and reassess.

quantile_limits <- function(df, lb, ub){
  lower <- c()
  upper <- c()
  for(i in 1:ncol(df)) {
    lower <- c(lower, quantile(df[,i], lb))
    upper <- c(upper, quantile(df[,i], ub))
  }
  frame <- data.frame(names(df), lower, upper)
  return(frame)
}

ci_cont_var <- quantile_limits(cont_var_chol.dataset, 0.001, 0.999)
ci_cont_var

##        names.df.     lower       upper
## 1         n_days  43.87000  4546.52900
## 2            age  27.04445    77.99623
## 3      bilirubin   0.30000    27.28250
## 4    cholesterol 122.00900  1724.77500
## 5        albumin   2.00018     4.60556
## 6         copper   5.43500   579.39000
## 7       alk_phos 295.02700 13402.16680
## 8           sgot  26.93261   423.02525
## 9  tryglicerides  36.15700   550.35800
## 10     platelets  64.29600   556.11200
## 11   prothrombin   9.02870    16.55470

refined_df <- filter(chol.dataset, age > ci_cont_var$lower[2] & age < ci_cont_var$upper[2])
refined_df <- filter(refined_df, bilirubin > ci_cont_var$lower[3] & bilirubin < ci_cont_var$upper[3])
refined_df <- filter(refined_df, cholesterol > ci_cont_var$lower[4] & cholesterol < ci_cont_var$upper[4])
refined_df <- filter(refined_df, albumin > ci_cont_var$lower[5] & albumin < ci_cont_var$upper[5])
refined_df <- filter(refined_df, copper > ci_cont_var$lower[6] & copper < ci_cont_var$upper[6])
refined_df <- filter(refined_df, alk_phos    > ci_cont_var$lower[7] & alk_phos   < ci_cont_var$upper[7])
refined_df <- filter(refined_df, sgot > ci_cont_var$lower[8] & sgot < ci_cont_var$upper[8])
refined_df <- filter(refined_df, tryglicerides > ci_cont_var$lower[9] & tryglicerides < 400)
refined_df <- filter(refined_df, platelets > ci_cont_var$lower[10] & platelets < ci_cont_var$upper[10])
refined_df <- filter(refined_df, prothrombin > ci_cont_var$lower[11] & prothrombin < ci_cont_var$upper[11])
num_outliers_removed <- nrow(chol.dataset) - nrow(refined_df)
num_outliers_removed

## [1] 23

The first part of this chunk of code parsed through each continuous variable and calculated the lower and upper bounds for where 99% of the data lies. However, I analyzed the scatter plots in order to assess trends to subjectively assist me in removing outliers that may skew the regression.

ci_95 <- quantile_limits(cont_var_chol.dataset, 0.025, 0.975)

refined_df_95 <- filter(cont_var_chol.dataset, age > ci_95$lower[2] & age < ci_95$upper[2])
refined_df_95 <- filter(refined_df_95, bilirubin > ci_95$lower[3] & bilirubin < ci_95$upper[3])
refined_df_95 <- filter(refined_df_95, cholesterol > ci_95$lower[4] & cholesterol < ci_95$upper[4])
refined_df_95 <- filter(refined_df_95, albumin > ci_95$lower[5] & albumin < ci_95$upper[5])
refined_df_95 <- filter(refined_df_95, copper > ci_95$lower[6] & copper < ci_95$upper[6])
refined_df_95 <- filter(refined_df_95, alk_phos  > ci_95$lower[7] & alk_phos     < 10000)
refined_df_95 <- filter(refined_df_95, sgot > ci_95$lower[8] & sgot < ci_95$upper[8])
refined_df_95 <- filter(refined_df_95, tryglicerides > ci_95$lower[9] & tryglicerides < ci_95$upper[9])
refined_df_95 <- filter(refined_df_95, platelets > ci_95$lower[10] & platelets < ci_95$upper[10])
refined_df_95 <- filter(refined_df_95, prothrombin > ci_95$lower[11] & prothrombin < ci_95$upper[11])
num_outliers_removed_95 <- nrow(chol.dataset) - nrow(refined_df_95)
num_outliers_removed_95

## [1] 124

This chunk of code does the same as above, except for 95% of the data. Removing outliers outside of the middle 99% results in the removal of 22 data points, while removing outliers outside of the middle 95% resulted in the removal of a much larger 125 values, which diminishes the number of participants, and hence power, too much. Further analysis can be conducted below on whether the 95% restriction is more appropriate, but for now, we will move into the regression with 99% of the data.

b <- ggplot(chol.dataset, aes(x = bilirubin, color = drug)) + geom_density() + ggtitle("Bilirubin Over Time") + theme(axis.text.x = element_text(angle = 90)) 

chl <- ggplot(chol.dataset, aes(x = cholesterol, color = drug)) + geom_density() + ggtitle("Cholesterol Over Time") + theme(axis.text.x = element_text(angle = 90))

a <- ggplot(chol.dataset, aes(x = albumin, color = drug)) + geom_density() + ggtitle("Albumin Over Time") + theme(axis.text.x = element_text(angle = 90))

cu <- ggplot(chol.dataset, aes(x = copper, color = drug)) + geom_density() + ggtitle("Copper Over Time") + theme(axis.text.x = element_text(angle = 90))


plot_grid(b, chl, a, cu)

ak <- ggplot(chol.dataset, aes(x = alk_phos, color = drug)) + geom_density() + ggtitle("Alkaline Phosphatase Over Time") + theme(axis.text.x = element_text(angle = 90))

sg <- ggplot(chol.dataset, aes(x = sgot, color = drug)) + geom_density() + ggtitle("SGOT Over Time") + theme(axis.text.x = element_text(angle = 90))

tri <- ggplot(chol.dataset, aes(x = tryglicerides, color = drug)) + geom_density() + ggtitle("Triglicerides Over Time") + theme(axis.text.x = element_text(angle = 90))

pl <- ggplot(chol.dataset, aes(x = platelets, color = drug)) + geom_density() + ggtitle("Platelets Over Time") + theme(axis.text.x = element_text(angle = 90))

plot_grid(ak, sg, tri, pl)

ggplot(chol.dataset, aes(x = prothrombin, color = drug)) + geom_density() + ggtitle("Probthrombin Over Time") + theme(axis.text.x = element_text(angle = 90))

Based on the skew of these plots, bilirubin, cholesterol, copper, alk_phos, sgot and tryglicerides would benefit from a log transformation in order to normalize the distribution.

refined_df$log_bilirubin <- log(refined_df$bilirubin)
refined_df$log_cholesterol <- log(refined_df$cholesterol)
refined_df$log_copper <- log(refined_df$copper)
refined_df$log_alk_phos <- log(refined_df$alk_phos)
refined_df$log_sgot <- log(refined_df$sgot)
refined_df$log_tryglicerides <- log(refined_df$tryglicerides)


b <- ggplot(refined_df, aes(x = log_bilirubin)) + geom_density() + ggtitle("Bilirubin Over Time") + theme(axis.text.x = element_text(angle = 90)) 

chl <- ggplot(refined_df, aes(x = log_cholesterol)) + geom_density() + ggtitle("Cholesterol Over Time") + theme(axis.text.x = element_text(angle = 90))

cu <- ggplot(refined_df, aes(x = log_copper)) + geom_density() + ggtitle("Copper Over Time") + theme(axis.text.x = element_text(angle = 90))

ak <- ggplot(refined_df, aes(x = log_alk_phos)) + geom_density() + ggtitle("Alkaline Phosphatase Over Time") + theme(axis.text.x = element_text(angle = 90))

sg <- ggplot(refined_df, aes(x = log_sgot)) + geom_density() + ggtitle("SGOT Over Time") + theme(axis.text.x = element_text(angle = 90))

tri <- ggplot(refined_df, aes(x = log_tryglicerides)) + geom_density() + ggtitle("Triglicerides Over Time") + theme(axis.text.x = element_text(angle = 90))

plot_grid(b, chl, cu, ak, sg, tri)

These variables look more normally distributed now!

PCA

chol_pca <- prcomp(refined_df[, -c(1, 3, 4, 6:12, 14:17, 20)], center = TRUE, scale = F)
summary(chol_pca)

## Importance of components:
##                              PC1      PC2      PC3   PC4    PC5    PC6    PC7
## Standard deviation     1144.4224 91.80641 10.22910 1.173 0.8174 0.5773 0.5698
## Proportion of Variance    0.9935  0.00639  0.00008 0.000 0.0000 0.0000 0.0000
## Cumulative Proportion     0.9935  0.99992  1.00000 1.000 1.0000 1.0000 1.0000
##                           PC8    PC9   PC10   PC11
## Standard deviation     0.3911 0.3487 0.3254 0.2908
## Proportion of Variance 0.0000 0.0000 0.0000 0.0000
## Cumulative Proportion  1.0000 1.0000 1.0000 1.0000

ggplot() +
geom_point(
  aes(x = chol_pca$x[,1], y = chol_pca$x[,2], color = refined_df$status)) +
  labs(x = "First PC",
       y = "Second PC") + theme_bw()

This plot of the first two PC’s demonstrates that there does appear to be some clustering based on status, with the living participants falling slightly right of the dead participants.

#text_names = place_rated$city
#text_names[-example_cities] = 'o'
biplot(chol_pca, cex = 0.5)

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

This biplot demonstrates that the variable pulling C away from D is the number of days, which confirms my analysis above regarding n_days as a latent variable for survivability.

no_day_pca <- prcomp(refined_df[, -c(1, 2, 3, 4, 6:12, 14:17, 20)])
ggplot() +
geom_point(
  aes(x = no_day_pca$x[,1], y = no_day_pca$x[,2], color = refined_df$status)) +
  labs(x = "First PC",
       y = "Second PC") + theme_bw()

biplot(no_day_pca, cex = 0.5)

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped

After removing n_days, we can see a strong influence of platelets on the first PC, meaning the first PC represents platelets. Age has a positive loadings on the second component.

cont_var_trans <- refined_df[, -c(1, 3, 4, 6:12, 14:17, 20)]
corrgram(cont_var_trans)

logs_only <- refined_df[, -c(11, 12, 14:17)]
#non_logs <- 
full_lm <- lm(n_days ~ ., data = logs_only[, -c(1)])
summary(full_lm)

## 
## Call:
## lm(formula = n_days ~ ., data = logs_only[, -c(1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2410.30  -578.37   -67.15   571.34  2365.63 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.254e+03  1.944e+03  -2.189 0.029556 *  
## statusD           -4.648e+02  1.539e+02  -3.020 0.002795 ** 
## drugPlacebo       -1.955e+01  1.151e+02  -0.170 0.865233    
## age               -5.484e+00  6.117e+00  -0.896 0.370919    
## sexM               1.890e+02  1.885e+02   1.003 0.317051    
## ascitesY          -1.432e+02  3.133e+02  -0.457 0.648043    
## hepatomegalyY      2.730e+01  1.344e+02   0.203 0.839231    
## spidersY          -9.896e+00  1.423e+02  -0.070 0.944614    
## edemaS            -2.366e+02  2.091e+02  -1.131 0.258973    
## edemaY            -4.212e+02  3.314e+02  -1.271 0.204988    
## albumin            5.843e+02  1.658e+02   3.525 0.000507 ***
## platelets          8.508e-02  6.695e-01   0.127 0.898990    
## prothrombin        2.342e+02  7.895e+01   2.967 0.003307 ** 
## stage2            -2.816e+02  2.706e+02  -1.041 0.298978    
## stage3            -3.372e+02  2.684e+02  -1.256 0.210242    
## stage4            -5.539e+02  2.901e+02  -1.909 0.057394 .  
## log_bilirubin     -3.399e+02  9.661e+01  -3.518 0.000519 ***
## log_cholesterol   -9.516e+00  1.770e+02  -0.054 0.957161    
## log_copper        -1.766e+02  9.005e+01  -1.961 0.050991 .  
## log_alk_phos       4.538e+02  8.819e+01   5.146 5.49e-07 ***
## log_sgot          -5.046e+01  1.650e+02  -0.306 0.759991    
## log_tryglicerides  1.074e+02  1.554e+02   0.691 0.490054    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 888.7 on 243 degrees of freedom
## Multiple R-squared:  0.4448, Adjusted R-squared:  0.3968 
## F-statistic: 9.269 on 21 and 243 DF,  p-value: < 2.2e-16

#plot(full_lm)

From this regression, the significant variables are status, albumin, prothrombin, log_bilirubin, log_copper and log_alk_phos.

Variable Selection

step(full_lm, direction = "both")

## Start:  AIC=3619.62
## n_days ~ status + drug + age + sex + ascites + hepatomegaly + 
##     spiders + edema + albumin + platelets + prothrombin + stage + 
##     log_bilirubin + log_cholesterol + log_copper + log_alk_phos + 
##     log_sgot + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - log_cholesterol    1      2284 191930410 3617.6
## - spiders            1      3820 191931946 3617.6
## - platelets          1     12753 191940880 3617.6
## - drug               1     22798 191950925 3617.7
## - hepatomegaly       1     32578 191960705 3617.7
## - log_sgot           1     73877 192002003 3617.7
## - ascites            1    164991 192093118 3617.8
## - edema              2   1799002 193727129 3618.1
## - log_tryglicerides  1    377425 192305552 3618.1
## - stage              3   3423699 195351825 3618.3
## - age                1    634673 192562799 3618.5
## - sex                1    793942 192722069 3618.7
## <none>                           191928126 3619.6
## - log_copper         1   3038103 194966230 3621.8
## - prothrombin        1   6953183 198881310 3627.0
## - status             1   7204826 199132953 3627.4
## - log_bilirubin      1   9776039 201704166 3630.8
## - albumin            1   9811464 201739590 3630.8
## - log_alk_phos       1  20914382 212842509 3645.0
## 
## Step:  AIC=3617.62
## n_days ~ status + drug + age + sex + ascites + hepatomegaly + 
##     spiders + edema + albumin + platelets + prothrombin + stage + 
##     log_bilirubin + log_copper + log_alk_phos + log_sgot + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - spiders            1      3533 191933943 3615.6
## - platelets          1     10816 191941226 3615.6
## - drug               1     21404 191951814 3615.7
## - hepatomegaly       1     32668 191963078 3615.7
## - log_sgot           1     76613 192007023 3615.7
## - ascites            1    162786 192093196 3615.8
## - edema              2   1803092 193733502 3616.1
## - log_tryglicerides  1    381502 192311912 3616.1
## - stage              3   3421437 195351847 3616.3
## - age                1    632558 192562968 3616.5
## - sex                1    792256 192722667 3616.7
## <none>                           191930410 3617.6
## + log_cholesterol    1      2284 191928126 3619.6
## - log_copper         1   3038042 194968452 3619.8
## - prothrombin        1   7202587 199132997 3625.4
## - status             1   7293756 199224167 3625.5
## - albumin            1   9825536 201755946 3628.9
## - log_bilirubin      1  10914333 202844743 3630.3
## - log_alk_phos       1  21010950 212941360 3643.2
## 
## Step:  AIC=3615.63
## n_days ~ status + drug + age + sex + ascites + hepatomegaly + 
##     edema + albumin + platelets + prothrombin + stage + log_bilirubin + 
##     log_copper + log_alk_phos + log_sgot + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - platelets          1     11230 191945174 3613.6
## - drug               1     20181 191954125 3613.7
## - hepatomegaly       1     31166 191965110 3613.7
## - log_sgot           1     75009 192008952 3613.7
## - ascites            1    159255 192093198 3613.8
## - log_tryglicerides  1    393299 192327242 3614.2
## - edema              2   1912505 193846448 3614.3
## - stage              3   3522192 195456136 3614.4
## - age                1    630391 192564335 3614.5
## - sex                1    851495 192785438 3614.8
## <none>                           191933943 3615.6
## + spiders            1      3533 191930410 3617.6
## + log_cholesterol    1      1997 191931946 3617.6
## - log_copper         1   3071899 195005842 3617.8
## - prothrombin        1   7199078 199133022 3623.4
## - status             1   7323963 199257907 3623.6
## - albumin            1   9832085 201766029 3626.9
## - log_bilirubin      1  11225118 203159061 3628.7
## - log_alk_phos       1  21054838 212988781 3641.2
## 
## Step:  AIC=3613.64
## n_days ~ status + drug + age + sex + ascites + hepatomegaly + 
##     edema + albumin + prothrombin + stage + log_bilirubin + log_copper + 
##     log_alk_phos + log_sgot + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - drug               1     17726 191962900 3611.7
## - hepatomegaly       1     28913 191974087 3611.7
## - log_sgot           1     76866 192022040 3611.7
## - ascites            1    161860 192107034 3611.9
## - log_tryglicerides  1    399399 192344573 3612.2
## - edema              2   1924399 193869572 3612.3
## - age                1    629896 192575070 3612.5
## - stage              3   3594980 195540154 3612.6
## - sex                1    842018 192787192 3612.8
## <none>                           191945174 3613.6
## + platelets          1     11230 191933943 3615.6
## + spiders            1      3948 191941226 3615.6
## + log_cholesterol    1       224 191944950 3615.6
## - log_copper         1   3068488 195013662 3615.8
## - prothrombin        1   7242035 199187208 3621.5
## - status             1   7322504 199267678 3621.6
## - albumin            1   9900371 201845545 3625.0
## - log_bilirubin      1  11272391 203217565 3626.8
## - log_alk_phos       1  21964730 213909904 3640.4
## 
## Step:  AIC=3611.67
## n_days ~ status + age + sex + ascites + hepatomegaly + edema + 
##     albumin + prothrombin + stage + log_bilirubin + log_copper + 
##     log_alk_phos + log_sgot + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - hepatomegaly       1     26638 191989538 3609.7
## - log_sgot           1     75451 192038351 3609.8
## - ascites            1    160361 192123261 3609.9
## - log_tryglicerides  1    399424 192362323 3610.2
## - edema              2   1913371 193876270 3610.3
## - age                1    612730 192575629 3610.5
## - stage              3   3689242 195652141 3610.7
## - sex                1    837628 192800528 3610.8
## <none>                           191962900 3611.7
## + drug               1     17726 191945174 3613.6
## + platelets          1      8775 191954125 3613.7
## + spiders            1      2675 191960225 3613.7
## + log_cholesterol    1         9 191962891 3613.7
## - log_copper         1   3081114 195044013 3613.9
## - prothrombin        1   7238833 199201733 3619.5
## - status             1   7310258 199273157 3619.6
## - albumin            1   9893420 201856319 3623.0
## - log_bilirubin      1  11298373 203261273 3624.8
## - log_alk_phos       1  22031382 213994282 3638.5
## 
## Step:  AIC=3609.7
## n_days ~ status + age + sex + ascites + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos + log_sgot + 
##     log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - log_sgot           1     84097 192073634 3607.8
## - ascites            1    168923 192158461 3607.9
## - log_tryglicerides  1    405145 192394683 3608.3
## - edema              2   1913356 193902893 3608.3
## - age                1    624265 192613802 3608.6
## - sex                1    855115 192844652 3608.9
## - stage              3   4059752 196049290 3609.2
## <none>                           191989538 3609.7
## + hepatomegaly       1     26638 191962900 3611.7
## + drug               1     15450 191974087 3611.7
## + platelets          1      7013 191982525 3611.7
## + spiders            1      1507 191988031 3611.7
## + log_cholesterol    1        64 191989474 3611.7
## - log_copper         1   3099343 195088881 3611.9
## - prothrombin        1   7219943 199209481 3617.5
## - status             1   7290652 199280190 3617.6
## - albumin            1   9914715 201904253 3621.0
## - log_bilirubin      1  11381811 203371349 3623.0
## - log_alk_phos       1  22226281 214215819 3636.7
## 
## Step:  AIC=3607.82
## n_days ~ status + age + sex + ascites + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - ascites            1    152709 192226344 3606.0
## - edema              2   1882648 193956283 3606.4
## - log_tryglicerides  1    479024 192552658 3606.5
## - age                1    563892 192637527 3606.6
## - sex                1    848333 192921968 3607.0
## - stage              3   4134759 196208393 3607.5
## <none>                           192073634 3607.8
## + log_sgot           1     84097 191989538 3609.7
## + hepatomegaly       1     35284 192038351 3609.8
## + drug               1     13737 192059897 3609.8
## + platelets          1      8383 192065251 3609.8
## + log_cholesterol    1       973 192072661 3609.8
## + spiders            1       451 192073183 3609.8
## - log_copper         1   3148192 195221826 3610.1
## - prothrombin        1   7413944 199487579 3615.9
## - status             1   7553965 199627600 3616.0
## - albumin            1   9971265 202044899 3619.2
## - log_bilirubin      1  14340618 206414253 3624.9
## - log_alk_phos       1  22464065 214537699 3635.1
## 
## Step:  AIC=3606.03
## n_days ~ status + age + sex + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos + log_tryglicerides
## 
##                     Df Sum of Sq       RSS    AIC
## - log_tryglicerides  1    450381 192676724 3604.7
## - age                1    587707 192814051 3604.8
## - sex                1    924695 193151039 3605.3
## - stage              3   4310735 196537079 3605.9
## <none>                           192226344 3606.0
## - edema              2   3573738 195800082 3606.9
## + ascites            1    152709 192073634 3607.8
## + log_sgot           1     67883 192158461 3607.9
## + hepatomegaly       1     43488 192182856 3608.0
## + drug               1     12326 192214018 3608.0
## + platelets          1     10239 192216104 3608.0
## + spiders            1      1162 192225181 3608.0
## + log_cholesterol    1        44 192226300 3608.0
## - log_copper         1   3132987 195359331 3608.3
## - prothrombin        1   7590052 199816395 3614.3
## - status             1   8033778 200260122 3614.9
## - albumin            1  10317131 202543475 3617.9
## - log_bilirubin      1  14246202 206472545 3623.0
## - log_alk_phos       1  22512574 214738918 3633.4
## 
## Step:  AIC=3604.65
## n_days ~ status + age + sex + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos
## 
##                     Df Sum of Sq       RSS    AIC
## - age                1    512330 193189054 3603.4
## - sex                1   1017139 193693863 3604.0
## - stage              3   4209911 196886635 3604.4
## <none>                           192676724 3604.7
## - edema              2   3542639 196219363 3605.5
## + log_tryglicerides  1    450381 192226344 3606.0
## + log_sgot           1    134993 192541732 3606.5
## + ascites            1    124066 192552658 3606.5
## + hepatomegaly       1     54599 192622125 3606.6
## + log_cholesterol    1     19078 192657646 3606.6
## + platelets          1     16803 192659921 3606.6
## + drug               1     11688 192665037 3606.6
## + spiders            1      1118 192675606 3606.6
## - log_copper         1   3099379 195776104 3606.9
## - prothrombin        1   7141706 199818430 3612.3
## - status             1   8051329 200728054 3613.5
## - albumin            1  10528533 203205258 3616.7
## - log_bilirubin      1  13904469 206581194 3621.1
## - log_alk_phos       1  23795624 216472348 3633.5
## 
## Step:  AIC=3603.35
## n_days ~ status + sex + edema + albumin + prothrombin + stage + 
##     log_bilirubin + log_copper + log_alk_phos
## 
##                     Df Sum of Sq       RSS    AIC
## - sex                1    753337 193942391 3602.4
## <none>                           193189054 3603.4
## - stage              3   4737806 197926861 3603.8
## + age                1    512330 192676724 3604.7
## - edema              2   3971608 197160662 3604.7
## + log_tryglicerides  1    375004 192814051 3604.8
## + ascites            1    146565 193042489 3605.2
## + hepatomegaly       1     63263 193125791 3605.3
## + log_sgot           1     51181 193137873 3605.3
## - log_copper         1   2892124 196081179 3605.3
## + log_cholesterol    1     31571 193157483 3605.3
## + platelets          1     16782 193172272 3605.3
## + spiders            1      2509 193186545 3605.4
## + drug               1        58 193188996 3605.4
## - prothrombin        1   7176551 200365605 3611.0
## - status             1   9123853 202312907 3613.6
## - albumin            1  11385662 204574716 3616.5
## - log_bilirubin      1  13475302 206664356 3619.2
## - log_alk_phos       1  24735568 217924623 3633.3
## 
## Step:  AIC=3602.39
## n_days ~ status + edema + albumin + prothrombin + stage + log_bilirubin + 
##     log_copper + log_alk_phos
## 
##                     Df Sum of Sq       RSS    AIC
## <none>                           193942391 3602.4
## - stage              3   4925200 198867591 3603.0
## + sex                1    753337 193189054 3603.4
## - edema              2   3855380 197797771 3603.6
## - log_copper         1   2406031 196348422 3603.7
## + log_tryglicerides  1    469520 193472872 3603.7
## + age                1    248528 193693863 3604.0
## + ascites            1    204755 193737636 3604.1
## + hepatomegaly       1     88302 193854089 3604.3
## + log_sgot           1     65968 193876423 3604.3
## + log_cholesterol    1     39764 193902627 3604.3
## + spiders            1     21586 193920806 3604.4
## + platelets          1      4938 193937453 3604.4
## + drug               1       228 193942163 3604.4
## - prothrombin        1   7409191 201351582 3610.3
## - status             1   8536385 202478776 3611.8
## - albumin            1  12426898 206369289 3616.8
## - log_bilirubin      1  14324034 208266425 3619.3
## - log_alk_phos       1  24630488 218572879 3632.1

## 
## Call:
## lm(formula = n_days ~ status + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos, data = logs_only[, 
##     -c(1)])
## 
## Coefficients:
##   (Intercept)        statusD         edemaS         edemaY        albumin  
##       -4543.9         -477.3         -260.1         -521.2          635.4  
##   prothrombin         stage2         stage3         stage4  log_bilirubin  
##         227.4         -317.4         -362.1         -595.5         -335.2  
##    log_copper   log_alk_phos  
##        -151.9          466.6

A stepwise analysis in both directions (with the lowest AIC of 3602.39) suggests that the best model for this multivariate regression is utilizing status + edema + albumin + prothrombin + stage + log_bilirubin + log_copper + log_alk_phos.

min_aic_mod <-lm(formula = n_days ~ status + edema + albumin + prothrombin + 
    stage + log_bilirubin + log_copper + log_alk_phos, data = logs_only[, 
    -c(1)])
summary(min_aic_mod)

## 
## Call:
## lm(formula = n_days ~ status + edema + albumin + prothrombin + 
##     stage + log_bilirubin + log_copper + log_alk_phos, data = logs_only[, 
##     -c(1)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2472.96  -569.62   -61.99   558.27  2320.09 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4543.88    1202.84  -3.778 0.000197 ***
## statusD        -477.26     143.02  -3.337 0.000974 ***
## edemaS         -260.11     197.04  -1.320 0.187991    
## edemaY         -521.19     253.58  -2.055 0.040875 *  
## albumin         635.35     157.80   4.026 7.49e-05 ***
## prothrombin     227.42      73.15   3.109 0.002092 ** 
## stage2         -317.40     256.98  -1.235 0.217944    
## stage3         -362.09     246.63  -1.468 0.143312    
## stage4         -595.50     260.07  -2.290 0.022857 *  
## log_bilirubin  -335.20      77.54  -4.323 2.22e-05 ***
## log_copper     -151.92      85.75  -1.772 0.077659 .  
## log_alk_phos    466.65      82.32   5.668 3.92e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 875.5 on 253 degrees of freedom
## Multiple R-squared:  0.4389, Adjusted R-squared:  0.4145 
## F-statistic: 17.99 on 11 and 253 DF,  p-value: < 2.2e-16

More than half of the variance in the data is still not accounted for in this model however, but for all the variables that were removed, the residual standard error was improved from 888 to 875.5, and the multiple r-squared was reduced from 0.4448 to 0.4389.

coplot(n_days ~ log_copper | log_bilirubin, data = cont_var_trans)

The pairs plot above demonstrates a positive relationship between log_copper and log_bilirubin, but covariance can create unstable error. We see there does appear to be some relationship between n_days and log_copper for specific values of log_bilirubin.

Cross Validation

library(leaps)
no_id_refined_logs <- logs_only[, -c(1)]
cholregsub <- regsubsets(n_days ~., no_id_refined_logs, nvmax = 28)

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

k = 10
set.seed(1)   
folds = sample(1:k, nrow(logs_only), replace = TRUE)
predict.regsubsets = function(object, newdata, id, ...) {
    form = as.formula(object$call[[2]])
    mat = model.matrix(form, newdata)
    coefi = coef(object, id = id)
    mat[, names(coefi)] %*% coefi
}
cv_errors = matrix(NA, k, nrow(summary(cholregsub)$which), dimnames = list(NULL, paste(1:nrow(summary(cholregsub)$which))))
for(j in 1:k){
    best_fit = regsubsets(n_days~., data = no_id_refined_logs[folds!=j,], nvmax=28)
    for(i in 1:21){
        pred = predict.regsubsets(best_fit, no_id_refined_logs[folds==j,], id=i)
        cv_errors[j,i] = mean((no_id_refined_logs$n_days[folds==j]-pred)^2)
    }
}

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found

## Reordering variables and trying again:

mean_cv_err <- apply(cv_errors, 2, mean)
coef(cholregsub, which.min(mean_cv_err))

##     (Intercept)     drugPlacebo        ascitesY          edemaY         albumin 
##   -2676.6396874      14.6330860    -435.2116648    -105.3316982     743.7569452 
##       platelets          stage2   log_bilirubin log_cholesterol    log_alk_phos 
##       0.0954775      92.1568962    -457.3938940     -31.3062432     424.8419554 
##        log_sgot 
##    -125.5885338

Opposed to the stepwise AIC analysis, cross-validation selects a 10 predictor model with drug, ascites, edema, albumin, platelets, stage, log_bilirubin, log_cholesterol, log_alk_phos and log_sgot.

summary(lm(formula = n_days ~ drug + ascites + edema + albumin + platelets + 
    stage + log_bilirubin + log_cholesterol + log_alk_phos + log_sgot, data = logs_only[, 
    -c(1)]))

## 
## Call:
## lm(formula = n_days ~ drug + ascites + edema + albumin + platelets + 
##     stage + log_bilirubin + log_cholesterol + log_alk_phos + 
##     log_sgot, data = logs_only[, -c(1)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2675.1  -633.7  -123.1   563.0  2306.3 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.729e+03  1.360e+03  -1.272   0.2047    
## drugPlacebo      4.037e+01  1.155e+02   0.349   0.7271    
## ascitesY        -3.390e+02  3.114e+02  -1.089   0.2774    
## edemaS          -1.641e+02  2.082e+02  -0.788   0.4314    
## edemaY          -1.993e+02  3.231e+02  -0.617   0.5378    
## albumin          6.717e+02  1.653e+02   4.063 6.48e-05 ***
## platelets       -4.166e-02  6.803e-01  -0.061   0.9512    
## stage2          -3.618e+02  2.713e+02  -1.334   0.1836    
## stage3          -4.364e+02  2.623e+02  -1.664   0.0974 .  
## stage4          -6.199e+02  2.738e+02  -2.264   0.0244 *  
## log_bilirubin   -4.138e+02  8.363e+01  -4.948 1.37e-06 ***
## log_cholesterol -7.678e+01  1.726e+02  -0.445   0.6568    
## log_alk_phos     4.268e+02  8.782e+01   4.860 2.07e-06 ***
## log_sgot        -1.156e+02  1.622e+02  -0.713   0.4767    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 912.1 on 251 degrees of freedom
## Multiple R-squared:  0.3959, Adjusted R-squared:  0.3647 
## F-statistic: 12.66 on 13 and 251 DF,  p-value: < 2.2e-16

The r-squared of this model is much lower than the one calculated with minimum AIC, in addition to a larger residual standard error.

plot(min_aic_mod)

These plots show some outliers that I am going to remove before seeing if they affect the fit, as the residual vs fitted value plot may suggest some non-linearity (the red line does not stay constant), and the scale-location suggests heteroscadastity, as there is an increasing linear relationship between the fitted values and square root of the standardized residuals.

min_aic_mod_minus_outliers <- lm(formula = n_days ~ status + edema + albumin + prothrombin + 
    stage + log_bilirubin + log_copper + log_alk_phos, data = no_id_refined_logs[-c(37, 52, 77, 176),])

plot(min_aic_mod_minus_outliers)

There is perhaps a very weak non-linear relationship in the data, as the line in the residual vs. fitted plot forms a slight parabola, but looking at the scatter cloud, there doesn’t appear to be a trend, and this could also be due to heteroscadastity at the tail ends of the data, where more variability in our residuals is observed for larger fitted values than for smaller ones. This is comfirmed as the scale-location plot does show still some heteroscadastity as the plot line fluctuates a bit. The qq plot follows the linear trend for the most part, with deviations upwards at the left tail and upwards deviations at the right tail of the data. Overall the assumption of normality is upheld.

Logistic Regression

chol_glm <- glm(status ~ ., data= logs_only[, -c(1)], family = binomial)
summary(chol_glm)

## 
## Call:
## glm(formula = status ~ ., family = binomial, data = logs_only[, 
##     -c(1)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7027  -0.5243  -0.2359   0.4209   2.8830  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -3.169e+01  7.093e+00  -4.467 7.94e-06 ***
## n_days            -5.613e-04  2.138e-04  -2.626 0.008645 ** 
## drugPlacebo       -2.690e-01  3.913e-01  -0.687 0.491781    
## age                5.576e-02  2.058e-02   2.710 0.006733 ** 
## sexM               7.946e-01  6.470e-01   1.228 0.219393    
## ascitesY           1.759e+01  1.258e+03   0.014 0.988843    
## hepatomegalyY      5.299e-01  4.433e-01   1.195 0.232037    
## spidersY           3.207e-01  4.611e-01   0.695 0.486811    
## edemaS            -3.759e-01  7.756e-01  -0.485 0.627891    
## edemaY            -8.220e-01  1.611e+00  -0.510 0.609765    
## albumin            3.405e-01  6.029e-01   0.565 0.572217    
## platelets         -6.710e-04  2.328e-03  -0.288 0.773164    
## prothrombin        1.149e+00  2.990e-01   3.843 0.000121 ***
## stage2             1.615e+00  1.529e+00   1.056 0.290775    
## stage3             1.693e+00  1.550e+00   1.093 0.274494    
## stage4             1.243e+00  1.582e+00   0.785 0.432198    
## log_bilirubin      4.198e-01  3.246e-01   1.293 0.195913    
## log_cholesterol    5.549e-01  6.699e-01   0.828 0.407517    
## log_copper         3.938e-01  3.283e-01   1.199 0.230347    
## log_alk_phos       7.148e-01  3.253e-01   2.197 0.028013 *  
## log_sgot           8.177e-01  5.531e-01   1.478 0.139327    
## log_tryglicerides  5.649e-02  5.271e-01   0.107 0.914655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 361.00  on 264  degrees of freedom
## Residual deviance: 188.07  on 243  degrees of freedom
## AIC: 232.07
## 
## Number of Fisher Scoring iterations: 17

step(chol_glm, direction = "both")

## Start:  AIC=232.07
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + edema + albumin + platelets + prothrombin + stage + 
##     log_bilirubin + log_cholesterol + log_copper + log_alk_phos + 
##     log_sgot + log_tryglicerides
## 
##                     Df Deviance    AIC
## - stage              3   190.25 228.25
## - edema              2   188.53 228.53
## - log_tryglicerides  1   188.08 230.08
## - platelets          1   188.16 230.16
## - albumin            1   188.39 230.39
## - drug               1   188.55 230.55
## - spiders            1   188.56 230.56
## - log_cholesterol    1   188.78 230.78
## - hepatomegaly       1   189.50 231.50
## - log_copper         1   189.53 231.53
## - sex                1   189.59 231.59
## - log_bilirubin      1   189.78 231.78
## <none>                   188.07 232.07
## - log_sgot           1   190.27 232.27
## - log_alk_phos       1   193.16 235.16
## - ascites            1   195.21 237.21
## - n_days             1   195.43 237.43
## - age                1   195.92 237.92
## - prothrombin        1   205.37 247.37
## 
## Step:  AIC=228.25
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + edema + albumin + platelets + prothrombin + log_bilirubin + 
##     log_cholesterol + log_copper + log_alk_phos + log_sgot + 
##     log_tryglicerides
## 
##                     Df Deviance    AIC
## - edema              2   190.50 224.50
## - platelets          1   190.31 226.31
## - log_tryglicerides  1   190.33 226.33
## - drug               1   190.62 226.62
## - albumin            1   190.65 226.65
## - spiders            1   190.67 226.67
## - log_cholesterol    1   191.19 227.19
## - sex                1   191.40 227.40
## - log_bilirubin      1   191.61 227.61
## - hepatomegaly       1   191.74 227.74
## - log_copper         1   192.01 228.01
## <none>                   190.25 228.25
## - log_sgot           1   193.70 229.70
## - log_alk_phos       1   195.09 231.09
## + stage              3   188.07 232.07
## - n_days             1   197.17 233.17
## - ascites            1   197.18 233.18
## - age                1   198.52 234.52
## - prothrombin        1   206.71 242.71
## 
## Step:  AIC=224.5
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + albumin + platelets + prothrombin + log_bilirubin + 
##     log_cholesterol + log_copper + log_alk_phos + log_sgot + 
##     log_tryglicerides
## 
##                     Df Deviance    AIC
## - platelets          1   190.56 222.56
## - log_tryglicerides  1   190.57 222.57
## - spiders            1   190.84 222.84
## - drug               1   190.85 222.85
## - albumin            1   190.96 222.96
## - log_cholesterol    1   191.54 223.54
## - sex                1   191.62 223.62
## - log_bilirubin      1   191.75 223.75
## - hepatomegaly       1   192.04 224.04
## - log_copper         1   192.27 224.27
## <none>                   190.50 224.50
## - log_sgot           1   194.10 226.10
## - log_alk_phos       1   195.47 227.47
## + edema              2   190.25 228.25
## + stage              3   188.53 228.53
## - n_days             1   197.58 229.58
## - ascites            1   198.16 230.16
## - age                1   198.75 230.75
## - prothrombin        1   206.99 238.99
## 
## Step:  AIC=222.56
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + albumin + prothrombin + log_bilirubin + log_cholesterol + 
##     log_copper + log_alk_phos + log_sgot + log_tryglicerides
## 
##                     Df Deviance    AIC
## - log_tryglicerides  1   190.63 220.63
## - spiders            1   190.90 220.90
## - drug               1   190.95 220.95
## - albumin            1   190.98 220.98
## - log_cholesterol    1   191.54 221.54
## - sex                1   191.74 221.74
## - log_bilirubin      1   191.92 221.92
## - hepatomegaly       1   192.22 222.22
## - log_copper         1   192.29 222.29
## <none>                   190.56 222.56
## - log_sgot           1   194.23 224.23
## + platelets          1   190.50 224.50
## - log_alk_phos       1   195.50 225.50
## + edema              2   190.31 226.31
## + stage              3   188.62 226.62
## - n_days             1   197.60 227.60
## - ascites            1   198.33 228.33
## - age                1   198.76 228.76
## - prothrombin        1   207.85 237.85
## 
## Step:  AIC=220.63
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + albumin + prothrombin + log_bilirubin + log_cholesterol + 
##     log_copper + log_alk_phos + log_sgot
## 
##                     Df Deviance    AIC
## - spiders            1   190.94 218.94
## - drug               1   191.02 219.02
## - albumin            1   191.07 219.07
## - log_cholesterol    1   191.85 219.85
## - sex                1   191.86 219.86
## - log_bilirubin      1   192.17 220.17
## - hepatomegaly       1   192.34 220.34
## - log_copper         1   192.44 220.44
## <none>                   190.63 220.63
## - log_sgot           1   194.23 222.23
## + log_tryglicerides  1   190.56 222.56
## + platelets          1   190.57 222.57
## - log_alk_phos       1   195.81 223.81
## + edema              2   190.39 224.39
## + stage              3   188.63 224.63
## - n_days             1   197.63 225.63
## - ascites            1   198.53 226.53
## - age                1   199.02 227.02
## - prothrombin        1   208.14 236.14
## 
## Step:  AIC=218.94
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     albumin + prothrombin + log_bilirubin + log_cholesterol + 
##     log_copper + log_alk_phos + log_sgot
## 
##                     Df Deviance    AIC
## - albumin            1   191.33 217.33
## - drug               1   191.47 217.47
## - sex                1   191.97 217.97
## - log_cholesterol    1   192.14 218.14
## - log_bilirubin      1   192.70 218.70
## <none>                   190.94 218.94
## - hepatomegaly       1   192.94 218.94
## - log_copper         1   192.94 218.94
## - log_sgot           1   194.35 220.35
## + spiders            1   190.63 220.63
## + platelets          1   190.88 220.88
## + log_tryglicerides  1   190.90 220.90
## - log_alk_phos       1   196.00 222.00
## + edema              2   190.77 222.77
## + stage              3   188.98 222.98
## - n_days             1   197.97 223.97
## - ascites            1   198.75 224.75
## - age                1   199.09 225.09
## - prothrombin        1   209.38 235.38
## 
## Step:  AIC=217.33
## status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     prothrombin + log_bilirubin + log_cholesterol + log_copper + 
##     log_alk_phos + log_sgot
## 
##                     Df Deviance    AIC
## - drug               1   191.86 215.86
## - sex                1   192.76 216.76
## - log_cholesterol    1   192.78 216.78
## - hepatomegaly       1   193.05 217.05
## - log_bilirubin      1   193.06 217.06
## - log_copper         1   193.14 217.14
## <none>                   191.33 217.33
## - log_sgot           1   194.75 218.75
## + albumin            1   190.94 218.94
## + spiders            1   191.07 219.07
## + log_tryglicerides  1   191.28 219.28
## + platelets          1   191.31 219.31
## - log_alk_phos       1   196.10 220.10
## + edema              2   191.11 221.11
## + stage              3   189.31 221.31
## - n_days             1   197.97 221.97
## - ascites            1   198.75 222.75
## - age                1   199.24 223.24
## - prothrombin        1   209.92 233.92
## 
## Step:  AIC=215.86
## status ~ n_days + age + sex + ascites + hepatomegaly + prothrombin + 
##     log_bilirubin + log_cholesterol + log_copper + log_alk_phos + 
##     log_sgot
## 
##                     Df Deviance    AIC
## - sex                1   193.36 215.36
## - log_cholesterol    1   193.39 215.39
## - hepatomegaly       1   193.46 215.46
## - log_copper         1   193.57 215.57
## - log_bilirubin      1   193.61 215.61
## <none>                   191.86 215.86
## - log_sgot           1   195.33 217.33
## + drug               1   191.33 217.33
## + spiders            1   191.47 217.47
## + albumin            1   191.47 217.47
## + platelets          1   191.81 217.81
## + log_tryglicerides  1   191.81 217.81
## - log_alk_phos       1   196.77 218.77
## + edema              2   191.68 219.68
## + stage              3   190.01 220.01
## - n_days             1   198.60 220.60
## - ascites            1   199.43 221.43
## - age                1   200.70 222.70
## - prothrombin        1   210.34 232.34
## 
## Step:  AIC=215.36
## status ~ n_days + age + ascites + hepatomegaly + prothrombin + 
##     log_bilirubin + log_cholesterol + log_copper + log_alk_phos + 
##     log_sgot
## 
##                     Df Deviance    AIC
## - hepatomegaly       1   194.88 214.88
## - log_cholesterol    1   194.88 214.88
## - log_bilirubin      1   195.12 215.12
## <none>                   193.36 215.36
## + sex                1   191.86 215.86
## + albumin            1   192.58 216.58
## - log_copper         1   196.58 216.58
## + drug               1   192.76 216.76
## - log_sgot           1   197.03 217.03
## + platelets          1   193.24 217.24
## + spiders            1   193.24 217.24
## + log_tryglicerides  1   193.24 217.24
## - log_alk_phos       1   197.88 217.88
## + edema              2   193.15 219.15
## - n_days             1   199.28 219.28
## + stage              3   191.88 219.88
## - ascites            1   201.27 221.27
## - age                1   204.08 224.08
## - prothrombin        1   212.14 232.14
## 
## Step:  AIC=214.88
## status ~ n_days + age + ascites + prothrombin + log_bilirubin + 
##     log_cholesterol + log_copper + log_alk_phos + log_sgot
## 
##                     Df Deviance    AIC
## - log_cholesterol    1   196.29 214.29
## <none>                   194.88 214.88
## + hepatomegaly       1   193.36 215.36
## + sex                1   193.46 215.46
## - log_bilirubin      1   197.55 215.55
## - log_copper         1   198.03 216.03
## - log_sgot           1   198.18 216.18
## + drug               1   194.40 216.40
## + albumin            1   194.53 216.53
## + platelets          1   194.56 216.56
## + spiders            1   194.57 216.57
## + log_tryglicerides  1   194.73 216.73
## - log_alk_phos       1   199.82 217.82
## + edema              2   194.68 218.68
## - n_days             1   201.30 219.30
## + stage              3   193.52 219.52
## - ascites            1   202.97 220.97
## - age                1   206.58 224.58
## - prothrombin        1   215.40 233.40
## 
## Step:  AIC=214.3
## status ~ n_days + age + ascites + prothrombin + log_bilirubin + 
##     log_copper + log_alk_phos + log_sgot
## 
##                     Df Deviance    AIC
## <none>                   196.29 214.29
## + log_cholesterol    1   194.88 214.88
## + hepatomegaly       1   194.88 214.88
## + sex                1   194.89 214.89
## - log_copper         1   199.42 215.42
## + drug               1   195.73 215.73
## + albumin            1   195.74 215.74
## + log_tryglicerides  1   195.84 215.84
## - log_sgot           1   199.89 215.89
## + spiders            1   196.02 216.02
## + platelets          1   196.22 216.22
## - log_bilirubin      1   201.51 217.51
## - log_alk_phos       1   201.71 217.71
## + edema              2   195.98 217.98
## + stage              3   194.72 218.72
## - n_days             1   202.75 218.75
## - ascites            1   204.33 220.33
## - age                1   207.25 223.25
## - prothrombin        1   215.45 231.45

## 
## Call:  glm(formula = status ~ n_days + age + ascites + prothrombin + 
##     log_bilirubin + log_copper + log_alk_phos + log_sgot, family = binomial, 
##     data = logs_only[, -c(1)])
## 
## Coefficients:
##   (Intercept)         n_days            age       ascitesY    prothrombin  
##    -2.511e+01     -4.773e-04      5.919e-02      1.707e+01      1.012e+00  
## log_bilirubin     log_copper   log_alk_phos       log_sgot  
##     6.080e-01      5.148e-01      6.585e-01      9.565e-01  
## 
## Degrees of Freedom: 264 Total (i.e. Null);  256 Residual
## Null Deviance:       361 
## Residual Deviance: 196.3     AIC: 214.3

chol_glm_var_sel <- glm(formula = status ~ n_days + age + ascites + prothrombin + 
    log_bilirubin + log_copper + log_alk_phos + log_sgot, family = binomial, 
    data = logs_only[, -c(1)])

summary(chol_glm_var_sel)

## 
## Call:
## glm(formula = status ~ n_days + age + ascites + prothrombin + 
##     log_bilirubin + log_copper + log_alk_phos + log_sgot, family = binomial, 
##     data = logs_only[, -c(1)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6190  -0.5792  -0.2905   0.4414   2.7704  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.511e+01  4.592e+00  -5.469 4.52e-08 ***
## n_days        -4.773e-04  1.918e-04  -2.488   0.0128 *  
## age            5.919e-02  1.864e-02   3.175   0.0015 ** 
## ascitesY       1.707e+01  1.184e+03   0.014   0.9885    
## prothrombin    1.012e+00  2.529e-01   4.001 6.31e-05 ***
## log_bilirubin  6.080e-01  2.731e-01   2.226   0.0260 *  
## log_copper     5.148e-01  2.943e-01   1.749   0.0802 .  
## log_alk_phos   6.585e-01  2.877e-01   2.288   0.0221 *  
## log_sgot       9.565e-01  5.087e-01   1.880   0.0601 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 361.0  on 264  degrees of freedom
## Residual deviance: 196.3  on 256  degrees of freedom
## AIC: 214.3
## 
## Number of Fisher Scoring iterations: 17

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.0.5

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

lrtest(chol_glm, chol_glm_var_sel)

## Likelihood ratio test
## 
## Model 1: status ~ n_days + drug + age + sex + ascites + hepatomegaly + 
##     spiders + edema + albumin + platelets + prothrombin + stage + 
##     log_bilirubin + log_cholesterol + log_copper + log_alk_phos + 
##     log_sgot + log_tryglicerides
## Model 2: status ~ n_days + age + ascites + prothrombin + log_bilirubin + 
##     log_copper + log_alk_phos + log_sgot
##   #Df  LogLik  Df  Chisq Pr(>Chisq)
## 1  22 -94.036                      
## 2   9 -98.148 -13 8.2225     0.8288

The fact that there isn’t a significant difference between these two models, and yet the AIC is much lower in the second demonstrates how the other variables are just superfluous.

plot(chol_glm_var_sel)

For the linearity of a logistic regressoin, we expect that there is no trend in the residuals with respect to the fitted values, other than any patterns inherent to the nature of the response. There doesn’t appear to be any trends, but just has a long tail of values to the right that may be highly influential. The qq plot implies a distribution of residuals with long tails and outliers, however, normality of residuals is not an assumption of binary logistic regression (neither is homoscedastity). These plots of raw residuals are actually rather not useful for interpreting logistic regression, and so I will look at a binned residual plot.

library(arm)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 4.0.5

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: lme4

## Warning: package 'lme4' was built under R version 4.0.5

## 
## arm (Version 1.12-2, built: 2021-10-15)

## Working directory is /Users/katarinacook/Desktop/Stat 131A/Project

binnedplot(fitted(chol_glm_var_sel), 
           residuals(chol_glm_var_sel, type = "response"), 
           nclass = NULL, 
           xlab = "Expected Values", 
           ylab = "Average residual", 
           main = "Binned residual plot", 
           cex.pts = 0.8, 
           col.pts = 1, 
           col.int = "gray")

The model looks reasonable, but there are more outliers than we would expect due to chance (with an alpha = 0.05), as there was 16 total binned residuals, and 4 of them are outliers = 0.25. The model does well with values above about 0.18 and below 0.05 but falters between them. This means that below 0.2 the model is predicting a higher average rate of survival than is actually the case.

#1 = D
library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

glm_pred_probs = data.frame(round(probs = predict(chol_glm_var_sel, type="response")))

glm_pred_probs$round.probs...predict.chol_glm_var_sel..type....response... <- as.factor(glm_pred_probs$round.probs...predict.chol_glm_var_sel..type....response...)
levels(glm_pred_probs$round.probs...predict.chol_glm_var_sel..type....response...) <- c(0,1)
levels(logs_only$status) <- c(0, 2, 1)
new_data <- droplevels(logs_only$status)

confusionMatrix(data = glm_pred_probs$round.probs...predict.chol_glm_var_sel..type....response..., reference = new_data)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 137  28
##          1  16  84
##                                           
##                Accuracy : 0.834           
##                  95% CI : (0.7836, 0.8767)
##     No Information Rate : 0.5774          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6548          
##                                           
##  Mcnemar's Test P-Value : 0.09725         
##                                           
##             Sensitivity : 0.8954          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8303          
##          Neg Pred Value : 0.8400          
##              Prevalence : 0.5774          
##          Detection Rate : 0.5170          
##    Detection Prevalence : 0.6226          
##       Balanced Accuracy : 0.8227          
##                                           
##        'Positive' Class : 0               
##

This confusion matrix treated the factors D = 1, and C= 0 for the purposes of analyzing how well our model predicted survivability. Rounding the probabilities generated by this model to either 0 or 1, the confusion matrix found that this model was about 83% accurate in predicting survivability.

Katarina Cook’s Stat 131 Project