library(stringr)
library(ggplot2)
library(rsq)

# Import Dataset
data <- read.csv(file='../../../Downloads/Dataset - small records.csv',header=TRUE)

# List variables and create a subset
var_list <- c("death_binary","age", "gender_binary", "respiratory", "fever", "gastrointestinal",
              "nausea", "cardiac", "kidney", "neuro", "diabetes", "hypertension",
              "cancer", "ortho", "prostate", "thyroid")

df<- data[,var_list]

# Create age group
df$group <- (str_extract(df$age, "^[0-9]"))
range <- data.frame(str_split_fixed(as.character(df$age), "-", 2))
colnames(range)<- c("v1","v2")
range$v1 <- as.numeric(range$v1)
range$v2 <- as.numeric(range$v2)
range$v3 <- range$v1 - ifelse(is.na(range$v2),0,range$v2)

df2 <- data.frame(df,range)
df2 <- df2[df2$v3 >= -9,] # Remove age range greater than 10

# Generalized Linear Model: Logistic Regression
# Model 1 using 14 independent variables
model_1 <- glm(death_binary ~ respiratory + fever + gastrointestinal + nausea + cardiac + kidney + 
                 neuro + diabetes + hypertension + cancer + ortho + prostate + thyroid + gender_binary,
               data = df2, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Model 2 using 14 independent variables plus age group
model_2 <- glm(death_binary ~ respiratory + fever + gastrointestinal + nausea + cardiac + kidney + 
                 neuro + diabetes + hypertension + cancer + ortho + prostate + thyroid + gender_binary +
                 group, data = df2, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Model 3 using significant independent variables from previous model
model_3 <- glm(death_binary ~ respiratory + fever + cardiac + diabetes + hypertension + gender_binary + 
                 group, data = df2, family = "binomial")

# Use AIC to find the best model
summary(model_1)$aic
## [1] 413.6983
summary(model_2)$aic # Model 2 has the minimum AIC
## [1] 404.6385
summary(model_3)$aic
## [1] 407.3142
# Predict Y (probability of death) using best fit model and calculate the accuracy rate
pred <- predict(model_2, df2, type="response") # the probability of death
y_pred_num <- ifelse(pred > 0.5, 1, 0) # set cutoff as 0.5, Y > 0.5 means dead, Y < 0.5 means survived
y_pred <- factor(y_pred_num, levels=c(0, 1))
y_act <- df2$death_binary
mean(y_pred == y_act) #accuracy rate of this model
## [1] 0.9649123
# Predict the outcome given new data - Patient A
nd = data.frame(age = 55, respiratory=1, fever=0, gastrointestinal=0, nausea=0, cardiac=1, kidney=0, 
                neuro=0, diabetes=1, hypertension=0, cancer=0, ortho=0, prostate=0, 
                thyroid=0, gender_binary=1, group="5")
predict(model_2, nd, type="response")
##         1 
## 0.7594725
# Predict the outcome for Patient B
nd = data.frame(age = 35, respiratory=0, fever=1, gastrointestinal=0, nausea=0, cardiac=0, kidney=0, 
                neuro=0, diabetes=0, hypertension=0, cancer=0, ortho=0, prostate=0, 
                thyroid=0, gender_binary=0, group="3")
predict(model_2, nd, type="response")
##         1 
## 0.0131863