STEP 1: import the data

setwd("/Users/S/Downloads")

# Load the data
library(caret)   
## Loading required package: ggplot2
## Loading required package: lattice
library(class)   
library(rpart) 
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
death_df <- read.csv('2005_data.csv', nrows = 50000)

STEP 2: Explore the data

str(death_df)
## 'data.frame':    50000 obs. of  77 variables:
##  $ resident_status                                       : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ education_1989_revision                               : int  11 13 12 12 14 3 12 12 14 8 ...
##  $ education_2003_revision                               : logi  NA NA NA NA NA NA ...
##  $ education_reporting_flag                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ month_of_death                                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                                   : chr  "F" "M" "F" "M" ...
##  $ detail_age_type                                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ detail_age                                            : int  45 61 79 50 68 89 68 61 73 85 ...
##  $ age_substitution_flag                                 : logi  NA NA NA NA NA NA ...
##  $ age_recode_52                                         : int  35 38 41 36 39 43 39 38 40 43 ...
##  $ age_recode_27                                         : int  15 18 21 16 19 23 19 18 20 23 ...
##  $ age_recode_12                                         : int  7 8 10 7 9 11 9 8 9 11 ...
##  $ infant_age_recode_22                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_death_and_decedents_status                   : int  1 1 6 1 1 6 1 1 1 7 ...
##  $ marital_status                                        : chr  "M" "D" "D" "S" ...
##  $ day_of_week_of_death                                  : int  2 7 1 4 2 7 7 6 6 6 ...
##  $ current_data_year                                     : int  2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
##  $ injury_at_work                                        : chr  "U" "U" "U" "U" ...
##  $ manner_of_death                                       : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ method_of_disposition                                 : chr  "U" "U" "U" "U" ...
##  $ autopsy                                               : chr  "N" "N" "N" "N" ...
##  $ activity_code                                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ icd_code_10th_revision                                : chr  "C439" "J439" "I698" "E119" ...
##  $ X358_cause_recode                                     : int  98 266 239 159 93 239 266 267 266 125 ...
##  $ X113_cause_recode                                     : int  28 84 70 46 27 70 84 86 84 43 ...
##  $ X130_infant_cause_recode                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ X39_cause_recode                                      : int  15 28 24 16 8 24 28 28 28 15 ...
##  $ number_of_entity_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ entity_condition_1                                    : chr  "11C439" "11J439" "11R628" "11I469" ...
##  $ entity_condition_2                                    : chr  "" "" "21I698" "61E119" ...
##  $ entity_condition_3                                    : chr  "" "" "61J449" "62I500" ...
##  $ entity_condition_4                                    : chr  "" "" "62M199" "63K862" ...
##  $ entity_condition_5                                    : chr  "" "" "63R568" "" ...
##  $ entity_condition_6                                    : chr  "" "" "" "" ...
##  $ entity_condition_7                                    : chr  "" "" "" "" ...
##  $ entity_condition_8                                    : chr  "" "" "" "" ...
##  $ entity_condition_9                                    : chr  "" "" "" "" ...
##  $ entity_condition_10                                   : chr  "" "" "" "" ...
##  $ entity_condition_11                                   : chr  "" "" "" "" ...
##  $ entity_condition_12                                   : chr  "" "" "" "" ...
##  $ entity_condition_13                                   : chr  "" "" "" "" ...
##  $ entity_condition_14                                   : chr  "" "" "" "" ...
##  $ entity_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ number_of_record_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ record_condition_1                                    : chr  "C439" "J439" "I698" "E119" ...
##  $ record_condition_2                                    : chr  "" "" "J449" "I469" ...
##  $ record_condition_3                                    : chr  "" "" "M199" "I500" ...
##  $ record_condition_4                                    : chr  "" "" "R568" "K862" ...
##  $ record_condition_5                                    : chr  "" "" "R628" "" ...
##  $ record_condition_6                                    : chr  "" "" "" "" ...
##  $ record_condition_7                                    : chr  "" "" "" "" ...
##  $ record_condition_8                                    : chr  "" "" "" "" ...
##  $ record_condition_9                                    : chr  "" "" "" "" ...
##  $ record_condition_10                                   : chr  "" "" "" "" ...
##  $ record_condition_11                                   : chr  "" "" "" "" ...
##  $ record_condition_12                                   : chr  "" "" "" "" ...
##  $ record_condition_13                                   : chr  "" "" "" "" ...
##  $ record_condition_14                                   : chr  "" "" "" "" ...
##  $ record_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ race                                                  : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ bridged_race_flag                                     : logi  NA NA NA NA NA NA ...
##  $ race_imputation_flag                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ race_recode_3                                         : int  1 1 1 1 1 2 1 2 1 1 ...
##  $ race_recode_5                                         : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ hispanic_origin                                       : int  100 100 100 100 100 100 100 100 100 100 ...
##  $ hispanic_originrace_recode                            : int  6 6 6 6 6 8 6 8 6 6 ...
sort(colSums(is.na(death_df)), decreasing = TRUE)
##                                education_2003_revision 
##                                                  50000 
##                                  age_substitution_flag 
##                                                  50000 
##                                    entity_condition_15 
##                                                  50000 
##                                    entity_condition_16 
##                                                  50000 
##                                    entity_condition_17 
##                                                  50000 
##                                    entity_condition_18 
##                                                  50000 
##                                    entity_condition_19 
##                                                  50000 
##                                    entity_condition_20 
##                                                  50000 
##                                    record_condition_15 
##                                                  50000 
##                                    record_condition_16 
##                                                  50000 
##                                    record_condition_17 
##                                                  50000 
##                                    record_condition_18 
##                                                  50000 
##                                    record_condition_19 
##                                                  50000 
##                                    record_condition_20 
##                                                  50000 
##                                      bridged_race_flag 
##                                                  50000 
##                                   race_imputation_flag 
##                                                  49975 
##                                   infant_age_recode_22 
##                                                  49367 
##                               X130_infant_cause_recode 
##                                                  49367 
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_ 
##                                                  46918 
##                                          activity_code 
##                                                  46108 
##                                        manner_of_death 
##                                                  10352 
##                                        resident_status 
##                                                      0 
##                                education_1989_revision 
##                                                      0 
##                               education_reporting_flag 
##                                                      0 
##                                         month_of_death 
##                                                      0 
##                                                    sex 
##                                                      0 
##                                        detail_age_type 
##                                                      0 
##                                             detail_age 
##                                                      0 
##                                          age_recode_52 
##                                                      0 
##                                          age_recode_27 
##                                                      0 
##                                          age_recode_12 
##                                                      0 
##                    place_of_death_and_decedents_status 
##                                                      0 
##                                         marital_status 
##                                                      0 
##                                   day_of_week_of_death 
##                                                      0 
##                                      current_data_year 
##                                                      0 
##                                         injury_at_work 
##                                                      0 
##                                  method_of_disposition 
##                                                      0 
##                                                autopsy 
##                                                      0 
##                                 icd_code_10th_revision 
##                                                      0 
##                                      X358_cause_recode 
##                                                      0 
##                                      X113_cause_recode 
##                                                      0 
##                                       X39_cause_recode 
##                                                      0 
##                       number_of_entity_axis_conditions 
##                                                      0 
##                                     entity_condition_1 
##                                                      0 
##                                     entity_condition_2 
##                                                      0 
##                                     entity_condition_3 
##                                                      0 
##                                     entity_condition_4 
##                                                      0 
##                                     entity_condition_5 
##                                                      0 
##                                     entity_condition_6 
##                                                      0 
##                                     entity_condition_7 
##                                                      0 
##                                     entity_condition_8 
##                                                      0 
##                                     entity_condition_9 
##                                                      0 
##                                    entity_condition_10 
##                                                      0 
##                                    entity_condition_11 
##                                                      0 
##                                    entity_condition_12 
##                                                      0 
##                                    entity_condition_13 
##                                                      0 
##                                    entity_condition_14 
##                                                      0 
##                       number_of_record_axis_conditions 
##                                                      0 
##                                     record_condition_1 
##                                                      0 
##                                     record_condition_2 
##                                                      0 
##                                     record_condition_3 
##                                                      0 
##                                     record_condition_4 
##                                                      0 
##                                     record_condition_5 
##                                                      0 
##                                     record_condition_6 
##                                                      0 
##                                     record_condition_7 
##                                                      0 
##                                     record_condition_8 
##                                                      0 
##                                     record_condition_9 
##                                                      0 
##                                    record_condition_10 
##                                                      0 
##                                    record_condition_11 
##                                                      0 
##                                    record_condition_12 
##                                                      0 
##                                    record_condition_13 
##                                                      0 
##                                    record_condition_14 
##                                                      0 
##                                                   race 
##                                                      0 
##                                          race_recode_3 
##                                                      0 
##                                          race_recode_5 
##                                                      0 
##                                        hispanic_origin 
##                                                      0 
##                             hispanic_originrace_recode 
##                                                      0

STEP 3: Pre-Process the data

new_death_df <- death_df[ , c("sex", "marital_status", "hispanic_origin", "month_of_death", 
                              "detail_age", "injury_at_work")]
#new_death_df
new_death_df$above_age = as.factor(death_df$detail_age>77.3)
#new_death_df

# partition the data into training and validation sets
set.seed(1)  
train_index <- sample(c(1:dim(new_death_df)[1]), dim(new_death_df)[1]*0.6)  
valid_index <- setdiff(c(1:dim(new_death_df)[1]), train_index)  
train_df <- new_death_df[train_index, ]
valid_df <- new_death_df[valid_index, ]

STEP 4: Modelling

Fit Logistic Regression Model

reg <- glm(above_age ~ ., data = train_df, family = "binomial", control = glm.control(maxit = 100))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
  # Make predictions on the validation set
reg_pred <- predict(reg, valid_df, type = "response")

  # Create a binary column for predicted outcome
valid_df$LRPred <- as.factor(ifelse(reg_pred > 0.5, 1, 0))

  # Set levels for the factor variables
levels(valid_df$above_age) <- levels(valid_df$LRPred) <- c("0", "1")

  # Confusion/classification matrix
confusionMatrix(valid_df$LRPred, valid_df$above_age, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 11259     0
##          1     0  8741
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.563      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.437      
##          Detection Rate : 0.437      
##    Detection Prevalence : 0.437      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 1          
## 

Fit KNN Model

For some reason, every time I tried to fit the knn model it would not work and I couldn’t figure out how to fix it, but here is the code that I would’ve used.

#kn <- class::knn(train = train_df[, -6, drop = FALSE], 
#                 test = valid_df[, -6, drop = FALSE], 
#                 cl = train_df$above_age, 
#                 k = 3, prob = TRUE)

  # Confusion/classification matrix
#confusionMatrix(kn, factor(valid_df$above_age), positive = "1")

Fit Class Tree Model

tr <- rpart(above_age ~., data = new_death_df)
tr
## n= 50000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 50000 21794 FALSE (0.5641200 0.4358800)  
##   2) detail_age< 77.5 28206     0 FALSE (1.0000000 0.0000000) *
##   3) detail_age>=77.5 21794     0 TRUE (0.0000000 1.0000000) *
#confusion/classification matrix
confusionMatrix(factor(ifelse(predict(tr, valid_df)[,2]>0.5, 1, 0)), 
                valid_df$above_age, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 11259     0
##          1     0  8741
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.563      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.437      
##          Detection Rate : 0.437      
##    Detection Prevalence : 0.437      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : 1          
## 

Step 5. Prediction (predict)

I could not get the prediction to work correctly because I could not figure out how to get my knn model to run, but here is the code that I would have used had it worked.

#Create a data frame with the actual outcome, predicted outcome, and each of the three models. Report the first 10 rows of this data frame.
res <- data.frame(ActualClass = valid_df$above_age, 
                  LRProb = predict(reg, valid_df, type = "response"), 
                  LRPred = ifelse(predict(reg, valid_df, type = "response") > 0.5, 1, 0), 
                  #KNNProb = 1 - attr(kn, "prob"), 
                  #KNNPred = kn, 
                  TREEProb = predict(tr, valid_df)[, 2], 
                  TREEPred = ifelse(predict(tr, valid_df)[, 2] > 0.5, 1, 0))

options(digits = 1, scipen = 2)
head(res, 10)
##    ActualClass LRProb LRPred TREEProb TREEPred
## 2            0  2e-16      0        0        0
## 3            1  1e+00      1        1        1
## 6            1  1e+00      1        1        1
## 13           1  1e+00      1        1        1
## 14           0  2e-16      0        0        0
## 16           0  2e-16      0        0        0
## 19           0  2e-16      0        0        0
## 21           1  1e+00      1        1        1
## 23           0  2e-16      0        0        0
## 28           0  2e-16      0        0        0
# Ensemble model
#res$majority <- rowMeans(data.frame(res$LRPred, as.numeric(res$KNNPred), res$TREEPred)) > 0.5
#res$avg <- rowMeans(data.frame(res$LRProb, res$KNNProb, res$TREEProb))
head(res)
##    ActualClass LRProb LRPred TREEProb TREEPred
## 2            0  2e-16      0        0        0
## 3            1  1e+00      1        1        1
## 6            1  1e+00      1        1        1
## 13           1  1e+00      1        1        1
## 14           0  2e-16      0        0        0
## 16           0  2e-16      0        0        0
#res$majority <- factor(res$majority, levels = c("0", "1"))
#valid_df$above_age <- factor(valid_df$above_age, levels = c("0", "1"))