Import the data

df <- read.csv("2005_data.csv", nrows = 50000)

Explore Data

str(df)
## 'data.frame':    50000 obs. of  77 variables:
##  $ resident_status                                       : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ education_1989_revision                               : int  11 13 12 12 14 3 12 12 14 8 ...
##  $ education_2003_revision                               : logi  NA NA NA NA NA NA ...
##  $ education_reporting_flag                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ month_of_death                                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                                   : chr  "F" "M" "F" "M" ...
##  $ detail_age_type                                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ detail_age                                            : int  45 61 79 50 68 89 68 61 73 85 ...
##  $ age_substitution_flag                                 : logi  NA NA NA NA NA NA ...
##  $ age_recode_52                                         : int  35 38 41 36 39 43 39 38 40 43 ...
##  $ age_recode_27                                         : int  15 18 21 16 19 23 19 18 20 23 ...
##  $ age_recode_12                                         : int  7 8 10 7 9 11 9 8 9 11 ...
##  $ infant_age_recode_22                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_death_and_decedents_status                   : int  1 1 6 1 1 6 1 1 1 7 ...
##  $ marital_status                                        : chr  "M" "D" "D" "S" ...
##  $ day_of_week_of_death                                  : int  2 7 1 4 2 7 7 6 6 6 ...
##  $ current_data_year                                     : int  2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
##  $ injury_at_work                                        : chr  "U" "U" "U" "U" ...
##  $ manner_of_death                                       : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ method_of_disposition                                 : chr  "U" "U" "U" "U" ...
##  $ autopsy                                               : chr  "N" "N" "N" "N" ...
##  $ activity_code                                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ icd_code_10th_revision                                : chr  "C439" "J439" "I698" "E119" ...
##  $ X358_cause_recode                                     : int  98 266 239 159 93 239 266 267 266 125 ...
##  $ X113_cause_recode                                     : int  28 84 70 46 27 70 84 86 84 43 ...
##  $ X130_infant_cause_recode                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ X39_cause_recode                                      : int  15 28 24 16 8 24 28 28 28 15 ...
##  $ number_of_entity_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ entity_condition_1                                    : chr  "11C439" "11J439" "11R628" "11I469" ...
##  $ entity_condition_2                                    : chr  "" "" "21I698" "61E119" ...
##  $ entity_condition_3                                    : chr  "" "" "61J449" "62I500" ...
##  $ entity_condition_4                                    : chr  "" "" "62M199" "63K862" ...
##  $ entity_condition_5                                    : chr  "" "" "63R568" "" ...
##  $ entity_condition_6                                    : chr  "" "" "" "" ...
##  $ entity_condition_7                                    : chr  "" "" "" "" ...
##  $ entity_condition_8                                    : chr  "" "" "" "" ...
##  $ entity_condition_9                                    : chr  "" "" "" "" ...
##  $ entity_condition_10                                   : chr  "" "" "" "" ...
##  $ entity_condition_11                                   : chr  "" "" "" "" ...
##  $ entity_condition_12                                   : chr  "" "" "" "" ...
##  $ entity_condition_13                                   : chr  "" "" "" "" ...
##  $ entity_condition_14                                   : chr  "" "" "" "" ...
##  $ entity_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ number_of_record_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ record_condition_1                                    : chr  "C439" "J439" "I698" "E119" ...
##  $ record_condition_2                                    : chr  "" "" "J449" "I469" ...
##  $ record_condition_3                                    : chr  "" "" "M199" "I500" ...
##  $ record_condition_4                                    : chr  "" "" "R568" "K862" ...
##  $ record_condition_5                                    : chr  "" "" "R628" "" ...
##  $ record_condition_6                                    : chr  "" "" "" "" ...
##  $ record_condition_7                                    : chr  "" "" "" "" ...
##  $ record_condition_8                                    : chr  "" "" "" "" ...
##  $ record_condition_9                                    : chr  "" "" "" "" ...
##  $ record_condition_10                                   : chr  "" "" "" "" ...
##  $ record_condition_11                                   : chr  "" "" "" "" ...
##  $ record_condition_12                                   : chr  "" "" "" "" ...
##  $ record_condition_13                                   : chr  "" "" "" "" ...
##  $ record_condition_14                                   : chr  "" "" "" "" ...
##  $ record_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ race                                                  : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ bridged_race_flag                                     : logi  NA NA NA NA NA NA ...
##  $ race_imputation_flag                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ race_recode_3                                         : int  1 1 1 1 1 2 1 2 1 1 ...
##  $ race_recode_5                                         : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ hispanic_origin                                       : int  100 100 100 100 100 100 100 100 100 100 ...
##  $ hispanic_originrace_recode                            : int  6 6 6 6 6 8 6 8 6 6 ...
colSums(is.na(df))
##                                        resident_status 
##                                                      0 
##                                education_1989_revision 
##                                                      0 
##                                education_2003_revision 
##                                                  50000 
##                               education_reporting_flag 
##                                                      0 
##                                         month_of_death 
##                                                      0 
##                                                    sex 
##                                                      0 
##                                        detail_age_type 
##                                                      0 
##                                             detail_age 
##                                                      0 
##                                  age_substitution_flag 
##                                                  50000 
##                                          age_recode_52 
##                                                      0 
##                                          age_recode_27 
##                                                      0 
##                                          age_recode_12 
##                                                      0 
##                                   infant_age_recode_22 
##                                                  49367 
##                    place_of_death_and_decedents_status 
##                                                      0 
##                                         marital_status 
##                                                      0 
##                                   day_of_week_of_death 
##                                                      0 
##                                      current_data_year 
##                                                      0 
##                                         injury_at_work 
##                                                      0 
##                                        manner_of_death 
##                                                  10352 
##                                  method_of_disposition 
##                                                      0 
##                                                autopsy 
##                                                      0 
##                                          activity_code 
##                                                  46108 
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_ 
##                                                  46918 
##                                 icd_code_10th_revision 
##                                                      0 
##                                      X358_cause_recode 
##                                                      0 
##                                      X113_cause_recode 
##                                                      0 
##                               X130_infant_cause_recode 
##                                                  49367 
##                                       X39_cause_recode 
##                                                      0 
##                       number_of_entity_axis_conditions 
##                                                      0 
##                                     entity_condition_1 
##                                                      0 
##                                     entity_condition_2 
##                                                      0 
##                                     entity_condition_3 
##                                                      0 
##                                     entity_condition_4 
##                                                      0 
##                                     entity_condition_5 
##                                                      0 
##                                     entity_condition_6 
##                                                      0 
##                                     entity_condition_7 
##                                                      0 
##                                     entity_condition_8 
##                                                      0 
##                                     entity_condition_9 
##                                                      0 
##                                    entity_condition_10 
##                                                      0 
##                                    entity_condition_11 
##                                                      0 
##                                    entity_condition_12 
##                                                      0 
##                                    entity_condition_13 
##                                                      0 
##                                    entity_condition_14 
##                                                      0 
##                                    entity_condition_15 
##                                                  50000 
##                                    entity_condition_16 
##                                                  50000 
##                                    entity_condition_17 
##                                                  50000 
##                                    entity_condition_18 
##                                                  50000 
##                                    entity_condition_19 
##                                                  50000 
##                                    entity_condition_20 
##                                                  50000 
##                       number_of_record_axis_conditions 
##                                                      0 
##                                     record_condition_1 
##                                                      0 
##                                     record_condition_2 
##                                                      0 
##                                     record_condition_3 
##                                                      0 
##                                     record_condition_4 
##                                                      0 
##                                     record_condition_5 
##                                                      0 
##                                     record_condition_6 
##                                                      0 
##                                     record_condition_7 
##                                                      0 
##                                     record_condition_8 
##                                                      0 
##                                     record_condition_9 
##                                                      0 
##                                    record_condition_10 
##                                                      0 
##                                    record_condition_11 
##                                                      0 
##                                    record_condition_12 
##                                                      0 
##                                    record_condition_13 
##                                                      0 
##                                    record_condition_14 
##                                                      0 
##                                    record_condition_15 
##                                                  50000 
##                                    record_condition_16 
##                                                  50000 
##                                    record_condition_17 
##                                                  50000 
##                                    record_condition_18 
##                                                  50000 
##                                    record_condition_19 
##                                                  50000 
##                                    record_condition_20 
##                                                  50000 
##                                                   race 
##                                                      0 
##                                      bridged_race_flag 
##                                                  50000 
##                                   race_imputation_flag 
##                                                  49975 
##                                          race_recode_3 
##                                                      0 
##                                          race_recode_5 
##                                                      0 
##                                        hispanic_origin 
##                                                      0 
##                             hispanic_originrace_recode 
##                                                      0

Preprocess Data

df_death <- df[,c("autopsy", "marital_status", "sex", 
                        "resident_status" , "injury_at_work" ,"detail_age")]
df_death$Above_Avg_Age <- ifelse(df_death$detail_age > 77.4, 1, 0)
df_death <- df_death[,-6]
df_index <- sample(c(1:dim(df_death)[1]),0.7*dim(df_death)[1])
df_train <- df_death[df_index,]
df_valid <- df_death[-df_index,]

Modeling (logistic regression)

logic_model <- glm(Above_Avg_Age ~ ., data = df_train, family = "binomial")
confusionMatrix(factor(ifelse(predict(logic_model, df_valid, type = "response")>0.5, 1, 0)), factor(df_valid$Above_Avg_Age), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7175 2477
##          1 1325 4023
##                                           
##                Accuracy : 0.7465          
##                  95% CI : (0.7395, 0.7535)
##     No Information Rate : 0.5667          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4729          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6189          
##             Specificity : 0.8441          
##          Pos Pred Value : 0.7522          
##          Neg Pred Value : 0.7434          
##              Prevalence : 0.4333          
##          Detection Rate : 0.2682          
##    Detection Prevalence : 0.3565          
##       Balanced Accuracy : 0.7315          
##                                           
##        'Positive' Class : 1               
## 
summary(logic_model)
## 
## Call:
## glm(formula = Above_Avg_Age ~ ., family = "binomial", data = df_train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.30073    0.11846 -19.421  < 2e-16 ***
## autopsyU         0.09108    0.02957   3.080 0.002070 ** 
## autopsyY        -1.72202    0.10327 -16.675  < 2e-16 ***
## marital_statusM  0.68809    0.04330  15.893  < 2e-16 ***
## marital_statusS -0.08198    0.06043  -1.357 0.174920    
## marital_statusU  0.34308    0.27570   1.244 0.213363    
## marital_statusW  2.48021    0.04477  55.396  < 2e-16 ***
## sexM            -0.09252    0.02731  -3.388 0.000704 ***
## resident_status -0.31593    0.02732 -11.565  < 2e-16 ***
## injury_at_workU  1.34301    0.10681  12.574  < 2e-16 ***
## injury_at_workY -0.81901    0.62462  -1.311 0.189787    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 47963  on 34999  degrees of freedom
## Residual deviance: 37824  on 34989  degrees of freedom
## AIC: 37846
## 
## Number of Fisher Scoring iterations: 5

Modeling (K-nearest neighbor)

We can only use numerical values for this model

df_death_knn <- df[,c("education_1989_revision", "day_of_week_of_death", "month_of_death", 
                        "race" , "hispanic_origin" ,"detail_age")]
df_death_knn$Above_Avg_Age <- ifelse(df_death_knn$detail_age > 77.4, 1, 0)
df_death_knn <- df_death_knn[,-6]
df_index_knn <- sample(c(1:dim(df_death_knn)[1]),0.7*dim(df_death_knn)[1])
df_train_knn <- df_death_knn[df_index_knn,]
df_valid_knn <- df_death_knn[-df_index_knn,]

kn <- knn(train = df_train_knn[, -6], test = df_valid_knn[,-6], cl = df_train_knn[, 6], k = 5, prob=TRUE)
confusionMatrix(kn, factor(df_valid_knn[,6]), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6273 3879
##          1 2219 2629
##                                           
##                Accuracy : 0.5935          
##                  95% CI : (0.5856, 0.6013)
##     No Information Rate : 0.5661          
##     P-Value [Acc > NIR] : 6.696e-12       
##                                           
##                   Kappa : 0.147           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.4040          
##             Specificity : 0.7387          
##          Pos Pred Value : 0.5423          
##          Neg Pred Value : 0.6179          
##              Prevalence : 0.4339          
##          Detection Rate : 0.1753          
##    Detection Prevalence : 0.3232          
##       Balanced Accuracy : 0.5713          
##                                           
##        'Positive' Class : 1               
## 

Modeling (Tree Model)

tr <- rpart(Above_Avg_Age ~., data = df_death)
confusionMatrix(factor(ifelse(predict(tr, df_valid)>0.5, 1, 0)), 
                factor(df_valid$Above_Avg_Age), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7106 2429
##          1 1394 4071
##                                           
##                Accuracy : 0.7451          
##                  95% CI : (0.7381, 0.7521)
##     No Information Rate : 0.5667          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4711          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6263          
##             Specificity : 0.8360          
##          Pos Pred Value : 0.7449          
##          Neg Pred Value : 0.7453          
##              Prevalence : 0.4333          
##          Detection Rate : 0.2714          
##    Detection Prevalence : 0.3643          
##       Balanced Accuracy : 0.7312          
##                                           
##        'Positive' Class : 1               
## 

Prediction

res<- data.frame(ActualClass = df_valid$Above_Avg_Age, 
                 LRProb = predict(logic_model, df_valid, type = "response"), 
                 LRPred = ifelse(predict(logic_model, df_valid, type = "response")>0.5, 1, 0), 
                 KNNProb = 1-attr(kn, "prob"), 
                 KNNPred = kn, 
                 TREEProb = predict(tr, df_valid), 
                 TREEPred = ifelse(predict(tr, df_valid)>0.5, 1, 0))

options(digits = 1, scipen = 2)
head(res, 10)
##    ActualClass LRProb LRPred KNNProb KNNPred TREEProb TREEPred
## 3            1    0.2      0     0.4       0      0.2        0
## 12           0    0.3      0     0.4       0      0.3        0
## 26           0    0.3      0     0.2       1      0.3        0
## 30           1    0.4      0     0.2       0      0.3        0
## 32           0    0.3      0     0.5       0      0.3        0
## 37           0    0.1      0     0.5       0      0.3        0
## 40           0    0.2      0     0.5       0      0.2        0
## 41           0    0.8      1     0.2       1      0.7        1
## 43           0    0.3      0     0.3       1      0.3        0
## 48           0    0.3      0     0.5       0      0.3        0
res$majority <- rowMeans(data.frame(res$LRPred, as.numeric(res$KNNPred), 
                                    res$TREEPred))>0.5
res$avg <- rowMeans(data.frame(res$LRProb, res$KNNProb, res$TREEProb))

head(res)
##    ActualClass LRProb LRPred KNNProb KNNPred TREEProb TREEPred majority avg
## 3            1    0.2      0     0.4       0      0.2        0    FALSE 0.3
## 12           0    0.3      0     0.4       0      0.3        0    FALSE 0.4
## 26           0    0.3      0     0.2       1      0.3        0     TRUE 0.3
## 30           1    0.4      0     0.2       0      0.3        0    FALSE 0.3
## 32           0    0.3      0     0.5       0      0.3        0    FALSE 0.4
## 37           0    0.1      0     0.5       0      0.3        0    FALSE 0.3

Evaluation

#confusion matrix using majority vote of predicted outcomes
confusionMatrix(factor(res$majority * 1), factor(df_valid[,6]), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4889 1652
##          1 3611 4848
##                                         
##                Accuracy : 0.649         
##                  95% CI : (0.641, 0.657)
##     No Information Rate : 0.567         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.31          
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.746         
##             Specificity : 0.575         
##          Pos Pred Value : 0.573         
##          Neg Pred Value : 0.747         
##              Prevalence : 0.433         
##          Detection Rate : 0.323         
##    Detection Prevalence : 0.564         
##       Balanced Accuracy : 0.661         
##                                         
##        'Positive' Class : 1             
## 
#confusion matrix using average of predicted probabilities
confusionMatrix(factor((res$avg > 0.5)* 1), factor(df_valid[,6]), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7149 2473
##          1 1351 4027
##                                         
##                Accuracy : 0.745         
##                  95% CI : (0.738, 0.752)
##     No Information Rate : 0.567         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.47          
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.620         
##             Specificity : 0.841         
##          Pos Pred Value : 0.749         
##          Neg Pred Value : 0.743         
##              Prevalence : 0.433         
##          Detection Rate : 0.268         
##    Detection Prevalence : 0.359         
##       Balanced Accuracy : 0.730         
##                                         
##        'Positive' Class : 1             
##