Step 0 Load required libraries

library(class)
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(gains)

Step 1. Import data (read.csv/data)

death_df <- read.csv("2005_data.csv", nrows = 50000)

#Step 2. Explore data (str/is.na)

str(death_df)
## 'data.frame':    50000 obs. of  77 variables:
##  $ resident_status                                       : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ education_1989_revision                               : int  11 13 12 12 14 3 12 12 14 8 ...
##  $ education_2003_revision                               : logi  NA NA NA NA NA NA ...
##  $ education_reporting_flag                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ month_of_death                                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sex                                                   : chr  "F" "M" "F" "M" ...
##  $ detail_age_type                                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ detail_age                                            : int  45 61 79 50 68 89 68 61 73 85 ...
##  $ age_substitution_flag                                 : logi  NA NA NA NA NA NA ...
##  $ age_recode_52                                         : int  35 38 41 36 39 43 39 38 40 43 ...
##  $ age_recode_27                                         : int  15 18 21 16 19 23 19 18 20 23 ...
##  $ age_recode_12                                         : int  7 8 10 7 9 11 9 8 9 11 ...
##  $ infant_age_recode_22                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_death_and_decedents_status                   : int  1 1 6 1 1 6 1 1 1 7 ...
##  $ marital_status                                        : chr  "M" "D" "D" "S" ...
##  $ day_of_week_of_death                                  : int  2 7 1 4 2 7 7 6 6 6 ...
##  $ current_data_year                                     : int  2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
##  $ injury_at_work                                        : chr  "U" "U" "U" "U" ...
##  $ manner_of_death                                       : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ method_of_disposition                                 : chr  "U" "U" "U" "U" ...
##  $ autopsy                                               : chr  "N" "N" "N" "N" ...
##  $ activity_code                                         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ icd_code_10th_revision                                : chr  "C439" "J439" "I698" "E119" ...
##  $ X358_cause_recode                                     : int  98 266 239 159 93 239 266 267 266 125 ...
##  $ X113_cause_recode                                     : int  28 84 70 46 27 70 84 86 84 43 ...
##  $ X130_infant_cause_recode                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ X39_cause_recode                                      : int  15 28 24 16 8 24 28 28 28 15 ...
##  $ number_of_entity_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ entity_condition_1                                    : chr  "11C439" "11J439" "11R628" "11I469" ...
##  $ entity_condition_2                                    : chr  "" "" "21I698" "61E119" ...
##  $ entity_condition_3                                    : chr  "" "" "61J449" "62I500" ...
##  $ entity_condition_4                                    : chr  "" "" "62M199" "63K862" ...
##  $ entity_condition_5                                    : chr  "" "" "63R568" "" ...
##  $ entity_condition_6                                    : chr  "" "" "" "" ...
##  $ entity_condition_7                                    : chr  "" "" "" "" ...
##  $ entity_condition_8                                    : chr  "" "" "" "" ...
##  $ entity_condition_9                                    : chr  "" "" "" "" ...
##  $ entity_condition_10                                   : chr  "" "" "" "" ...
##  $ entity_condition_11                                   : chr  "" "" "" "" ...
##  $ entity_condition_12                                   : chr  "" "" "" "" ...
##  $ entity_condition_13                                   : chr  "" "" "" "" ...
##  $ entity_condition_14                                   : chr  "" "" "" "" ...
##  $ entity_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ entity_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ number_of_record_axis_conditions                      : int  1 1 5 4 3 3 1 3 1 1 ...
##  $ record_condition_1                                    : chr  "C439" "J439" "I698" "E119" ...
##  $ record_condition_2                                    : chr  "" "" "J449" "I469" ...
##  $ record_condition_3                                    : chr  "" "" "M199" "I500" ...
##  $ record_condition_4                                    : chr  "" "" "R568" "K862" ...
##  $ record_condition_5                                    : chr  "" "" "R628" "" ...
##  $ record_condition_6                                    : chr  "" "" "" "" ...
##  $ record_condition_7                                    : chr  "" "" "" "" ...
##  $ record_condition_8                                    : chr  "" "" "" "" ...
##  $ record_condition_9                                    : chr  "" "" "" "" ...
##  $ record_condition_10                                   : chr  "" "" "" "" ...
##  $ record_condition_11                                   : chr  "" "" "" "" ...
##  $ record_condition_12                                   : chr  "" "" "" "" ...
##  $ record_condition_13                                   : chr  "" "" "" "" ...
##  $ record_condition_14                                   : chr  "" "" "" "" ...
##  $ record_condition_15                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_16                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_17                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_18                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_19                                   : logi  NA NA NA NA NA NA ...
##  $ record_condition_20                                   : logi  NA NA NA NA NA NA ...
##  $ race                                                  : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ bridged_race_flag                                     : logi  NA NA NA NA NA NA ...
##  $ race_imputation_flag                                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ race_recode_3                                         : int  1 1 1 1 1 2 1 2 1 1 ...
##  $ race_recode_5                                         : int  1 1 1 1 1 3 1 3 1 1 ...
##  $ hispanic_origin                                       : int  100 100 100 100 100 100 100 100 100 100 ...
##  $ hispanic_originrace_recode                            : int  6 6 6 6 6 8 6 8 6 6 ...
colSums(is.na(death_df))
##                                        resident_status 
##                                                      0 
##                                education_1989_revision 
##                                                      0 
##                                education_2003_revision 
##                                                  50000 
##                               education_reporting_flag 
##                                                      0 
##                                         month_of_death 
##                                                      0 
##                                                    sex 
##                                                      0 
##                                        detail_age_type 
##                                                      0 
##                                             detail_age 
##                                                      0 
##                                  age_substitution_flag 
##                                                  50000 
##                                          age_recode_52 
##                                                      0 
##                                          age_recode_27 
##                                                      0 
##                                          age_recode_12 
##                                                      0 
##                                   infant_age_recode_22 
##                                                  49367 
##                    place_of_death_and_decedents_status 
##                                                      0 
##                                         marital_status 
##                                                      0 
##                                   day_of_week_of_death 
##                                                      0 
##                                      current_data_year 
##                                                      0 
##                                         injury_at_work 
##                                                      0 
##                                        manner_of_death 
##                                                  10352 
##                                  method_of_disposition 
##                                                      0 
##                                                autopsy 
##                                                      0 
##                                          activity_code 
##                                                  46108 
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_ 
##                                                  46918 
##                                 icd_code_10th_revision 
##                                                      0 
##                                      X358_cause_recode 
##                                                      0 
##                                      X113_cause_recode 
##                                                      0 
##                               X130_infant_cause_recode 
##                                                  49367 
##                                       X39_cause_recode 
##                                                      0 
##                       number_of_entity_axis_conditions 
##                                                      0 
##                                     entity_condition_1 
##                                                      0 
##                                     entity_condition_2 
##                                                      0 
##                                     entity_condition_3 
##                                                      0 
##                                     entity_condition_4 
##                                                      0 
##                                     entity_condition_5 
##                                                      0 
##                                     entity_condition_6 
##                                                      0 
##                                     entity_condition_7 
##                                                      0 
##                                     entity_condition_8 
##                                                      0 
##                                     entity_condition_9 
##                                                      0 
##                                    entity_condition_10 
##                                                      0 
##                                    entity_condition_11 
##                                                      0 
##                                    entity_condition_12 
##                                                      0 
##                                    entity_condition_13 
##                                                      0 
##                                    entity_condition_14 
##                                                      0 
##                                    entity_condition_15 
##                                                  50000 
##                                    entity_condition_16 
##                                                  50000 
##                                    entity_condition_17 
##                                                  50000 
##                                    entity_condition_18 
##                                                  50000 
##                                    entity_condition_19 
##                                                  50000 
##                                    entity_condition_20 
##                                                  50000 
##                       number_of_record_axis_conditions 
##                                                      0 
##                                     record_condition_1 
##                                                      0 
##                                     record_condition_2 
##                                                      0 
##                                     record_condition_3 
##                                                      0 
##                                     record_condition_4 
##                                                      0 
##                                     record_condition_5 
##                                                      0 
##                                     record_condition_6 
##                                                      0 
##                                     record_condition_7 
##                                                      0 
##                                     record_condition_8 
##                                                      0 
##                                     record_condition_9 
##                                                      0 
##                                    record_condition_10 
##                                                      0 
##                                    record_condition_11 
##                                                      0 
##                                    record_condition_12 
##                                                      0 
##                                    record_condition_13 
##                                                      0 
##                                    record_condition_14 
##                                                      0 
##                                    record_condition_15 
##                                                  50000 
##                                    record_condition_16 
##                                                  50000 
##                                    record_condition_17 
##                                                  50000 
##                                    record_condition_18 
##                                                  50000 
##                                    record_condition_19 
##                                                  50000 
##                                    record_condition_20 
##                                                  50000 
##                                                   race 
##                                                      0 
##                                      bridged_race_flag 
##                                                  50000 
##                                   race_imputation_flag 
##                                                  49975 
##                                          race_recode_3 
##                                                      0 
##                                          race_recode_5 
##                                                      0 
##                                        hispanic_origin 
##                                                      0 
##                             hispanic_originrace_recode 
##                                                      0

#Step 3. Pre-process data (omit.na/ifelse/preProcess/sample)

new_death_df <- death_df[,c("education_1989_revision","marital_status","hispanic_origin","month_of_death","injury_at_work","detail_age")]
new_death_df$Above_age <- ifelse(new_death_df$detail_age > 77.3, 1,0)
new_death_df <- new_death_df[,-6]

set.seed(1)  
train_index <- sample(c(1:dim(new_death_df)[1]), dim(new_death_df)[1]*0.6)  
valid_index <- setdiff(c(1:dim(new_death_df)[1]), train_index)  
train_df <- new_death_df[train_index, ]
valid_df <- new_death_df[valid_index, ]

#Step 4. Modeling ((1) logistic regression, (2) k-nearest neighbors with k = 3, and (3) classification trees.)

logic_model <- glm(Above_age~., data = train_df, family = "binomial")
confusionMatrix(factor(ifelse(predict(logic_model, valid_df, type = "response")>0.5, 1, 0)), 
                factor(valid_df$Above_age), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9474 3292
##          1 1785 5449
##                                           
##                Accuracy : 0.7462          
##                  95% CI : (0.7401, 0.7522)
##     No Information Rate : 0.563           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.474           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6234          
##             Specificity : 0.8415          
##          Pos Pred Value : 0.7532          
##          Neg Pred Value : 0.7421          
##              Prevalence : 0.4370          
##          Detection Rate : 0.2725          
##    Detection Prevalence : 0.3617          
##       Balanced Accuracy : 0.7324          
##                                           
##        'Positive' Class : 1               
## 
summary(logic_model)
## 
## Call:
## glm(formula = Above_age ~ ., family = "binomial", data = train_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -3.1733947  0.1387563 -22.870   <2e-16 ***
## education_1989_revision  0.0003449  0.0010959   0.315   0.7530    
## marital_statusM          0.7270512  0.0472969  15.372   <2e-16 ***
## marital_statusS         -0.1120922  0.0652631  -1.718   0.0859 .  
## marital_statusU          0.6144575  0.2810492   2.186   0.0288 *  
## marital_statusW          2.5771236  0.0481655  53.506   <2e-16 ***
## hispanic_origin         -0.0008384  0.0006857  -1.223   0.2214    
## month_of_death          -0.0056564  0.0038173  -1.482   0.1384    
## injury_at_workU          1.8228281  0.1094851  16.649   <2e-16 ***
## injury_at_workY         -1.0392959  0.7395366  -1.405   0.1599    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41082  on 29999  degrees of freedom
## Residual deviance: 32921  on 29990  degrees of freedom
## AIC: 32941
## 
## Number of Fisher Scoring iterations: 5