library(class)
library(rpart)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(gains)
death_df <- read.csv("2005_data.csv", nrows = 50000)
#Step 2. Explore data (str/is.na)
str(death_df)
## 'data.frame': 50000 obs. of 77 variables:
## $ resident_status : int 1 1 1 1 1 1 1 1 1 3 ...
## $ education_1989_revision : int 11 13 12 12 14 3 12 12 14 8 ...
## $ education_2003_revision : logi NA NA NA NA NA NA ...
## $ education_reporting_flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_of_death : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : chr "F" "M" "F" "M" ...
## $ detail_age_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ detail_age : int 45 61 79 50 68 89 68 61 73 85 ...
## $ age_substitution_flag : logi NA NA NA NA NA NA ...
## $ age_recode_52 : int 35 38 41 36 39 43 39 38 40 43 ...
## $ age_recode_27 : int 15 18 21 16 19 23 19 18 20 23 ...
## $ age_recode_12 : int 7 8 10 7 9 11 9 8 9 11 ...
## $ infant_age_recode_22 : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_death_and_decedents_status : int 1 1 6 1 1 6 1 1 1 7 ...
## $ marital_status : chr "M" "D" "D" "S" ...
## $ day_of_week_of_death : int 2 7 1 4 2 7 7 6 6 6 ...
## $ current_data_year : int 2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
## $ injury_at_work : chr "U" "U" "U" "U" ...
## $ manner_of_death : int 7 7 7 7 7 7 7 7 7 7 ...
## $ method_of_disposition : chr "U" "U" "U" "U" ...
## $ autopsy : chr "N" "N" "N" "N" ...
## $ activity_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int NA NA NA NA NA NA NA NA NA NA ...
## $ icd_code_10th_revision : chr "C439" "J439" "I698" "E119" ...
## $ X358_cause_recode : int 98 266 239 159 93 239 266 267 266 125 ...
## $ X113_cause_recode : int 28 84 70 46 27 70 84 86 84 43 ...
## $ X130_infant_cause_recode : int NA NA NA NA NA NA NA NA NA NA ...
## $ X39_cause_recode : int 15 28 24 16 8 24 28 28 28 15 ...
## $ number_of_entity_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ entity_condition_1 : chr "11C439" "11J439" "11R628" "11I469" ...
## $ entity_condition_2 : chr "" "" "21I698" "61E119" ...
## $ entity_condition_3 : chr "" "" "61J449" "62I500" ...
## $ entity_condition_4 : chr "" "" "62M199" "63K862" ...
## $ entity_condition_5 : chr "" "" "63R568" "" ...
## $ entity_condition_6 : chr "" "" "" "" ...
## $ entity_condition_7 : chr "" "" "" "" ...
## $ entity_condition_8 : chr "" "" "" "" ...
## $ entity_condition_9 : chr "" "" "" "" ...
## $ entity_condition_10 : chr "" "" "" "" ...
## $ entity_condition_11 : chr "" "" "" "" ...
## $ entity_condition_12 : chr "" "" "" "" ...
## $ entity_condition_13 : chr "" "" "" "" ...
## $ entity_condition_14 : chr "" "" "" "" ...
## $ entity_condition_15 : logi NA NA NA NA NA NA ...
## $ entity_condition_16 : logi NA NA NA NA NA NA ...
## $ entity_condition_17 : logi NA NA NA NA NA NA ...
## $ entity_condition_18 : logi NA NA NA NA NA NA ...
## $ entity_condition_19 : logi NA NA NA NA NA NA ...
## $ entity_condition_20 : logi NA NA NA NA NA NA ...
## $ number_of_record_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ record_condition_1 : chr "C439" "J439" "I698" "E119" ...
## $ record_condition_2 : chr "" "" "J449" "I469" ...
## $ record_condition_3 : chr "" "" "M199" "I500" ...
## $ record_condition_4 : chr "" "" "R568" "K862" ...
## $ record_condition_5 : chr "" "" "R628" "" ...
## $ record_condition_6 : chr "" "" "" "" ...
## $ record_condition_7 : chr "" "" "" "" ...
## $ record_condition_8 : chr "" "" "" "" ...
## $ record_condition_9 : chr "" "" "" "" ...
## $ record_condition_10 : chr "" "" "" "" ...
## $ record_condition_11 : chr "" "" "" "" ...
## $ record_condition_12 : chr "" "" "" "" ...
## $ record_condition_13 : chr "" "" "" "" ...
## $ record_condition_14 : chr "" "" "" "" ...
## $ record_condition_15 : logi NA NA NA NA NA NA ...
## $ record_condition_16 : logi NA NA NA NA NA NA ...
## $ record_condition_17 : logi NA NA NA NA NA NA ...
## $ record_condition_18 : logi NA NA NA NA NA NA ...
## $ record_condition_19 : logi NA NA NA NA NA NA ...
## $ record_condition_20 : logi NA NA NA NA NA NA ...
## $ race : int 1 1 1 1 1 3 1 3 1 1 ...
## $ bridged_race_flag : logi NA NA NA NA NA NA ...
## $ race_imputation_flag : int NA NA NA NA NA NA NA NA NA NA ...
## $ race_recode_3 : int 1 1 1 1 1 2 1 2 1 1 ...
## $ race_recode_5 : int 1 1 1 1 1 3 1 3 1 1 ...
## $ hispanic_origin : int 100 100 100 100 100 100 100 100 100 100 ...
## $ hispanic_originrace_recode : int 6 6 6 6 6 8 6 8 6 6 ...
colSums(is.na(death_df))
## resident_status
## 0
## education_1989_revision
## 0
## education_2003_revision
## 50000
## education_reporting_flag
## 0
## month_of_death
## 0
## sex
## 0
## detail_age_type
## 0
## detail_age
## 0
## age_substitution_flag
## 50000
## age_recode_52
## 0
## age_recode_27
## 0
## age_recode_12
## 0
## infant_age_recode_22
## 49367
## place_of_death_and_decedents_status
## 0
## marital_status
## 0
## day_of_week_of_death
## 0
## current_data_year
## 0
## injury_at_work
## 0
## manner_of_death
## 10352
## method_of_disposition
## 0
## autopsy
## 0
## activity_code
## 46108
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_
## 46918
## icd_code_10th_revision
## 0
## X358_cause_recode
## 0
## X113_cause_recode
## 0
## X130_infant_cause_recode
## 49367
## X39_cause_recode
## 0
## number_of_entity_axis_conditions
## 0
## entity_condition_1
## 0
## entity_condition_2
## 0
## entity_condition_3
## 0
## entity_condition_4
## 0
## entity_condition_5
## 0
## entity_condition_6
## 0
## entity_condition_7
## 0
## entity_condition_8
## 0
## entity_condition_9
## 0
## entity_condition_10
## 0
## entity_condition_11
## 0
## entity_condition_12
## 0
## entity_condition_13
## 0
## entity_condition_14
## 0
## entity_condition_15
## 50000
## entity_condition_16
## 50000
## entity_condition_17
## 50000
## entity_condition_18
## 50000
## entity_condition_19
## 50000
## entity_condition_20
## 50000
## number_of_record_axis_conditions
## 0
## record_condition_1
## 0
## record_condition_2
## 0
## record_condition_3
## 0
## record_condition_4
## 0
## record_condition_5
## 0
## record_condition_6
## 0
## record_condition_7
## 0
## record_condition_8
## 0
## record_condition_9
## 0
## record_condition_10
## 0
## record_condition_11
## 0
## record_condition_12
## 0
## record_condition_13
## 0
## record_condition_14
## 0
## record_condition_15
## 50000
## record_condition_16
## 50000
## record_condition_17
## 50000
## record_condition_18
## 50000
## record_condition_19
## 50000
## record_condition_20
## 50000
## race
## 0
## bridged_race_flag
## 50000
## race_imputation_flag
## 49975
## race_recode_3
## 0
## race_recode_5
## 0
## hispanic_origin
## 0
## hispanic_originrace_recode
## 0
#Step 3. Pre-process data (omit.na/ifelse/preProcess/sample)
new_death_df <- death_df[,c("education_1989_revision","marital_status","hispanic_origin","month_of_death","injury_at_work","detail_age")]
new_death_df$Above_age <- ifelse(new_death_df$detail_age > 77.3, 1,0)
new_death_df <- new_death_df[,-6]
set.seed(1)
train_index <- sample(c(1:dim(new_death_df)[1]), dim(new_death_df)[1]*0.6)
valid_index <- setdiff(c(1:dim(new_death_df)[1]), train_index)
train_df <- new_death_df[train_index, ]
valid_df <- new_death_df[valid_index, ]
#Step 4. Modeling ((1) logistic regression, (2) k-nearest neighbors with k = 3, and (3) classification trees.)
logic_model <- glm(Above_age~., data = train_df, family = "binomial")
confusionMatrix(factor(ifelse(predict(logic_model, valid_df, type = "response")>0.5, 1, 0)),
factor(valid_df$Above_age), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9474 3292
## 1 1785 5449
##
## Accuracy : 0.7462
## 95% CI : (0.7401, 0.7522)
## No Information Rate : 0.563
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.474
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6234
## Specificity : 0.8415
## Pos Pred Value : 0.7532
## Neg Pred Value : 0.7421
## Prevalence : 0.4370
## Detection Rate : 0.2725
## Detection Prevalence : 0.3617
## Balanced Accuracy : 0.7324
##
## 'Positive' Class : 1
##
summary(logic_model)
##
## Call:
## glm(formula = Above_age ~ ., family = "binomial", data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1733947 0.1387563 -22.870 <2e-16 ***
## education_1989_revision 0.0003449 0.0010959 0.315 0.7530
## marital_statusM 0.7270512 0.0472969 15.372 <2e-16 ***
## marital_statusS -0.1120922 0.0652631 -1.718 0.0859 .
## marital_statusU 0.6144575 0.2810492 2.186 0.0288 *
## marital_statusW 2.5771236 0.0481655 53.506 <2e-16 ***
## hispanic_origin -0.0008384 0.0006857 -1.223 0.2214
## month_of_death -0.0056564 0.0038173 -1.482 0.1384
## injury_at_workU 1.8228281 0.1094851 16.649 <2e-16 ***
## injury_at_workY -1.0392959 0.7395366 -1.405 0.1599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41082 on 29999 degrees of freedom
## Residual deviance: 32921 on 29990 degrees of freedom
## AIC: 32941
##
## Number of Fisher Scoring iterations: 5