df <- read.csv("2005_data.csv", nrows = 50000)
str(df)
## 'data.frame': 50000 obs. of 77 variables:
## $ resident_status : int 1 1 1 1 1 1 1 1 1 3 ...
## $ education_1989_revision : int 11 13 12 12 14 3 12 12 14 8 ...
## $ education_2003_revision : logi NA NA NA NA NA NA ...
## $ education_reporting_flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_of_death : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : chr "F" "M" "F" "M" ...
## $ detail_age_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ detail_age : int 45 61 79 50 68 89 68 61 73 85 ...
## $ age_substitution_flag : logi NA NA NA NA NA NA ...
## $ age_recode_52 : int 35 38 41 36 39 43 39 38 40 43 ...
## $ age_recode_27 : int 15 18 21 16 19 23 19 18 20 23 ...
## $ age_recode_12 : int 7 8 10 7 9 11 9 8 9 11 ...
## $ infant_age_recode_22 : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_death_and_decedents_status : int 1 1 6 1 1 6 1 1 1 7 ...
## $ marital_status : chr "M" "D" "D" "S" ...
## $ day_of_week_of_death : int 2 7 1 4 2 7 7 6 6 6 ...
## $ current_data_year : int 2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
## $ injury_at_work : chr "U" "U" "U" "U" ...
## $ manner_of_death : int 7 7 7 7 7 7 7 7 7 7 ...
## $ method_of_disposition : chr "U" "U" "U" "U" ...
## $ autopsy : chr "N" "N" "N" "N" ...
## $ activity_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int NA NA NA NA NA NA NA NA NA NA ...
## $ icd_code_10th_revision : chr "C439" "J439" "I698" "E119" ...
## $ X358_cause_recode : int 98 266 239 159 93 239 266 267 266 125 ...
## $ X113_cause_recode : int 28 84 70 46 27 70 84 86 84 43 ...
## $ X130_infant_cause_recode : int NA NA NA NA NA NA NA NA NA NA ...
## $ X39_cause_recode : int 15 28 24 16 8 24 28 28 28 15 ...
## $ number_of_entity_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ entity_condition_1 : chr "11C439" "11J439" "11R628" "11I469" ...
## $ entity_condition_2 : chr "" "" "21I698" "61E119" ...
## $ entity_condition_3 : chr "" "" "61J449" "62I500" ...
## $ entity_condition_4 : chr "" "" "62M199" "63K862" ...
## $ entity_condition_5 : chr "" "" "63R568" "" ...
## $ entity_condition_6 : chr "" "" "" "" ...
## $ entity_condition_7 : chr "" "" "" "" ...
## $ entity_condition_8 : chr "" "" "" "" ...
## $ entity_condition_9 : chr "" "" "" "" ...
## $ entity_condition_10 : chr "" "" "" "" ...
## $ entity_condition_11 : chr "" "" "" "" ...
## $ entity_condition_12 : chr "" "" "" "" ...
## $ entity_condition_13 : chr "" "" "" "" ...
## $ entity_condition_14 : chr "" "" "" "" ...
## $ entity_condition_15 : logi NA NA NA NA NA NA ...
## $ entity_condition_16 : logi NA NA NA NA NA NA ...
## $ entity_condition_17 : logi NA NA NA NA NA NA ...
## $ entity_condition_18 : logi NA NA NA NA NA NA ...
## $ entity_condition_19 : logi NA NA NA NA NA NA ...
## $ entity_condition_20 : logi NA NA NA NA NA NA ...
## $ number_of_record_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ record_condition_1 : chr "C439" "J439" "I698" "E119" ...
## $ record_condition_2 : chr "" "" "J449" "I469" ...
## $ record_condition_3 : chr "" "" "M199" "I500" ...
## $ record_condition_4 : chr "" "" "R568" "K862" ...
## $ record_condition_5 : chr "" "" "R628" "" ...
## $ record_condition_6 : chr "" "" "" "" ...
## $ record_condition_7 : chr "" "" "" "" ...
## $ record_condition_8 : chr "" "" "" "" ...
## $ record_condition_9 : chr "" "" "" "" ...
## $ record_condition_10 : chr "" "" "" "" ...
## $ record_condition_11 : chr "" "" "" "" ...
## $ record_condition_12 : chr "" "" "" "" ...
## $ record_condition_13 : chr "" "" "" "" ...
## $ record_condition_14 : chr "" "" "" "" ...
## $ record_condition_15 : logi NA NA NA NA NA NA ...
## $ record_condition_16 : logi NA NA NA NA NA NA ...
## $ record_condition_17 : logi NA NA NA NA NA NA ...
## $ record_condition_18 : logi NA NA NA NA NA NA ...
## $ record_condition_19 : logi NA NA NA NA NA NA ...
## $ record_condition_20 : logi NA NA NA NA NA NA ...
## $ race : int 1 1 1 1 1 3 1 3 1 1 ...
## $ bridged_race_flag : logi NA NA NA NA NA NA ...
## $ race_imputation_flag : int NA NA NA NA NA NA NA NA NA NA ...
## $ race_recode_3 : int 1 1 1 1 1 2 1 2 1 1 ...
## $ race_recode_5 : int 1 1 1 1 1 3 1 3 1 1 ...
## $ hispanic_origin : int 100 100 100 100 100 100 100 100 100 100 ...
## $ hispanic_originrace_recode : int 6 6 6 6 6 8 6 8 6 6 ...
colSums(is.na(df))
## resident_status
## 0
## education_1989_revision
## 0
## education_2003_revision
## 50000
## education_reporting_flag
## 0
## month_of_death
## 0
## sex
## 0
## detail_age_type
## 0
## detail_age
## 0
## age_substitution_flag
## 50000
## age_recode_52
## 0
## age_recode_27
## 0
## age_recode_12
## 0
## infant_age_recode_22
## 49367
## place_of_death_and_decedents_status
## 0
## marital_status
## 0
## day_of_week_of_death
## 0
## current_data_year
## 0
## injury_at_work
## 0
## manner_of_death
## 10352
## method_of_disposition
## 0
## autopsy
## 0
## activity_code
## 46108
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_
## 46918
## icd_code_10th_revision
## 0
## X358_cause_recode
## 0
## X113_cause_recode
## 0
## X130_infant_cause_recode
## 49367
## X39_cause_recode
## 0
## number_of_entity_axis_conditions
## 0
## entity_condition_1
## 0
## entity_condition_2
## 0
## entity_condition_3
## 0
## entity_condition_4
## 0
## entity_condition_5
## 0
## entity_condition_6
## 0
## entity_condition_7
## 0
## entity_condition_8
## 0
## entity_condition_9
## 0
## entity_condition_10
## 0
## entity_condition_11
## 0
## entity_condition_12
## 0
## entity_condition_13
## 0
## entity_condition_14
## 0
## entity_condition_15
## 50000
## entity_condition_16
## 50000
## entity_condition_17
## 50000
## entity_condition_18
## 50000
## entity_condition_19
## 50000
## entity_condition_20
## 50000
## number_of_record_axis_conditions
## 0
## record_condition_1
## 0
## record_condition_2
## 0
## record_condition_3
## 0
## record_condition_4
## 0
## record_condition_5
## 0
## record_condition_6
## 0
## record_condition_7
## 0
## record_condition_8
## 0
## record_condition_9
## 0
## record_condition_10
## 0
## record_condition_11
## 0
## record_condition_12
## 0
## record_condition_13
## 0
## record_condition_14
## 0
## record_condition_15
## 50000
## record_condition_16
## 50000
## record_condition_17
## 50000
## record_condition_18
## 50000
## record_condition_19
## 50000
## record_condition_20
## 50000
## race
## 0
## bridged_race_flag
## 50000
## race_imputation_flag
## 49975
## race_recode_3
## 0
## race_recode_5
## 0
## hispanic_origin
## 0
## hispanic_originrace_recode
## 0
df_death <- df[,c("autopsy", "marital_status", "sex",
"resident_status" , "injury_at_work" ,"detail_age")]
df_death$Above_Avg_Age <- ifelse(df_death$detail_age > 77.4, 1, 0)
df_death <- df_death[,-6]
df_index <- sample(c(1:dim(df_death)[1]),0.7*dim(df_death)[1])
df_train <- df_death[df_index,]
df_valid <- df_death[-df_index,]
logic_model <- glm(Above_Avg_Age ~ ., data = df_train, family = "binomial")
confusionMatrix(factor(ifelse(predict(logic_model, df_valid, type = "response")>0.5, 1, 0)), factor(df_valid$Above_Avg_Age), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7175 2477
## 1 1325 4023
##
## Accuracy : 0.7465
## 95% CI : (0.7395, 0.7535)
## No Information Rate : 0.5667
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4729
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6189
## Specificity : 0.8441
## Pos Pred Value : 0.7522
## Neg Pred Value : 0.7434
## Prevalence : 0.4333
## Detection Rate : 0.2682
## Detection Prevalence : 0.3565
## Balanced Accuracy : 0.7315
##
## 'Positive' Class : 1
##
summary(logic_model)
##
## Call:
## glm(formula = Above_Avg_Age ~ ., family = "binomial", data = df_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.30073 0.11846 -19.421 < 2e-16 ***
## autopsyU 0.09108 0.02957 3.080 0.002070 **
## autopsyY -1.72202 0.10327 -16.675 < 2e-16 ***
## marital_statusM 0.68809 0.04330 15.893 < 2e-16 ***
## marital_statusS -0.08198 0.06043 -1.357 0.174920
## marital_statusU 0.34308 0.27570 1.244 0.213363
## marital_statusW 2.48021 0.04477 55.396 < 2e-16 ***
## sexM -0.09252 0.02731 -3.388 0.000704 ***
## resident_status -0.31593 0.02732 -11.565 < 2e-16 ***
## injury_at_workU 1.34301 0.10681 12.574 < 2e-16 ***
## injury_at_workY -0.81901 0.62462 -1.311 0.189787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 47963 on 34999 degrees of freedom
## Residual deviance: 37824 on 34989 degrees of freedom
## AIC: 37846
##
## Number of Fisher Scoring iterations: 5
We can only use numerical values for this model
df_death_knn <- df[,c("education_1989_revision", "day_of_week_of_death", "month_of_death",
"race" , "hispanic_origin" ,"detail_age")]
df_death_knn$Above_Avg_Age <- ifelse(df_death_knn$detail_age > 77.4, 1, 0)
df_death_knn <- df_death_knn[,-6]
df_index_knn <- sample(c(1:dim(df_death_knn)[1]),0.7*dim(df_death_knn)[1])
df_train_knn <- df_death_knn[df_index_knn,]
df_valid_knn <- df_death_knn[-df_index_knn,]
kn <- knn(train = df_train_knn[, -6], test = df_valid_knn[,-6], cl = df_train_knn[, 6], k = 5, prob=TRUE)
confusionMatrix(kn, factor(df_valid_knn[,6]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6273 3879
## 1 2219 2629
##
## Accuracy : 0.5935
## 95% CI : (0.5856, 0.6013)
## No Information Rate : 0.5661
## P-Value [Acc > NIR] : 6.696e-12
##
## Kappa : 0.147
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4040
## Specificity : 0.7387
## Pos Pred Value : 0.5423
## Neg Pred Value : 0.6179
## Prevalence : 0.4339
## Detection Rate : 0.1753
## Detection Prevalence : 0.3232
## Balanced Accuracy : 0.5713
##
## 'Positive' Class : 1
##
tr <- rpart(Above_Avg_Age ~., data = df_death)
confusionMatrix(factor(ifelse(predict(tr, df_valid)>0.5, 1, 0)),
factor(df_valid$Above_Avg_Age), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7106 2429
## 1 1394 4071
##
## Accuracy : 0.7451
## 95% CI : (0.7381, 0.7521)
## No Information Rate : 0.5667
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4711
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6263
## Specificity : 0.8360
## Pos Pred Value : 0.7449
## Neg Pred Value : 0.7453
## Prevalence : 0.4333
## Detection Rate : 0.2714
## Detection Prevalence : 0.3643
## Balanced Accuracy : 0.7312
##
## 'Positive' Class : 1
##
res<- data.frame(ActualClass = df_valid$Above_Avg_Age,
LRProb = predict(logic_model, df_valid, type = "response"),
LRPred = ifelse(predict(logic_model, df_valid, type = "response")>0.5, 1, 0),
KNNProb = 1-attr(kn, "prob"),
KNNPred = kn,
TREEProb = predict(tr, df_valid),
TREEPred = ifelse(predict(tr, df_valid)>0.5, 1, 0))
options(digits = 1, scipen = 2)
head(res, 10)
## ActualClass LRProb LRPred KNNProb KNNPred TREEProb TREEPred
## 3 1 0.2 0 0.4 0 0.2 0
## 12 0 0.3 0 0.4 0 0.3 0
## 26 0 0.3 0 0.2 1 0.3 0
## 30 1 0.4 0 0.2 0 0.3 0
## 32 0 0.3 0 0.5 0 0.3 0
## 37 0 0.1 0 0.5 0 0.3 0
## 40 0 0.2 0 0.5 0 0.2 0
## 41 0 0.8 1 0.2 1 0.7 1
## 43 0 0.3 0 0.3 1 0.3 0
## 48 0 0.3 0 0.5 0 0.3 0
res$majority <- rowMeans(data.frame(res$LRPred, as.numeric(res$KNNPred),
res$TREEPred))>0.5
res$avg <- rowMeans(data.frame(res$LRProb, res$KNNProb, res$TREEProb))
head(res)
## ActualClass LRProb LRPred KNNProb KNNPred TREEProb TREEPred majority avg
## 3 1 0.2 0 0.4 0 0.2 0 FALSE 0.3
## 12 0 0.3 0 0.4 0 0.3 0 FALSE 0.4
## 26 0 0.3 0 0.2 1 0.3 0 TRUE 0.3
## 30 1 0.4 0 0.2 0 0.3 0 FALSE 0.3
## 32 0 0.3 0 0.5 0 0.3 0 FALSE 0.4
## 37 0 0.1 0 0.5 0 0.3 0 FALSE 0.3
#confusion matrix using majority vote of predicted outcomes
confusionMatrix(factor(res$majority * 1), factor(df_valid[,6]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4889 1652
## 1 3611 4848
##
## Accuracy : 0.649
## 95% CI : (0.641, 0.657)
## No Information Rate : 0.567
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.31
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.746
## Specificity : 0.575
## Pos Pred Value : 0.573
## Neg Pred Value : 0.747
## Prevalence : 0.433
## Detection Rate : 0.323
## Detection Prevalence : 0.564
## Balanced Accuracy : 0.661
##
## 'Positive' Class : 1
##
#confusion matrix using average of predicted probabilities
confusionMatrix(factor((res$avg > 0.5)* 1), factor(df_valid[,6]), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7149 2473
## 1 1351 4027
##
## Accuracy : 0.745
## 95% CI : (0.738, 0.752)
## No Information Rate : 0.567
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.47
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.620
## Specificity : 0.841
## Pos Pred Value : 0.749
## Neg Pred Value : 0.743
## Prevalence : 0.433
## Detection Rate : 0.268
## Detection Prevalence : 0.359
## Balanced Accuracy : 0.730
##
## 'Positive' Class : 1
##