setwd("/Users/S/Downloads")
# Load the data
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(class)
library(rpart)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
death_df <- read.csv('2005_data.csv', nrows = 50000)
str(death_df)
## 'data.frame': 50000 obs. of 77 variables:
## $ resident_status : int 1 1 1 1 1 1 1 1 1 3 ...
## $ education_1989_revision : int 11 13 12 12 14 3 12 12 14 8 ...
## $ education_2003_revision : logi NA NA NA NA NA NA ...
## $ education_reporting_flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_of_death : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : chr "F" "M" "F" "M" ...
## $ detail_age_type : int 1 1 1 1 1 1 1 1 1 1 ...
## $ detail_age : int 45 61 79 50 68 89 68 61 73 85 ...
## $ age_substitution_flag : logi NA NA NA NA NA NA ...
## $ age_recode_52 : int 35 38 41 36 39 43 39 38 40 43 ...
## $ age_recode_27 : int 15 18 21 16 19 23 19 18 20 23 ...
## $ age_recode_12 : int 7 8 10 7 9 11 9 8 9 11 ...
## $ infant_age_recode_22 : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_death_and_decedents_status : int 1 1 6 1 1 6 1 1 1 7 ...
## $ marital_status : chr "M" "D" "D" "S" ...
## $ day_of_week_of_death : int 2 7 1 4 2 7 7 6 6 6 ...
## $ current_data_year : int 2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
## $ injury_at_work : chr "U" "U" "U" "U" ...
## $ manner_of_death : int 7 7 7 7 7 7 7 7 7 7 ...
## $ method_of_disposition : chr "U" "U" "U" "U" ...
## $ autopsy : chr "N" "N" "N" "N" ...
## $ activity_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ place_of_injury_for_causes_w00_y34_except_y06_and_y07_: int NA NA NA NA NA NA NA NA NA NA ...
## $ icd_code_10th_revision : chr "C439" "J439" "I698" "E119" ...
## $ X358_cause_recode : int 98 266 239 159 93 239 266 267 266 125 ...
## $ X113_cause_recode : int 28 84 70 46 27 70 84 86 84 43 ...
## $ X130_infant_cause_recode : int NA NA NA NA NA NA NA NA NA NA ...
## $ X39_cause_recode : int 15 28 24 16 8 24 28 28 28 15 ...
## $ number_of_entity_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ entity_condition_1 : chr "11C439" "11J439" "11R628" "11I469" ...
## $ entity_condition_2 : chr "" "" "21I698" "61E119" ...
## $ entity_condition_3 : chr "" "" "61J449" "62I500" ...
## $ entity_condition_4 : chr "" "" "62M199" "63K862" ...
## $ entity_condition_5 : chr "" "" "63R568" "" ...
## $ entity_condition_6 : chr "" "" "" "" ...
## $ entity_condition_7 : chr "" "" "" "" ...
## $ entity_condition_8 : chr "" "" "" "" ...
## $ entity_condition_9 : chr "" "" "" "" ...
## $ entity_condition_10 : chr "" "" "" "" ...
## $ entity_condition_11 : chr "" "" "" "" ...
## $ entity_condition_12 : chr "" "" "" "" ...
## $ entity_condition_13 : chr "" "" "" "" ...
## $ entity_condition_14 : chr "" "" "" "" ...
## $ entity_condition_15 : logi NA NA NA NA NA NA ...
## $ entity_condition_16 : logi NA NA NA NA NA NA ...
## $ entity_condition_17 : logi NA NA NA NA NA NA ...
## $ entity_condition_18 : logi NA NA NA NA NA NA ...
## $ entity_condition_19 : logi NA NA NA NA NA NA ...
## $ entity_condition_20 : logi NA NA NA NA NA NA ...
## $ number_of_record_axis_conditions : int 1 1 5 4 3 3 1 3 1 1 ...
## $ record_condition_1 : chr "C439" "J439" "I698" "E119" ...
## $ record_condition_2 : chr "" "" "J449" "I469" ...
## $ record_condition_3 : chr "" "" "M199" "I500" ...
## $ record_condition_4 : chr "" "" "R568" "K862" ...
## $ record_condition_5 : chr "" "" "R628" "" ...
## $ record_condition_6 : chr "" "" "" "" ...
## $ record_condition_7 : chr "" "" "" "" ...
## $ record_condition_8 : chr "" "" "" "" ...
## $ record_condition_9 : chr "" "" "" "" ...
## $ record_condition_10 : chr "" "" "" "" ...
## $ record_condition_11 : chr "" "" "" "" ...
## $ record_condition_12 : chr "" "" "" "" ...
## $ record_condition_13 : chr "" "" "" "" ...
## $ record_condition_14 : chr "" "" "" "" ...
## $ record_condition_15 : logi NA NA NA NA NA NA ...
## $ record_condition_16 : logi NA NA NA NA NA NA ...
## $ record_condition_17 : logi NA NA NA NA NA NA ...
## $ record_condition_18 : logi NA NA NA NA NA NA ...
## $ record_condition_19 : logi NA NA NA NA NA NA ...
## $ record_condition_20 : logi NA NA NA NA NA NA ...
## $ race : int 1 1 1 1 1 3 1 3 1 1 ...
## $ bridged_race_flag : logi NA NA NA NA NA NA ...
## $ race_imputation_flag : int NA NA NA NA NA NA NA NA NA NA ...
## $ race_recode_3 : int 1 1 1 1 1 2 1 2 1 1 ...
## $ race_recode_5 : int 1 1 1 1 1 3 1 3 1 1 ...
## $ hispanic_origin : int 100 100 100 100 100 100 100 100 100 100 ...
## $ hispanic_originrace_recode : int 6 6 6 6 6 8 6 8 6 6 ...
sort(colSums(is.na(death_df)), decreasing = TRUE)
## education_2003_revision
## 50000
## age_substitution_flag
## 50000
## entity_condition_15
## 50000
## entity_condition_16
## 50000
## entity_condition_17
## 50000
## entity_condition_18
## 50000
## entity_condition_19
## 50000
## entity_condition_20
## 50000
## record_condition_15
## 50000
## record_condition_16
## 50000
## record_condition_17
## 50000
## record_condition_18
## 50000
## record_condition_19
## 50000
## record_condition_20
## 50000
## bridged_race_flag
## 50000
## race_imputation_flag
## 49975
## infant_age_recode_22
## 49367
## X130_infant_cause_recode
## 49367
## place_of_injury_for_causes_w00_y34_except_y06_and_y07_
## 46918
## activity_code
## 46108
## manner_of_death
## 10352
## resident_status
## 0
## education_1989_revision
## 0
## education_reporting_flag
## 0
## month_of_death
## 0
## sex
## 0
## detail_age_type
## 0
## detail_age
## 0
## age_recode_52
## 0
## age_recode_27
## 0
## age_recode_12
## 0
## place_of_death_and_decedents_status
## 0
## marital_status
## 0
## day_of_week_of_death
## 0
## current_data_year
## 0
## injury_at_work
## 0
## method_of_disposition
## 0
## autopsy
## 0
## icd_code_10th_revision
## 0
## X358_cause_recode
## 0
## X113_cause_recode
## 0
## X39_cause_recode
## 0
## number_of_entity_axis_conditions
## 0
## entity_condition_1
## 0
## entity_condition_2
## 0
## entity_condition_3
## 0
## entity_condition_4
## 0
## entity_condition_5
## 0
## entity_condition_6
## 0
## entity_condition_7
## 0
## entity_condition_8
## 0
## entity_condition_9
## 0
## entity_condition_10
## 0
## entity_condition_11
## 0
## entity_condition_12
## 0
## entity_condition_13
## 0
## entity_condition_14
## 0
## number_of_record_axis_conditions
## 0
## record_condition_1
## 0
## record_condition_2
## 0
## record_condition_3
## 0
## record_condition_4
## 0
## record_condition_5
## 0
## record_condition_6
## 0
## record_condition_7
## 0
## record_condition_8
## 0
## record_condition_9
## 0
## record_condition_10
## 0
## record_condition_11
## 0
## record_condition_12
## 0
## record_condition_13
## 0
## record_condition_14
## 0
## race
## 0
## race_recode_3
## 0
## race_recode_5
## 0
## hispanic_origin
## 0
## hispanic_originrace_recode
## 0
new_death_df <- death_df[ , c("sex", "marital_status", "hispanic_origin", "month_of_death",
"detail_age", "injury_at_work")]
#new_death_df
new_death_df$above_age = as.factor(death_df$detail_age>77.3)
#new_death_df
# partition the data into training and validation sets
set.seed(1)
train_index <- sample(c(1:dim(new_death_df)[1]), dim(new_death_df)[1]*0.6)
valid_index <- setdiff(c(1:dim(new_death_df)[1]), train_index)
train_df <- new_death_df[train_index, ]
valid_df <- new_death_df[valid_index, ]
reg <- glm(above_age ~ ., data = train_df, family = "binomial", control = glm.control(maxit = 100))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Make predictions on the validation set
reg_pred <- predict(reg, valid_df, type = "response")
# Create a binary column for predicted outcome
valid_df$LRPred <- as.factor(ifelse(reg_pred > 0.5, 1, 0))
# Set levels for the factor variables
levels(valid_df$above_age) <- levels(valid_df$LRPred) <- c("0", "1")
# Confusion/classification matrix
confusionMatrix(valid_df$LRPred, valid_df$above_age, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11259 0
## 1 0 8741
##
## Accuracy : 1
## 95% CI : (0.9998, 1)
## No Information Rate : 0.563
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.437
## Detection Rate : 0.437
## Detection Prevalence : 0.437
## Balanced Accuracy : 1.000
##
## 'Positive' Class : 1
##
For some reason, every time I tried to fit the knn model it would not work and I couldn’t figure out how to fix it, but here is the code that I would’ve used.
#kn <- class::knn(train = train_df[, -6, drop = FALSE],
# test = valid_df[, -6, drop = FALSE],
# cl = train_df$above_age,
# k = 3, prob = TRUE)
# Confusion/classification matrix
#confusionMatrix(kn, factor(valid_df$above_age), positive = "1")
tr <- rpart(above_age ~., data = new_death_df)
tr
## n= 50000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 50000 21794 FALSE (0.5641200 0.4358800)
## 2) detail_age< 77.5 28206 0 FALSE (1.0000000 0.0000000) *
## 3) detail_age>=77.5 21794 0 TRUE (0.0000000 1.0000000) *
#confusion/classification matrix
confusionMatrix(factor(ifelse(predict(tr, valid_df)[,2]>0.5, 1, 0)),
valid_df$above_age, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11259 0
## 1 0 8741
##
## Accuracy : 1
## 95% CI : (0.9998, 1)
## No Information Rate : 0.563
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.437
## Detection Rate : 0.437
## Detection Prevalence : 0.437
## Balanced Accuracy : 1.000
##
## 'Positive' Class : 1
##
I could not get the prediction to work correctly because I could not figure out how to get my knn model to run, but here is the code that I would have used had it worked.
#Create a data frame with the actual outcome, predicted outcome, and each of the three models. Report the first 10 rows of this data frame.
res <- data.frame(ActualClass = valid_df$above_age,
LRProb = predict(reg, valid_df, type = "response"),
LRPred = ifelse(predict(reg, valid_df, type = "response") > 0.5, 1, 0),
#KNNProb = 1 - attr(kn, "prob"),
#KNNPred = kn,
TREEProb = predict(tr, valid_df)[, 2],
TREEPred = ifelse(predict(tr, valid_df)[, 2] > 0.5, 1, 0))
options(digits = 1, scipen = 2)
head(res, 10)
## ActualClass LRProb LRPred TREEProb TREEPred
## 2 0 2e-16 0 0 0
## 3 1 1e+00 1 1 1
## 6 1 1e+00 1 1 1
## 13 1 1e+00 1 1 1
## 14 0 2e-16 0 0 0
## 16 0 2e-16 0 0 0
## 19 0 2e-16 0 0 0
## 21 1 1e+00 1 1 1
## 23 0 2e-16 0 0 0
## 28 0 2e-16 0 0 0
# Ensemble model
#res$majority <- rowMeans(data.frame(res$LRPred, as.numeric(res$KNNPred), res$TREEPred)) > 0.5
#res$avg <- rowMeans(data.frame(res$LRProb, res$KNNProb, res$TREEProb))
head(res)
## ActualClass LRProb LRPred TREEProb TREEPred
## 2 0 2e-16 0 0 0
## 3 1 1e+00 1 1 1
## 6 1 1e+00 1 1 1
## 13 1 1e+00 1 1 1
## 14 0 2e-16 0 0 0
## 16 0 2e-16 0 0 0
#res$majority <- factor(res$majority, levels = c("0", "1"))
#valid_df$above_age <- factor(valid_df$above_age, levels = c("0", "1"))