library("ggplot2")
library("reshape2")
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
library("DescTools")
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
##
## %like%
library("pscl")
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library("PresenceAbsence")
load_data <- function(lab, set, mp){
#Read in data
tmp <- read.csv(paste("~/goals_of_care/regex_v2/",lab,"_",set,"_processed.csv", sep = ''),
header = T, stringsAsFactors = F, quote = "", row.names = NULL)
#Change "filename" to file for consistency
colnames(tmp)[which(colnames(tmp) == "filename")] <- "file"
#Subset unique filenames
tmp <- unique(tmp$file)
#Take substring of names to ensure we can map across both train_/valid_ files
tmp <- substr(tmp, nchar(tmp) - 9, nchar(tmp))
#Subset files in train set from key map
tmp <- mp[(mp$file %in% tmp),]
#Return the mapped data
return(tmp)
}
strict_regex <- function(kwds, texts){
#Create a list to store results
tmpList <- list()
#Loop through all keywords
for (i in 1:length(kwds)){
#Store results as a logical vector in its respective list entry position
tmpList[[i]] <- grepl(kwds[i], texts, ignore.case = TRUE)
}
#Return list and control to environment
return(tmpList)
}
to_df <- function(domain, rule){
#Convert list from grepl to data frame
domain <- as.data.frame(domain)
#Show column names as phrases
colnames(domain) <- rule[rule != '']
#Multiply by 1 for binary numeric
domain <- domain*1
return(domain)
}
clean_text <- function(tokens, printout){
#Create a fake patient note phrase as a representative sample
ex_token <- "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daughter, [** name **].\n Pt was in usual state of health until four days ago, when began to complain to family of heartburn for which the pt was taking tums in addition to his prescribed PPI, without resolution."
if (printout){
print(substr(ex_token, 1, 100))
}
#Remove carriage returns, convert to lower
tokens <- tolower(gsub('\r', ' ', tokens))
tokens <- tolower(gsub('\n', ' ', tokens))
ex_token <- tolower(gsub('\n', ' ', ex_token))
if (printout){
cat("Result after removing carriage returns:\n")
print(substr(ex_token, 1, 100))
}
#https://stackoverflow.com/questions/13529360/replace-text-within-parenthesis-in-r
#Remove obfuscations between '[' and ']'
tokens <- gsub(" *\\[.*?\\] *", ' ', tokens)
ex_token <- gsub(" *\\[.*?\\] *", ' ', ex_token)
if (printout){
cat("Result after leaving [obfuscation]:\n")
print(substr(ex_token, 1, 100))
}
#Keep only words & numeric
tokens <- gsub("[^[:alnum:][:space:]]", '', tokens)
ex_token <- gsub("[^[:alnum:][:space:]]", '', ex_token)
if (printout){
cat("Result after removing all but alphanumeric and spaces:\n")
print(substr(ex_token, 1, 100))
}
#Keep only a single white space
#https://stackoverflow.com/questions/25707647/merge-multiple-spaces-to-single-space-remove-trailing-leading-spaces
tokens <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', tokens, perl=TRUE)
ex_token <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", '', ex_token, perl=TRUE)
if (printout){
cat("Result after keeping only single spaces:\n")
print(substr(ex_token, 1, 100))
}
return(tokens)
}
bow <- function(phrases){
tmp <- unlist(strsplit(phrases, ' '))
tmp <- tmp[tmp != '']
return(tmp)
}
plot_bow <- function(bows, n, lab){
tmp_tab <- table(bows)[rev(order(table(bows)))]
par(mai=c(1,2,1,1))
barplot(rev(head(tmp_tab, n)), horiz = T, las = 1,
main = paste("Most Frequent Words in the ",lab ," Dictionary", sep = ''),
xlab = "Frequency")
}
bow_subset <- function(bows, n){
#Make an ordered table
tmp_tab <- table(bows)[rev(order(table(bows)))]
#print(head(tmp_tab))
#Percent calculation
perc <- length(tmp_tab)*(n/100)
#print(perc)
#Return top n represented values
return(attr(head(tmp_tab, perc), "names"))
}
cohort_gen <- function(dat, train_set, valid_set){
dat$COHORT <- rep('', each = nrow(dat))
for (i in 1:nrow(dat)){
if (dat$ROW_ID[i] %in% train_set$row_id){
dat$COHORT[i] <- "train"
} else if (dat$ROW_ID[i] %in% valid_set$row_id){
dat$COHORT[i] <- "validation"
} else {
dat$COHORT[i] <- ""
}
}
return(dat)
}
standardize <- function(dat){
return((dat - mean(dat))/sd(dat))
}
check_variance <- function(dat){
##Remove columns if they have no variance
for (name in colnames(dat)){
if (all(dat[[name]] == 1) | all (dat[[name]] == 0)){
cat(paste("\"",name, "\" has no variance and will be dropped.\n", sep = ''))
dat[[name]] <- NULL
}
}
return(dat)
}
model_info <- function(fit){
#Summary info
model_sum <- summary(fit)
#Odds ratio, confidence interval
odds_ratio <- cbind(OR = exp(fit$coef), exp(confint(fit)))
#Create list for return
my_list <- list(model_sum, odds_ratio)
#names
names(my_list) <- c("Model Summary","OR Summary")
return(my_list)
}
confusion_data <- function(fit){
#Create variables
accuracy <- vector()
#Threshold sequence
threshold <- seq(0.1,0.9, by=.01)
for (i in 1:length(threshold)){
#Accuracy calculation from confusion matrix
accuracy[i] <- Conf(fit, cutoff = threshold[i])$acc
}
#Confusion matrix
cutoff <- threshold[which.max(accuracy)]
conf_mat <- Conf(fit, cutoff = cutoff)
cat("Maximum accuracy is acheived at a cutoff of: ", cutoff, '\n', sep = '')
#Plot
layout(matrix(1:2,ncol = 2))
plot(threshold, accuracy, type = "l", main = "Cutoff Based on Accuracy")
abline(h=max(accuracy), v = cutoff, col="red")
fourfoldplot(conf_mat$table, main = "Confusion Matrix Plot",
color = c("red","green"))
return(conf_mat)
}
roc_plot <- function(observed, fit, lab){
df <- data.frame(ID=1:length(observed),observed = observed,
predicted = predict(fit, type = "response"))
#Maximize PCC (percent correctly classified)
auc.roc.plot(df, opt.methods = 5, color = TRUE, main = paste(lab, "AUC ROC Plot", sep = ' '))
}
stat_gen <- function(dat, lab){
h_lab <- substr(lab, 1, 3)
#True Positives
tp <- 0
#True Negatives
tn <- 0
#False Positives
fp <- 0
#False Negatives
fn <- 0
for (i in 1:nrow(dat)){
#if both human and model mark positive
if (dat[[h_lab]][i] == 1 & dat[[lab]][i] == 1){
tp <- tp + 1
#if human marks negative and model assigns label
} else if (dat[[h_lab]][i] == 0 & dat[[lab]][i] == 1){
fp <- fp + 1
#if human marks positive and model marks negative
} else if (dat[[h_lab]][i] == 1 & dat[[lab]][i] == 0){
#false negative
fn <- fn + 1
} else if (dat[[h_lab]][i] == 0 & dat[[lab]][i] == 0){
tn <- tn + 1
}
}
tmpFrame <- cbind(tp, tn, fp, fn)
colnames(tmpFrame) <- c("tp", "tn", "fp", "fn")
return(as.data.frame(tmpFrame))
}
model_stats <- function(dat){
dat <- as.data.frame(dat)
accuracy <- (dat$tp + dat$tn)/(dat$tp + dat$tn + dat$fp + dat$fn)
precision <- dat$tp/(dat$tp + dat$fp)
sensitivity <- dat$tp/(dat$tp + dat$fn)
specificity <- dat$tn/(dat$tn+dat$fp)
F1 <- 2*(precision*sensitivity)/(precision + sensitivity)
dat$accuracy <- round(accuracy*100, 2)
dat$precision <- round(precision*100, 2)
dat$sensitivity <- round(sensitivity*100, 2)
dat$specificity <- round(specificity*100, 2)
dat$F1 <- round(F1*100, 2)
return(dat)
}
#Read csv
dat <- fread("~/goals_of_care/regex_v3/op_annotations_122017.csv",
header = T, stringsAsFactors = F)
#Remove note w/ missing data
missing_note <- fread("~/goals_of_care/unmatched_note_result.csv", header = T, stringsAsFactors = F)
nrow(missing_note)
## [1] 1
#Remove note with missing data
dat <- dat[!dat$ROW_ID %in% missing_note$ROW_ID, ]
#Make the names easier to deal with:
colnames(dat)[which(colnames(dat) == "Code Status Limitations")] <- "LIM"
colnames(dat)[which(colnames(dat) == "Full Code Status")] <- "COD"
colnames(dat)[which(colnames(dat) == "Patient and Family Care Preferences")] <- "CAR"
colnames(dat)[which(colnames(dat) == "Palliative Care Team Involvement")] <- "PAL"
colnames(dat)[which(colnames(dat) == "Communication with Family")] <- "FAM"
#Dat CIM is already accounted for
#Check the data
colnames(dat)
## [1] "V1"
## [2] "ROW_ID"
## [3] "SUBJECT_ID"
## [4] "HADM_ID"
## [5] "CATEGORY"
## [6] "DESCRIPTION"
## [7] "TEXT"
## [8] "COHORT"
## [9] "CAR"
## [10] "Patient and Family Care Preferences Text"
## [11] "FAM"
## [12] "Communication with Family Text"
## [13] "COD"
## [14] "Full Code Status Text"
## [15] "LIM"
## [16] "Code Status Limitations Text"
## [17] "PAL"
## [18] "Palliative Care Team Involvement Text"
## [19] "Ambiguous"
## [20] "Ambiguous Text"
## [21] "Ambiguous Comments"
## [22] "None"
## [23] "STAMP"
## [24] "operator"
## [25] "original_filename"
nrow(dat)
## [1] 1496
#Contains map of files (BRAT format for neuroNER) to row_id (MIMIC NOTEEVENTS)
map <- fread("~/goals_of_care/summary_stats/row_id_to_file_num.txt", header = F, stringsAsFactors = F, sep = '\t')
head(map)
## V1 V2
## 1: 407635 text_00000
## 2: 345097 text_00001
## 3: 371117 text_00002
## 4: 401545 text_00003
## 5: 393792 text_00004
## 6: 382263 text_00005
#Add column names for ease of merging
colnames(map) <- c("row_id", "file")
#Code Status Limitations
lim_train_set <- load_data("LIM", "train", map)
lim_valid_set <- load_data("LIM", "valid", map)
#Full Code
cod_train_set <- load_data("COD", "train", map)
cod_valid_set <- load_data("COD", "valid", map)
#Patient/Family Care Preferences
car_train_set <- load_data("CAR", "train", map)
car_valid_set <- load_data("CAR", "valid", map)
#Palliative Care Consult
pal_train_set <- load_data("PAL", "train", map)
pal_valid_set <- load_data("PAL", "valid", map)
#Family meeting held
fam_train_set <- load_data("FAM", "train", map)
fam_valid_set <- load_data("FAM", "valid", map)
#dat <- cohort_gen(dat, train_set, valid_set)
lim_dat <- cohort_gen(dat, lim_train_set, lim_valid_set)
cod_dat <- cohort_gen(dat, cod_train_set, cod_valid_set)
car_dat <- cohort_gen(dat, car_train_set, car_valid_set)
pal_dat <- cohort_gen(dat, pal_train_set, pal_valid_set)
fam_dat <- cohort_gen(dat, fam_train_set, fam_valid_set)
#apply train/valid cohort information to dat
if (identical(dat$ROW_ID, lim_dat$ROW_ID)){
dat$COHORT <- lim_dat$COHORT
}
levels(factor(dat$COHORT))
## [1] "train" "validation"
CIM will be defined as including code status limitations and documentation of care preferences.
#Create CIM dat by using any other data source
cim_dat <- lim_dat
cim <- vector()
for (i in 1:nrow(lim_dat)){
if (lim_dat$LIM[i] == 1 | lim_dat$CAR[i] == 1){
cim[length(cim)+1] <- 1
} else {
cim[length(cim)+1] <- 0
}
}
#Populate CIM data
cim_dat$CIM <- cim
#Add to dat
dat$CIM <- cim
Note: Only text and phrases from the training set will be used.
##Only use phrases from training set
lim_phrases <- lim_dat$`Code Status Limitations Text`[lim_dat$COHORT == "train"]
cod_phrases <- cod_dat$`Full Code Status Text`[cod_dat$COHORT == "train"]
car_phrases <- car_dat$`Patient and Family Care Preferences Text`[car_dat$COHORT == "train"]
pal_phrases <- pal_dat$`Palliative Care Team Involvement Text`[pal_dat$COHORT == "train"]
fam_phrases <- fam_dat$`Communication with Family Text`[fam_dat$COHORT == "train"]
##remove empty observations ""
lim_phrases <- lim_phrases[lim_phrases != '']
cod_phrases <- cod_phrases[cod_phrases != '']
car_phrases <- car_phrases[car_phrases != '']
pal_phrases <- pal_phrases[pal_phrases != '']
fam_phrases <- fam_phrases[fam_phrases != '']
#CIM is a combination of care preferences/code status limitations
cim_phrases <- c(car_phrases, lim_phrases)
##Remove duplicate observations (keep first)
dat <- dat[!duplicated(dat$TEXT), ]
#Subset texts from training set
txts <- clean_text(dat$TEXT[dat$COHORT == "train"], TRUE)
## [1] "Example note:\nThe patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after removing carriage returns:
## [1] "example note: the patient is a 81yo m who was found down in [** location **] on [** date **] by daug"
## Result after leaving [obfuscation]:
## [1] "example note: the patient is a 81yo m who was found down in on by daughter, . pt was in usual sta"
## Result after removing all but alphanumeric and spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state "
## Result after keeping only single spaces:
## [1] "example note the patient is a 81yo m who was found down in on by daughter pt was in usual state of h"
lim_phrases <- clean_text(lim_phrases, FALSE)
cod_phrases <- clean_text(cod_phrases, FALSE)
car_phrases <- clean_text(car_phrases, FALSE)
pal_phrases <- clean_text(pal_phrases, FALSE)
fam_phrases <- clean_text(fam_phrases, FALSE)
#
cim_phrases <- clean_text(cim_phrases, FALSE)
#Create bag of words by splitting phrases on spaces and unlisting result
lim_bow <- bow(lim_phrases)
cod_bow <- bow(cod_phrases)
car_bow <- bow(car_phrases)
pal_bow <- bow(pal_phrases)
fam_bow <- bow(fam_phrases)
#cim
cim_bow <- bow(cim_phrases)
#All texts
txt_bow <- bow(txts)
plot_bow(lim_bow, 20, "LIM Phrase")
plot_bow(cod_bow, 20, "COD Phrase")
plot_bow(car_bow, 20, "CAR Phrase")
plot_bow(pal_bow, 20, "PAL Phrase")
plot_bow(fam_bow, 20, "FAM Phrase")
plot_bow(cim_bow, 20, "CIM Phrase")
plot_bow(txt_bow, 20, "Corpus")
#Top non-specific tokens from all texts
txt_bow <- bow_subset(txt_bow, .1)
#Clean non-specific tokens from phrase dictionaries
lim_bow <- lim_bow[!lim_bow %in% txt_bow]
cod_bow <- cod_bow[!cod_bow %in% txt_bow]
car_bow <- car_bow[!car_bow %in% txt_bow]
pal_bow <- pal_bow[!pal_bow %in% txt_bow]
fam_bow <- fam_bow[!fam_bow %in% txt_bow]
cim_bow <- cim_bow[!cim_bow %in% txt_bow]
#Subset top tokens in each phrase dictionary
(lim_bow <- bow_subset(lim_bow, 10))
## [1] "code" "status" "dnr" "dni"
## [5] "dnrdni" "full" "do" "resuscitate"
## [9] "family" "patient" "cmo" "confirmed"
## [13] "comfort" "care" "only" "but"
## [17] "measures" "made" "be" "her"
## [21] "would" "now" "that" "intubate"
## [25] "hcp" "compressions" "changed" "want"
## [29] "pressors" "if" "his" "a"
## [33] "will" "pt" "per" "patients"
(cod_bow <- bow_subset(cod_bow, 10))
## [1] "code" "full" "status" "presumed" "confirmed"
## [6] "discussed" "patient" "family" "will" "wife"
## [11] "per" "hcp" "discussion"
(car_bow <- bow_subset(car_bow, 10))
## [1] "family" "care" "code" "patient" "would"
## [6] "be" "full" "status" "comfort" "that"
## [11] "cmo" "want" "like" "her" "goals"
## [16] "but" "his" "will" "measures" "dnrdni"
## [21] "she" "confirmed" "meeting" "discussion" "made"
## [26] "dnr" "decision" "he" "focus" "a"
## [31] "they" "son" "or" "daughter" "central"
## [36] "procedures" "patients" "discussed" "aggressive" "pt"
## [41] "pressors" "per" "invasive" "now" "this"
## [46] "after" "including" "hcp" "escalation" "wife"
## [51] "lines" "home" "by" "wishes" "we"
## [56] "states" "prognosis" "plan" "only" "dni"
## [61] "any" "morphine" "avoid" "are" "agreed"
## [66] "yesterday" "pts" "intubation" "have"
(pal_bow <- bow_subset(pal_bow, 10))
## [1] "care" "palliative" "family" "goals" "consult"
## [6] "would" "like" "patient" "hospice" "her"
(fam_bow <- bow_subset(fam_bow, 10))
## [1] "family" "meeting" "code" "patient"
## [5] "care" "discussed" "communication" "held"
## [9] "full" "status" "will" "wife"
## [13] "her" "son" "goals" "discussion"
## [17] "be" "a" "daughter" "confirmed"
## [21] "hcp" "would" "his" "dnrdni"
## [25] "this" "that" "comfort" "patients"
## [29] "husband" "consent" "who" "s"
## [33] "we" "spoke" "have" "plan"
## [37] "icu" "yesterday" "today" "they"
## [41] "met" "comments" "after" "signed"
## [45] "him" "decision" "but" "team"
## [49] "prognosis" "like" "he" "are"
## [53] "w" "pts" "pt" "dnr"
## [57] "discuss" "update" "rounds" "length"
## [61] "aware" "night"
(cim_bow <- bow_subset(cim_bow, 10))
## [1] "code" "status" "dnr" "family"
## [5] "dni" "full" "care" "patient"
## [9] "dnrdni" "would" "cmo" "comfort"
## [13] "be" "do" "that" "confirmed"
## [17] "but" "want" "resuscitate" "her"
## [21] "measures" "made" "goals" "like"
## [25] "his" "only" "will" "meeting"
## [29] "discussion" "a" "she" "now"
## [33] "he" "pressors" "patients" "decision"
## [37] "daughter" "central" "son" "pt"
## [41] "per" "hcp" "focus" "invasive"
## [45] "discussed" "procedures" "aggressive" "they"
## [49] "or" "escalation" "after" "wife"
## [53] "this" "by" "plan" "intubation"
## [57] "compressions" "we" "lines" "including"
## [61] "changed" "wishes" "prognosis" "if"
## [65] "home" "given" "chest" "states"
## [69] "make" "have" "any" "agreed"
## [73] "should" "morphine"
#Strict regex
lim_tmp <- strict_regex(lim_bow, txts)
cod_tmp <- strict_regex(cod_bow, txts)
car_tmp <- strict_regex(car_bow, txts)
pal_tmp <- strict_regex(pal_bow, txts)
fam_tmp <- strict_regex(fam_bow, txts)
#CIM
cim_tmp <- strict_regex(cim_bow, txts)
#Convert list to data.frame
lim_tmp <- to_df(lim_tmp, lim_bow)
cod_tmp <- to_df(cod_tmp, cod_bow)
car_tmp <- to_df(car_tmp, car_bow)
pal_tmp <- to_df(pal_tmp, pal_bow)
fam_tmp <- to_df(fam_tmp, fam_bow)
#CIM
cim_tmp <- to_df(cim_tmp, cim_bow)
lim_tmp <- check_variance(lim_tmp)
## "a" has no variance and will be dropped.
cod_tmp <- check_variance(cod_tmp)
car_tmp <- check_variance(car_tmp)
## "he" has no variance and will be dropped.
## "a" has no variance and will be dropped.
## "or" has no variance and will be dropped.
pal_tmp <- check_variance(pal_tmp)
fam_tmp <- check_variance(fam_tmp)
## "a" has no variance and will be dropped.
## "s" has no variance and will be dropped.
## "he" has no variance and will be dropped.
## "w" has no variance and will be dropped.
cim_tmp <- check_variance(cim_tmp)
## "a" has no variance and will be dropped.
## "he" has no variance and will be dropped.
## "or" has no variance and will be dropped.
#cbind data
LIM <- dat$LIM[dat$COHORT == "train"]
lim_tmp <- cbind(LIM, lim_tmp)
#Note: NA's must be omitted for COD due to missing observations
COD <- dat$COD[dat$COHORT == "train"]
cod_tmp <- na.omit(cbind(COD, cod_tmp))
CAR <- dat$CAR[dat$COHORT == "train"]
car_tmp <- cbind(CAR, car_tmp)
PAL <- dat$PAL[dat$COHORT == "train"]
pal_tmp <- cbind(PAL, pal_tmp)
FAM <- dat$FAM[dat$COHORT == "train"]
fam_tmp <- cbind(FAM, fam_tmp)
CIM <- dat$CIM[dat$COHORT == "train"]
cim_tmp <- cbind(CIM, cim_tmp)
#LIM
lim_reg <- glm(LIM ~ ., family = binomial(link = 'logit'), data = lim_tmp)
#COD
cod_reg <- glm(COD ~ ., family = binomial(link = 'logit'), data = cod_tmp)
#CAR
car_reg <- glm(CAR ~ ., family = binomial(link = 'logit'), data = car_tmp)
#PAL
pal_reg <- glm(PAL ~ ., family = binomial(link = 'logit'), data = pal_tmp)
#FAM
fam_reg <- glm(FAM ~ ., family = binomial(link = 'logit'), data = fam_tmp)
#CIM
cim_reg <- glm(CIM ~ ., family = binomial(link = 'logit'), data = cim_tmp)
val_dat <- fread("~/over_75_cohort_20Jan18.csv", header = T, stringsAsFactors = F)
txts <- clean_text(val_dat$TEXT, FALSE)
#Strict regex
lim_tmp <- strict_regex(lim_bow, txts)
cod_tmp <- strict_regex(cod_bow, txts)
car_tmp <- strict_regex(car_bow, txts)
pal_tmp <- strict_regex(pal_bow, txts)
fam_tmp <- strict_regex(fam_bow, txts)
#CIM
cim_tmp <- strict_regex(cim_bow, txts)
#Convert list to data.frame
lim_tmp <- to_df(lim_tmp, lim_bow)
cod_tmp <- to_df(cod_tmp, cod_bow)
car_tmp <- to_df(car_tmp, car_bow)
pal_tmp <- to_df(pal_tmp, pal_bow)
fam_tmp <- to_df(fam_tmp, fam_bow)
#CIM
cim_tmp <- to_df(cim_tmp, cim_bow)
#Generate probability values
LIM_PRED <- predict(lim_reg, lim_tmp, type="response")
COD_PRED <- predict(cod_reg, cod_tmp, type="response")
CAR_PRED <- predict(car_reg, car_tmp, type="response")
PAL_PRED <- predict(pal_reg, pal_tmp, type="response")
FAM_PRED <- predict(fam_reg, fam_tmp, type="response")
CIM_PRED <- predict(cim_reg, cim_tmp, type="response")
#Show distribution of probabilities
boxplot(LIM_PRED, COD_PRED, CAR_PRED, PAL_PRED, FAM_PRED, CIM_PRED, main = "Note Prediction Probability Distributions\n (LIM) (COD) (CAR) (PAL) (FAM) (CIM)")
#Apply predictions given cutoff probability
LIM_PRED <- ifelse(LIM_PRED > 0.63, 1, 0)
COD_PRED <- ifelse(COD_PRED > 0.59, 1, 0)
CAR_PRED <- ifelse(CAR_PRED > 0.48, 1, 0)
PAL_PRED <- ifelse(PAL_PRED > 0.47, 1, 0)
FAM_PRED <- ifelse(FAM_PRED > 0.40, 1, 0)
CIM_PRED <- ifelse(CIM_PRED > 0.49, 1, 0)
valid <- cbind(val_dat,
LIM_PRED,
COD_PRED,
CAR_PRED,
PAL_PRED,
FAM_PRED,
CIM_PRED)
write.csv(valid, file = "BoW_LR_ex_val.csv", row.names = F)
table(LIM_PRED)
## LIM_PRED
## 0 1
## 6968 3282
table(COD_PRED)
## COD_PRED
## 0 1
## 3644 6606
table(CAR_PRED)
## CAR_PRED
## 0 1
## 8876 1374
table(PAL_PRED)
## PAL_PRED
## 0 1
## 10199 51
table(FAM_PRED)
## FAM_PRED
## 0 1
## 7743 2507
table(CIM_PRED)
## CIM_PRED
## 0 1
## 5847 4403