Step 0: Follow the steps in the previous comment to tag hierarchical buckets to a subset (say, 40%) of free-text responses manually.
X AND Y OR Z, then bucket = some bucket”Step 2: Repeat Step 1 for all buckets
## load data set
#input_df = read.csv("free_text_v7_categories - best_treatment_explain.csv")
#### n_level: number of total sub-levels
#### need change
n_level = 19## load test set
test = read.csv("test_718.csv")
test = test[, c("best_treatment", "best_treatment_explain", "Specific.Level")]## main function
## create a new dataframe
### need edit
heuristic_auto = function(input_df) {
output_df = input_df[, c("best_treatment", "best_treatment_explain", "Specific.Level")]
output_df$predicted_levels = rep(NA, nrow(output_df))
#output_df$main_level = rep(NA, nrow(output_df))
for (i in 1:nrow(output_df)) {
free_text = output_df[i,]
predicted_levels = combine_levels(find_sub_level(free_text))
output_df$predicted_levels[i] = predicted_levels
}
## desired output
return(output_df)
}## helper function: combine sub levels to a single entry
combine_levels = function(lst) {
if (length(lst) == 0) {
return("")
}
if (length(lst) == 1) {
return(lst[1])
}
level_str = lst[1]
for (i in 2:length(lst)) {
level_str = paste0(level_str, " & ", lst[i])
}
return(level_str)
}## best_treatment bank
TRUSTINFO = "trusted info source"
FAMILYSUPPORT = "family supports it"
REWARD = "rewards for vaxxing"
NOTHING = "nothing"
OTHER = "other"
SOMETHINGELSE = "something else"## helper function: find levels of one free-text response (for one row)
find_sub_level = function(free_text) {
# best treatment selected in previous question
best_treatment_selected = tolower(free_text[1])
# free-text response answered in current question
response = tolower(free_text[2])
# sub_level_lst: all sub-levels that satisfies the corresponding heuristics
sub_level_lst = list()
for (i in 1:n_level) {
output = do.call(paste0("sub_level_", as.character(i)), list(best_treatment_selected, response))
if (output != "") {
sub_level_lst = append(sub_level_lst, c(output))
}
}
return(sub_level_lst)
}#### helper function: sub_level_1:
#### level: accompanied by family
#### arg1: best_treatment_selected
#### arg2: response
sub_level_1 = function(arg1, arg2) {
token1 = (arg1 == "family supports it") || (arg1 == "nothing")
token2 = grepl("be", arg2, fixed=TRUE) & grepl("there", arg2, fixed=TRUE)
token3 = (grepl("went", arg2, fixed=TRUE) || grepl("go", arg2, fixed=TRUE)) & (grepl("alone", arg2, fixed=TRUE) || grepl("with me", arg2, fixed=TRUE))
if (token1 & (token2 || token3)) {
return("accompanied by family")
} else {
return("")
}
}#### helper function: sub_level_2:
#### level: afraid of the vaccine
#### arg1: best_treatment_selected
#### arg2: response
sub_level_2 = function(arg1, arg2) {
token1 = (arg1 == "family supports it") || (arg1 == "other")
token2 = (grepl("i", arg2, fixed=TRUE) || grepl("they", arg2, fixed=TRUE)) & grepl("afraid", arg2, fixed=TRUE)
token3 = (grepl("i", arg2, fixed=TRUE) & grepl("die", arg2, fixed=TRUE)) & (grepl("after", arg2, fixed=TRUE))
if (token1 & (token2 || token3)) {
return("afraid of the vaccine")
} else {
return("")
}
}###################################
#### helper function: sub_level_3:
#### level: info provided by clinic
#### arg1: best_treatment_selected
#### arg2: response
sub_level_3 = function(arg1, arg2) {
token1 = (arg1 == "family supports it") || (arg1 == "trusted info source")
token2 = (grepl("clinic", arg2, fixed=TRUE))
if (token1 & token2) {
return("info provided by clinic")
} else {
return("")
}
}#### helper function: sub_level_4:
#### level: info provided by internet
#### arg1: best_treatment_selected
#### arg2: response
sub_level_4 = function(arg1, arg2) {
token1 = (arg1 == "trusted info source") || (arg1 == "other")
token2 = (grepl("google", arg2, fixed=TRUE) || grepl("internet", arg2, fixed=TRUE)) || grepl("website", arg2, fixed=TRUE)
if (token1 & (token2)) {
return("info provided by internet")
} else {
return("")
}
}#### helper function: sub_level_5:
#### level: info provided by media
#### arg1: best_treatment_selected
#### arg2: response
sub_level_5 = function(arg1, arg2) {
token1 = (arg1 == "trusted info source") || (arg1 == "other")
token2 = arg2 == "media"
token3 = arg2 == "the media"
if (token1 & (token2 || token3)) {
return("info provided by media")
} else {
return("")
}
}#### helper function: sub_level_6:
#### level: info provided by news
#### arg1: best_treatment_selected
#### arg2: response
sub_level_6 = function(arg1, arg2) {
token1 = (arg1 == "trusted info source") || (arg1 == "other") || (arg1 == "nothing")
token2 = grepl("news", arg2, fixed=TRUE) & (grepl("channel", arg2, fixed=TRUE) || grepl("the", arg2, fixed=TRUE) || grepl("world", arg2, fixed=TRUE) || grepl("and", arg2, fixed=TRUE))
token3 = arg2 == "news"
if (token1 & (token2 || token3)) {
return("info provided by news")
} else {
return("")
}
}#### helper function: sub_level_7:
#### level: info provided by WHO
#### arg1: best_treatment_selected
#### arg2: response
sub_level_7 = function(arg1, arg2) {
token1 = (arg1 == "trusted info source") || (arg1 == "nothing")
token2 = grepl("world health org", arg2, fixed=TRUE)
token3 = grepl("who", arg2, fixed=TRUE)
token4 = grepl("w h o", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4)) {
return("info provided by WHO")
} else {
return("")
}
}#### helper function: sub_level_8:
#### level: info provided by government
#### arg1: best_treatment_selected
#### arg2: response
sub_level_8 = function(arg1, arg2) {
token1 = (arg1 == "trusted info source") || (arg1 == "family supports it") || (arg1 == "something else")
token2 = (grepl("department", arg2, fixed=TRUE) || grepl("dept", arg2, fixed=TRUE)) & grepl("health", arg2, fixed=TRUE)
token3 = grepl("government", arg2, fixed=TRUE)
token4 = grepl("president", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4)) {
return("info provided by government")
} else {
return("")
}
}#### helper function: sub_level_9:
#### level: no treatment
#### arg1: best_treatment_selected
#### arg2: response
sub_level_9 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == FAMILYSUPPORT) || (arg1 == REWARD) || (arg1 == NOTHING) || (arg1 == OTHER)
token2 = arg2 %in% list("nothing", "no", "none", "nope")
token3 = grepl("i don t know", arg2, fixed=TRUE)
token4 = grepl("so far none", arg2, fixed=TRUE)
token5 = grepl("i can t really say", arg2, fixed=TRUE)
token6 = grepl("i have no idea", arg2, fixed=TRUE)
token7 = grepl("not one of them", arg2, fixed=TRUE)
token8 = grepl("nothing els", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5 || token6 || token7 || token8)) {
return("no treatment")
} else {
return("")
}
}#### helper function: sub_level_10:
#### level: info provided by professionals
#### arg1: best_treatment_selected
#### arg2: response
sub_level_10 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == SOMETHINGELSE) || (arg1 == NOTHING) || (arg1 == OTHER)
token2 = grepl("doctor", arg2, fixed=TRUE)
token3 = grepl("nurse", arg2, fixed=TRUE)
token4 = grepl("expert", arg2, fixed=TRUE) & (grepl("explanation", arg2, fixed=TRUE) || grepl("advice", arg2, fixed=TRUE))
token5 = grepl("dockter", arg2, fixed=TRUE)
token6 = grepl("drs", arg2, fixed=TRUE)
token7 = grepl("health", arg2, fixed=TRUE) & grepl("worker", arg2, fixed=TRUE)
token8 = grepl("health", arg2, fixed=TRUE) & grepl("desk", arg2, fixed=TRUE)
token9 = (grepl("health", arg2, fixed=TRUE) || grepl("medical", arg2, fixed=TRUE)) & grepl("expert", arg2, fixed=TRUE)
token10 = grepl("health industry", arg2, fixed=TRUE) & grepl("work", arg2, fixed=TRUE)
token11 = grepl("medical training", arg2, fixed=TRUE) & grepl("friend", arg2, fixed=TRUE)
token12 = grepl("info", arg2, fixed=TRUE) & grepl("hospital", arg2, fixed=TRUE)
token13 = grepl("scientist", arg2, fixed=TRUE)
token14 = grepl("medical", arg2, fixed=TRUE) & grepl("practitioner", arg2, fixed=TRUE)
token15 = grepl("health professional", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5 || token6 || token7 || token8 || token9 || token10 || token11 || token12 || token13 || token14 || token15)) {
return("info provided by professionals")
} else {
return("")
}
}#### helper function: sub_level_11:
#### level: info provided by social media
#### arg1: best_treatment_selected
#### arg2: response
sub_level_11 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == NOTHING)
token2 = grepl("share", arg2, fixed=TRUE) & grepl("u", arg2, fixed=TRUE) & grepl("voice", arg2, fixed=TRUE)
token3 = grepl("social media", arg2, fixed=TRUE)
token4 = grepl("facebook", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4)) {
return("info provided by social media")
} else {
return("")
}
}#### helper function: sub_level_12:
#### level: info related to vax safety
#### arg1: best_treatment_selected
#### arg2: response
sub_level_12 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == NOTHING) || (arg1 == FAMILYSUPPORT) || (arg1 == OTHER)
token2 = grepl("give", arg2, fixed=TRUE) & grepl("more", arg2, fixed=TRUE) & grepl("ill", arg2, fixed=TRUE)
token3 = grepl("not dangerous", arg2, fixed=TRUE)
token4 = grepl("my health", arg2, fixed=TRUE)
token5 = grepl("vaccine", arg2, fixed=TRUE) & grepl("safe", arg2, fixed=TRUE)
token6 = grepl("prove", arg2, fixed=TRUE) & grepl("safe", arg2, fixed=TRUE)
token7 = grepl("absolute", arg2, fixed=TRUE) & grepl("safe", arg2, fixed=TRUE)
token8 = grepl("safely", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5 || token6 || token7 || token8)) {
return("info related to vax safety")
} else {
return("")
}
}#### helper function: sub_level_13:
#### level: info provided by radio
#### arg1: best_treatment_selected
#### arg2: response
sub_level_13 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO)
token2 = grepl("radio", arg2, fixed=TRUE)
if (token1 & token2) {
return("info provided by radio")
} else {
return("")
}
}#### helper function: sub_level_14:
#### level: trusted info source from news
#### arg1: best_treatment_selected
#### arg2: response
sub_level_14 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO)
token2 = grepl("bbc", arg2, fixed=TRUE)
token3 = grepl("cnn", arg2, fixed=TRUE)
token4 = grepl("abc", arg2, fixed=TRUE)
token5 = grepl("verified", arg2, fixed=TRUE) & grepl("news", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5)) {
return("trusted info source from news")
} else {
return("")
}
}#### helper function: sub_level_15:
#### level: info provided by TV
#### arg1: best_treatment_selected
#### arg2: response
sub_level_15 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO)
token2 = grepl("tv", arg2, fixed=TRUE)
token3 = grepl("television", arg2, fixed=TRUE)
if (token1 & (token2 || token3)) {
return("info provided by TV")
} else {
return("")
}
}#### helper function: sub_level_16:
#### level: info provided by friends and family
#### arg1: best_treatment_selected
#### arg2: response
sub_level_16 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO)
token2 = grepl("friend", arg2, fixed=TRUE)
token3 = grepl("family", arg2, fixed=TRUE)
token4 = arg2 %in% list("my father", "my mother")
if (token1 & (token2 || token3 || token4)) {
return("info provided by friends and family")
} else {
return("")
}
}#### helper function: sub_level_17:
#### level: info that shows the vaccine is effective/can protect people
#### arg1: best_treatment_selected
#### arg2: response
sub_level_17 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == FAMILYSUPPORT) || (arg1 == NOTHING)
token2 = grepl("100", arg2, fixed=TRUE) & (grepl("effect", arg2, fixed=TRUE) || grepl("safe", arg2, fixed=TRUE) || grepl("work", arg2, fixed=TRUE) || grepl("protect", arg2, fixed=TRUE))
token3 = grepl("avoid", arg2, fixed=TRUE) & grepl("hospital", arg2, fixed=TRUE)
token4 = grepl("vaccine", arg2, fixed=TRUE) & grepl("sav", arg2, fixed=TRUE) & grepl("live", arg2, fixed=TRUE)
token5 = grepl("vacc", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE) & grepl("fatal", arg2, fixed=TRUE)
token6 = grepl("vacc", arg2, fixed=TRUE) & grepl("protected", arg2, fixed=TRUE) & grepl("by", arg2, fixed=TRUE)
token7 = grepl("immu", arg2, fixed=TRUE) & grepl("system", arg2, fixed=TRUE) & (grepl("boost", arg2, fixed=TRUE) || grepl("protection", arg2, fixed=TRUE))
token8 = grepl("help", arg2, fixed=TRUE) & grepl("not", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE)
token9 = grepl("n t", arg2, fixed=TRUE) & grepl("vacc", arg2, fixed=TRUE) & grepl("die", arg2, fixed=TRUE) & grepl("positive", arg2, fixed=TRUE)
token10 = grepl("wont", arg2, fixed=TRUE) & grepl("kill", arg2, fixed=TRUE) & grepl("you", arg2, fixed=TRUE)
token11 = grepl("bad", arg2, fixed=TRUE) & grepl("not taking", arg2, fixed=TRUE) & grepl("vaccine", arg2, fixed=TRUE)
token12 = grepl("n t", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("sick", arg2, fixed=TRUE) & grepl("vaccin", arg2, fixed=TRUE)
token13 = grepl("reduce", arg2, fixed=TRUE) & grepl("chance", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & (grepl("covid", arg2, fixed=TRUE) || grepl("virus", arg2, fixed=TRUE))
token14 = grepl("wont", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE)
token15 = grepl("explain", arg2, fixed=TRUE) & grepl("people", arg2, fixed=TRUE) & grepl("die", arg2, fixed=TRUE) & (grepl("get", arg2, fixed=TRUE) & grepl("jab", arg2, fixed=TRUE))
token16 = grepl("impact", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE) & grepl("affected", arg2, fixed=TRUE)
token17 = grepl("no longer", arg2, fixed=TRUE) & grepl("at risk", arg2, fixed=TRUE) & grepl("vaccinated", arg2, fixed=TRUE)
token18 = grepl("prevent corona", arg2, fixed=TRUE)
token19 = grepl("n t", arg2, fixed=TRUE) & grepl("contract", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE)
token20 = grepl("n t", arg2, fixed=TRUE) & grepl("test", arg2, fixed=TRUE) & grepl("positive", arg2, fixed=TRUE) & grepl("after", arg2, fixed=TRUE) & grepl("vax", arg2, fixed=TRUE)
token21 = grepl("virus", arg2, fixed=TRUE) & (grepl("against", arg2, fixed=TRUE) || grepl("protect", arg2, fixed=TRUE))
token22 = grepl("chance", arg2, fixed=TRUE) & grepl("low", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE)
token23 = grepl("reduce", arg2, fixed=TRUE) & grepl("risk", arg2, fixed=TRUE) & grepl("infect", arg2, fixed=TRUE)
token24 = grepl("chance", arg2, fixed=TRUE) & grepl("get", arg2, fixed=TRUE) & grepl("infect", arg2, fixed=TRUE) & grepl("less", arg2, fixed=TRUE)
token25 = grepl("covid", arg2, fixed=TRUE) & grepl("rat", arg2, fixed=TRUE) & grepl("drop", arg2, fixed=TRUE)
token26 = grepl("virus", arg2, fixed=TRUE) & (grepl("against", arg2, fixed=TRUE) & grepl("protect", arg2, fixed=TRUE))
token27 = grepl("reduce", arg2, fixed=TRUE) & grepl("spread", arg2, fixed=TRUE) & grepl("corona", arg2, fixed=TRUE)
token28 = grepl("vacc", arg2, fixed=TRUE) & grepl("help", arg2, fixed=TRUE) & grepl("guarantee", arg2, fixed=TRUE)
token29 = grepl("never", arg2, fixed=TRUE) & grepl("infected", arg2, fixed=TRUE) & grepl("vaccine", arg2, fixed=TRUE)
token30 = grepl("dont", arg2, fixed=TRUE) & grepl("sick", arg2, fixed=TRUE) & grepl("take", arg2, fixed=TRUE) & grepl("vacc", arg2, fixed=TRUE)
token31 = grepl("help", arg2, fixed=TRUE) & grepl("not", arg2, fixed=TRUE) & grepl("court", arg2, fixed=TRUE) & grepl("virus", arg2, fixed=TRUE)
token32 = grepl("protect", arg2, fixed=TRUE) & grepl("against", arg2, fixed=TRUE) & grepl("covid", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5 || token6 || token7 || token8 || token9 || token10 || token11 || token12 || token13 || token14 || token15 || token16 || token17 || token18 || token19 || token20 || token21 || token22 || token23 || token24 || token25 || token26 || token27 || token28 || token29 || token30 || token31 || token32)) {
return("info that shows the vaccine is effective/can protect people")
} else {
return("")
}
}#### helper function: sub_level_18:
#### level: info related to side effects
#### arg1: best_treatment_selected
#### arg2: response
sub_level_18 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == OTHER)
token2 = (grepl("illness", arg2, fixed=TRUE) || grepl("sick", arg2, fixed=TRUE)) & grepl("vacc", arg2, fixed=TRUE)
token3 = (grepl("difficult", arg2, fixed=TRUE) & grepl("after", arg2, fixed=TRUE)) & grepl("vacc", arg2, fixed=TRUE)
token4 = grepl("side effect", arg2, fixed=TRUE)
token5 = (grepl("effects of the vaccine", arg2, fixed=TRUE)) & grepl("bad", arg2, fixed=TRUE)
token6 = (grepl("dying from", arg2, fixed=TRUE)) & grepl("vaccine", arg2, fixed=TRUE)
if (token1 & (token2 || token3 || token4 || token5 || token6)) {
return("info related to side effects")
} else {
return("")
}
}#### helper function: sub_level_19:
#### level: info provided by vaccinated people
#### arg1: best_treatment_selected
#### arg2: response
sub_level_19 = function(arg1, arg2) {
token1 = (arg1 == TRUSTINFO) || (arg1 == OTHER) || (arg1 == SOMETHINGELSE) || (arg1 == NOTHING)
token2 = (grepl("experience", arg2, fixed=TRUE) & grepl("people", arg2, fixed=TRUE))
token3 = grepl("vaccinated", arg2, fixed=TRUE) & ((grepl("after", arg2, fixed=TRUE)) || grepl("before", arg2, fixed=TRUE) || grepl("have", arg2, fixed=TRUE) || grepl("had", arg2, fixed=TRUE) || grepl("got", arg2, fixed=TRUE) || grepl("get", arg2, fixed=TRUE))
token4 = (grepl("already", arg2, fixed=TRUE) & grepl("people", arg2, fixed=TRUE))
token5 = (grepl("vaccination", arg2, fixed=TRUE) || grepl("vax", arg2, fixed=TRUE)) & (grepl("people", arg2, fixed=TRUE) || grepl("they", arg2, fixed=TRUE)) & (grepl("say", arg2, fixed=TRUE) || grepl("tell", arg2, fixed=TRUE))
if (token1 & (token2 || token3 || token4 || token5)) {
return("info provided by vaccinated people")
} else {
return("")
}
}The heuristics above is trained by the data from Pilot 7. Then, we use 50 free-text responses from Pilot 5&6 to perform a quick test.
accuracy = mean(test_result$Specific.Level == test_result$predicted_levels)
unassigned_rate = 6 / nrow(test_result)
misclassified_rate = 4 / nrow(test_result)
metrics = c("Accuracy", "Unassigned Rate", "Misassigned Rate")
stats = c(accuracy, unassigned_rate, misclassified_rate)
df = data.frame(metrics, stats)
#datatable(df)| Metrics | Stats |
|---|---|
| Accuracy | 80% |
| Unassigned Rate | 12% |
| Misassigned Rate | 8% |
Accuracy: #responses that predicted correctly / #responsesUnassigned Rate: #responses that miss one or more levels in prediction / #responsesMisassigned Rate: #responses that been assigned one or more wrong levels in prediction/ #responsesMisclassified data:
filtered = test_result[test_result$Specific.Level != test_result$predicted_levels, ]
datatable(filtered, rownames = FALSE)Testing dataset: