The title says it all!
Among other things, this exploration is useful because:
mean_F <- mean(MP1_data$F, na.rm = TRUE)
sd_F <- sd(MP1_data$F, na.rm = TRUE)
# Define thresholds for outliers
threshold_3sd_upper <- mean_F + 3 * sd_F
threshold_3sd_lower <- mean_F - 3 * sd_F
threshold_2.5sd_upper <- mean_F + 2.5 * sd_F
threshold_2.5sd_lower <- mean_F - 2.5 * sd_F
# Identify outliers using ±3 SD
outliers_3sd <- MP1_data %>%
filter(F > threshold_3sd_upper | F < threshold_3sd_lower)
# outliers_3sd # no outlier identified
# Identify outliers using ±2.5 SD
outliers_2.5sd <- MP1_data %>%
filter(F > threshold_2.5sd_upper | F < threshold_2.5sd_lower)
# outliers_2.5sd # only 1 outlier identified
Preliminary Notes
Reproducibility: We set seed to ensure that the results are reproducible (currently, seed=123; Xuwen to try out a few different ones and see if results vary).
Hyperparameters: The hyperparameters currently used are those recommended as typical in the literature. The caret tuning grid is still running to generate the optimal hyperparameters, which may take up to 1-2 days. All this is to say is that the performance of the machine learning methods, specifically random forest and gradient boosting, may improve beyond the current results.
Packages and R Code:
Random Forest is trained with ranger() and Gradient Boosting is trained with gbm(), both through caret(), all of which are commonly used packages in the literature.
For R code, we follow the vignettes from the caret package: caret() Vignette
### Revert Variables back to their original scale
MP1_data<-MP1_data%>%
mutate(numINCOME=numINCOME.OrgScale,
numEDU = numEDU.OrgScale,
AGE = AGE.OrgScale,
CRTScore=CRTScore.OrgScale,
BNT=BNT.OrgScale)
FAM = grep("FAM", names(MP1_data), value = TRUE)
TIME.lg = grep("lg", names(MP1_data), value = TRUE)
BNT2_3 <- grep("BNT2|BNT3", names(MP1_data), value = TRUE)
IV_DV_data <- grep("TIME_SUNK_|TIME_LW_|TIME_DISEASE_|TIME_DEFAULT_|TIME_LESSMORE_", names(MP1_data), value = TRUE)
MP1_data<-MP1_data%>% # ensure that factors are entered as factors,
mutate(
OS_category = as.factor(OS_category),
ZIP_LW_IV2.recoded = as.factor(ZIP_LW_IV2.recoded),
day_of_week = as.factor(day_of_week),
numGENDER = case_when(numGENDER==1 ~ "Female",
TRUE ~ "Male"),
numGENDER = as.factor(numGENDER),
TimeOfDay=as.factor(TimeOfDay),
TimeZone=as.factor(TimeZone),
META_Browser = as.factor(META_Browser),
ATTN1 = as.factor(case_when(ATTN1==1~"_Correct",
TRUE ~ "_InCorrect")),
ATTN2 = as.factor(case_when(ATTN2=="TRUE"~"_Correct",
TRUE ~ "_InCorrect")), # variable hygiene
CONFUSE = as.factor(CONFUSE),
TEDIOUS = as.factor(TEDIOUS),
mobile = as.numeric(mobile),
mobile=as.factor(case_when(
is.na(mobile) == T ~ "_No",
mobile == 0 ~ "_No", # three students have 0 value, but we already know that they use PC.
TRUE ~ "_YES")),
Country=factor(case_when(
Panel=="Students" ~ "Netherlands",
Panel=="Prolific_UK_Rep" ~ "UK",
TRUE ~ "US"
)))%>%
rename(GENDER = numGENDER) %>%
separate(META_Resolution, into = c("Width", "Height"), sep = "x") %>%
mutate(
Width = as.numeric(Width),
Height = as.numeric(Height),
Resolution_Product = Width * Height
)
columns_to_remove <- c(FAM, TIME.lg,BNT2_3,IV_DV_data)
MP1_data<-MP1_data%>%
select(-all_of(columns_to_remove))#remove all the familiarity and time.lg data
### Variable Selection for M1 ###
TIME = grep("\\.Submit", names(MP1_data), value = TRUE)
First_click = grep("\\_First.Click", names(MP1_data), value = TRUE)
Last_click = grep("\\_Last.Click", names(MP1_data), value = TRUE)
Total_click = grep("\\_Click.Count", names(MP1_data), value = TRUE)
### Calculate the differences between the first and the lack click times ###
for (i in 1:length(First_click)) {
first_col <- First_click[i]
last_col <- Last_click[i]
diff_col <- paste0(sub("\\_First.Click", "", first_col), "_Click.Time.Diff")
MP1_data <- MP1_data %>%
mutate(!!diff_col := !!sym(last_col) - !!sym(first_col))
}
Click_time_diff = grep("\\_Click.Time.Diff", names(MP1_data), value = TRUE)
### DONE ###
### Additional wrangling to append missing values ###
MP1_data<-MP1_data %>%
# filter(!is.na(day_tempF) & !is.na(hist_tempF) & !is.na(LW_tempdiff)) %>% # unclear why these are NA values but removed for accurate prediction
# as discussed in 07/19 meeting, we retain these observations and impute them by 0.
group_by(Panel) %>%
mutate(
Avg_RecaptchaScore = mean(Q_RecaptchaScore, na.rm = TRUE),
Avg_RelevantIDFraudScore = mean(Q_RelevantIDFraudScore, na.rm = TRUE)
) %>%
ungroup() %>%
mutate(
Q_RecaptchaScore = ifelse(is.na(Q_RecaptchaScore), Avg_RecaptchaScore, Q_RecaptchaScore),
Q_RelevantIDFraudScore = ifelse(is.na(Q_RelevantIDFraudScore), Avg_RelevantIDFraudScore, Q_RelevantIDFraudScore) # append missing Q_RecaptchaScore and Q_RelevantIDFraudScore with Panel average
) %>%
select(-Avg_RecaptchaScore, -Avg_RelevantIDFraudScore)
# NA Check
### DONE ###
### Select Variables of Interest ###
M1_variable = c("Duration..in.seconds.","Q_RecaptchaScore","LocationLatitude","LocationLongitude","Q_RelevantIDFraudScore","CONFUSE","TEDIOUS","day_of_week","minutes_since_midnight","ATTN1","ATTN2","numINCOME","numEDU","GENDER","AGE","Country","mobile","Resolution_Product","META_Browser","Panel")
# ,"OS_category","ZIP_LW_IV2.recoded"
# 072024: after meeting with Eric and Antonia, we will be removing SES, adjusted_minutes since midnight, and timezone, and TimeOfDay
# swap out adjusted mins with minutes since midnight, let ML method pick up the interactions themselves, and not let my probably incorrect coding mess things up; also the variable became less important in the importance plot, indicating potential erros in coding as eric suggested (the actual timezone demarcation is very complex.)
# 072024: we should also try out how a recoded version of Operating system work
FACE=c("F","A","C","E")
IV_DV = c("numSunkCost","numLessMore","numDefault","LW_DV","numDisease","SunkCondition","LessMoreCondition","DiseaseCondition","DefaultCondition","numLW_IV")
Text_analysis = grep("(_TTR|_AvgSentenceLength|_FleschKincaid)", names(MP1_data), value = TRUE)
# is the IV for LW correct?
Data_M1 <- MP1_data %>%
select(all_of(c(FACE, IV_DV, TIME, First_click, Last_click, Total_click, M1_variable, Click_time_diff,Text_analysis))) #,Text_analysis
na_columns <- sapply(Data_M1, function(x) any(is.na(x)))
# Extract the names of columns that contain NA values
# names(na_columns[na_columns])
# "TIME_BNTXX_Click.Count" "TIME_BNTXX_Click.Count" .. "TIME_DEMCHAR_Click.Time.Diff"
# all as expected and documented
# Data_M1%>%filter(is.na(TIME_DEMCHAR_Last.Click)) n=3 seems okay
# dim(Data_M1) # 6438 103
# summary(lm(C~DEMCHAR_TTR,Data_M1))
# Text_analysis
# [1] "DEMCHAR_TTR" "DEMCHAR_AvgSentenceLength"
# [3] "DEMCHAR_FleschKincaid" "CONFUSEOPEN_TTR"
# [5] "CONFUSEOPEN_AvgSentenceLength" "CONFUSEOPEN_FleschKincaid"
# [7] "TEDIOUSOPEN_TTR" "TEDIOUSOPEN_AvgSentenceLength"
# [9] "TEDIOUSOPEN_FleschKincaid"
Data_M1<-Data_M1%>%
mutate(across(everything(), ~ replace_na(., 0)))
F_data <- grep("BNT|CRT", names(Data_M1), value = TRUE)
Data_M1a <- Data_M1 %>%
select(-all_of(F_data)) # Remove all data associated with F for M1a
Data_M1a<-Data_M1a%>%
rowwise() %>%
mutate(
avg_first_click = mean(c_across(all_of(grep("First.Click", names(Data_M1a), value = TRUE))), na.rm = TRUE),
avg_last_click = mean(c_across(all_of(grep("Last.Click", names(Data_M1a), value = TRUE))), na.rm = TRUE),
avg_click_count = mean(c_across(all_of(grep("Click.Count", names(Data_M1a), value = TRUE))), na.rm = TRUE),
avg_click_time_diff = mean(c_across(all_of(grep("Click.Time.Diff", names(Data_M1a), value = TRUE))), na.rm = TRUE),
avg_page_submit = mean(c_across(all_of(grep(".Submit", names(Data_M1a), value = TRUE))), na.rm = TRUE)
) %>%
ungroup()
### DONE ###
# Data_M1[duplicated(Data_M1), ] #; none, good
### Perform Log Transformation and scaling
Data_M1a <- Data_M1a %>%
select(Panel, everything()) # move Panel to the beginning of the dataset, so we don't dummy code this..
# Identify numeric columns and their indices
numeric_indices <- which(sapply(Data_M1a, is.numeric))
log_columns <- names(Data_M1a)[numeric_indices[numeric_indices >= 16]]
scale_columns <- names(Data_M1a)[numeric_indices[numeric_indices >= 16]]
# perform log transformation among only click time, click count, duration data, and resolution product (also right skewed)
log.exclude_columns <- c("LocationLatitude", "LocationLongitude", "day_tempF", "LW_tempdiff","hist_tempF","Q_RecaptchaScore")
log_columns <- setdiff(log_columns, c(log.exclude_columns, Text_analysis))
Data_M1a.log <- Data_M1a %>%
mutate(
across(.cols = all_of(log_columns), .fns = ~ log(. + 1)))
Data_M1a.log_z<-Data_M1a.log%>%
mutate(# Apply log to selected columns, excluding some
across(.cols = all_of(scale_columns), .fns = scale) # Apply scale to all numeric columns starting from the 18th
)
###
### One-hot coding ###
# factor_columns <- sapply(Data_M1a, is.factor)
# factors_in_Data_M1a <- Data_M1a[, factor_columns]
# names(factors_in_Data_M1a)
# install.packages("fastDummies")
library(fastDummies)
process_dataset <- function(data) {
data <- data %>%
mutate(META_Browser = relevel(META_Browser, ref = "Chrome"),
CONFUSE = relevel(CONFUSE, ref = "No"),
TEDIOUS = relevel(TEDIOUS, ref = "No"),
day_of_week = relevel(day_of_week, ref = "Monday"),
ATTN1 = relevel(ATTN1, ref = "_InCorrect"),
ATTN2 = relevel(ATTN2, ref = "_InCorrect"),
Country = relevel(Country, ref = "Netherlands"),
mobile = relevel(mobile, ref = "_No"),
GENDER = relevel(GENDER, ref = "Male"))
day_mapping <- c("Sunday" = 0, "Monday" = 1, "Tuesday" = 2, "Wednesday" = 3,
"Thursday" = 4, "Friday" = 5, "Saturday" = 6)
data$day_of_week_num.from_Sunday <- day_mapping[data$day_of_week]
data <- data %>%
select(-day_of_week)
data_part1 <- as.data.frame(data[, 1:15])
data_part2 <- as.data.frame(data[, 16:ncol(data)])
data_part2_dummy_coded <- dummy_cols(data_part2,
remove_first_dummy = TRUE,
remove_selected_columns = TRUE)
data_dummy_coded <- cbind(data_part1, data_part2_dummy_coded)
colnames(data_dummy_coded) <- gsub(" ", "_", colnames(data_dummy_coded))
return(data_dummy_coded)
}
Data_M1a.log_z_dummy_coded <- process_dataset(Data_M1a.log_z)
Data_M1a.log_dummy_coded <- process_dataset(Data_M1a.log)
write.csv(Data_M1a.log_dummy_coded,"Data_M1a.log_dummy_coded_20240820.csv")
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_M1a <- createDataPartition(Data_M1a.log_z_dummy_coded$F, p = 0.8, list = FALSE)
variables_to_remove <- c("Panel","A","C","E","numSunkCost","numLessMore","numDefault","LW_DV","numDisease","SunkCondition","LessMoreCondition","DiseaseCondition","DefaultCondition","numLW_IV")
variables_to_remove <- c(variables_to_remove, Text_analysis)
# We need to remove all the above variables other than F for training and testing; but we'd like to see their relationship with the predicted F
# Create training and testing sets
data.trn_M1a <- as.data.frame(Data_M1a.log_z_dummy_coded[train_M1a, ])
data.tst_M1a <- as.data.frame(Data_M1a.log_z_dummy_coded[-train_M1a, ])
# data.tst_M1a[duplicated(rbind(data.trn_M1a, data.tst_M1a))[-(1:nrow(data.trn_M1a))], ] # none, good, no duplicates, so we won't need to worry aboyt data leakage
# Get copies of the original training and testing sets to retain all variables
data.trn_M1a_full <- data.trn_M1a # we won't be needing this but just in case
data.tst_M1a_full <- data.tst_M1a
# remove the specified variables from the training and testing sets
data.trn_M1a <- data.trn_M1a[, !colnames(data.trn_M1a) %in% variables_to_remove]
data.tst_M1a <- data.tst_M1a[, !colnames(data.tst_M1a) %in% variables_to_remove]
# use this when operating system is in the dataset #
# train_os <- unique(data.trn_M1a$META_Operating.System)
# test_os <- unique(data.tst_M1a$META_Operating.System)
# # Check if any values of operating system is in test_os but not in train_os
# missing_in_train <- setdiff(test_os, train_os)
# # Filter the testing dataframe to remove rows containing these values
# data.tst_M1a_cleaned <- data.tst_M1a %>%
# filter(!(META_Operating.System %in% missing_in_train))
# rows_to_remove <- data.tst_M1a %>%
# filter(META_Operating.System %in% missing_in_train)
# nrow(rows_to_remove) # 9, seems alright
# data.tst_M1a<-data.tst_M1a_cleaned
actual_M1a <- data.tst_M1a$F
# dim(data.trn_M1a) # [1] 5152 80; note that country has 3 levels and browser 11 levels
# dim(data.tst_M1a) # [1] 1286 80
# dim(data.tst_M1a_full) # [1] 1286 103 country has 3 levels
### train Random Forest ###
### Set up cross-validation control; here we are not using CV to select the best model; hyperperameters have already been selected from previous testing using the grid search feature. We just use cv to get a sense of model performance in the training dataset ###
ctrl <- trainControl(method = "cv", number = 10, summaryFunction = defaultSummary)
tuneGrid <- expand.grid(
mtry = c(62),
min.node.size = c(5),
splitrule = c("extratrees")
)
# Model_M1a_rf <- train(F ~ ., data = data.trn_M1a, method = "ranger",
# tuneLength = 5, importance = 'impurity', num.trees = 1000,
# trControl = ctrl)
# save(Model_M1a_rf, file = "./Saved ML Model and Data/Model_M1a_rf_20240820.RData")
load("./Saved ML Model and Data/Model_M1a_rf_20240820.RData")
pred_M1a_rf<- predict(Model_M1a_rf,data.tst_M1a)
test_perf.M1a_rf <- postResample(pred_M1a_rf, actual_M1a)
data.tst_M1a_full.rf <- cbind(data.tst_M1a_full, Predicted_F = pred_M1a_rf)
### DONE Training RF Model ###
### Train Bench Mark Regression Model ###
Model_M1a_reg <- train(F ~ ., data = data.trn_M1a, method = "lm")
pred_M1a_reg<- predict(Model_M1a_reg,data.tst_M1a)
test_perf.M1a_reg <- postResample(pred_M1a_reg, actual_M1a)
data.tst_M1a_full.reg <- cbind(data.tst_M1a_full, Predicted_F = pred_M1a_reg)
### DONE training reg Model ###
### Train Elastic Net ###
Model_M1a_net <- train(F ~ ., data = data.trn_M1a, method = "glmnet", trControl = ctrl,tuneLength=40) # 10 fold cv to select the best model
pred_M1a_net<- predict(Model_M1a_net,data.tst_M1a)
test_perf.M1a_net <- postResample(pred_M1a_net, actual_M1a)
data.tst_M1a_full.net <- cbind(data.tst_M1a_full, Predicted_F = pred_M1a_net)
### Done Training Elastic Net ###
### train Gradient Boosting ###
tuneGrid <- expand.grid(
n.trees = c(1250), # Number of trees (boosting iterations)
interaction.depth = c( 10), # Tree depth
shrinkage = c(0.01), # Learning rate
n.minobsinnode = c(5) # Minimum number of observations in terminal nodes
)
# Model_M1a_gbm <- train(F ~ ., data = data.trn_M1a, method = "gbm",
# tuneGrid = tuneGrid, trControl = ctrl,
# verbose = FALSE)
# save(Model_M1a_gbm, file = "./Saved ML Model and Data/Model_M1a_gbm_20240820.RData") # ensure all continuous variables are in numeric format, not array or matrix, this is needed for iml to calculate the permutation importance scores.
load("./Saved ML Model and Data/Model_M1a_gbm_20240820.RData")
pred_M1a_gbm <- predict(Model_M1a_gbm, data.tst_M1a)
test_perf.M1a_gbm <- postResample(pred_M1a_gbm, actual_M1a)
data.tst_M1a_full.gbm <- cbind(data.tst_M1a_full, Predicted_F = pred_M1a_gbm)
# Retrieve training data stats
# we trained just one model for reg, rf, gbm, so which.min arg is fine.
# for Elastic NEt, this section needs a bit more work but the results are only off by slightly. Needs to come back to this
train_perf_M1a_rf <- Model_M1a_rf$results[which.min(Model_M1a_rf$results$RMSE), c("RMSE", "Rsquared", "MAE")]
train_perf_M1a_reg <- Model_M1a_reg$results[which.min(Model_M1a_reg$results$RMSE), c("RMSE", "Rsquared", "MAE")]
train_perf_M1a_net <- Model_M1a_net$results[which.min(Model_M1a_net$results$RMSE), c("RMSE", "Rsquared", "MAE")]
train_perf_M1a_gbm <- Model_M1a_gbm$results[which.min(Model_M1a_gbm$results$RMSE), c("RMSE", "Rsquared", "MAE")]
# Combine training performance metrics into a data frame
performance_metrics_trn_M1 <- data.frame(
Model = factor(c("Linear Regression", "Elastic Net", "Random Forest", "Gradient Boosting"),
levels = c("Linear Regression", "Elastic Net", "Random Forest", "Gradient Boosting")),
MAE = c(as.numeric(train_perf_M1a_reg["MAE"]), as.numeric(train_perf_M1a_net["MAE"]), as.numeric(train_perf_M1a_rf["MAE"]), as.numeric(train_perf_M1a_gbm["MAE"])),
RMSE = c(as.numeric(train_perf_M1a_reg["RMSE"]), as.numeric(train_perf_M1a_net["RMSE"]), as.numeric(train_perf_M1a_rf["RMSE"]), as.numeric(train_perf_M1a_gbm["RMSE"])),
Rsquared = c(as.numeric(train_perf_M1a_reg["Rsquared"]), as.numeric(train_perf_M1a_net["Rsquared"]), as.numeric(train_perf_M1a_rf["Rsquared"]), as.numeric(train_perf_M1a_gbm["Rsquared"])))
performance_table_trn <- performance_metrics_trn_M1 %>%
mutate(across(where(is.numeric), round, 4))
# Display the table
kable(performance_table_trn, caption = "Comparison of Model Performance in Training Data using Cross-Validation") %>%
kable_styling(full_width = F, position = "center")
Model | MAE | RMSE | Rsquared |
---|---|---|---|
Linear Regression | 0.4276 | 0.5342 | 0.4638 |
Elastic Net | 0.4246 | 0.5311 | 0.4725 |
Random Forest | 0.3977 | 0.5085 | 0.5181 |
Gradient Boosting | 0.3790 | 0.4925 | 0.5457 |
###Plot ###
performance_metrics_long_trn <- performance_metrics_trn_M1 %>%
pivot_longer(cols = c(RMSE, Rsquared, MAE), names_to = "Metric", values_to = "Value")
trn_M1a.sd_F <- sd(data.trn_M1a$F)
ggplot(performance_metrics_long_trn, aes(x = Metric, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
geom_segment(data = performance_metrics_long_trn %>% filter(Metric == "RMSE"),
aes(x = 1.6, xend = 2.4, y = trn_M1a.sd_F, yend = trn_M1a.sd_F),
linetype = "dashed", color = "#FF9999") +
geom_text(data = performance_metrics_long_trn %>% filter(Metric == "RMSE"),
aes(x = 2, y = trn_M1a.sd_F, label = paste0("Std.dev of F in Training Data: ", round(trn_M1a.sd_F, 2))),
vjust = -0.5, color = "#FF9999") +
geom_text(aes(label = round(Value, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
scale_fill_manual(values = c("gray80", "gray50", "gray30", "gray20")) + # Less colorful palette
labs(title = "Comparison of Model Performance in Training Data using Cross-Validation",
x = "Metric",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
Top 20 Important Variables Across Models
Variable importance plot below is based on the final model selected during model training
Score Interpretation: The top score of 100 represents the most important variable, and other variables’ scores are scaled relative to this top score. So the variable with the highest importance score is given a score of 100, and the importance of other variables is measured as a percentage of this top variable.
Linear Regression: Importance is the absolute value of the t-statistic for variable’s in the model.
Elastic Net: Importance is the absolute value of each variable’s coefficient in the model.
Random Forest and Gradient Boosting Models (Permutation Importance Plot): The importance is based on how much model fit is reduced when the values of a variable is permuted. Here, the model fit metric we are using is the medium absolute error (MAE), which gives less weight to the error due to outliers compared to MSE. For each tree in the Random Forest, the MAE on the out-of-bag (OOB) portion of the data is recorded. Then, for each predictor variable, the values are permuted, breaking the relationship between the specific predictor and DV, and the RSS is recorded again using the permuted data. The difference between the original RSS and the permuted RSS, which reflects the impact of the variable, is calculated for each tree. These differences are averaged over all trees in the forest and normalized by the standard error. The same principle applies to Gradient Boosting.
Reference Level for Factors:
# Compare the imp plot and see if longtitude is in there when adjusted vs. unadjusted
# Extract the top 10 important variables for each model
# install.packages("iml")
varImp_reg_M1a <- varImp(Model_M1a_reg)$importance %>%
arrange(desc(Overall)) %>%
head(20)
varImp_net_M1a <- varImp(Model_M1a_net)$importance %>%
arrange(desc(Overall)) %>%
head(20)
varImp_reg_M1a_df <- data.frame(Variable = rownames(varImp_reg_M1a), LinearRegression = varImp_reg_M1a$Overall)
varImp_net_M1a_df <- data.frame(Variable = rownames(varImp_net_M1a), ElasticNet = varImp_net_M1a$Overall)
# library(iml)
# iml will messes some packages up...
# predictor.gbm <- Predictor$new(Model_M1a_gbm, data = data.tst_M1a, y = data.tst_M1a$F)
# imp.gbm <- FeatureImp$new(predictor.gbm, loss = "mae",n.repetitions=10)
#
# varImp_gbm_M1a_perm <- imp.gbm$results %>%
# arrange(desc(importance)) %>%
# head(20) %>%
# select(Overall = importance, Variable = feature)
#
# predictor.rf <- Predictor$new(Model_M1a_rf, data = data.tst_M1a, y = data.tst_M1a$F)
# imp.rf <- FeatureImp$new(predictor.rf, loss = "mae",n.repetitions=10)
#
# varImp_rf_M1a_perm <- imp.rf$results %>%
# arrange(desc(importance)) %>%
# head(20) %>%
# select(Overall = importance, Variable = feature)
# save(varImp_gbm_M1a_perm, varImp_rf_M1a_perm, file = "./Saved ML Model and Data/varImp_results_20240818.RData")
load("./Saved ML Model and Data/varImp_results_20240818.RData")
varImp_gbm_M1a_df <- varImp_gbm_M1a_perm %>%
mutate(GradientBoosting = (Overall / max(Overall)) * 100) %>%
select(Variable, GradientBoosting)
# For Random Forest Model
varImp_rf_M1a_df <- varImp_rf_M1a_perm %>%
mutate(RandomForest = (Overall / max(Overall)) * 100) %>%
select(Variable, RandomForest)
# Combine the variables from all models
all_vars <- unique(c(varImp_gbm_M1a_df$Variable, varImp_rf_M1a_df$Variable, varImp_reg_M1a_df$Variable, varImp_net_M1a_df$Variable))
# Create a data frame to store the importance scores
importance_df.M1a <- data.frame(
Variable = all_vars
)
importance_df.M1a <- importance_df.M1a %>%
left_join(varImp_gbm_M1a_df, by = "Variable") %>%
left_join(varImp_rf_M1a_df, by = "Variable") %>%
left_join(varImp_reg_M1a_df, by = "Variable") %>%
left_join(varImp_net_M1a_df, by = "Variable") %>%
replace(is.na(.), 0)
print("To compare variable importance across models, I extracted the top 20 important variables from each model. These variables were combined into a single data frame: ")
## [1] "To compare variable importance across models, I extracted the top 20 important variables from each model. These variables were combined into a single data frame: "
importance_df.M1a <- importance_df.M1a %>%
mutate(Variable = case_when(
Variable == "minutes_since_midnight" ~ "Survey Taking Time Recorded as EST - minutes since midnight",
TRUE ~ Variable
),
Variable = gsub("DEMCHAR", "Study_Purpose_OpenEnd",Variable),
Variable = gsub("Page.Submit", "Duration", Variable),
Variable = gsub("ZiIP_LW_IV2", "ZipCode", Variable),
Variable = gsub("TIME_", "Page_", Variable),
Variable = gsub("ATTN1", "ATTN_Check.1", Variable),
Variable = gsub("ATTN2", "ATTN_Check.2", Variable),
Variable = gsub("Last.Click", "Last.Click.Time", Variable),
Variable = gsub("First.Click", "First.Click.Time", Variable))
kable(importance_df.M1a, caption = "Top 20 Important Variables Across Models") %>%
kable_styling(full_width = F, position = "center") %>%
column_spec(1, bold = TRUE, border_right = TRUE) %>%
column_spec(2:5, width = "3cm")
Variable | GradientBoosting | RandomForest | LinearRegression | ElasticNet |
---|---|---|---|---|
ATTN_Check.1__Correct | 100.00000 | 98.14349 | 94.52165 | 35.795106 |
LocationLatitude | 99.97605 | 87.64898 | 0.00000 | 0.000000 |
Resolution_Product | 98.75258 | 88.76601 | 44.25614 | 14.743467 |
numEDU | 96.67372 | 89.80047 | 100.00000 | 14.018191 |
Page_ATTN_Check.1_Last.Click.Time | 95.92211 | 89.04308 | 27.60484 | 14.853240 |
Survey Taking Time Recorded as EST - minutes since midnight | 95.33153 | 88.20340 | 0.00000 | 0.000000 |
mobile__YES | 94.95348 | 97.00517 | 33.72902 | 24.447335 |
Page_Study_Purpose_OpenEnd_Duration | 94.95258 | 87.94706 | 61.65453 | 14.333598 |
GENDER_1 | 93.70086 | 90.01515 | 0.00000 | 0.000000 |
AGE | 92.28768 | 88.10571 | 43.23756 | 0.000000 |
Page_ATTN_Check.2_Duration | 92.10377 | 87.73942 | 0.00000 | 9.549017 |
CONFUSE_Yes | 92.09031 | 88.89399 | 61.89447 | 18.083348 |
Country_US | 92.05490 | 100.00000 | 88.22409 | 100.000000 |
Page_ATTN_Check.1_First.Click.Time | 91.98113 | 88.55550 | 0.00000 | 0.000000 |
Page_AGE_Last.Click.Time | 91.90696 | 0.00000 | 0.00000 | 0.000000 |
Page_Study_Purpose_OpenEnd_Last.Click.Time | 91.79630 | 0.00000 | 0.00000 | 0.000000 |
Page_AGE_Duration | 91.73406 | 0.00000 | 0.00000 | 0.000000 |
day_of_week_num.from_Sunday | 91.72256 | 87.80543 | 49.64297 | 0.000000 |
LocationLongitude | 91.71942 | 0.00000 | 0.00000 | 0.000000 |
Page_iNTRO_Last.Click.Time | 91.66663 | 0.00000 | 0.00000 | 0.000000 |
ATTN_Check.2__Correct | 0.00000 | 88.28006 | 0.00000 | 0.000000 |
Country_UK | 0.00000 | 88.25480 | 47.45803 | 42.829407 |
Page_ATTN_Check.1_Duration | 0.00000 | 88.19777 | 0.00000 | 0.000000 |
Page_ATTN_Check.2_Last.Click.Time | 0.00000 | 87.70821 | 0.00000 | 0.000000 |
META_Browser_Firefox | 0.00000 | 87.66050 | 30.28525 | 17.160250 |
GENDER_Female | 0.00000 | 0.00000 | 74.08680 | 18.846550 |
META_Browser_Edge | 0.00000 | 0.00000 | 39.04502 | 21.242459 |
Q_RecaptchaScore | 0.00000 | 0.00000 | 35.01799 | 0.000000 |
Page_ZipCode_Duration | 0.00000 | 0.00000 | 27.47616 | 0.000000 |
META_Browser_Safari | 0.00000 | 0.00000 | 25.43181 | 20.265583 |
Page_AGE_Click.Time.Diff | 0.00000 | 0.00000 | 25.41183 | 11.159723 |
Page_Study_Purpose_OpenEnd_Click.Count | 0.00000 | 0.00000 | 24.58357 | 0.000000 |
Page_ZipCode_Click.Count | 0.00000 | 0.00000 | 23.02877 | 0.000000 |
META_Browser_Firefox_iPhone | 0.00000 | 0.00000 | 0.00000 | 88.621902 |
TEDIOUS_ | 0.00000 | 0.00000 | 0.00000 | 38.110102 |
META_Browser_AppleWebKit | 0.00000 | 0.00000 | 0.00000 | 27.725102 |
META_Browser_Chrome_iPhone | 0.00000 | 0.00000 | 0.00000 | 21.877634 |
META_Browser_Safari_iPad | 0.00000 | 0.00000 | 0.00000 | 10.193386 |
# what are the variables important in ML methods that do not show up in regular regression models?
# min since midnight; Longitude; product resolution; Latitude
This paper shows that both the permutation method (as seen in the previous section for random forest and gradient boosting) and SHAP value methods are among the most accurate approaches to interpret ML models.
For Full details of SHAP values, refer to this paper; See this paper for an example of how SHAP values have been reported. Brief summaries below (xuwen to work on this documentation more…)
SHAP values combine concepts from Local Interpretable Model-Agnostic Explanations (LIME) and Shapley values. The former involves building a linear model to assess how predictions change in the neighborhood of each observation according to the model (this is done by perturbing the data around an observation and record the model’s predictions), and the latter assign each variable a fair contribution to the prediction based on principles from game theory. For each data point, SHAP values represent the marginal contribution of each feature to the prediction. For example, for a given observation, we calculate how the specific value of each feature influences its prediction compared to if the feature’s value were set to a reference level ( mean in the entire dataset is commonly used).
A SHAP value of 1 for feature X in prediction i means that feature X increases the prediction for observation i by 1 unit compared to what the prediction would have been if feature X had been at its reference level (e.g., the mean of feature X in the entire dataset).
Below, we 1] plot a graph of the mean of |SHAP values| to identify the most important variables. We also plot 2] a beeswarm plot of SHAP values, which shows both the magnitude and direction of SHAP values, indicating whether a feature increases or decreases each prediction. Each Data Point in 2] represents an individual observation, showing the SHAP value of a specific feature for that observation. The color of the points reflects the feature value (e.g., low to high).
There are many different implementations of calculating SHAP values . We will be calculating SHAP values for gradient boosting and random forest using tree shap packages. TreeSHAP is specifically optimized for tree-based methods and computes SHAP values efficiently. This method uses the structure of decision trees to calculate SHAP values in polynomial time.
set.seed(123)
# Model_M1a_gbm.shap <- gbm(
# formula = F ~ .,
# data = data.trn_M1a,
# distribution = "gaussian",
# n.trees = 1250,
# interaction.depth = 10,
# shrinkage = 0.01,
# n.minobsinnode = 5,
# cv.folds = 10,
# keep.data = TRUE,
# verbose = T
# ) # matching MAE, rmse, and rsquare
# unified.gbm <- unify(Model_M1a_gbm.shap, data.trn_M1a)
# treeshap.gbm <- treeshap(unified.gbm, data.trn_M1a, verbose = 1)
# Train the model using ranger directly
# Model_M1a_rf.shap <- ranger(
# formula = F ~ .,
# data = data.trn_M1a,
# num.trees = 1000,
# mtry = 62,
# min.node.size = 5,
# splitrule = "extratrees",
# importance = 'impurity',
# probability = FALSE,
# respect.unordered.factors = "partition",
# verbose = TRUE
# )
# # Matching results with test sampel.
#
# unified.rf <- unify(Model_M1a_rf.shap, data.trn_M1a)
# treeshap.rf <- treeshap(unified.rf, data.trn_M1a, verbose = 1)
# save(Model_M1a_gbm.shap,treeshap.gbm, treeshap.rf,file="./Saved ML Model and Data/Model_M1a_gbm.shap_20240819.RData")
load("./Saved ML Model and Data/Model_M1a_gbm.shap_20240819.RData")
data.trn_M1a.shap<-data.trn_M1a%>%
select(GENDER_1=GENDER_Female,everything())
shp.gbm <- shapviz(treeshap.gbm, X = data.trn_M1a.shap)
shp.rf <- shapviz(treeshap.rf, X = data.trn_M1a.shap)
plot_rf<-sv_importance(shp.rf)+
ggtitle("Summary of SHAP values (mean |SHAP values|) - Random Forest ") +
theme(plot.title = element_text(hjust = 0.5))
plot_gbm<-sv_importance(shp.gbm)+
ggtitle("Summary of SHAP values (mean |SHAP values|) - GBM ") +
theme(plot.title = element_text(hjust = 0.5))
combined_plot <- plot_rf / plot_gbm
combined_plot
plot_rf <- sv_importance(shp.rf, kind = "beeswarm") +
ggtitle("SHAP value Beeswarm plot - Random forest ") +
theme(plot.title = element_text(hjust = 0.5))
plot_gbm <- sv_importance(shp.gbm, kind = "beeswarm") +
ggtitle("SHAP value Beeswarm plot - GBM") +
theme(plot.title = element_text(hjust = 0.5))
combined_plot <- plot_rf / plot_gbm
combined_plot
library(GGally)
predicted_f_reg <- data.tst_M1a_full.reg$Predicted_F
predicted_f_rf <- data.tst_M1a_full.rf$Predicted_F
predicted_f_gbm <- data.tst_M1a_full.gbm$Predicted_F
predicted_f_net <- data.tst_M1a_full.net$Predicted_F
actual_F<-data.tst_M1a_full.net$F
# Combine into a data frame
predicted_f <- data.frame(
Pred_F.Reg = predicted_f_reg,
Pred_F.RandomForest = predicted_f_rf,
Pred_F.GradientBoosting = predicted_f_gbm,
Pred_F.ElasticNet = predicted_f_net,
F = actual_F
)
ggpairs(predicted_f, c("Pred_F.Reg","Pred_F.ElasticNet","Pred_F.RandomForest","Pred_F.GradientBoosting","F"),
lower = list(continuous = wrap("points",
position = position_jitter(height = .02, width = .02))),diag = list(continuous = "density"))
# Bar plot indicating overall model performance, plotting RMSE, R-squared, and MEA
# Combine performance metrics into a data frame
performance_metrics <- data.frame(
Model = factor(c("Linear Regression", "Elastic Net", "Random Forest", "Gradient Boosting"),
levels = c("Linear Regression", "Elastic Net", "Random Forest", "Gradient Boosting")),
MAE = c(test_perf.M1a_reg["MAE"], test_perf.M1a_net["MAE"], test_perf.M1a_rf["MAE"], test_perf.M1a_gbm["MAE"]),
RMSE = c(test_perf.M1a_reg["RMSE"], test_perf.M1a_net["RMSE"], test_perf.M1a_rf["RMSE"], test_perf.M1a_gbm["RMSE"]),
Rsquared = c(test_perf.M1a_reg["Rsquared"], test_perf.M1a_net["Rsquared"], test_perf.M1a_rf["Rsquared"], test_perf.M1a_gbm["Rsquared"])
)
performance_metrics_long <- performance_metrics %>%
pivot_longer(cols = c(RMSE, Rsquared, MAE), names_to = "Metric", values_to = "Value")
# Create the performance metrics table
performance_table <- performance_metrics %>%
mutate(across(where(is.numeric), round, 4))
# Display the table
kable(performance_table, caption = "Comparison of Model Performance in Held-Out Test Data") %>%
kable_styling(full_width = F, position = "center")
Model | MAE | RMSE | Rsquared |
---|---|---|---|
Linear Regression | 0.4228 | 0.5341 | 0.4808 |
Elastic Net | 0.4230 | 0.5335 | 0.4819 |
Random Forest | 0.3996 | 0.5097 | 0.5299 |
Gradient Boosting | 0.3773 | 0.4901 | 0.5635 |
tst_M1a.sd_F <- sd(data.tst_M1a$F)
ggplot(performance_metrics_long, aes(x = Metric, y = Value, fill = Model)) +
geom_bar(stat = "identity", position = "dodge") +
geom_segment(data = performance_metrics_long %>% filter(Metric == "RMSE"),
aes(x = 1.6, xend = 2.4, y = tst_M1a.sd_F, yend = tst_M1a.sd_F),
linetype = "dashed", color = "#FF9999") +
geom_text(data = performance_metrics_long %>% filter(Metric == "RMSE"),
aes(x = 2, y = tst_M1a.sd_F, label = paste0("Std.dev of F in Held-Out Test Data: ", round(tst_M1a.sd_F, 2))),
vjust = -0.5, color = "#FF9999") +
geom_text(aes(label = round(Value, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
scale_fill_manual(values = c("gray80", "gray50", "gray30", "gray20")) + # Less colorful palette
labs(title = "Comparison of Model Performance in Held-Out Testing Data",
x = "Metric",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
# summary(lm(F~Predicted_F+Panel,data.tst_M1a_full.reg)) # potential disadv of using panel as a source of FACE heterogenity?
# summary(lm(F~Predicted_F,data.tst_M1a_full.net))
# summary(lm(F~Predicted_F,data.tst_M1a_full.rf))
# summary(lm(F~Predicted_F,data.tst_M1a_full.gbm))
hist(data.tst_M1a$F)
library(gridExtra)
pred_M1a_rf <- predict(Model_M1a_rf, data.tst_M1a)
pred_M1a_reg <- predict(Model_M1a_reg, data.tst_M1a)
pred_M1a_net <- predict(Model_M1a_net, data.tst_M1a)
pred_M1a_gbm <- predict(Model_M1a_gbm, data.tst_M1a)
# Calculate residuals
residuals_rf <- actual_M1a - pred_M1a_rf
residuals_reg <- actual_M1a - pred_M1a_reg
residuals_net <- actual_M1a - pred_M1a_net
residuals_gbm <- actual_M1a - pred_M1a_gbm
# Create a data frame with residuals and actual values
residuals_data <- data.frame(
Actual = actual_M1a,
Pred_RF = pred_M1a_rf,
Pred_Reg = pred_M1a_reg,
Pred_NET = pred_M1a_net,
Pred_GBM = pred_M1a_gbm,
Residual_RF = residuals_rf,
Residual_Reg = residuals_reg,
Residual_NET = residuals_net,
Residual_GBM = residuals_gbm
)
create_density_plot <- function(predicted, actual, title) {
ggplot() +
geom_density(aes(x = predicted, color = "Predicted F"), alpha = 0.4, size = 1) +
geom_density(aes(x = actual, color = "Actual F"), alpha = 0.4, size = 1) +
scale_y_continuous(limits = c(0, 1)) +
scale_color_manual(values = c("Actual F" = "firebrick", "Predicted F" = "darkorange")) +
labs(title = title, x = "F Score", y = "Density") +
theme_minimal() +
theme(legend.position = "bottom", legend.title = element_blank())
}
# Create density plots for each model
density_rf <- create_density_plot(residuals_data$Pred_RF, residuals_data$Actual, "Density Plot for Random Forest")
density_reg <- create_density_plot(residuals_data$Pred_Reg, residuals_data$Actual, "Density Plot for Regression")
density_net <- create_density_plot(residuals_data$Pred_NET, residuals_data$Actual, "Density Plot for Elastic Net")
density_gbm <- create_density_plot(residuals_data$Pred_GBM, residuals_data$Actual, "Density Plot for Gradient Boosting")
# Arrange all density plots in a grid
grid.arrange(density_reg, density_net, density_rf, density_gbm, ncol = 2, nrow = 2)
create_residual_plot <- function(actual, residuals, title) {
ggplot(residuals_data, aes(x = actual, y = residuals)) +
geom_point(alpha = 0.4) +
geom_rug(sides = "b", alpha = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = title, x = "Actual F Score", y = "Residuals") +
theme_minimal() +
ylim(-2, 2)
}
# Create residual plots for each model
residual_rf <- create_residual_plot(residuals_data$Actual, residuals_data$Residual_RF, "Residuals Plot for Random Forest")
residual_reg <- create_residual_plot(residuals_data$Actual, residuals_data$Residual_Reg, "Residuals Plot for Regression")
residual_net <- create_residual_plot(residuals_data$Actual, residuals_data$Residual_NET, "Residuals Plot for Elastic Net")
residual_gbm <- create_residual_plot(residuals_data$Actual, residuals_data$Residual_GBM, "Residuals Plot for Gradient Boosting")
# Arrange all residual plots in a grid
grid.arrange( residual_reg, residual_net,residual_rf, residual_gbm, ncol = 2, nrow = 2)
ggpairs(data.tst_M1a_full.rf, c("Predicted_F","F","A","C", "E"),
lower = list(continuous = wrap("points",
position = position_jitter(height = .02, width = .02))),diag = list(continuous = "density"))
ggpairs(data.tst_M1a_full.gbm, c("Predicted_F","F","A","C", "E"),
lower = list(continuous = wrap("points",
position = position_jitter(height = .02, width = .02))),diag = list(continuous = "density"))
# Data_M1$Panel <- factor(Data_M1$Panel)
# Data_M1$Panel <- relevel(Data_M1$Panel, ref = "Students")
# (Default.PAN <- glm(numDefault ~ DefaultCondition * Panel, data = Data_M1, family = binomial)) # good - matches with SI
# (Default.PAN_FACE <- glm(numDefault ~ DefaultCondition * Panel + DefaultCondition * F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E, data = Data_M1, family = binomial)) # good - matches with SI.
# (Disease.PAN <- glm(numDisease ~ DiseaseCondition * Panel, data = Data_M1, family = binomial)) # good - matches with SI
# (Disease.PAN_FACE <- glm(numDisease ~ DiseaseCondition * Panel + DiseaseCondition * F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E, data = Data_M1, family = binomial)) # good - matches with SI.
# (LessMore.PAN <- lm(numLessMore ~ LessMoreCondition * Panel, data = Data_M1)) # good - matches with SI
# summary(LessMore.PAN)# good - matches with SI
# (LessMore.PAN_FACE <- lm(numLessMore ~ LessMoreCondition * Panel + LessMoreCondition * F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E, data = Data_M1)) # good - matches with SI.
# summary(LessMore.PAN_FACE)# good - matches with SI
# (Sunk.PAN <- lm(numSunkCost ~ SunkCondition * Panel, data = Data_M1)) # good - matches with SI
# summary(Sunk.PAN)# good - matches with SI
# (Sunk.PAN_FACE <- lm(numSunkCost ~ SunkCondition * Panel + SunkCondition * F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E, data = Data_M1)) # good - matches with SI.
# summary(Sunk.PAN_FACE)# good - matches with SI
We then examine if, compared to analysis 1], analysis 2] shows i) a reduction in the average explained variance of panel main effect and panel condition interaction reduce ii) Improvement in model fit.
Default.PAN <- glm(numDefault ~ DefaultCondition * Panel, data = data.tst_M1a_full.reg, family = binomial)
Default.PAN_FACE <- glm(numDefault ~ DefaultCondition * F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E + DefaultCondition * Panel , data = data.tst_M1a_full.reg, family = binomial)
Default.PAN_Pred_F.reg_ACE <- glm(numDefault ~ DefaultCondition * Predicted_F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E + DefaultCondition * Panel, data = data.tst_M1a_full.reg , family = binomial)
Default.PAN_Pred_F.net_ACE <- glm(numDefault ~ DefaultCondition * Predicted_F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E + DefaultCondition * Panel, data = data.tst_M1a_full.net, family = binomial)
Default.PAN_Pred_F.rf_ACE <- glm(numDefault ~ DefaultCondition * Predicted_F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E + DefaultCondition * Panel, data = data.tst_M1a_full.rf, family = binomial)
Default.PAN_Pred_F.gbm_ACE <- glm(numDefault ~ DefaultCondition * Predicted_F + DefaultCondition * A+ DefaultCondition * C+ DefaultCondition * E + DefaultCondition * Panel, data = data.tst_M1a_full.gbm, family = binomial)
# extract_aic <- function(model_list) {
# model_names <- names(model_list)
# aic_values <- sapply(model_list, function(model) AIC(model))
#
# aic_table <- data.frame(
# Model = model_names,
# AIC = aic_values,
# stringsAsFactors = FALSE
# )
#
# return(aic_table)
# }
Default_model <- list(
"Default.PAN" = Default.PAN,
"Default.PAN_FACE" = Default.PAN_FACE,
"Default.PAN_Pred_F.reg_ACE" = Default.PAN_Pred_F.reg_ACE,
"Default.PAN_Pred_F.net_ACE" = Default.PAN_Pred_F.net_ACE,
"Default.PAN_Pred_F.rf_ACE" = Default.PAN_Pred_F.rf_ACE,
"Default.PAN_Pred_F.gbm_ACE" = Default.PAN_Pred_F.gbm_ACE
)
term_names <- names(coef(Default.PAN))
panel_terms <- term_names[grepl("Panel", term_names)]
Default_result<-tab_model(Default.PAN, Default.PAN_FACE, Default.PAN_Pred_F.reg_ACE,Default.PAN_Pred_F.net_ACE,Default.PAN_Pred_F.rf_ACE,Default.PAN_Pred_F.gbm_ACE,
show.ci = FALSE, show.stat = TRUE, show.se = TRUE, show.re.var = TRUE,
show.icc = FALSE, show.r2=0,show.aic = 1,
show.loglik = 1,
show.fstat = 0,
rm.terms = panel_terms,
dv.labels = c("PAN",
"PAN_FACE",
"PAN_Pred.F(reg)_ACE",
"PAN_Pred.F(net)_ACE",
"PAN_Pred.F(rf)_ACE",
"PAN_Pred.F(gbm)_ACE"), title = "Default Paradigm; ✔ Panel and interaction terms are included in the model but omitted for readability")
Default_result
PAN | PAN_FACE | PAN_Pred.F(reg)_ACE | PAN_Pred.F(net)_ACE | PAN_Pred.F(rf)_ACE | PAN_Pred.F(gbm)_ACE | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p |
(Intercept) | 0.66 | 0.19 | -1.43 | 0.152 | 0.66 | 0.20 | -1.40 | 0.162 | 0.66 | 0.20 | -1.37 | 0.171 | 0.67 | 0.20 | -1.37 | 0.172 | 0.69 | 0.21 | -1.25 | 0.210 | 0.67 | 0.20 | -1.36 | 0.175 |
DefaultCondition [OPTNEUTRAL] |
3.61 | 1.45 | 3.19 | 0.001 | 3.64 | 1.48 | 3.18 | 0.001 | 3.64 | 1.48 | 3.17 | 0.002 | 3.65 | 1.49 | 3.17 | 0.002 | 3.77 | 1.55 | 3.23 | 0.001 | 3.66 | 1.49 | 3.18 | 0.001 |
DefaultCondition [OPTOUT] | 3.65 | 1.49 | 3.18 | 0.001 | 4.07 | 1.70 | 3.36 | 0.001 | 4.00 | 1.67 | 3.32 | 0.001 | 3.99 | 1.67 | 3.31 | 0.001 | 3.77 | 1.58 | 3.17 | 0.002 | 3.93 | 1.64 | 3.28 | 0.001 |
F | 1.29 | 0.28 | 1.16 | 0.245 | ||||||||||||||||||||
A | 1.09 | 0.30 | 0.31 | 0.754 | 1.10 | 0.31 | 0.35 | 0.728 | 1.10 | 0.31 | 0.35 | 0.729 | 1.11 | 0.31 | 0.36 | 0.716 | 1.11 | 0.31 | 0.36 | 0.717 | ||||
C | 1.14 | 0.13 | 1.15 | 0.250 | 1.24 | 0.15 | 1.73 | 0.084 | 1.24 | 0.15 | 1.72 | 0.085 | 1.23 | 0.15 | 1.73 | 0.084 | 1.25 | 0.15 | 1.84 | 0.066 | ||||
E | 0.92 | 0.17 | -0.48 | 0.632 | 0.83 | 0.15 | -0.99 | 0.322 | 0.84 | 0.15 | -0.98 | 0.328 | 0.83 | 0.15 | -1.03 | 0.304 | 0.82 | 0.15 | -1.07 | 0.284 | ||||
DefaultCondition [OPTNEUTRAL] × F |
0.71 | 0.21 | -1.18 | 0.239 | ||||||||||||||||||||
DefaultCondition [OPTOUT] × F |
1.37 | 0.45 | 0.96 | 0.338 | ||||||||||||||||||||
DefaultCondition [OPTNEUTRAL] × A |
0.96 | 0.38 | -0.11 | 0.910 | 0.95 | 0.37 | -0.13 | 0.900 | 0.95 | 0.37 | -0.12 | 0.903 | 0.96 | 0.38 | -0.10 | 0.920 | 0.96 | 0.38 | -0.11 | 0.909 | ||||
DefaultCondition [OPTOUT] × A |
0.41 | 0.17 | -2.18 | 0.029 | 0.41 | 0.17 | -2.17 | 0.030 | 0.41 | 0.17 | -2.17 | 0.030 | 0.41 | 0.17 | -2.18 | 0.029 | 0.41 | 0.17 | -2.19 | 0.028 | ||||
DefaultCondition [OPTNEUTRAL] × C |
0.90 | 0.14 | -0.68 | 0.495 | 0.83 | 0.14 | -1.13 | 0.260 | 0.83 | 0.14 | -1.10 | 0.270 | 0.87 | 0.14 | -0.88 | 0.378 | 0.83 | 0.14 | -1.12 | 0.262 | ||||
DefaultCondition [OPTOUT] × C |
1.09 | 0.20 | 0.45 | 0.650 | 1.10 | 0.22 | 0.51 | 0.612 | 1.11 | 0.22 | 0.52 | 0.606 | 1.05 | 0.20 | 0.26 | 0.798 | 1.01 | 0.19 | 0.07 | 0.945 | ||||
DefaultCondition [OPTNEUTRAL] × E |
1.29 | 0.34 | 0.94 | 0.346 | 1.43 | 0.38 | 1.34 | 0.180 | 1.42 | 0.38 | 1.32 | 0.185 | 1.36 | 0.36 | 1.14 | 0.254 | 1.42 | 0.38 | 1.31 | 0.190 | ||||
DefaultCondition [OPTOUT] × E |
1.20 | 0.32 | 0.69 | 0.488 | 1.16 | 0.31 | 0.56 | 0.578 | 1.15 | 0.31 | 0.54 | 0.589 | 1.25 | 0.33 | 0.83 | 0.407 | 1.26 | 0.34 | 0.87 | 0.385 | ||||
Predicted F | 0.67 | 0.24 | -1.11 | 0.267 | 0.67 | 0.25 | -1.10 | 0.272 | 0.56 | 0.23 | -1.42 | 0.155 | 0.54 | 0.21 | -1.60 | 0.109 | ||||||||
DefaultCondition [OPTNEUTRAL] × Predicted F |
1.37 | 0.70 | 0.61 | 0.543 | 1.33 | 0.69 | 0.54 | 0.587 | 0.94 | 0.53 | -0.11 | 0.912 | 1.42 | 0.75 | 0.66 | 0.507 | ||||||||
DefaultCondition [OPTOUT] × Predicted F |
1.29 | 0.70 | 0.48 | 0.633 | 1.28 | 0.69 | 0.45 | 0.650 | 2.80 | 1.62 | 1.78 | 0.075 | 2.86 | 1.57 | 1.91 | 0.057 | ||||||||
Observations | 1286 | 1286 | 1286 | 1286 | 1286 | 1286 | ||||||||||||||||||
AIC | 1653.994 | 1652.593 | 1658.206 | 1658.171 | 1653.750 | 1655.325 | ||||||||||||||||||
log-Likelihood | -793.997 | -781.296 | -784.103 | -784.085 | -781.875 | -782.662 |
As in the origianl paper, type I anova is used. We first attribute variance to condition, FACE, then Panel main effect, followed by the conditionFACE interaction, and the Panelcondition interaction last.
As in the original paper, Etasq of a variable in linear regression models (LessMore and Sunk) is calculated as the sum of squares explained by the variables divided by the total sum of squares. Etasq of a variable in logistic regression models (Default and Unusual Disease) is calculated as the deviance explained by the model relative to the null model (deviance from the null model divided by the model’s residual deviance).
I validated this pipeline by reproducing the stats reported in the SI using the entire study 1 dataset.
Woudl we like to see the full Anova stats?
calculate_deviance_explained.Default <- function(model) {
anova_res <- anova(model)
# Extract deviance for Panel and DefaultCondition:Panel interaction
dev_cond <- anova_res["DefaultCondition", "Deviance"]
dev_panel <- anova_res["Panel", "Deviance"]
dev_interaction <- anova_res["DefaultCondition:Panel", "Deviance"]
p_cond <- anova_res["DefaultCondition", "Pr(>Chi)"]
p_panel <- anova_res["Panel", "Pr(>Chi)"]
p_interaction <- anova_res["DefaultCondition:Panel", "Pr(>Chi)"]
sig_cond <- ifelse(p_cond < 0.001, "***",
ifelse(p_cond < 0.01, "**",
ifelse(p_cond < 0.05, "*",
ifelse(p_cond < 0.1, ".", ""))))
sig_panel <- ifelse(p_panel < 0.001, "***",
ifelse(p_panel < 0.01, "**",
ifelse(p_panel < 0.05, "*",
ifelse(p_panel < 0.1, ".", ""))))
sig_interaction <- ifelse(p_interaction < 0.001, "***",
ifelse(p_interaction < 0.01, "**",
ifelse(p_interaction < 0.05, "*",
ifelse(p_interaction < 0.1, ".", ""))))
# Calculate total deviance
dev_total <- sum(anova_res[1, "Resid. Dev"])
# Calculate percentage of deviance explained
perc_cond <- (dev_cond / dev_total)
perc_panel <- (dev_panel / dev_total)
perc_interaction <- (dev_interaction / dev_total)
res_cond <- paste0(round(perc_cond , 3), sig_cond)
res_panel <- paste0(round(perc_panel, 3), sig_panel)
res_interaction <- paste0(round(perc_interaction, 3), sig_interaction)
return(c(res_cond, res_panel, res_interaction))
}
# Apply the function to each model
deviance_explained <- sapply(Default_model, calculate_deviance_explained.Default)
# Convert to a data frame for easy viewing
deviance_explained_df.Default <- as.data.frame(t(deviance_explained))
colnames(deviance_explained_df.Default) <- c("Condition","Panel", "Default Condition:Panel")
# Display the results
kable(deviance_explained_df.Default,caption = "Etasq Table: Default Paradigm")
Condition | Panel | Default Condition:Panel | |
---|---|---|---|
Default.PAN | 0.036*** | 0.011. | 0.015 |
Default.PAN_FACE | 0.036*** | 0.01. | 0.014 |
Default.PAN_Pred_F.reg_ACE | 0.036*** | 0.011. | 0.014 |
Default.PAN_Pred_F.net_ACE | 0.036*** | 0.011* | 0.014 |
Default.PAN_Pred_F.rf_ACE | 0.036*** | 0.012* | 0.015 |
Default.PAN_Pred_F.gbm_ACE | 0.036*** | 0.011* | 0.015 |
Disease.PAN <- glm(numDisease ~ DiseaseCondition * Panel, data = data.tst_M1a_full.reg, family = binomial)
Disease.PAN_FACE <- glm(numDisease ~ DiseaseCondition * F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E+DiseaseCondition * Panel , data = data.tst_M1a_full.reg, family = binomial)
Disease.PAN_Pred_F.reg_ACE <- glm(numDisease ~ DiseaseCondition * Predicted_F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E + DiseaseCondition * Panel, data = data.tst_M1a_full.reg, family = binomial)
Disease.PAN_Pred_F.net_ACE <- glm(numDisease ~ DiseaseCondition * Predicted_F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E + DiseaseCondition * Panel, data = data.tst_M1a_full.net, family = binomial)
Disease.PAN_Pred_F.rf_ACE <- glm(numDisease ~ DiseaseCondition * Predicted_F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E + DiseaseCondition * Panel, data = data.tst_M1a_full.rf, family = binomial)
Disease.PAN_Pred_F.gbm_ACE <- glm(numDisease ~ DiseaseCondition * Predicted_F + DiseaseCondition * A+ DiseaseCondition * C+ DiseaseCondition * E + DiseaseCondition * Panel , data = data.tst_M1a_full.gbm, family = binomial)
Disease_model <- list(
"Disease.PAN" = Disease.PAN,
"Disease.PAN_FACE" = Disease.PAN_FACE,
"Disease.PAN_Pred_F.reg_ACE" = Disease.PAN_Pred_F.reg_ACE,
"Disease.PAN_Pred_F.net_ACE" = Disease.PAN_Pred_F.net_ACE,
"Disease.PAN_Pred_F.rf_ACE" = Disease.PAN_Pred_F.rf_ACE,
"Disease.PAN_Pred_F.gbm_ACE" = Disease.PAN_Pred_F.gbm_ACE
)
# extract_aic(Disease_model)
term_names <- names(coef(Disease.PAN))
panel_terms <- term_names[grepl("Panel", term_names)]
Disease_result<-tab_model(Disease.PAN, Disease.PAN_FACE, Disease.PAN_Pred_F.reg_ACE,Disease.PAN_Pred_F.net_ACE,Disease.PAN_Pred_F.rf_ACE,Disease.PAN_Pred_F.gbm_ACE,
show.ci = FALSE, show.stat = TRUE, show.se = TRUE, show.re.var = TRUE,
show.icc = FALSE, show.r2=0,show.aic = 1,
show.loglik = 1,
show.fstat = 0,
rm.terms = panel_terms,
dv.labels = c("PAN",
"PAN_FACE",
"PAN_Pred.F(reg)_ACE",
"PAN_Pred.F(net)_ACE",
"PAN_Pred.F(rf)_ACE",
"PAN_Pred.F(gbm)_ACE"), title = "Unusual Disease Paradigm; ✔ Panel and interaction terms are included in the model but omitted for readability")
Disease_result
PAN | PAN_FACE | PAN_Pred.F(reg)_ACE | PAN_Pred.F(net)_ACE | PAN_Pred.F(rf)_ACE | PAN_Pred.F(gbm)_ACE | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p | Odds Ratios | std. Error | Statistic | p |
(Intercept) | 0.46 | 0.12 | -3.08 | 0.002 | 0.44 | 0.11 | -3.19 | 0.001 | 0.44 | 0.11 | -3.19 | 0.001 | 0.44 | 0.11 | -3.18 | 0.001 | 0.44 | 0.11 | -3.15 | 0.002 | 0.44 | 0.11 | -3.16 | 0.002 |
DiseaseCondition [LOSS] | 2.77 | 0.89 | 3.15 | 0.002 | 2.83 | 0.93 | 3.16 | 0.002 | 2.88 | 0.95 | 3.21 | 0.001 | 2.87 | 0.95 | 3.20 | 0.001 | 2.85 | 0.94 | 3.17 | 0.002 | 2.85 | 0.94 | 3.18 | 0.001 |
F | 1.35 | 0.24 | 1.70 | 0.089 | ||||||||||||||||||||
A | 1.85 | 0.43 | 2.63 | 0.009 | 1.84 | 0.43 | 2.61 | 0.009 | 1.85 | 0.43 | 2.62 | 0.009 | 1.86 | 0.43 | 2.65 | 0.008 | 1.86 | 0.43 | 2.65 | 0.008 | ||||
C | 0.72 | 0.08 | -3.11 | 0.002 | 0.74 | 0.08 | -2.80 | 0.005 | 0.74 | 0.08 | -2.74 | 0.006 | 0.76 | 0.08 | -2.65 | 0.008 | 0.76 | 0.08 | -2.62 | 0.009 | ||||
E | 1.31 | 0.20 | 1.73 | 0.083 | 1.25 | 0.19 | 1.47 | 0.141 | 1.25 | 0.19 | 1.44 | 0.150 | 1.23 | 0.19 | 1.30 | 0.193 | 1.23 | 0.19 | 1.32 | 0.185 | ||||
DiseaseCondition [LOSS] × F |
0.74 | 0.18 | -1.24 | 0.215 | ||||||||||||||||||||
DiseaseCondition [LOSS] × A |
0.76 | 0.24 | -0.85 | 0.397 | 0.76 | 0.24 | -0.84 | 0.400 | 0.76 | 0.24 | -0.85 | 0.397 | 0.76 | 0.24 | -0.84 | 0.399 | 0.77 | 0.24 | -0.83 | 0.406 | ||||
DiseaseCondition [LOSS] × C |
1.17 | 0.16 | 1.09 | 0.276 | 1.18 | 0.17 | 1.13 | 0.259 | 1.17 | 0.17 | 1.05 | 0.292 | 1.12 | 0.16 | 0.81 | 0.416 | 1.14 | 0.16 | 0.92 | 0.358 | ||||
DiseaseCondition [LOSS] × E |
0.62 | 0.13 | -2.26 | 0.024 | 0.63 | 0.13 | -2.20 | 0.028 | 0.64 | 0.13 | -2.16 | 0.031 | 0.66 | 0.14 | -2.01 | 0.044 | 0.64 | 0.14 | -2.09 | 0.037 | ||||
Predicted F | 1.21 | 0.37 | 0.63 | 0.532 | 1.15 | 0.36 | 0.44 | 0.660 | 0.95 | 0.31 | -0.15 | 0.883 | 0.98 | 0.31 | -0.07 | 0.948 | ||||||||
DiseaseCondition [LOSS] × Predicted F |
0.62 | 0.26 | -1.12 | 0.262 | 0.68 | 0.29 | -0.92 | 0.359 | 0.87 | 0.39 | -0.30 | 0.767 | 0.77 | 0.33 | -0.62 | 0.538 | ||||||||
Observations | 1286 | 1286 | 1286 | 1286 | 1286 | 1286 | ||||||||||||||||||
AIC | 1687.269 | 1672.317 | 1673.877 | 1674.262 | 1674.854 | 1674.226 | ||||||||||||||||||
log-Likelihood | -821.634 | -806.158 | -806.938 | -807.131 | -807.427 | -807.113 |
calculate_deviance_explained.Disease <- function(model) {
anova_res <- anova(model)
# Extract deviance for DiseaseCondition, Panel, and DiseaseCondition:Panel interaction
dev_cond <- anova_res["DiseaseCondition", "Deviance"]
dev_panel <- anova_res["Panel", "Deviance"]
dev_interaction <- anova_res["DiseaseCondition:Panel", "Deviance"]
# Extract p-values
p_cond <- anova_res["DiseaseCondition", "Pr(>Chi)"]
p_panel <- anova_res["Panel", "Pr(>Chi)"]
p_interaction <- anova_res["DiseaseCondition:Panel", "Pr(>Chi)"]
# Determine significance levels
sig_cond <- ifelse(p_cond < 0.001, "***",
ifelse(p_cond < 0.01, "**",
ifelse(p_cond < 0.05, "*",
ifelse(p_cond < 0.1, ".", ""))))
sig_panel <- ifelse(p_panel < 0.001, "***",
ifelse(p_panel < 0.01, "**",
ifelse(p_panel < 0.05, "*",
ifelse(p_panel < 0.1, ".", ""))))
sig_interaction <- ifelse(p_interaction < 0.001, "***",
ifelse(p_interaction < 0.01, "**",
ifelse(p_interaction < 0.05, "*",
ifelse(p_interaction < 0.1, ".", ""))))
# Calculate total deviance
dev_total <- sum(anova_res[1, "Resid. Dev"])
# Calculate percentage of deviance explained
perc_cond <- (dev_cond / dev_total)
perc_panel <- (dev_panel / dev_total)
perc_interaction <- (dev_interaction / dev_total)
# Combine percentage and significance
res_cond <- paste0(round(perc_cond, 3), sig_cond)
res_panel <- paste0(round(perc_panel, 3), sig_panel)
res_interaction <- paste0(round(perc_interaction, 3), sig_interaction)
return(c(res_cond, res_panel, res_interaction))
}
# Apply the function to each model
deviance_explained <- sapply(Disease_model, calculate_deviance_explained.Disease)
# Convert to a data frame for easy viewing
deviance_explained_df.Disease <- as.data.frame(t(deviance_explained))
colnames(deviance_explained_df.Disease) <- c("Condition","Panel", "DiseaseCondition:Panel")
# Display the results
kable(deviance_explained_df.Disease,caption = "Etasq Table: Disease Paradigm")
Condition | Panel | DiseaseCondition:Panel | |
---|---|---|---|
Disease.PAN | 0.045*** | 0.015** | 0.013* |
Disease.PAN_FACE | 0.045*** | 0.007 | 0.008 |
Disease.PAN_Pred_F.reg_ACE | 0.045*** | 0.007 | 0.008 |
Disease.PAN_Pred_F.net_ACE | 0.045*** | 0.007 | 0.008 |
Disease.PAN_Pred_F.rf_ACE | 0.045*** | 0.007 | 0.007 |
Disease.PAN_Pred_F.gbm_ACE | 0.045*** | 0.008 | 0.007 |
LessMore.PAN <- lm(numLessMore ~ LessMoreCondition * Panel, data = data.tst_M1a_full.reg)
LessMore.PAN_FACE <- lm(numLessMore ~ LessMoreCondition * F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E + LessMoreCondition * Panel, data = data.tst_M1a_full.reg)
LessMore.PAN_Pred_F.reg_ACE <- lm(numLessMore ~ LessMoreCondition * Predicted_F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E + LessMoreCondition * Panel, data = data.tst_M1a_full.reg)
LessMore.PAN_Pred_F.net_ACE <- lm(numLessMore ~ LessMoreCondition * Predicted_F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E + LessMoreCondition * Panel , data = data.tst_M1a_full.net)
LessMore.PAN_Pred_F.rf_ACE <- lm(numLessMore ~ LessMoreCondition * Predicted_F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E + LessMoreCondition * Panel, data = data.tst_M1a_full.rf)
LessMore.PAN_Pred_F.gbm_ACE <- lm(numLessMore ~ LessMoreCondition * Predicted_F + LessMoreCondition * A+ LessMoreCondition * C+ LessMoreCondition * E + LessMoreCondition * Panel , data = data.tst_M1a_full.gbm)
extract_adjusted_r_squared <- function(model_list) {
model_names <- names(model_list)
adj_r_squared_values <- sapply(model_list, function(model) summary(model)$adj.r.squared) # Extract Adjusted R-squared
# Create a data frame
adj_r_squared_table <- data.frame(
Model = model_names,
Adjusted_R_Squared = round(adj_r_squared_values, 4),
stringsAsFactors = FALSE
)
return(adj_r_squared_table)
}
LessMore_model <- list(
"LessMore.PAN" = LessMore.PAN,
"LessMore.PAN_FACE" = LessMore.PAN_FACE,
"LessMore.PAN_Pred_F.reg_ACE" = LessMore.PAN_Pred_F.reg_ACE,
"LessMore.PAN_Pred_F.net_ACE" = LessMore.PAN_Pred_F.net_ACE,
"LessMore.PAN_Pred_F.rf_ACE" = LessMore.PAN_Pred_F.rf_ACE,
"LessMore.PAN_Pred_F.gbm_ACE" = LessMore.PAN_Pred_F.gbm_ACE
)
term_names <- names(coef(LessMore.PAN))
panel_terms <- term_names[grepl("Panel", term_names)]
LessMore_result<-tab_model(LessMore.PAN, LessMore.PAN_FACE, LessMore.PAN_Pred_F.reg_ACE,LessMore.PAN_Pred_F.net_ACE,LessMore.PAN_Pred_F.rf_ACE,LessMore.PAN_Pred_F.gbm_ACE,
show.ci = FALSE, show.stat = TRUE, show.se = TRUE, show.re.var = TRUE,
show.icc = FALSE, show.r2=1,
show.fstat = 0,
rm.terms = panel_terms,
dv.labels = c("PAN",
"PAN_FACE",
"PAN_Pred.F(reg)_ACE",
"PAN_Pred.F(net)_ACE",
"PAN_Pred.F(rf)_ACE",
"PAN_Pred.F(gbm)_ACE"), title = "LessMore Paradigm; ✔ Panel and interaction terms are included in the model but omitted for readability")
LessMore_result
PAN | PAN_FACE | PAN_Pred.F(reg)_ACE | PAN_Pred.F(net)_ACE | PAN_Pred.F(rf)_ACE | PAN_Pred.F(gbm)_ACE | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p |
(Intercept) | 5.36 | 0.14 | 37.84 | <0.001 | 5.37 | 0.14 | 38.49 | <0.001 | 5.36 | 0.14 | 38.48 | <0.001 | 5.36 | 0.14 | 38.47 | <0.001 | 5.37 | 0.14 | 38.33 | <0.001 | 5.37 | 0.14 | 38.53 | <0.001 |
LessMoreCondition [SCARF] | 1.16 | 0.20 | 5.79 | <0.001 | 1.17 | 0.20 | 5.97 | <0.001 | 1.16 | 0.20 | 5.90 | <0.001 | 1.16 | 0.20 | 5.91 | <0.001 | 1.15 | 0.20 | 5.82 | <0.001 | 1.16 | 0.20 | 5.92 | <0.001 |
F | 0.10 | 0.11 | 0.89 | 0.372 | ||||||||||||||||||||
A | 0.36 | 0.13 | 2.68 | 0.007 | 0.36 | 0.13 | 2.72 | 0.007 | 0.36 | 0.13 | 2.72 | 0.007 | 0.36 | 0.13 | 2.72 | 0.007 | 0.36 | 0.13 | 2.73 | 0.007 | ||||
C | -0.02 | 0.06 | -0.33 | 0.738 | -0.02 | 0.06 | -0.40 | 0.687 | -0.03 | 0.06 | -0.44 | 0.661 | -0.01 | 0.06 | -0.14 | 0.885 | -0.00 | 0.06 | -0.04 | 0.965 | ||||
E | 0.06 | 0.09 | 0.69 | 0.490 | 0.06 | 0.09 | 0.67 | 0.500 | 0.06 | 0.09 | 0.70 | 0.483 | 0.04 | 0.09 | 0.46 | 0.646 | 0.03 | 0.09 | 0.36 | 0.717 | ||||
LessMoreCondition [SCARF] × F |
-0.05 | 0.15 | -0.33 | 0.741 | ||||||||||||||||||||
LessMoreCondition [SCARF] × A |
0.37 | 0.19 | 1.96 | 0.050 | 0.34 | 0.19 | 1.83 | 0.067 | 0.34 | 0.19 | 1.83 | 0.067 | 0.35 | 0.19 | 1.86 | 0.064 | 0.34 | 0.19 | 1.81 | 0.070 | ||||
LessMoreCondition [SCARF] × C |
0.03 | 0.08 | 0.38 | 0.707 | -0.03 | 0.08 | -0.39 | 0.696 | -0.03 | 0.08 | -0.39 | 0.698 | -0.01 | 0.08 | -0.08 | 0.939 | -0.02 | 0.08 | -0.25 | 0.799 | ||||
LessMoreCondition [SCARF] × E |
-0.27 | 0.13 | -2.15 | 0.032 | -0.24 | 0.12 | -1.93 | 0.053 | -0.24 | 0.12 | -1.95 | 0.051 | -0.23 | 0.13 | -1.83 | 0.068 | -0.22 | 0.13 | -1.76 | 0.079 | ||||
Predicted F | 0.12 | 0.17 | 0.71 | 0.477 | 0.15 | 0.18 | 0.84 | 0.401 | -0.04 | 0.18 | -0.19 | 0.846 | -0.09 | 0.17 | -0.53 | 0.594 | ||||||||
LessMoreCondition [SCARF] × Predicted F |
0.39 | 0.25 | 1.56 | 0.119 | 0.39 | 0.25 | 1.53 | 0.126 | 0.41 | 0.27 | 1.54 | 0.124 | 0.46 | 0.25 | 1.81 | 0.071 | ||||||||
Observations | 1286 | 1286 | 1286 | 1286 | 1286 | 1286 | ||||||||||||||||||
R2 / R2 adjusted | 0.204 / 0.191 | 0.245 / 0.228 | 0.250 / 0.233 | 0.250 / 0.233 | 0.247 / 0.230 | 0.247 / 0.230 |
# create_Anova_df <- function(model, model_name) {
# # tyep 1
# anova_df <- as.data.frame(anova(model))
# anova_df$Variable <- rownames(anova_df)
# ss_total <- sum(anova_df$`Sum Sq`)
# # eta squared for Panel and LessMoreCondition:Panel
# eta_square_cond <- anova_df[anova_df$Variable == "LessMoreCondition", "Sum Sq"] / ss_total
# eta_square_panel <- anova_df[anova_df$Variable == "Panel", "Sum Sq"] / ss_total
# eta_square_interaction <- anova_df[anova_df$Variable == "LessMoreCondition:Panel", "Sum Sq"] / ss_total
# # Add eta squared to the data frame
# anova_df$Eta_Squared <- NA
# anova_df$Eta_Squared[anova_df$Variable == "LessMoreCondition"] <- eta_square_cond
# anova_df$Eta_Squared[anova_df$Variable == "Panel"] <- eta_square_panel
# anova_df$Eta_Squared[anova_df$Variable == "LessMoreCondition:Panel"] <- eta_square_interaction
#
# # lavel each model
# colnames(anova_df)[-which(colnames(anova_df) == "Variable")] <- paste0(colnames(anova_df)[-which(colnames(anova_df) == "Variable")], "_", model_name)
# return(anova_df)
# }
#
# # Apply the function to each model
# Anova_LessMore.PAN_df <- create_Anova_df(LessMore.PAN, "PAN")
# Anova_LessMore.PAN_FACE_df <- create_Anova_df(LessMore.PAN_FACE, "PAN_FACE")
# Anova_LessMore.PAN_Pred_F.reg_ACE_df <- create_Anova_df(LessMore.PAN_Pred_F.reg_ACE, "Pred_F(reg)_ACE")
# Anova_LessMore.PAN_Pred_F.net_ACE_df <- create_Anova_df(LessMore.PAN_Pred_F.net_ACE, "Pred_F(net)_ACE")
# Anova_LessMore.PAN_Pred_F.rf_ACE_df <- create_Anova_df(LessMore.PAN_Pred_F.rf_ACE, "Pred_F(rf)_ACE")
# Anova_LessMore.PAN_Pred_F.gbm_ACE_df <- create_Anova_df(LessMore.PAN_Pred_F.gbm_ACE, "Pred_F(gbm)_ACE")
#
# anova_combined <- reduce(list(Anova_LessMore.PAN_df, Anova_LessMore.PAN_FACE_df, Anova_LessMore.PAN_Pred_F.reg_ACE_df,
# Anova_LessMore.PAN_Pred_F.net_ACE_df, Anova_LessMore.PAN_Pred_F.rf_ACE_df,
# Anova_LessMore.PAN_Pred_F.gbm_ACE_df),
# function(x, y) full_join(x, y, by = "Variable"))
#
#
# anova_combined <- anova_combined %>%
# select(Variable, everything())
#
# anova_combined <- anova_combined %>%
# mutate_if(is.numeric, ~round(., 3))
#
# # Move residuals to the last row
# anova_combined <- anova_combined %>%
# filter(Variable != "Residuals") %>%
# bind_rows(anova_combined %>% filter(Variable == "Residuals"))
#
# kable(anova_combined, caption = "ANOVA Results for Less More Models")
calculate_variance_explained_LessMore <- function(model) {
anova_res <- anova(model)
# Extract sum of squares for LessMoreCondition, Panel, and LessMoreCondition:Panel interaction
ss_condition <- anova_res["LessMoreCondition", "Sum Sq"]
ss_panel <- anova_res["Panel", "Sum Sq"]
ss_interaction <- anova_res["LessMoreCondition:Panel", "Sum Sq"]
# Extract p-values
p_condition <- anova_res["LessMoreCondition", "Pr(>F)"]
p_panel <- anova_res["Panel", "Pr(>F)"]
p_interaction <- anova_res["LessMoreCondition:Panel", "Pr(>F)"]
# Determine significance levels
sig_condition <- ifelse(p_condition < 0.001, "***",
ifelse(p_condition < 0.01, "**",
ifelse(p_condition < 0.05, "*",
ifelse(p_condition < 0.1, ".", ""))))
sig_panel <- ifelse(p_panel < 0.001, "***",
ifelse(p_panel < 0.01, "**",
ifelse(p_panel < 0.05, "*",
ifelse(p_panel < 0.1, ".", ""))))
sig_interaction <- ifelse(p_interaction < 0.001, "***",
ifelse(p_interaction < 0.01, "**",
ifelse(p_interaction < 0.05, "*",
ifelse(p_interaction < 0.1, ".", ""))))
# Calculate total sum of squares
ss_total <- sum(anova_res[, "Sum Sq"])
# Calculate percentage of variance explained
perc_condition <- ss_condition / ss_total
perc_panel <- ss_panel / ss_total
perc_interaction <- ss_interaction / ss_total
# Combine percentage and significance
res_condition <- paste0(round(perc_condition, 3), sig_condition)
res_panel <- paste0(round(perc_panel, 3), sig_panel)
res_interaction <- paste0(round(perc_interaction, 3), sig_interaction)
return(c(res_condition, res_panel, res_interaction))
}
variance_explained_lm <- sapply(LessMore_model, calculate_variance_explained_LessMore)
# Convert to a data frame for easy viewing
variance_explained_lm_df.LessMore <- as.data.frame(t(variance_explained_lm))
colnames(variance_explained_lm_df.LessMore) <- c("Condition","Panel", "LessMoreCondition:Panel")
# Display the results
kable(variance_explained_lm_df.LessMore,caption = "Etasq Table: LessMore Paradigm")
Condition | Panel | LessMoreCondition:Panel | |
---|---|---|---|
LessMore.PAN | 0.127*** | 0.069*** | 0.008 |
LessMore.PAN_FACE | 0.127*** | 0.026*** | 0.005 |
LessMore.PAN_Pred_F.reg_ACE | 0.127*** | 0.023*** | 0.006 |
LessMore.PAN_Pred_F.net_ACE | 0.127*** | 0.023*** | 0.006 |
LessMore.PAN_Pred_F.rf_ACE | 0.127*** | 0.024*** | 0.006 |
LessMore.PAN_Pred_F.gbm_ACE | 0.127*** | 0.024*** | 0.006 |
Sunk.PAN <- lm(numSunkCost~ SunkCondition * Panel, data = data.tst_M1a_full)
Sunk.PAN_FACE <- lm(numSunkCost ~ SunkCondition * F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E + SunkCondition * Panel, data = data.tst_M1a_full)
Sunk.PAN_Pred_F.reg_ACE <- lm(numSunkCost ~ SunkCondition * Predicted_F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E+SunkCondition * Panel, data = data.tst_M1a_full.reg)
Sunk.PAN_Pred_F.net_ACE <- lm(numSunkCost ~ SunkCondition * Predicted_F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E + SunkCondition * Panel, data = data.tst_M1a_full.net)
Sunk.PAN_Pred_F.rf_ACE <- lm(numSunkCost ~ SunkCondition * Predicted_F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E + SunkCondition * Panel, data = data.tst_M1a_full.rf)
Sunk.PAN_Pred_F.gbm_ACE <- lm(numSunkCost ~ SunkCondition * Predicted_F + SunkCondition * A+ SunkCondition * C+ SunkCondition * E + SunkCondition * Panel, data = data.tst_M1a_full.gbm)
extract_adjusted_r_squared <- function(model_list) {
model_names <- names(model_list)
adj_r_squared_values <- sapply(model_list, function(model) summary(model)$adj.r.squared) # Extract Adjusted R-squared
# Create a data frame
adj_r_squared_table <- data.frame(
Model = model_names,
Adjusted_R_Squared = round(adj_r_squared_values, 4),
stringsAsFactors = FALSE
)
return(adj_r_squared_table)
}
Sunk_model <- list(
"Sunk.PAN" = Sunk.PAN,
"Sunk.PAN_FACE" = Sunk.PAN_FACE,
"Sunk.PAN_Pred_F.reg_ACE" = Sunk.PAN_Pred_F.reg_ACE,
"Sunk.PAN_Pred_F.net_ACE" = Sunk.PAN_Pred_F.net_ACE,
"Sunk.PAN_Pred_F.rf_ACE" = Sunk.PAN_Pred_F.rf_ACE, #0.302 with unscaled
"Sunk.PAN_Pred_F.gbm_ACE" = Sunk.PAN_Pred_F.gbm_ACE
)
# extract_adjusted_r_squared(Sunk_model)
term_names <- names(coef(Sunk.PAN))
panel_terms <- term_names[grepl("Panel", term_names)]
Sunk_result<-tab_model(Sunk.PAN, Sunk.PAN_FACE, Sunk.PAN_Pred_F.reg_ACE,Sunk.PAN_Pred_F.net_ACE,Sunk.PAN_Pred_F.rf_ACE,Sunk.PAN_Pred_F.gbm_ACE,
show.ci = FALSE, show.stat = TRUE, show.se = TRUE, show.re.var = TRUE,
show.icc = FALSE, show.r2=1,
show.fstat = 0,
rm.terms = panel_terms,
dv.labels = c("PAN",
"PAN_FACE",
"PAN_Pred_F(reg)_ACE",
"PAN_Pred_F(net)_ACE",
"PAN_Pred_F(rf)_ACE",
"PAN_Pred_F(gbm)_ACE"), title = "Sunk Cost Paradigm; ✔ Panel and interaction terms are included in the model but omitted for readability")
Sunk_result
PAN | PAN_FACE | PAN_Pred_F(reg)_ACE | PAN_Pred_F(net)_ACE | PAN_Pred_F(rf)_ACE | PAN_Pred_F(gbm)_ACE | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p | Estimates | std. Error | Statistic | p |
(Intercept) | 5.97 | 0.29 | 20.66 | <0.001 | 6.02 | 0.29 | 20.96 | <0.001 | 6.01 | 0.29 | 20.84 | <0.001 | 6.01 | 0.29 | 20.84 | <0.001 | 6.02 | 0.29 | 20.83 | <0.001 | 6.01 | 0.29 | 20.86 | <0.001 |
SunkCondition [PAID] | -0.41 | 0.42 | -0.95 | 0.340 | -0.38 | 0.42 | -0.91 | 0.364 | -0.44 | 0.43 | -1.03 | 0.301 | -0.44 | 0.43 | -1.04 | 0.301 | -0.48 | 0.43 | -1.12 | 0.263 | -0.43 | 0.43 | -1.00 | 0.317 |
F | 0.59 | 0.22 | 2.68 | 0.007 | ||||||||||||||||||||
A | 0.45 | 0.28 | 1.62 | 0.106 | 0.42 | 0.28 | 1.53 | 0.126 | 0.42 | 0.28 | 1.53 | 0.126 | 0.42 | 0.28 | 1.53 | 0.127 | 0.42 | 0.28 | 1.54 | 0.125 | ||||
C | -0.00 | 0.12 | -0.02 | 0.982 | 0.10 | 0.13 | 0.75 | 0.453 | 0.09 | 0.13 | 0.72 | 0.473 | 0.09 | 0.12 | 0.74 | 0.457 | 0.10 | 0.13 | 0.77 | 0.439 | ||||
E | 0.02 | 0.19 | 0.08 | 0.933 | -0.12 | 0.19 | -0.65 | 0.516 | -0.12 | 0.19 | -0.64 | 0.525 | -0.14 | 0.19 | -0.70 | 0.484 | -0.14 | 0.19 | -0.71 | 0.476 | ||||
SunkCondition [PAID] × F | 0.04 | 0.31 | 0.14 | 0.889 | ||||||||||||||||||||
SunkCondition [PAID] × A | 0.00 | 0.41 | 0.01 | 0.994 | 0.01 | 0.41 | 0.02 | 0.987 | 0.01 | 0.41 | 0.02 | 0.988 | -0.01 | 0.41 | -0.02 | 0.988 | -0.02 | 0.41 | -0.05 | 0.962 | ||||
SunkCondition [PAID] × C | 0.18 | 0.17 | 1.03 | 0.303 | 0.05 | 0.18 | 0.28 | 0.782 | 0.06 | 0.18 | 0.31 | 0.758 | 0.09 | 0.18 | 0.54 | 0.590 | 0.06 | 0.18 | 0.31 | 0.756 | ||||
SunkCondition [PAID] × E | 0.30 | 0.27 | 1.11 | 0.266 | 0.42 | 0.27 | 1.55 | 0.122 | 0.41 | 0.27 | 1.53 | 0.126 | 0.44 | 0.27 | 1.61 | 0.107 | 0.45 | 0.27 | 1.65 | 0.099 | ||||
Predicted F | -0.19 | 0.39 | -0.50 | 0.615 | -0.17 | 0.39 | -0.43 | 0.669 | -0.26 | 0.41 | -0.62 | 0.535 | -0.26 | 0.38 | -0.67 | 0.506 | ||||||||
SunkCondition [PAID] × Predicted F |
1.11 | 0.54 | 2.04 | 0.041 | 1.09 | 0.55 | 1.99 | 0.047 | 1.29 | 0.58 | 2.23 | 0.026 | 1.31 | 0.55 | 2.39 | 0.017 | ||||||||
Observations | 1286 | 1286 | 1286 | 1286 | 1286 | 1286 | ||||||||||||||||||
R2 / R2 adjusted | 0.101 / 0.086 | 0.120 / 0.100 | 0.113 / 0.093 | 0.113 / 0.093 | 0.114 / 0.093 | 0.114 / 0.094 |
# create_Anova_df <- function(model, model_name) {
# # tyep 2 anova???
# anova_df <- as.data.frame(anova(model))
# anova_df$Variable <- rownames(anova_df)
# ss_total <- sum(anova_df$`Sum Sq`)
# # eta squared for Panel and SunkCondition:Panel
# eta_square_cond <- anova_df[anova_df$Variable == "SunkCondition", "Sum Sq"] / ss_total
# eta_square_panel <- anova_df[anova_df$Variable == "Panel", "Sum Sq"] / ss_total
# eta_square_interaction <- anova_df[anova_df$Variable == "SunkCondition:Panel", "Sum Sq"] / ss_total
# # Add eta squared to the data frame
# anova_df$Eta_Squared <- NA
# anova_df$Eta_Squared[anova_df$Variable == "SunkCondition"] <- eta_square_cond
# anova_df$Eta_Squared[anova_df$Variable == "Panel"] <- eta_square_panel
# anova_df$Eta_Squared[anova_df$Variable == "SunkCondition:Panel"] <- eta_square_interaction
#
# # lavel each model
# colnames(anova_df)[-which(colnames(anova_df) == "Variable")] <- paste0(colnames(anova_df)[-which(colnames(anova_df) == "Variable")], "_", model_name)
# return(anova_df)
# }
#
# # Apply the function to each model
# Anova_Sunk.PAN_df <- create_Anova_df(Sunk.PAN, "PAN")
# Anova_Sunk.PAN_FACE_df <- create_Anova_df(Sunk.PAN_FACE, "PAN_FACE")
# Anova_Sunk.PAN_Pred_F.reg_ACE_df <- create_Anova_df(Sunk.PAN_Pred_F.reg_ACE, "Pred_F(reg)_ACE")
# Anova_Sunk.PAN_Pred_F.net_ACE_df <- create_Anova_df(Sunk.PAN_Pred_F.net_ACE, "Pred_F(net)_ACE")
# Anova_Sunk.PAN_Pred_F.rf_ACE_df <- create_Anova_df(Sunk.PAN_Pred_F.rf_ACE, "Pred_F(rf)_ACE")
# Anova_Sunk.PAN_Pred_F.gbm_ACE_df <- create_Anova_df(Sunk.PAN_Pred_F.gbm_ACE, "Pred_F(gbm)_ACE")
#
# anova_combined <- reduce(list(Anova_Sunk.PAN_df, Anova_Sunk.PAN_FACE_df, Anova_Sunk.PAN_Pred_F.reg_ACE_df,
# Anova_Sunk.PAN_Pred_F.net_ACE_df, Anova_Sunk.PAN_Pred_F.rf_ACE_df,
# Anova_Sunk.PAN_Pred_F.gbm_ACE_df),
# function(x, y) full_join(x, y, by = "Variable"))
#
#
# anova_combined <- anova_combined %>%
# select(Variable, everything())
#
# anova_combined <- anova_combined %>%
# mutate_if(is.numeric, ~round(., 3))
#
# # Move residuals to the last row
# anova_combined <- anova_combined %>%
# filter(Variable != "Residuals") %>%
# bind_rows(anova_combined %>% filter(Variable == "Residuals"))
#
# kable(anova_combined, caption = "ANOVA Results for Sunk Cost Models")
calculate_variance_explained_Sunk <- function(model) {
anova_res <- anova(model)
# Extract sum of squares for SunkCondition, Panel, and SunkCondition:Panel interaction
ss_condition <- anova_res["SunkCondition", "Sum Sq"]
ss_panel <- anova_res["Panel", "Sum Sq"]
ss_interaction <- anova_res["SunkCondition:Panel", "Sum Sq"]
# Extract p-values
p_condition <- anova_res["SunkCondition", "Pr(>F)"]
p_panel <- anova_res["Panel", "Pr(>F)"]
p_interaction <- anova_res["SunkCondition:Panel", "Pr(>F)"]
# Determine significance levels
sig_condition <- ifelse(p_condition < 0.001, "***",
ifelse(p_condition < 0.01, "**",
ifelse(p_condition < 0.05, "*",
ifelse(p_condition < 0.1, ".", ""))))
sig_panel <- ifelse(p_panel < 0.001, "***",
ifelse(p_panel < 0.01, "**",
ifelse(p_panel < 0.05, "*",
ifelse(p_panel < 0.1, ".", ""))))
sig_interaction <- ifelse(p_interaction < 0.001, "***",
ifelse(p_interaction < 0.01, "**",
ifelse(p_interaction < 0.05, "*",
ifelse(p_interaction < 0.1, ".", ""))))
# Calculate total sum of squares
ss_total <- sum(anova_res[, "Sum Sq"])
# Calculate percentage of variance explained
perc_condition <- (ss_condition / ss_total)
perc_panel <- (ss_panel / ss_total)
perc_interaction <- (ss_interaction / ss_total)
# Combine percentage and significance
res_condition <- paste0(round(perc_condition, 3), sig_condition)
res_panel <- paste0(round(perc_panel, 3), sig_panel)
res_interaction <- paste0(round(perc_interaction, 3), sig_interaction)
return(c(res_condition, res_panel, res_interaction))
}
variance_explained_lm <- sapply(Sunk_model, calculate_variance_explained_Sunk)
# Convert to a data frame for easy viewing
variance_explained_lm_df.SunkCost <- as.data.frame(t(variance_explained_lm))
colnames(variance_explained_lm_df.SunkCost) <- c("Condition","Panel", "LessMoreCondition:Panel")
# Display the results
kable(variance_explained_lm_df.SunkCost,caption = "Etasq Table: SunkCost Paradigm")
Condition | Panel | LessMoreCondition:Panel | |
---|---|---|---|
Sunk.PAN | 0.007** | 0.086*** | 0.008 |
Sunk.PAN_FACE | 0.007** | 0.042*** | 0.008 |
Sunk.PAN_Pred_F.reg_ACE | 0.007** | 0.037*** | 0.01 |
Sunk.PAN_Pred_F.net_ACE | 0.007** | 0.036*** | 0.01 |
Sunk.PAN_Pred_F.rf_ACE | 0.007** | 0.039*** | 0.01 |
Sunk.PAN_Pred_F.gbm_ACE | 0.007** | 0.04*** | 0.01 |
\(R^2\): \(R^2\) for linear regressions, and \(1 − \frac{\log L(\text{model})}{\log L(\text{null model})}\) for logistic regressions (as in the paper)
Model order: consistently (1) PAN only; (2) PAN + FACE; (3) PAN + Pred_F(reg)ACE; (4) PAN + Pred_F(net)ACE; (5) PAN + Pred_F(rf)ACE; (6) PAN + Pred_F(gbm)ACE
note that the y-axes for different paradigms are not aligned.. Would it be more useful to plot the percentage of improvement compared to models that include only condition*panel as predictors instead?
#following antonia's analysis here...
Default.null<-glm(numDefault~1,data.tst_M1a_full.reg,family=binomial)
R2.Default = as.data.frame(cbind( "R2" = c((1-logLik(Default.PAN)/logLik(Default.null)),
(1-logLik(Default.PAN_FACE)/logLik(Default.null)),
(1-logLik(Default.PAN_Pred_F.reg_ACE)/logLik(Default.null)),
(1-logLik(Default.PAN_Pred_F.net_ACE)/logLik(Default.null)),
(1-logLik(Default.PAN_Pred_F.rf_ACE)/logLik(Default.null)),
(1-logLik(Default.PAN_Pred_F.gbm_ACE)/logLik(Default.null))),
"Model" = c("Pan", "Pan+FACE", "Pan+Pred.F(reg)+ACE", "Pan+Pred.F(net)+ACE" , "Pan+Pred.F(rf)+ACE","Pan+Pred.F(gbm)+ACE"),
"Paradigm" = "Default"))
Disease.null<-glm(numDisease~1,data.tst_M1a_full.reg,family=binomial)
R2.Disease = as.data.frame(cbind( "R2" = c((1-logLik(Disease.PAN)/logLik(Disease.null)),
(1-logLik(Disease.PAN_FACE)/logLik(Disease.null)),
(1-logLik(Disease.PAN_Pred_F.reg_ACE)/logLik(Disease.null)),
(1-logLik(Disease.PAN_Pred_F.net_ACE)/logLik(Disease.null)),
(1-logLik(Disease.PAN_Pred_F.rf_ACE)/logLik(Disease.null)),
(1-logLik(Disease.PAN_Pred_F.gbm_ACE)/logLik(Disease.null))),
"Model" = c("Pan", "Pan+FACE", "Pan+Pred.F(reg)+ACE", "Pan+Pred.F(net)+ACE" , "Pan+Pred.F(rf)+ACE","Pan+Pred.F(gbm)+ACE"),
"Paradigm" = "Disease"))
R2.LessMore = as.data.frame(cbind( "R2" = c(summary(LessMore.PAN)$r.squared,
summary(LessMore.PAN_FACE)$r.squared,
summary(LessMore.PAN_Pred_F.reg_ACE)$r.squared,
summary(LessMore.PAN_Pred_F.net_ACE)$r.squared,
summary(LessMore.PAN_Pred_F.rf_ACE )$r.squared,
summary(LessMore.PAN_Pred_F.gbm_ACE )$r.squared),
"Model" = c("Pan", "Pan+FACE", "Pan+Pred.F(reg)+ACE", "Pan+Pred.F(net)+ACE" , "Pan+Pred.F(rf)+ACE","Pan+Pred.F(gbm)+ACE"),
"Paradigm" = "SunkCost"))
R2.Sunk = as.data.frame(cbind( "R2" = c(summary(Sunk.PAN)$r.squared,
summary(Sunk.PAN_FACE)$r.squared,
summary(Sunk.PAN_Pred_F.reg_ACE)$r.squared,
summary(Sunk.PAN_Pred_F.net_ACE)$r.squared,
summary(Sunk.PAN_Pred_F.rf_ACE )$r.squared,
summary(Sunk.PAN_Pred_F.gbm_ACE )$r.squared),
"Model" = c("Pan", "Pan+FACE", "Pan+Pred.F(reg)+ACE", "Pan+Pred.F(net)+ACE" , "Pan+Pred.F(rf)+ACE","Pan+Pred.F(gbm)+ACE"),
"Paradigm" = "LessMore"))
R2<-rbind(R2.Default,R2.Disease,R2.LessMore,R2.Sunk)%>%
mutate(R2=round(as.numeric(R2),3))
R2$Model <- factor(R2$Model,
levels = c("Pan",
"Pan+FACE",
"Pan+Pred.F(reg)+ACE",
"Pan+Pred.F(net)+ACE",
"Pan+Pred.F(rf)+ACE",
"Pan+Pred.F(gbm)+ACE"),
ordered = TRUE)
library(RColorBrewer)
ggplot(R2, aes(x = Model, y = R2, fill = Model)) +
geom_bar(stat = "identity") +
facet_wrap(.~Paradigm, ncol = 4, scales = "free") +
theme_bw() +
ylab(bquote( R^2)) +
theme(
legend.position = "none",
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 18),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face = "bold")
) +
scale_fill_brewer(palette = "Set2")
# label paradigm
deviance_explained_df.Default$Paradigm <- "Default"
deviance_explained_df.Disease$Paradigm <- "Disease"
variance_explained_lm_df.LessMore$Paradigm <- "LessMore"
variance_explained_lm_df.SunkCost$Paradigm <- "SunkCost"
Model <- c("Pan", "Pan+FACE", "Pan+Pred.F(reg)+ACE", "Pan+Pred.F(net)+ACE", "Pan+Pred.F(rf)+ACE", "Pan+Pred.F(gbm)+ACE")
deviance_explained_df.Default$Model<-Model
deviance_explained_df.Disease$Model<-Model
variance_explained_lm_df.LessMore$Model<-Model
variance_explained_lm_df.SunkCost$Model<-Model
colnames(deviance_explained_df.Default)[3] <- "Condition:Panel"
colnames(deviance_explained_df.Disease)[3] <- "Condition:Panel"
colnames(variance_explained_lm_df.LessMore)[3] <- "Condition:Panel"
colnames(variance_explained_lm_df.SunkCost)[3] <- "Condition:Panel"
etasq <- rbind(deviance_explained_df.Default,
deviance_explained_df.Disease,
variance_explained_lm_df.LessMore,
variance_explained_lm_df.SunkCost)
etasq$Model <- factor(etasq$Model,
levels = c("Pan",
"Pan+FACE",
"Pan+Pred.F(reg)+ACE",
"Pan+Pred.F(net)+ACE",
"Pan+Pred.F(rf)+ACE",
"Pan+Pred.F(gbm)+ACE"),
ordered = TRUE)
split_value_and_sig <- function(df) {
df$Significance <- gsub("[0-9.]", "", df$Panel) # Extract the significance symbols
df$Panel <- as.numeric(sub("^([0-9]+\\.[0-9]+).*", "\\1", df$Panel))
return(df)
}
etasq.pan<-split_value_and_sig(etasq)
ggplot(etasq.pan, aes(x = Model, y = Panel, fill = Model)) +
geom_bar(stat = "identity") +
facet_wrap(.~Paradigm, ncol = 4, scales = "free") +
theme_bw() +
ylab("Etasq_panel") +
theme(
legend.position = "none",
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 18),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face = "bold")
) +
scale_fill_brewer(palette = "Set2")
split_value_and_sig <- function(df) {
df$Significance <- gsub("[0-9.]", "", df$`Condition:Panel`) # Extract the significance symbols
df$Panel <- as.numeric(sub("^([0-9]+\\.[0-9]+).*", "\\1", df$`Condition:Panel`))
return(df)
}
etasq.pan_cond<-split_value_and_sig(etasq)
ggplot(etasq.pan_cond, aes(x = Model, y = Panel, fill = Model)) +
geom_bar(stat = "identity") +
facet_wrap(.~Paradigm, ncol = 4, scales = "free") +
theme_bw() +
ylab("Etasq_panel*cond") +
theme(
legend.position = "none",
strip.background = element_rect(fill = "white"),
strip.text = element_text(size = 18),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, face = "bold")
) +
scale_fill_brewer(palette = "Set2")