# Subset to get the complete cases DF
analysis_var <-c("SO_1" , "SO_2" , "SO_3" ,"SO_4" , "SO_5", "SO_6" , "SO_7" , "SO_8" , "SO_9" , "SO_10", "LOTR_1" , "LOTR_2" , "LOTR_3" , "LOTR_4" , "LOTR_5" , "LOTR_6", "years_cons_scaled" , "gender", "WH_scaled", "position_simple" ,"education_simple" , "SO_coun", "GP_1_a","GP_2_a","GP_3_a","GP_4_a","GP_5_a","GP_6_a","GP_7_a","GP_8_a","GP_9_a","GP_10_a") # Data used in the analysis
DF2_variables <- DF2[,c(analysis_var)]
DF2_complete.cases <- DF2_variables[complete.cases(DF2_variables),]
missing_years <- paste0(round(table(is.na(DF2$years_cons))[2]/nrow(DF2)*100,1), "%")
A total of 2323 completed the survey; there are complete observations for 1853 respondents (including missing observations for categorical data, which are included with the level “unknown”). The variable with the highest rate of missing data was ‘years in conservation’, where 13.9% of observations were missing.
Additionally, goal progress questions are answered in a step-wise fashion, meaning there are expected patterns of missingness for the GP_1_b to GP_10_b data. Years in conservation (“years_cons”), work hours (“WH”), and age (“age_year”) may not be missing at random. (Work country is likely to be missing at random, since it was asked later in the survey, and so is probably not answered because respondents did not complete the survey.) The consequences of this missingness will be evaluated later.
This missing data is substituted through multivariate imputation by chained equations (MICE) of ten datasets, using the package ‘mice’. Here, synthetic values are computed based on the variables in the data, using an appropriate model type. Ordered logistic regression was used to impute ordinal data, predictive mean matching is used for numeric variables, polytomous regression for un-ordered factors, and logistic regression for binary data. Additional data, not used in the analysis, was included to help impute the missing values (e.g. “health”).
############################# MI #############################
### Setting up the envionrment and loading data ###
### Subsetting the data used in the analysis ###
# Retain variables used in the analysis
analysis_vars <- c("GP_1_a","GP_2_a", "GP_3_a","GP_4_a", "GP_5_a","GP_6_a", "GP_7_a","GP_8_a", "GP_9_a","GP_10_a","K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","GP_total_scaled","ERI_n_scaled", "ERI_o_scaled","age_year_scaled","SO_11","health","years_cons_scaled", "gender","K10_total","SS1", "SS2", "SS3", "PS_1", "PS_2","PS_3", "national.nnational" , "WH_scaled" , "CV", "position_simple", "education_simple")
DF3 <- DF2[analysis_vars]
# Convert to ordinal
DF3[,c("K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","SO_11","health","PS_1","PS_2", "PS_3","SS1", "SS2","SS3")] <- lapply(DF3[,c("K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","SO_11","health","PS_1","PS_2", "PS_3","SS1", "SS2","SS3")], as.ordered)
# And and converted to numeric
DF3[,c("GP_total_scaled","ERI_n_scaled","ERI_o_scaled","age_year_scaled","years_cons_scaled","K10_total","WH_scaled")] <- lapply(DF3[,c("GP_total_scaled","ERI_n_scaled","ERI_o_scaled","age_year_scaled","years_cons_scaled","K10_total","WH_scaled")], as.numeric)
# Ensuring factors are coded as such
DF3[,c("education_simple", "gender", "position_simple")] <- lapply(DF3[,c("education_simple", "gender","position_simple")], as.factor)
# Drop "other" as a catagory in gender, as there were too few observations to model
DF3<-DF3[!(DF3$gender=="Other"),]
DF3<- droplevels(DF3)
### Conducting the MI ###
# Number of imputed df
N.Imp = 10
# Conduct multiple imputation using the mice package
# Here we have to specify the imputation method for each variable:
str_out <- data.frame(capture.output(str(DF3)))
# Delete the first and last row
str_out <- data.frame(str_output = matrix(str_out[2:(nrow(str_out)-1),]))
# Create a column that contain the appropreate model for each variable - this only works if the variable is of the correct data type in the first place
str_out$type <- ifelse(grepl("Ord.factor", str_out$str_output, fixed = TRUE)==T, "polr",
ifelse(grepl("num", str_out$str_output, fixed = TRUE)==T, "pmm",
ifelse(grepl("Factor", str_out$str_output, fixed = TRUE)==T, "polyreg",
ifelse(grepl("int", str_out$str_output, fixed = TRUE)==T, "logreg", "ERROR"))))
# Conduct the MI - with the number of datasets specified by N.Imp, and the estimation type specified by str_out$type (derived from the above)
DF3_imp <- mice(DF3, m = N.Imp, method = str_out$type )
# GP_b variables
GP_b <- c("GP_1_b","GP_2_b","GP_3_b","GP_4_b","GP_6_b","GP_7_b","GP_8_b" , "GP_5_b" ,"GP_9_b","GP_10_b")
# Drop "other" as a catagory in gender, as there were too few observations to model
DF2_sub<-DF2[!(DF2$gender=="Other"),]
DF2_sub<- droplevels(DF2_sub)
# Extract each DF
mice.imp <- list()
for(i in 1:N.Imp) {
mice.imp[[i]] <- mice::complete(DF3_imp, action= i, inc=FALSE)
# Add data
mice.imp[[i]]$Nation <- DF2_sub$Nation
mice.imp[[i]]$W_coun <- DF2_sub$W_coun
mice.imp[[i]]$SO_coun <- DF2_sub$SO_coun
mice.imp[[i]]$nation_region <- DF2_sub$nation_region
mice.imp[[i]]$W_region <- DF2_sub$W_region
mice.imp[[i]]$SO_region <- DF2_sub$SO_region
# Add Goal progress, and make sure they're all ordered
mice.imp[[i]] <- cbind(mice.imp[[i]], DF2_sub[,GP_b] )
# Add respondent ID
mice.imp[[i]]$ID <- DF2_sub$ID
# Turn factor into series of binary variables
mice.imp[[i]] <- dummy_cols(mice.imp[[i]], select_columns = c("gender", "education_simple", "position_simple", "nation_region", "W_region","SO_region"))
# remove spaces and other charecters on colnames
colnames( mice.imp[[i]]) <- gsub(" ", "", colnames( mice.imp[[i]]), fixed = TRUE)
colnames( mice.imp[[i]]) <- gsub("/", "", colnames( mice.imp[[i]]), fixed = TRUE)
colnames( mice.imp[[i]]) <- gsub("-", "_", colnames( mice.imp[[i]]), fixed = TRUE)
colnames( mice.imp[[i]]) <- gsub("&", "_", colnames( mice.imp[[i]]), fixed = TRUE)
}
# Save the DF
save(mice.imp, file = paste0(path, "mice.imp.Rdata"))
The following steps were taken in the development of the situational optimism latent variable. First, the first imputed dataset was randomly split into training (70%) and test (30%) data.
The polychoric correlation between the ten situational optimism items within the training data were examined, showing good correlation.
The Ordinal Alpha was then examined, showing good internal conssitency between items.
# Ordinal Alpha
round(alpha(poly_cor$rho)$total,2) # the Ordinal Alpha, using polychoric correlation mateix
Parallel analysis of the ten items within the training data, using polychoric correlation and WLS was performed, suggested the extraction of three factors.
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
The root mean square error of approximation (RMSEA) across exploratory factor analysis with increasing numbers of factors was examined. The three factor model had a very high RMSEA, whereas the five factor model (the number of factors expected to be extracted) had a good RMSEA.
Factors | RMSEA | lower | upper |
---|---|---|---|
1 | 0.167 | 0.160 | 0.174 |
2 | 0.120 | 0.112 | 0.129 |
3 | 0.089 | 0.079 | 0.099 |
4 | 0.056 | 0.044 | 0.070 |
5 | 0.033 | 0.013 | 0.054 |
The factor loadings for the five factor exploratory factor analysis was inspected. When repeating the analysis with differing random seeds, seven items were consistently loaded onto three factors, but three items were unstable, cross-loaded, or were the only item loading on a single factor. Consequently, these three items were removed.
Factor 1 | Factor 2 | Factor 3 | Factor 4 | Factor 5 | |
---|---|---|---|---|---|
Public support for conservation will grow over the next ten years (SO_1) | 0.62 | ||||
Government spending on conservation will grow over the next ten years (SO_2) | 0.80 | ||||
The harmful impact of people on nature will be less in ten years’ time than it is now (SO_3) | 0.66 | ||||
Human society will be more environmentally sustainable in ten years’ time than it is now (SO_4) | 0.89 | ||||
There will be more wildlife in ten years’ time than there is today (SO_5) | 0.92 | ||||
There will be more natural areas and habitats in ten years’ time than there are today (SO_6) | 0.72 | ||||
People will spend more recreational time in nature in ten years’ time than they do now (SO_7) | 1 | ||||
Nature will be able to provide the same benefits to people in ten years’ time as now (SO_8) | 0.47 | ||||
There will be more local participation in conservation in ten years’ time than now (SO_9) | 0.33 | ||||
Conservationists will have better tools and knowledge in ten years’ time than now (SO_10) | 0.83 |
A graded response model was used to explore the association between the assumed latent variable, situational optimism, and each item.
Furthermore, the analysis sought to extract a single latent variable, representing respondents’ optimism about the future of conservation. Consequently, correlated error terms were included for those items loaded on the same factor, using the test data.
# Using the same sub-set of data, but with ordered factors
test_DF2_ord <- mice.imp[[1]][-train_ind, ]
# The model
model_SO_simple <- '
SO =~ SO_1 + SO_2 + SO_3 +SO_4 + SO_5+ SO_6 + SO_8
# Correlated error terms
SO_1 ~~ SO_2
SO_3 ~~ SO_4
SO_5 ~~ SO_6
SO_5 ~~ SO_8
'
# Now lets run the SEM with the same
fit_SO_simple <- lavaan::cfa(model = model_SO_simple, estimator = "WLSMVS", data=test_DF2_ord[,SO_row_name], ordered = c("SO_1" , "SO_2" , "SO_3" , "SO_4", "SO_5", "SO_6", "SO_8"))
The RMSEA of this model is 0.055 (95% CI 0.03 - 0.08), and the Comparative Fit Index (CFI) is 0.996.
Although the structure of the LOTR is established, the polychoric correlation and Ordinal Alpha between the LOTR items were explored.
# Ordinal Alpha
round(alpha(poly_cor$rho)$total,2) # the Ordinal Alpha, using polychoric correlation mateix
Furthermore, the relationship between each item and the assumed latent variable dispositional optimism was also explored using a graded response model.
A SEM corresponding to the structure of the LOTR was fit, using the first multiple imputation dataset.
model_OP_method <- '
###### Dispositional optimism
# Dispositional optimism
OP =~ LOTR_1 + LOTR_2 + LOTR_3 + LOTR_4 + LOTR_5 + LOTR_6
# The method effect
method =~ LOTR_1 + LOTR_3 + LOTR_6
# OP and the method effect are orthognal
OP ~~0*method
'
# Now lets run the SEM with the same
fit_OP_meth <- lavaan::sem(model=model_OP_method, estimator = "WLSMVS", data=mice.imp[[1]] , ordered = c("LOTR_1" , "LOTR_2" , "LOTR_3" , "LOTR_4", "LOTR_5", "LOTR_6") )
The RMSEA of this model is 0.064 (95% CI 0.05 - 0.08), and CFI is 0.995.
After developing the structure of the situational optimism and LOTR parts of the model, these were combined in a full structural equation model, including additional explanatory variables.
# The model
model_SO_1 <- '
# Situational optimism
SO =~ SO_1 + SO_2 + SO_3 +SO_4 + SO_5+ SO_6 + SO_8
# Correlated error terms
SO_1 ~~ SO_2
SO_3 ~~ SO_4
SO_5 ~~ SO_6
SO_5 ~~ SO_8
# Dispositional optimism
OP =~ LOTR_1 + LOTR_2 + LOTR_3 + LOTR_4 + LOTR_5 + LOTR_6
# The method effect
method =~ LOTR_1 + LOTR_3 + LOTR_6
# OP and the method effect are orthogonal
OP ~~0*method
# Regression part
### Regression part
SO ~ OP +
# Years conservation
years_cons_scaled +
# Gender - RL = female
gender_Male + gender_Unknown +
# WH
WH_scaled +
# Academic/practice - RL = academic
position_simple_Practice + position_simple_Unknown.other +
# Education - RL = non-university
education_simple_University + education_simple_Unknown +
# SO region - RL = SO_region_NorthAmeric
SO_region_EastAsia_Pacific + SO_region_Europe_CentralAsia + SO_region_LatinAmerica_Caribbean + SO_region_MiddleEast_NorthAfrica + SO_region_SouthAsia + SO_region_Sub_SaharanAfrica + SO_region_Unknown
'
model_SO_1_MI <- runMI(model_SO_1, data = mice.imp, fun="sem", estimator = "WLSMVS", ordered = c("SO_1" , "SO_2" , "SO_3" , "SO_4", "SO_5", "SO_6", "SO_7", "SO_9","LOTR_1" , "LOTR_2" , "LOTR_3" , "LOTR_4", "LOTR_5", "LOTR_6"), FUN = fitMeasures)
# Extracting the fit measures for each model
# Lists
list_fits <- list()
# Extract
for (i in seq_along(1:length(mice.imp))){
list_fits[[i]] <- data.frame(
MI_DF <- i,
CFI = model_SO_1_MI@funList[[i]]$userFUN1["cfi"],
RMSEA = model_SO_1_MI@funList[[i]]$userFUN1["rmsea"],
RMSEA_lower = model_SO_1_MI@funList[[i]]$userFUN1["rmsea.ci.lower"],
RMSEA_upper = model_SO_1_MI@funList[[i]]$userFUN1["rmsea.ci.upper"])
}
# into a single DF
list_fits_DF <- do.call(rbind,list_fits)
The mean CFI for the models implimented for the ten imputed datasets were 0.949 (SD = 0), and the mean RMSEA is 0.054 (SD = 0).
The composite variable, and stacked datasets, were constructed.
# A function that returns complete cases for each goal, progress towards each goal, the name of that goal, anc converting back from numeric to ordered factor
complete.goals <- function(DF, goal){
DF_goal <- DF[complete.cases(DF[,goal]),]
DF_goal$goal_num <- as.factor(goal)
DF_goal$goal_prog <- DF_goal[,goal]
DF_goal$goal_prog <- as.ordered(DF_goal$goal_prog)
return(DF_goal)
}
# Lists of the stacked DF (stacked by complete case of each goal) for each MI dataset
mice.imp_personal <- list_along(1:length(mice.imp))
mice.imp_people <- list_along(1:length(mice.imp))
mice.imp_nature <- list_along(1:length(mice.imp))
for (i in seq_along(1:10)){
# Personal
mice.imp_personal[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_2_b" ), complete.goals(mice.imp[[i]], "GP_3_b" ), complete.goals(mice.imp[[i]], "GP_4_b" ))
mice.imp_personal[[i]]$goal_prog <- as.ordered(mice.imp_personal[[i]]$goal_prog)
mice.imp_personal[[i]]$goal_prog <- plyr::mapvalues(mice.imp_personal[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
# People
mice.imp_people[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_6_b" ), complete.goals(mice.imp[[i]], "GP_7_b" ), complete.goals(mice.imp[[i]], "GP_8_b" ))
mice.imp_people[[i]]$goal_prog <- as.ordered(mice.imp_people[[i]]$goal_prog)
mice.imp_people[[i]]$goal_prog <- plyr::mapvalues(mice.imp_people[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
# Nature
mice.imp_nature[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_5_b" ), complete.goals(mice.imp[[i]], "GP_9_b" ), complete.goals(mice.imp[[i]], "GP_10_b" ))
mice.imp_nature[[i]]$goal_prog <- as.ordered(mice.imp_nature[[i]]$goal_prog)
mice.imp_nature[[i]]$goal_prog <- plyr::mapvalues(mice.imp_nature[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
}
Reported progress for the goals was expected to be positively correlated with each other, which they were.
# Polychoric correlation between variables
poly_cor = polychoric(mice.imp[[1]][,c("GP_1_b","GP_2_b","GP_3_b","GP_4_b","GP_5_b","GP_6_b","GP_7_b","GP_8_b","GP_9_b","GP_10_b")])
cor.plot(poly_cor$rho, numbers=F, upper=FALSE, main = "Polychoric Correlation", show.legend = T)
The degree to which the proportional odds assumptions was likely to hold was visually inspected, for each goal type model.
The proportional odds assumption generally held, although:
# Counts of regions
table(mice.imp[[1]]$W_region)
##
## EastAsia_Pacific Europe_CentralAsia LatinAmerica_Caribbean
## 329 691 144
## MiddleEast_NorthAfrica NorthAmerica SouthAsia
## 28 495 232
## Sub_SaharanAfrica Unknown
## 272 127
# Combine SSA and NA/ME
for (i in seq_along(1:length(mice.imp))){
mice.imp[[i]]$W_region <- as.factor(mice.imp[[i]]$W_region)
mice.imp[[i]]$W_region <- recode(mice.imp[[i]]$W_region , MiddleEast_NorthAfrica = "Africa_MiddleEast", Sub_SaharanAfrica = "Africa_MiddleEast")
mice.imp[[i]]$W_region <- droplevels(mice.imp[[i]]$W_region)
mice.imp[[i]]$W_region <- relevel(mice.imp[[i]]$W_region, "NorthAmerica")
}
The following checks were made when implementing the Bayesian ordinal regressions using the first imputed dataset. This follows the When to worry and how to Avoid the Misuse of Bayesian Statistics (WAMBS)-Checklist. As uninformative priors were used, the steps involving comparing informative to uninformative priors were skipped.
1) Description of priors
There was limited previous research which could be used to construct informative priors. Consequently, as this was an exploratory study, generic and unformative priors were chosen. These were considered uninformative, when compared to the sample size and explanatory variable scale (all continuous variables were scaled and centered). These are generic, and so the same prior distributional form was supplied for all coefficient and intercept parameters, as there was no prior evidence suggesting a specific form, and the normal distribution was considered to be parsimonious. Consequently, all coefficient and intercepts priors were fixed to
\[ prior \sim \mathcal{N}(0,100)\ \]
where $ $ denotes the normal distribution. Additionally, the default weakly informative priors were used for the standard deviation of random effects. This was a half student-t distribution with 3 degrees of freedom, and a scale parameter of 2.5. This prior was used in all three models.
# Combine SSA and NA/ME
for (i in seq_along(1:length(mice.imp))){
mice.imp[[i]]$W_region <- as.factor(mice.imp[[i]]$W_region)
mice.imp[[i]]$W_region <- recode(mice.imp[[i]]$W_region , MiddleEast_NorthAfrica = "Africa_MiddleEast", Sub_SaharanAfrica = "Africa_MiddleEast")
mice.imp[[i]]$W_region <- droplevels(mice.imp[[i]]$W_region)
mice.imp[[i]]$W_region <- relevel(mice.imp[[i]]$W_region, "NorthAmerica")
}
# A function that returns complete cases for each goal, progress towards each goal, the name of that goal, anc converting back from numeric to ordered factor
complete.goals <- function(DF, goal){
DF_goal <- DF[complete.cases(DF[,goal]),]
DF_goal$goal_num <- as.factor(goal)
DF_goal$goal_prog <- DF_goal[,goal]
DF_goal$goal_prog <- as.ordered(DF_goal$goal_prog)
return(DF_goal)
}
# Lists of the stacked DF (stacked by complete case of each goal) for each MI dataset
mice.imp_personal <- list_along(1:length(mice.imp))
mice.imp_people <- list_along(1:length(mice.imp))
mice.imp_nature <- list_along(1:length(mice.imp))
for (i in seq_along(1:10)){
# Personal
mice.imp_personal[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_2_b" ), complete.goals(mice.imp[[i]], "GP_3_b" ), complete.goals(mice.imp[[i]], "GP_4_b" ))
mice.imp_personal[[i]]$goal_prog <- as.ordered(mice.imp_personal[[i]]$goal_prog)
mice.imp_personal[[i]]$goal_prog <- plyr::mapvalues(mice.imp_personal[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
# People
mice.imp_people[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_6_b" ), complete.goals(mice.imp[[i]], "GP_7_b" ), complete.goals(mice.imp[[i]], "GP_8_b" ))
mice.imp_people[[i]]$goal_prog <- as.ordered(mice.imp_people[[i]]$goal_prog)
mice.imp_people[[i]]$goal_prog <- plyr::mapvalues(mice.imp_people[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
# Nature
mice.imp_nature[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_5_b" ), complete.goals(mice.imp[[i]], "GP_9_b" ), complete.goals(mice.imp[[i]], "GP_10_b" ))
mice.imp_nature[[i]]$goal_prog <- as.ordered(mice.imp_nature[[i]]$goal_prog)
mice.imp_nature[[i]]$goal_prog <- plyr::mapvalues(mice.imp_nature[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
}
# The model formula
brm_form <- formula("goal_prog ~ goal_num + years_cons_scaled + gender + WH_scaled + position_simple + education_simple + W_region + (1|ID)")
The cumulative logit models were run with the above specified priors, for 20,000 iterations with a burn-in of 10000 iterations which were discarded, with four chains.
### Run the models for the first imputed dataset ###
burn_in <- 10000
iterations <- 20000
# Personal
start_time_personal <- Sys.time()
personal_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_personal[[1]], warmup = burn_in, iter = iterations, chains = 4, seed = 123, prior = prior_all)
save(personal_model, file ="personal_model.RData")
end_time_personal <- Sys.time()
end_time_personal - start_time_personal
# People
start_time_people <- Sys.time()
people_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_people[[1]], warmup = burn_in, iter = iterations, chains = 4, seed = 123, prior = prior_all)
save(people_model, file ="people_model.RData")
end_time_people <- Sys.time()
end_time_people - start_time_people
# Nature
start_time_nature <- Sys.time()
nature_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_nature[[1]], warmup = burn_in, iter = iterations, chains = 4, seed = 123, prior = prior_all)
save(nature_model, file ="nature_model.RData")
end_time_nature <- Sys.time()
end_time_nature - start_time_nature
# # load models run elsewhere
path2 <- c("C:/Users/wolf5246/Dropbox/Oxford/PhD/Chapter 4/Analysis/Processing/")
load(paste0(path2, "personal_model.Rdata"))
load(paste0(path2, "people_model.Rdata"))
load(paste0(path2, "nature_model.Rdata"))
# s3load(object = "personal_model.Rdata", bucket = s3BucketName)
# s3load(object = "people_model.Rdata", bucket = s3BucketName)
# s3load(object = "nature_model.Rdata", bucket = s3BucketName)
2) Examining convergence
The trace-plots for each model and chain, and for each parameter, indicate convergence. PSRF values from the Gelman-Rubin Diagnostic were also examined, checking if the upper CI was near 1, which it was for all models and parameters.
The Geweke Diagnostic, testing for equality between the first and last half of the iterations, was performed. The estimates were also plotted, examining if the values exceed the 1.96 or below -1.96 z-score boundaries. Both tests indicated the models had converged.
# Gelman-Rubin Diagnostic
personal_modelposterior <- as.mcmc(personal_model)
#gelman.diag(personal_modelposterior[, 1:param_num])
gelman.plot(personal_modelposterior[, 1:param_num])
# Geweke diagnostic
#geweke.diag(personal_modelposterior[,1:param_num])
geweke.plot(personal_modelposterior[, 1:param_num])
# Gelman-Rubin Diagnostic
people_modelposterior <- as.mcmc(people_model)
#gelman.diag(people_modelposterior[, 1:param_num])
gelman.plot(people_modelposterior[, 1:param_num])
# Geweke diagnostic
#geweke.diag(people_modelposterior[,1:param_num])
geweke.plot(people_modelposterior[, 1:param_num])
# Gelman-Rubin Diagnostic
nature_modelposterior <- as.mcmc(nature_model)
#gelman.diag(nature_modelposterior[, 1:param_num])
gelman.plot(nature_modelposterior[, 1:param_num])
# Geweke diagnostic
#geweke.diag(nature_modelposterior[, 1:param_num])
geweke.plot(nature_modelposterior[, 1:param_num])
3) Examination of convergence after doubling the number of iterations
To check that the models had not simply reached local convergence, the model was re-run with twice the number of iterations, although we had to reduce the number of chains because the object was very large. We also re-examined the Gelman-Rubin and Geweke diagnostics. There appeared to be no indication of local convergence.
### Run the models for the first imputed dataset ###
burn_in <- 10000
iterations_double <- 30000
# Personal
personal_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_personal[[1]], warmup = burn_in, iter = iterations_double, chains = 2, seed = 123, prior = prior_all)
save(personal_model_double, file ="personal_model_double.RData")
# People
people_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_people[[1]], warmup = burn_in, iter = iterations_double, chains = 2, seed = 123, prior = prior_all)
save(people_model_double, file ="people_model_double.RData")
# Nature
nature_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_nature[[1]], warmup = burn_in, iter = iterations_double, chains = 2, seed = 123, prior = prior_all)
save(nature_model_double, file ="nature_model_double.RData")
# I have to clear everything for this chunk to run
rm(list= ls())
# iterations
burn_in <- 10000
iterations_double <- 30000
param_num <- 21
# load models run elsewhere
path2 <- c("C:/Users/wolf5246/Dropbox/Oxford/PhD/Chapter 4/Analysis/Processing/")
load(paste0(path2, "personal_model_double.Rdata"))
# s3load(object = "personal_model_double.Rdata", bucket = s3BucketName)
# s3load(object = "people_model_double.Rdata", bucket = s3BucketName)
# s3load(object = "nature_model_double.Rdata", bucket = s3BucketName)
# Gelman-Rubin Diagnostic
personal_model_posterior_d <- as.mcmc(personal_model_double)
#gelman.diag(personal_model_posterior_d[, 1:param_num])
gelman.plot(personal_model_posterior_d[, 1:param_num])
# Gelman-Rubin Diagnostic
people_model_posterior_d <- as.mcmc(people_model_double)
#gelman.diag(people_model_posterior_d[, 1:param_num])
gelman.plot(people_model_posterior_d[, 1:param_num])
# Gelman-Rubin Diagnostic
nature_model_posterior_d <- as.mcmc(nature_model_double)
#gelman.diag(nature_model_posterior_d[, 1:param_num])
gelman.plot(nature_model_posterior_d[, 1:param_num])
4) Examine the histogram of estimates in each iteration to determine if there were enough iterations to approximate the posterior
Examining a histogram of posterior distribution density indicates if it contains enough information. Regression coefficients do not have to be symetric, but should form a peak with sloping sides.
5) Examine the degree of autocorrelation between the chains
The autocorrelation between chains was examined. High autocorrelation alone is not an issue, but if accompanied by poor convergence, might be indicative of several problems.
6) Evaluate if the posterior distribution is sensible
The posterior distributions were checked to see if they are sensible. This included checking they were smooth, did not have a posterior SD greater than the scale of the original parameter, and the range of the posterior CI was not greater than the scale of the original parameter.
# Personal
mcmc_plot(personal_model, type = "dens", pars = "^b_")
# People
mcmc_plot(people_model, type = "dens", pars = "^b_")
# Nature
mcmc_plot(nature_model, type = "dens", pars = "^b_")