Missing data and multiple imputation

# Subset to get the complete cases DF
analysis_var <-c("SO_1" , "SO_2" , "SO_3" ,"SO_4" , "SO_5", "SO_6" , "SO_7" , "SO_8" , "SO_9" , "SO_10", "LOTR_1" , "LOTR_2" , "LOTR_3" ,  "LOTR_4" , "LOTR_5" , "LOTR_6", "years_cons_scaled" , "gender", "WH_scaled", "position_simple" ,"education_simple" , "SO_coun", "GP_1_a","GP_2_a","GP_3_a","GP_4_a","GP_5_a","GP_6_a","GP_7_a","GP_8_a","GP_9_a","GP_10_a") # Data used in the analysis 
DF2_variables <- DF2[,c(analysis_var)]
DF2_complete.cases <- DF2_variables[complete.cases(DF2_variables),]

missing_years <- paste0(round(table(is.na(DF2$years_cons))[2]/nrow(DF2)*100,1), "%")

A total of 2323 completed the survey; there are complete observations for 1853 respondents (including missing observations for categorical data, which are included with the level “unknown”). The variable with the highest rate of missing data was ‘years in conservation’, where 13.9% of observations were missing.

Additionally, goal progress questions are answered in a step-wise fashion, meaning there are expected patterns of missingness for the GP_1_b to GP_10_b data. Years in conservation (“years_cons”), work hours (“WH”), and age (“age_year”) may not be missing at random. (Work country is likely to be missing at random, since it was asked later in the survey, and so is probably not answered because respondents did not complete the survey.) The consequences of this missingness will be evaluated later.

This missing data is substituted through multivariate imputation by chained equations (MICE) of ten datasets, using the package ‘mice’. Here, synthetic values are computed based on the variables in the data, using an appropriate model type. Ordered logistic regression was used to impute ordinal data, predictive mean matching is used for numeric variables, polytomous regression for un-ordered factors, and logistic regression for binary data. Additional data, not used in the analysis, was included to help impute the missing values (e.g. “health”).

############################# MI ############################# 
### Setting up the envionrment and loading data ###
### Subsetting the data used in the analysis ###
# Retain variables used in the analysis
analysis_vars <- c("GP_1_a","GP_2_a", "GP_3_a","GP_4_a", "GP_5_a","GP_6_a", "GP_7_a","GP_8_a", "GP_9_a","GP_10_a","K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","GP_total_scaled","ERI_n_scaled", "ERI_o_scaled","age_year_scaled","SO_11","health","years_cons_scaled", "gender","K10_total","SS1",  "SS2", "SS3",  "PS_1", "PS_2","PS_3", "national.nnational" , "WH_scaled" , "CV", "position_simple", "education_simple")
DF3 <- DF2[analysis_vars]

# Convert to ordinal
DF3[,c("K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","SO_11","health","PS_1","PS_2", "PS_3","SS1", "SS2","SS3")] <- lapply(DF3[,c("K10_1","K10_2","K10_3","K10_4","K10_5","K10_6","K10_7","K10_8","K10_9","K10_10","LOTR_1","LOTR_3","LOTR_6","LOTR_2","LOTR_4" , "LOTR_5","SO_1", "SO_2", "SO_3", "SO_4","SO_5", "SO_6", "SO_8", "SO_7", "SO_9","SO_10","SO_11","health","PS_1","PS_2", "PS_3","SS1", "SS2","SS3")], as.ordered)

# And and converted to numeric
DF3[,c("GP_total_scaled","ERI_n_scaled","ERI_o_scaled","age_year_scaled","years_cons_scaled","K10_total","WH_scaled")] <- lapply(DF3[,c("GP_total_scaled","ERI_n_scaled","ERI_o_scaled","age_year_scaled","years_cons_scaled","K10_total","WH_scaled")], as.numeric)

# Ensuring factors are coded as such 
DF3[,c("education_simple", "gender",  "position_simple")] <- lapply(DF3[,c("education_simple", "gender","position_simple")], as.factor)

# Drop "other" as a catagory in gender, as there were too few observations to model 
DF3<-DF3[!(DF3$gender=="Other"),]
DF3<- droplevels(DF3)

### Conducting the MI ### 

# Number of imputed df 
N.Imp = 10

# Conduct multiple imputation using the mice package
# Here we have to specify the imputation method for each variable: 
str_out <- data.frame(capture.output(str(DF3)))

# Delete the first and last row
str_out <- data.frame(str_output = matrix(str_out[2:(nrow(str_out)-1),]))

# Create a column that contain the appropreate model for each variable - this only works if the variable is of the correct data type in the first place 
str_out$type <- ifelse(grepl("Ord.factor", str_out$str_output, fixed = TRUE)==T, "polr", 
                       ifelse(grepl("num", str_out$str_output, fixed = TRUE)==T, "pmm", 
                              ifelse(grepl("Factor", str_out$str_output, fixed = TRUE)==T, "polyreg",
                                     ifelse(grepl("int", str_out$str_output, fixed = TRUE)==T, "logreg", "ERROR"))))

# Conduct the MI - with the number of datasets specified by N.Imp, and the estimation type specified by str_out$type (derived from the above)
DF3_imp <- mice(DF3, m = N.Imp, method = str_out$type )

# GP_b variables
GP_b <- c("GP_1_b","GP_2_b","GP_3_b","GP_4_b","GP_6_b","GP_7_b","GP_8_b"  , "GP_5_b" ,"GP_9_b","GP_10_b")

# Drop "other" as a catagory in gender, as there were too few observations to model 
DF2_sub<-DF2[!(DF2$gender=="Other"),]
DF2_sub<- droplevels(DF2_sub)

# Extract each DF
mice.imp <- list()
for(i in 1:N.Imp) {
  
  mice.imp[[i]] <- mice::complete(DF3_imp, action= i, inc=FALSE)
  
  # Add data 
  mice.imp[[i]]$Nation <- DF2_sub$Nation
  mice.imp[[i]]$W_coun <- DF2_sub$W_coun
  mice.imp[[i]]$SO_coun <- DF2_sub$SO_coun
  mice.imp[[i]]$nation_region <- DF2_sub$nation_region
  mice.imp[[i]]$W_region <- DF2_sub$W_region
  mice.imp[[i]]$SO_region <- DF2_sub$SO_region
  
  # Add Goal progress, and make sure they're all ordered 
  mice.imp[[i]] <- cbind(mice.imp[[i]], DF2_sub[,GP_b] )
  
  # Add respondent ID 
  mice.imp[[i]]$ID <- DF2_sub$ID
  
  # Turn factor into series of binary variables
  mice.imp[[i]] <- dummy_cols(mice.imp[[i]], select_columns = c("gender", "education_simple", "position_simple", "nation_region", "W_region","SO_region"))
  
  # remove spaces and other charecters on colnames
  colnames( mice.imp[[i]]) <- gsub(" ", "", colnames( mice.imp[[i]]), fixed = TRUE)
  colnames( mice.imp[[i]]) <- gsub("/", "", colnames( mice.imp[[i]]), fixed = TRUE)
  colnames( mice.imp[[i]]) <- gsub("-", "_", colnames( mice.imp[[i]]), fixed = TRUE)
  colnames( mice.imp[[i]]) <- gsub("&", "_", colnames( mice.imp[[i]]), fixed = TRUE)
  }

# Save the DF 
save(mice.imp, file =  paste0(path, "mice.imp.Rdata")) 

Situational optimism

The following steps were taken in the development of the situational optimism latent variable. First, the first imputed dataset was randomly split into training (70%) and test (30%) data.

The polychoric correlation between the ten situational optimism items within the training data were examined, showing good correlation.

The Ordinal Alpha was then examined, showing good internal conssitency between items.

# Ordinal Alpha 
round(alpha(poly_cor$rho)$total,2) # the Ordinal Alpha, using polychoric correlation mateix 

Parallel analysis of the ten items within the training data, using polychoric correlation and WLS was performed, suggested the extraction of three factors.

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

The root mean square error of approximation (RMSEA) across exploratory factor analysis with increasing numbers of factors was examined. The three factor model had a very high RMSEA, whereas the five factor model (the number of factors expected to be extracted) had a good RMSEA.

Factors RMSEA lower upper
1 0.167 0.160 0.174
2 0.120 0.112 0.129
3 0.089 0.079 0.099
4 0.056 0.044 0.070
5 0.033 0.013 0.054

The factor loadings for the five factor exploratory factor analysis was inspected. When repeating the analysis with differing random seeds, seven items were consistently loaded onto three factors, but three items were unstable, cross-loaded, or were the only item loading on a single factor. Consequently, these three items were removed.

Factor 1 Factor 2 Factor 3 Factor 4 Factor 5
Public support for conservation will grow over the next ten years (SO_1) 0.62
Government spending on conservation will grow over the next ten years (SO_2) 0.80
The harmful impact of people on nature will be less in ten years’ time than it is now (SO_3) 0.66
Human society will be more environmentally sustainable in ten years’ time than it is now (SO_4) 0.89
There will be more wildlife in ten years’ time than there is today (SO_5) 0.92
There will be more natural areas and habitats in ten years’ time than there are today (SO_6) 0.72
People will spend more recreational time in nature in ten years’ time than they do now (SO_7) 1
Nature will be able to provide the same benefits to people in ten years’ time as now (SO_8) 0.47
There will be more local participation in conservation in ten years’ time than now (SO_9) 0.33
Conservationists will have better tools and knowledge in ten years’ time than now (SO_10) 0.83

A graded response model was used to explore the association between the assumed latent variable, situational optimism, and each item.

Furthermore, the analysis sought to extract a single latent variable, representing respondents’ optimism about the future of conservation. Consequently, correlated error terms were included for those items loaded on the same factor, using the test data.

# Using the same sub-set of data, but with ordered factors  
test_DF2_ord <-  mice.imp[[1]][-train_ind, ]

# The model
model_SO_simple <- '
SO =~ SO_1 + SO_2 + SO_3 +SO_4 + SO_5+ SO_6  + SO_8

# Correlated error terms 
SO_1 ~~ SO_2
SO_3 ~~ SO_4 
SO_5 ~~ SO_6 
SO_5 ~~ SO_8 
'

# Now lets run the SEM with the same
fit_SO_simple <- lavaan::cfa(model = model_SO_simple,  estimator = "WLSMVS", data=test_DF2_ord[,SO_row_name], ordered = c("SO_1" , "SO_2" , "SO_3" , "SO_4", "SO_5", "SO_6",  "SO_8"))

The RMSEA of this model is 0.055 (95% CI 0.03 - 0.08), and the Comparative Fit Index (CFI) is 0.996.

Dispositional optimism

Although the structure of the LOTR is established, the polychoric correlation and Ordinal Alpha between the LOTR items were explored.

# Ordinal Alpha 
round(alpha(poly_cor$rho)$total,2) # the Ordinal Alpha, using polychoric correlation mateix

Furthermore, the relationship between each item and the assumed latent variable dispositional optimism was also explored using a graded response model.

A SEM corresponding to the structure of the LOTR was fit, using the first multiple imputation dataset.

model_OP_method <- '
###### Dispositional optimism 
# Dispositional optimism 
OP =~ LOTR_1 + LOTR_2 + LOTR_3 +  LOTR_4 + LOTR_5 + LOTR_6

# The method effect 
method =~  LOTR_1 + LOTR_3 + LOTR_6

# OP and the method effect are orthognal 
OP ~~0*method
'

# Now lets run the SEM with the same
fit_OP_meth <- lavaan::sem(model=model_OP_method,  estimator = "WLSMVS",  data=mice.imp[[1]] , ordered = c("LOTR_1" , "LOTR_2" , "LOTR_3" , "LOTR_4", "LOTR_5", "LOTR_6") )

The RMSEA of this model is 0.064 (95% CI 0.05 - 0.08), and CFI is 0.995.

Full SEM

After developing the structure of the situational optimism and LOTR parts of the model, these were combined in a full structural equation model, including additional explanatory variables.

# The model
model_SO_1 <- '
# Situational optimism 
SO =~ SO_1 + SO_2 + SO_3 +SO_4 + SO_5+ SO_6  + SO_8

# Correlated error terms 
SO_1 ~~ SO_2
SO_3 ~~ SO_4 
SO_5 ~~ SO_6 
SO_5 ~~ SO_8 

# Dispositional optimism 
OP =~ LOTR_1 + LOTR_2 + LOTR_3 +  LOTR_4 + LOTR_5 + LOTR_6

# The method effect 
method =~  LOTR_1 + LOTR_3 + LOTR_6

# OP and the method effect are orthogonal 
OP ~~0*method

# Regression part
### Regression part
SO ~ OP +

# Years conservation
years_cons_scaled + 

# Gender - RL = female 
gender_Male + gender_Unknown + 

# WH
WH_scaled + 

# Academic/practice - RL = academic 
position_simple_Practice + position_simple_Unknown.other + 

# Education - RL = non-university
education_simple_University  + education_simple_Unknown + 

# SO region - RL = SO_region_NorthAmeric
SO_region_EastAsia_Pacific + SO_region_Europe_CentralAsia + SO_region_LatinAmerica_Caribbean + SO_region_MiddleEast_NorthAfrica + SO_region_SouthAsia + SO_region_Sub_SaharanAfrica + SO_region_Unknown 
'
model_SO_1_MI <- runMI(model_SO_1, data = mice.imp, fun="sem", estimator = "WLSMVS", ordered = c("SO_1" , "SO_2" , "SO_3" , "SO_4", "SO_5", "SO_6", "SO_7",  "SO_9","LOTR_1" , "LOTR_2" , "LOTR_3" , "LOTR_4", "LOTR_5", "LOTR_6"), FUN = fitMeasures)
# Extracting the fit measures for each model 
# Lists
list_fits <- list()
# Extract
for (i in seq_along(1:length(mice.imp))){
  list_fits[[i]] <- data.frame(
    MI_DF <- i,
    CFI = model_SO_1_MI@funList[[i]]$userFUN1["cfi"],
    RMSEA = model_SO_1_MI@funList[[i]]$userFUN1["rmsea"],
    RMSEA_lower = model_SO_1_MI@funList[[i]]$userFUN1["rmsea.ci.lower"],
    RMSEA_upper = model_SO_1_MI@funList[[i]]$userFUN1["rmsea.ci.upper"])
}
# into a single DF 
list_fits_DF <- do.call(rbind,list_fits)

The mean CFI for the models implimented for the ten imputed datasets were 0.949 (SD = 0), and the mean RMSEA is 0.054 (SD = 0).

Goal progress

The composite variable, and stacked datasets, were constructed.

# A function that returns complete cases for each goal, progress towards each goal, the name of that goal, anc converting back from numeric to ordered factor 
complete.goals <- function(DF, goal){
  DF_goal <- DF[complete.cases(DF[,goal]),]
  DF_goal$goal_num <- as.factor(goal) 
  DF_goal$goal_prog <- DF_goal[,goal]
  DF_goal$goal_prog <- as.ordered(DF_goal$goal_prog)
  return(DF_goal)
}

# Lists of the stacked DF (stacked by complete case of each goal) for each MI dataset
mice.imp_personal <- list_along(1:length(mice.imp))
mice.imp_people <- list_along(1:length(mice.imp))
mice.imp_nature <- list_along(1:length(mice.imp))

for (i in seq_along(1:10)){
  # Personal 
  mice.imp_personal[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_2_b" ), complete.goals(mice.imp[[i]], "GP_3_b" ),  complete.goals(mice.imp[[i]], "GP_4_b" ))
  mice.imp_personal[[i]]$goal_prog <- as.ordered(mice.imp_personal[[i]]$goal_prog)
  mice.imp_personal[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_personal[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
  # People
    mice.imp_people[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_6_b"  ), complete.goals(mice.imp[[i]], "GP_7_b" ),  complete.goals(mice.imp[[i]],  "GP_8_b" ))
      mice.imp_people[[i]]$goal_prog <- as.ordered(mice.imp_people[[i]]$goal_prog)
  mice.imp_people[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_people[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
    
    # Nature
      mice.imp_nature[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_5_b" ), complete.goals(mice.imp[[i]], "GP_9_b" ),  complete.goals(mice.imp[[i]], "GP_10_b" ))
        mice.imp_nature[[i]]$goal_prog <- as.ordered(mice.imp_nature[[i]]$goal_prog)
  mice.imp_nature[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_nature[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
}

Reported progress for the goals was expected to be positively correlated with each other, which they were.

# Polychoric correlation between variables
poly_cor = polychoric(mice.imp[[1]][,c("GP_1_b","GP_2_b","GP_3_b","GP_4_b","GP_5_b","GP_6_b","GP_7_b","GP_8_b","GP_9_b","GP_10_b")])
cor.plot(poly_cor$rho, numbers=F, upper=FALSE, main = "Polychoric Correlation", show.legend = T)

The degree to which the proportional odds assumptions was likely to hold was visually inspected, for each goal type model.

The proportional odds assumption generally held, although:

  • “MiddleEast_NorthAfrica” appears to have inconsistent associations compared to other regions. To address this, “MiddleEast_NorthAfrica” was combined with “Sub_SaharanAfrica” into the variable “Africa_MiddleEast”. See the next step, to see if it helped.
  • One of the goal name levels (GP_4_b) was inconsistent with the others. The extent to which this influenced the results will be examined (later).
# Counts of regions 
table(mice.imp[[1]]$W_region)
## 
##       EastAsia_Pacific     Europe_CentralAsia LatinAmerica_Caribbean 
##                    329                    691                    144 
## MiddleEast_NorthAfrica           NorthAmerica              SouthAsia 
##                     28                    495                    232 
##      Sub_SaharanAfrica                Unknown 
##                    272                    127
# Combine SSA and NA/ME
for (i in seq_along(1:length(mice.imp))){
  mice.imp[[i]]$W_region <- as.factor(mice.imp[[i]]$W_region)
  mice.imp[[i]]$W_region <- recode(mice.imp[[i]]$W_region , MiddleEast_NorthAfrica = "Africa_MiddleEast", Sub_SaharanAfrica = "Africa_MiddleEast")
   mice.imp[[i]]$W_region <- droplevels(mice.imp[[i]]$W_region)
   mice.imp[[i]]$W_region <- relevel(mice.imp[[i]]$W_region, "NorthAmerica")
}

Fitting the Bayesian ordinal regression

The following checks were made when implementing the Bayesian ordinal regressions using the first imputed dataset. This follows the When to worry and how to Avoid the Misuse of Bayesian Statistics (WAMBS)-Checklist. As uninformative priors were used, the steps involving comparing informative to uninformative priors were skipped.

  1. Description of priors
  2. Examining convergence
  3. Examination of convergence after doubling the number of iterations
  4. Examine the histogram of estimates in each iteration to determine if there were enough iterations to approximate the posterior
  5. Examine the degree of autocorrelation between the chains
  6. Evaluate if the posterior distribution is sensible
  7. Ensuring the results are correctly reported and interpreted

1) Description of priors

There was limited previous research which could be used to construct informative priors. Consequently, as this was an exploratory study, generic and unformative priors were chosen. These were considered uninformative, when compared to the sample size and explanatory variable scale (all continuous variables were scaled and centered). These are generic, and so the same prior distributional form was supplied for all coefficient and intercept parameters, as there was no prior evidence suggesting a specific form, and the normal distribution was considered to be parsimonious. Consequently, all coefficient and intercepts priors were fixed to

\[ prior \sim \mathcal{N}(0,100)\ \]

where $ $ denotes the normal distribution. Additionally, the default weakly informative priors were used for the standard deviation of random effects. This was a half student-t distribution with 3 degrees of freedom, and a scale parameter of 2.5. This prior was used in all three models.

# Combine SSA and NA/ME
for (i in seq_along(1:length(mice.imp))){
  mice.imp[[i]]$W_region <- as.factor(mice.imp[[i]]$W_region)
  mice.imp[[i]]$W_region <- recode(mice.imp[[i]]$W_region , MiddleEast_NorthAfrica = "Africa_MiddleEast", Sub_SaharanAfrica = "Africa_MiddleEast")
  mice.imp[[i]]$W_region <- droplevels(mice.imp[[i]]$W_region)
  mice.imp[[i]]$W_region <- relevel(mice.imp[[i]]$W_region, "NorthAmerica")
  
}

# A function that returns complete cases for each goal, progress towards each goal, the name of that goal, anc converting back from numeric to ordered factor 
complete.goals <- function(DF, goal){
  DF_goal <- DF[complete.cases(DF[,goal]),]
  DF_goal$goal_num <- as.factor(goal) 
  DF_goal$goal_prog <- DF_goal[,goal]
  DF_goal$goal_prog <- as.ordered(DF_goal$goal_prog)
  return(DF_goal)
}

# Lists of the stacked DF (stacked by complete case of each goal) for each MI dataset
mice.imp_personal <- list_along(1:length(mice.imp))
mice.imp_people <- list_along(1:length(mice.imp))
mice.imp_nature <- list_along(1:length(mice.imp))

for (i in seq_along(1:10)){
  # Personal 
  mice.imp_personal[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_2_b" ), complete.goals(mice.imp[[i]], "GP_3_b" ),  complete.goals(mice.imp[[i]], "GP_4_b" ))
  mice.imp_personal[[i]]$goal_prog <- as.ordered(mice.imp_personal[[i]]$goal_prog)
  mice.imp_personal[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_personal[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
  
  # People
  mice.imp_people[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_6_b"  ), complete.goals(mice.imp[[i]], "GP_7_b" ),  complete.goals(mice.imp[[i]],  "GP_8_b" ))
  mice.imp_people[[i]]$goal_prog <- as.ordered(mice.imp_people[[i]]$goal_prog)
  mice.imp_people[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_people[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
  
  # Nature
  mice.imp_nature[[i]] <- rbind(complete.goals(mice.imp[[i]], "GP_5_b" ), complete.goals(mice.imp[[i]], "GP_9_b" ),  complete.goals(mice.imp[[i]], "GP_10_b" ))
  mice.imp_nature[[i]]$goal_prog <- as.ordered(mice.imp_nature[[i]]$goal_prog)
  mice.imp_nature[[i]]$goal_prog <-  plyr::mapvalues(mice.imp_nature[[i]]$goal_prog, from = c("1", "2","3", "4", "5"), to = c("Very dissatisfied" , "Dissatisfied" , "Neutral" , "Satisfied" , "Very satisfied"))
}
# The model formula
brm_form <- formula("goal_prog ~ goal_num + years_cons_scaled + gender + WH_scaled + position_simple + education_simple + W_region + (1|ID)")

The cumulative logit models were run with the above specified priors, for 20,000 iterations with a burn-in of 10000 iterations which were discarded, with four chains.

### Run the models for the first imputed dataset ### 
burn_in <- 10000
iterations <- 20000
# Personal 
start_time_personal <- Sys.time()
personal_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_personal[[1]], warmup = burn_in,  iter  = iterations,  chains = 4, seed = 123,  prior = prior_all)
save(personal_model, file ="personal_model.RData")
end_time_personal <- Sys.time()
end_time_personal - start_time_personal

# People 
start_time_people <- Sys.time()
people_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_people[[1]], warmup = burn_in,  iter   = iterations,  chains = 4, seed = 123,  prior = prior_all)
save(people_model, file ="people_model.RData")
end_time_people <- Sys.time()
end_time_people - start_time_people

# Nature 
start_time_nature <- Sys.time()
nature_model <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_nature[[1]], warmup = burn_in,  iter   = iterations,  chains = 4, seed = 123,  prior = prior_all)
save(nature_model, file ="nature_model.RData")
end_time_nature <- Sys.time()
end_time_nature - start_time_nature
# # load models run elsewhere
path2 <- c("C:/Users/wolf5246/Dropbox/Oxford/PhD/Chapter 4/Analysis/Processing/")
load(paste0(path2, "personal_model.Rdata"))
load(paste0(path2, "people_model.Rdata"))
load(paste0(path2, "nature_model.Rdata"))
# s3load(object = "personal_model.Rdata", bucket = s3BucketName)
# s3load(object = "people_model.Rdata", bucket = s3BucketName)
# s3load(object = "nature_model.Rdata", bucket = s3BucketName)

2) Examining convergence

The trace-plots for each model and chain, and for each parameter, indicate convergence. PSRF values from the Gelman-Rubin Diagnostic were also examined, checking if the upper CI was near 1, which it was for all models and parameters.

The Geweke Diagnostic, testing for equality between the first and last half of the iterations, was performed. The estimates were also plotted, examining if the values exceed the 1.96 or below -1.96 z-score boundaries. Both tests indicated the models had converged.

# Gelman-Rubin Diagnostic 
personal_modelposterior <- as.mcmc(personal_model)
#gelman.diag(personal_modelposterior[, 1:param_num])
gelman.plot(personal_modelposterior[, 1:param_num])

#  Geweke diagnostic
#geweke.diag(personal_modelposterior[,1:param_num])
geweke.plot(personal_modelposterior[, 1:param_num])

# Gelman-Rubin Diagnostic 
people_modelposterior <- as.mcmc(people_model)
#gelman.diag(people_modelposterior[, 1:param_num])
gelman.plot(people_modelposterior[, 1:param_num])

# Geweke diagnostic
#geweke.diag(people_modelposterior[,1:param_num])
geweke.plot(people_modelposterior[, 1:param_num])

# Gelman-Rubin Diagnostic 
nature_modelposterior <- as.mcmc(nature_model)
#gelman.diag(nature_modelposterior[, 1:param_num])
gelman.plot(nature_modelposterior[, 1:param_num])

# Geweke diagnostic
#geweke.diag(nature_modelposterior[, 1:param_num])
geweke.plot(nature_modelposterior[, 1:param_num])

3) Examination of convergence after doubling the number of iterations

To check that the models had not simply reached local convergence, the model was re-run with twice the number of iterations, although we had to reduce the number of chains because the object was very large. We also re-examined the Gelman-Rubin and Geweke diagnostics. There appeared to be no indication of local convergence.

### Run the models for the first imputed dataset ### 
burn_in <- 10000
iterations_double <- 30000
# Personal 
personal_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_personal[[1]], warmup = burn_in,  iter  = iterations_double,  chains = 2, seed = 123,  prior = prior_all)
save(personal_model_double, file ="personal_model_double.RData")

# People 
people_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_people[[1]], warmup = burn_in,  iter   = iterations_double,  chains = 2, seed = 123,  prior = prior_all)
save(people_model_double, file ="people_model_double.RData")

# Nature 
nature_model_double <- brm(brm_form, family = cumulative(link = "logit", threshold = "flexible"), data = mice.imp_nature[[1]], warmup = burn_in,  iter   = iterations_double,  chains = 2, seed = 123,  prior = prior_all)
save(nature_model_double, file ="nature_model_double.RData")
# I have to clear everything for this chunk to run 
rm(list= ls())

# iterations 
burn_in <- 10000
iterations_double <- 30000
param_num <- 21

# load models run elsewhere
path2 <- c("C:/Users/wolf5246/Dropbox/Oxford/PhD/Chapter 4/Analysis/Processing/")
load(paste0(path2, "personal_model_double.Rdata"))
# s3load(object = "personal_model_double.Rdata", bucket = s3BucketName)
# s3load(object = "people_model_double.Rdata", bucket = s3BucketName)
# s3load(object = "nature_model_double.Rdata", bucket = s3BucketName)

# Gelman-Rubin Diagnostic 
personal_model_posterior_d <- as.mcmc(personal_model_double) 
#gelman.diag(personal_model_posterior_d[, 1:param_num])
gelman.plot(personal_model_posterior_d[, 1:param_num])

# Gelman-Rubin Diagnostic 
people_model_posterior_d <- as.mcmc(people_model_double) 
#gelman.diag(people_model_posterior_d[, 1:param_num])
gelman.plot(people_model_posterior_d[, 1:param_num])

# Gelman-Rubin Diagnostic 
nature_model_posterior_d <- as.mcmc(nature_model_double) 
#gelman.diag(nature_model_posterior_d[, 1:param_num])
gelman.plot(nature_model_posterior_d[, 1:param_num])

4) Examine the histogram of estimates in each iteration to determine if there were enough iterations to approximate the posterior

Examining a histogram of posterior distribution density indicates if it contains enough information. Regression coefficients do not have to be symetric, but should form a peak with sloping sides.

5) Examine the degree of autocorrelation between the chains

The autocorrelation between chains was examined. High autocorrelation alone is not an issue, but if accompanied by poor convergence, might be indicative of several problems.

6) Evaluate if the posterior distribution is sensible

The posterior distributions were checked to see if they are sensible. This included checking they were smooth, did not have a posterior SD greater than the scale of the original parameter, and the range of the posterior CI was not greater than the scale of the original parameter.

# Personal 
mcmc_plot(personal_model, type = "dens", pars = "^b_")

# People 
mcmc_plot(people_model, type = "dens", pars = "^b_")

# Nature 
mcmc_plot(nature_model, type = "dens", pars = "^b_")

Next steps