Dissertation Code

Work Summary

Open ended question
Twenty Words
Nine Definitions
Frameworks & Rankings within them
Should there be a definition? Why?
Conservation only Questions

Loading In Code

working_directory <- getwd()
setwd(working_directory)
getwd()

#install.packages("example") #For installing packages you don't have
library(tidyverse)
library(vegan)

# For the PCR analysis on data camp
library(factoextra)

library(ordinal) #Testing For difference in word order 

library(tm) #For text mining 
library(tidytext) 

# For nice looking graph when comparing words
library(ggrepel)

# From advice from CLM
library(ggeffects)

# For the means and CIs in the graphs
library(Hmisc)

starting_data_that_i_read_in_1 <- read.csv("(1) Edited Data.csv")
# view(starting_data_that_i_read_in_1)

starting_data_that_i_read_in_2 <- read.csv("(2) Edited Data.csv")
# view(starting_data_that_i_read_in_2)

changed_open_ended_questions <- read.csv("open_worded_responses.csv")
# view(changed_open_ended_questions)

Data manipulation

# Adding an extra collumn to make it is clear where each answer came from
starting_data_with_a_1 <- starting_data_that_i_read_in_1
starting_data_with_a_1$questionnaire <- "a"

starting_data_with_b_2 <- starting_data_that_i_read_in_2
starting_data_with_b_2$questionnaire <- "b"

first_combined_df_pre_edit <- bind_rows(starting_data_with_a_1, starting_data_with_b_2)
# View(first_dataframe_pre_edit)

# Now changing columns to make it more manageable
first_dataframe_pre_edit <- first_combined_df_pre_edit

# Changing the long definitions to very quick abbreviations
long_defintions <- names(first_dataframe_pre_edit)[24:32]
def_names <- c("def1", "def2", "def3", "def4", "def5", "def6", "def7","def8","def9")
names(first_dataframe_pre_edit)[names(first_dataframe_pre_edit) %in% long_defintions] <- def_names

# Changing the column names of the AW or C questions
what_you_do_for_work <- names(first_dataframe_pre_edit)[45:46]
work_names <- c("work_scale", "work_absolute")
names(first_dataframe_pre_edit)[names(first_dataframe_pre_edit) %in% what_you_do_for_work] <- work_names

first_dataframe_pre_edit <- first_dataframe_pre_edit %>% 
  rename(should.there.be.a.def = Do.you.believe.there.should.be.one.definition.of.animal.welfare.that.everyone.refers.to.)

# Putting the changed responses into the definitions columns (See OneNote for changes)
first_dataframe_pre_edit$define.animal.welfare <- changed_open_ended_questions$define.animal.welfare

first_dataframe <- first_dataframe_pre_edit

# Getting rid of the wording for "7"
first_dataframe[3:22] <- lapply(first_dataframe[3:22], function(x) {
  x <- as.character(x)
  replace(x, x == "7 - SHOULD be included", 7)
})

# Getting rid of the wording for "1"
# Literally no idea why I cannot just use the code above, think it has something to do with the formatting of Microsoft Forms, how peculiar 
first_dataframe[3:22] <- lapply(first_dataframe[3:22], function(y) {
  y <- as.character(y)
  y[grepl("^1\\s*-\\s*Should\\s+NOT\\s+be\\s+included", y)] <- 1
  y
})

# Getting rid of the wording for "1 - entirely conservation" on the work_scale column
# Getting rid of the wording for "9 - entirely animal welfare" on the work_scale column
# Getting rid of the wording for "Not applicable" on the work_scale column
first_dataframe[45] <- lapply(first_dataframe[45], function(y) {
 y <- as.character(y)
  y[grepl("^1\\s*-\\s*entirely\\s+conservation", y)] <- 1
  y
})

first_dataframe[45] <- lapply(first_dataframe[45], function(y) {
 y <- as.character(y)
  y[grepl("^9\\s*-\\s*entirely\\s+animal\\s+welfare", y)] <- 9
  y
})

first_dataframe[45] <- lapply(first_dataframe[45], function(y) {
 y <- as.character(y)
  y[grepl("s*Not\\s+applicable", y)] <- 0
  y
})

#Other biological field (e.g. Ecology, Behaviour, Disease)
first_dataframe["work_absolute"] <- lapply(first_dataframe["work_absolute"], function(y) {
 y <- as.character(y)
  y[grepl("^Other\\sbiological\\sfield \\(e\\.g\\. Ecology, Behaviour, Disease\\)$", y)] <- "Other biological field"
  y
})

#Getting rid of non-breaking spaces in the work_absolute collumn
first_dataframe["work_absolute"] <- lapply(first_dataframe["work_absolute"],function(y) {
  y <- as.character(y)
  y <- gsub("\u00A0", " ", y)   #replaces non-breaking space with normal space
  y <- trimws(y)               #remove leading/trailing spaces
  y
})


# Creating a vector with all the word names
word_col_names <- c("physical", "function.", "sentience", "consciousness", 
                    "wellbeing", "feelings", "stress", "fitness", "suffering", 
                    "happiness", "moral", "cope", "choice", "mental", "want",
                    "ethical", "natural", "health", "need", "distress")

# Turning all of the data withihn the 20 words to factors as ordinal data
first_dataframe[word_col_names] <- lapply(first_dataframe[word_col_names], factor, ordered = TRUE)
# is.factor(first_dataframe[[4]]) #Checking this 

# Turning all of the data withihn the 9 definitions to factors as ordinal data
# Added the "levels = 1:7" as one of the defs only had 6 levels
first_dataframe[def_names] <- lapply(first_dataframe[def_names], factor, levels = 1:7, ordered = TRUE)
# is.factor(first_dataframe[[27]]) #Checking this


# Splitting the 'continuous' data up into four distinct categories 
first_dataframe$work_scale <- lapply(first_dataframe$work_scale, as.numeric)
first_dataframe$size_group <- with(first_dataframe,
  ifelse(work_scale %in% 1:3, "conservation",
  ifelse(work_scale %in% 4:6, "middle",
  ifelse(work_scale %in% 7:9, "animal welfare", "not applicable"
         ))))


# is.character(first_dataframe$experience) #Checking if the data is character
# Merge the three groups
first_dataframe$experience[first_dataframe$experience %in% c("Less than one year", "1-4 years", "5-9 years")] <- "Less than 10 years"

# Convert back to factor
first_dataframe$experience <- factor(first_dataframe$experience)

table(first_dataframe$experience)

first_dataframe$experience <- factor(first_dataframe$experience,
                         levels = c("Less than 10 years",
                                    "10 or more years"),
                         ordered = TRUE) #Use to order the data

# Turning the data into a factor
first_dataframe$work_absolute <- as.factor(first_dataframe$work_absolute)
first_dataframe$experience <- as.factor(first_dataframe$experience)

# Changing the name of the these collumns
names(first_dataframe)[names(first_dataframe) == "If.you.answered.no..why.do.you.believe.there.should.not.be.one.definition.of.animal.welfare.."] <- "no.def.question"

names(first_dataframe)[names(first_dataframe) == "If.you.answered.yes..why.do.you.believe.there.should.be.one.definition.of.animal.welfare."] <- "yes.def.question"

#Changing the names to make it more managable
names(first_dataframe)[names(first_dataframe) == "Do.you.think.animal.welfare.has.helped.or.hindered.your.research.in.conservation."] <- "solo.help.hindered.conservation"
names(first_dataframe)[names(first_dataframe) == "Do.you.think.animal.welfare.has.helped.or.hindered.the.practice.of.conservation."] <- "all.help.hindered.conservation"

Making the MCQs binary

#This involves the frameworks questions and the three demographic questions taxa, continents and animal
#Removing all the non-breaking spaces and then making everything lowercase for the three different MCQs
first_dataframe <- first_dataframe %>%
  mutate(taxa = gsub("\u00A0", "", tolower(taxa))) %>%
  mutate(continent = gsub("\u00A0", "", tolower(continent))) %>%
  mutate(animal = gsub("\u00A0", "", tolower(animal))) %>%
  mutate(frameworks = gsub("\u00A0", "", tolower(frameworks))) %>% 
  mutate(yes.def.question = gsub("\u00A0", "", tolower(yes.def.question))) %>% 
  mutate(no.def.question = gsub("\u00A0", "", tolower(no.def.question)))


# Creating a table where I can put my data in 
continent_df <- as.data.frame(matrix(NA, nrow = 105, ncol = 8))

colnames(continent_df) <- c("africa", 
                     "antarctica", 
                     "asia", 
                     "europe",
                     "north america", 
                     "south america",
                     "australia and oceania", 
                     "prefer not to say")

# make column names syntactically safe
colnames(continent_df) <- make.names(colnames(continent_df))

# loop over each row and fill 1/0 depending on whether option appears
for(i in seq_len(nrow(first_dataframe))) {
  # get this row's response
  response <- first_dataframe$continent[i]
  # split on semicolon and trim spaces
  choices <- str_trim(unlist(strsplit(response, ";")))
  # make syntactically consistent with column names
  choices <- make.names(choices)
  # assign 1 if chosen, 0 otherwise
  continent_df[i, ] <- ifelse(colnames(continent_df) %in% choices, 1, 0)
}

# Adding another collumn adding up the 1s and 0s
continent_df$total_selected <- rowSums(continent_df)



# Creating a table where I can put my data in 
taxa_df <- as.data.frame(matrix(NA, nrow = 105, ncol = 14))

colnames(taxa_df) <- c("invertebrates(excludingthetwolistedabove)", 
                     "mammals", 
                     "birds",
                     "fish", 
                     "reptiles",
                     "amphibians",
                     "arthropods",
                     "cephalopods",
                     "various",
                     "nonespecific",
                     "plants",
                     "workisnottaxaspecific",
                     "prefer not to say")

# make column names syntactically safe
colnames(taxa_df) <- make.names(colnames(taxa_df))

# loop over each row and fill 1/0 depending on whether option appears
for(i in seq_len(nrow(first_dataframe))) {
  # get this row's response
  response <- first_dataframe$taxa[i]
  # split on semicolon and trim spaces
  choices <- str_trim(unlist(strsplit(response, ";")))
  # make syntactically consistent with column names
  choices <- make.names(choices)
  # assign 1 if chosen, 0 otherwise
  taxa_df[i, ] <- ifelse(colnames(taxa_df) %in% choices, 1, 0)
}

# Changing row 53 because of what they put in the "other" section
# Currentlyonlymammals,previouslysevenyearswithmammals,birds,reptilesandamphibians;
# Not sure wether to turn that into an "other" collumn or put it into all four

# create new combined column (row-wise sum of four columns)
taxa_df$other <- rowSums(taxa_df[, c("plants", "workisnottaxaspecific", "nonespecific", "various")], na.rm = TRUE)

# delete the original four columns
taxa_df <- taxa_df[, !(names(taxa_df) %in% c("plants", "workisnottaxaspecific", "nonespecific", "various"))]

#Changing "other" due to row 53, can change this later after talk with supervisor 
taxa_df[53, "other"] <- taxa_df[53, "other"] + 1

# Adding another collumn adding up the 1s and 0s
taxa_df$total_selected <- rowSums(taxa_df)



# # Finding the "other" collumn so I can write it below 
# unique(first_dataframe$animal)
first_dataframe$animal <- gsub(",", "", first_dataframe$animal)

# Creating a table where I can put my data in 
animal_df <- as.data.frame(matrix(NA, nrow = 105, ncol = 7))

colnames(animal_df) <- c("wild animals in the wild", 
                     "farmanimals", 
                     "animals in zoos and aquariums", 
                     "domesticated pets",
                     "animals in a laboratory setting",
                     "prefer not to say",
                     "other")

# make column names syntactically safe
colnames(animal_df) <- make.names(colnames(animal_df))

# loop over each row and fill 1/0 depending on whether option appears
for(i in seq_len(nrow(first_dataframe))) {
  # get this row's response
  response <- first_dataframe$animal[i]
  # split on semicolon and trim spaces
  choices <- str_trim(unlist(strsplit(response, ";")))
  # remove commas
  choices <- gsub(",", "", choices)
  # make syntactically consistent with column names
  choices <- make.names(choices)
  # assign 1 if chosen, 0 otherwise
  animal_df[i, ] <- ifelse(colnames(animal_df) %in% choices, 1, 0)
}

# Row: 9, 13, 18, 21, 34, 43, 50, 53, 68, 71, 73, 89, 104, 105 are "other" (manual)
# Don't know how to do this with code and don't care
# Adding a 1 to all the rows listed above
animal_df[c(9, 13, 18, 21, 34, 43, 50, 53, 68, 71, 73, 89, 104, 105), "other"] <- 
  animal_df[c(9, 13, 18, 21, 34, 43, 50, 53, 68, 71, 73, 89, 104, 105), "other"] + 1

# Adding another collumn adding up the 1s and 0s
animal_df$total_selected <- rowSums(animal_df)


# Creating a table where I can put my data in 
frameworks_binary_df <- as.data.frame(matrix(NA, nrow = 105, ncol = 9))

framework_names <- c("five freedoms",
                      "five.domains",
        "three.orientations..biological.functioning..affective.state..natural.living.",
                      "dynamic.animal.welfare.concept..dawcon.",
                      "X3rs..replacement..reduction..refinement.",
                      "quality.of.life",
                      "life.worth.living",
                      "welfare.quality..project",
                      "i.have.not.heard.of.any.of.these")

colnames(frameworks_binary_df) <- c(framework_names)

# make column names syntactically safe
colnames(frameworks_binary_df) <- make.names(colnames(frameworks_binary_df))

# loop over each row and fill 1/0 depending on whether option appears
for(i in seq_len(nrow(first_dataframe))) {
  # get this row's response
  response <- first_dataframe$frameworks[i]
  # split on semicolon and trim spaces
  choices <- str_trim(unlist(strsplit(response, ";")))
  # make syntactically consistent with column names
  choices <- make.names(choices)
  # assign 1 if chosen, 0 otherwise
  frameworks_binary_df[i, ] <- ifelse(colnames(frameworks_binary_df) %in% choices, 1, 0)
}

# Adding another collumn adding up the 1s and 0s EXCEPT THE 'NONE' collumn
frameworks_binary_df$total_selected <- rowSums(
  frameworks_binary_df[, !names(frameworks_binary_df) %in% c("i.have.not.heard.of.any.of.these")]
)

# Getting rid of the really long name to make future code look nicer
frameworks_binary_df <- rename(frameworks_binary_df, "three.orientations" = "three.orientations..biological.functioning..affective.state..natural.living.")


# Creating a table where I can put my data in 
def_yes_df <- as.data.frame(matrix(NA, nrow = 105, ncol = 6))

colnames(def_yes_df) <- c("not.applicable", 
                     "a definition eliminates ambiguity", 
                     "allows for more effective communication and collaboration", 
                     "informs decision making",
                     "do not know", 
                     "other")

# make column names syntactically safe
colnames(def_yes_df) <- make.names(colnames(def_yes_df))

# loop over each row and fill 1/0 depending on whether option appears
for(i in seq_len(nrow(first_dataframe))) {
  # get this row's response
  response <- first_dataframe$yes.def.question[i]
  # split on semicolon and trim spaces
  choices <- str_trim(unlist(strsplit(response, ";")))
  # make syntactically consistent with column names
  choices <- make.names(choices)
  # assign 1 if chosen, 0 otherwise
  def_yes_df[i, ] <- ifelse(colnames(def_yes_df) %in% choices, 1, 0)
}

# Africa: 20, Europe: 11, N.America: 18, A&O: 13, 1or2: 30, 3 or more: 13
continent_df <- continent_df %>%
  mutate(final_group_continent = case_when(
    prefer.not.to.say == 1  ~ "prefer_not_to_say",
    africa == 1 & total_selected == 1 ~ "africa",
    (europe == 1 & total_selected == 1) | 
    (asia == 1 & total_selected == 1) | 
    (europe == 1 & asia == 1 & total_selected == 2) ~ "eurasia",
    (north.america == 1 & total_selected == 1) | 
    (south.america == 1 & total_selected == 1) | 
    (north.america == 1 & south.america == 1 & total_selected == 2) ~ "the_americas",
    australia.and.oceania == 1 & total_selected == 1 ~ "australia_and_oceania",
    TRUE            ~ "other"
  ))


# 66 just mammals, 25 mammals and other, 14 non-mammals
taxa_df <- taxa_df %>%
  mutate(final_group_taxa = case_when(
    prefer.not.to.say == 1  ~ "prefer_not_to_say",
    mammals == 1 ~ "mammals",
    TRUE            ~ "non_mammals"
  ))

# wild animals in the wild: 37, other singles: 28, 2+: 40
animal_df <- animal_df %>%
  mutate(final_group_animal = case_when(
    prefer.not.to.say == 1  ~ "prefer_not_to_say",
    wild.animals.in.the.wild == 1 & total_selected == 1 ~ "wild_animals",
    wild.animals.in.the.wild == 1 ~ "wild_animals_and_captive",
    TRUE            ~ "just captive"
  ))

# Binding the three above collumns to first dataframe
first_dataframe$final_group_animal <- animal_df$final_group_animal
first_dataframe$final_group_taxa <- taxa_df$final_group_taxa
first_dataframe$final_group_continent <- continent_df$final_group_continent

Open-ended Question

Analysis

# My dictionary of words, it makes the words on the left equivalent to those on the right. This is needed for both the word cloud and scatter plot (and related analysis) as some words have differnt spelling, some are missplet, some are plural etc. There are currenlty 66 words that are substituted.
dictionary <- c(
  "affectives" = "affective",
  "behave" = "behaviour",
  "behavior" = "behaviour",
  "behaviors" = "behaviour",
  "behaviours" = "behaviour",
  "behavioural" = "behaviour",
  "caring" = "care",
  "cared" = "caring",
  "captivity" = "captive",
  "concepts" = "concept",
  "conditions" = "condition",
  "coping" = "cope",
  "dies" = "death",
  "emotionally" = "emotional",
  "ensures" = "ensuring",
  "ensured" = "ensuring",
  "environmental" = "environment",
  "experiences" = "experience",
  "experienced" = "experience",
  "experiencing" = "experience",
  "experiential" = "experience",
  "feeling" = "feelings",
  "freedom" = "freedoms",
  "functioning" = "function",
  "healthy" = "health",
  "humans" = "human",
  "impacts" = "impact",
  "impacted" = "impact",
  "includes" = "including",
  "interacting" = "interaction",
  "interactions" = "interaction",
  "interventions" = "intervention",
  "includes" = "including",
  "includes" = "including",
  "individuals" = "individual",
  "lives" = "live",
  "lived" = "live",
  "living" = "live",
  "mentally" = "mental",
  "minimizing" = "minimising",
  "minimal" = "minimising",
  "minimises" = "minimising",
  "minimization" = "minimising",
  "minimum" = "minimising",
  "naturalness" = "natural",
  "nutritious" = "nutrition",
  "physically" = "physical",
  "phsiological" = "physiological",
  "physiology" = "physiological",
  "relation" = "relates",
  "psychologically" = "psychological",
  "reflections" = "reflection",
  "reflects" = "reflection",
  "related" = "relation",
  "relates" = "relation",
  "respects" = "respect",
  "safe" = "saftey",
  "scientific" = "science",
  "stressors" = "stress",
  "treat" = "treatment",
  "treated" = "treatment",
  "welbeing" = "wellbeing",
  "wellness" = "wellbeing"
)

# Creating a new dataframe with just the open-ended data in it
open_ended_data <- data.frame(full_answer = first_dataframe$define.animal.welfare)

# Grabbing data from first dataframe
content_analysis_df <- first_dataframe[, c(1, 2, 46)]

tokens <- content_analysis_df %>%
  select("id", "work_absolute", "define.animal.welfare") %>%
  unnest_tokens(word, define.animal.welfare) # one word per row, lowercased, punctuation removed

tokens$work_absolute <- make.names(tokens$work_absolute) # special code that will be handy later

# removing empty tokens or pure numbers
tokens <- tokens %>% filter(word != "", !str_detect(word, "^\\d+$"))

# Removing the "filler words"
tokens_basic_words_removed <- tokens %>%
  anti_join(stop_words, by = c("word" = "word"))

# Counting how many times each word appears
word_counts <- tokens_basic_words_removed %>%
  dplyr::count(work_absolute, word, sort = TRUE)

# Removing animal and welfare from the count
word_counts <- word_counts[!grepl("animal|welfare|define|definition", word_counts$word, ignore.case = TRUE), ]

# Removing basic words or names that I do not need for analysis
remove_words <- c("al", "e.g", "i.e", "duncan's", "ian", "fraser","frasers", "fraser's", 
                  "it's", "mellors", "pre", "r's", "that's")
# Removing the above words
word_counts <- word_counts %>%
  filter(!word %in% remove_words)

# Adding the effects of the above dictonary and combining their counts into one
word_counts <- word_counts %>%
  mutate(word = dplyr::recode(word, !!!dictionary)) %>% 
  aggregate(n ~ work_absolute + word, data = ., sum)

# Creating all possible combinations of work_absolute × word
word_counts_complete <- word_counts %>%
  complete(work_absolute, word, fill = list(n = 0))  %>%
  mutate(word = as.factor(word))


# Filtering out all words that appear less than 5 times in the highest group
word_counts_complete <- word_counts_complete %>%
  filter(work_absolute %in% c("Animal.Welfare", "Conservation")) %>%
  group_by(word) %>%
  filter(max(n) >= 5) %>%
  ungroup()

# Trying to make the scatter plot between words that I have envisoned
# Turing it W I D E
word_count_wide <- pivot_wider(
  word_counts_complete,
  names_from = work_absolute,
  values_from = n)

# Making the graph as a percantage of words used by each group
# calculate total words per group
group_totals <- aggregate(n ~ work_absolute, data = word_counts_complete, sum)

# merge totals back into main data
word_percent <- merge(word_counts_complete, group_totals,
                    by = "work_absolute",
                    suffixes = c("", "_total"))

# Calculating percentage
word_percent$percent <- word_percent$n / word_percent$n_total

word_wide_percent <- reshape(word_percent[, c("work_absolute", "word", "percent")],
                           idvar = "word",
                           timevar = "work_absolute",
                           direction = "wide")

# Making a percentage plot but this one is by number of respondants
n_aw <- 36  
n_c <- 45

plot_data <- word_counts_complete %>%
  filter(work_absolute %in% c("Animal.Welfare", "Conservation")) %>%
  pivot_wider(names_from = work_absolute, values_from = n, values_fill = 0) %>%
  mutate(
    freq_welfare = (Animal.Welfare) / n_aw,
    freq_conservation = (Conservation) / n_c
  )

# Sensitive words I maybe want to get rid of
sensitive_words <- c("treatment", "positive", "natural", "freedoms")
# Useless words that I maybe want to get rid of
useless_words <- c("ensuring", "including", "negative", "positive")

plot_data_clean <- plot_data %>%
  filter(!word %in% useless_words)

ggplot(data = plot_data_clean, aes(x = freq_welfare, y = freq_conservation)) +
 geom_point(size = 2, alpha = 0.7) +
  geom_text_repel(aes(label = word),
                  max.overlaps = Inf,
                  box.padding = 0.4,   
                  size = 3,
                  min.segment.length = 0.1,
                  segment.size = 0.3,    
                  segment.color = "grey50") + 
  coord_fixed() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
  labs(x = "Word frequency (Animal Welfare)",
       y = "Word frequency (Conservation)") +
  theme_minimal() +
  theme(text = element_text(family = "Times New Roman"))

cor.test(plot_data_clean$Animal.Welfare, plot_data_clean$Conservation, method = "spearman")

#tokens_basic_words_removed has id, group and word but does not have the words grouped together
tokens_basic_words_removed <- tokens_basic_words_removed %>%
  mutate(word = dplyr::recode(word, !!!dictionary))


# Filtering to only Animal.Welfare and Conservation 
df <- tokens_basic_words_removed %>%
  filter(work_absolute %in% c("Animal.Welfare", "Conservation"))
  # group_by(word, work_absolute) %>%
  # filter(n() > 2) %>%
  # ungroup()

# Baseline log ratio 
baseline <- df %>%
  group_by(work_absolute, word) %>%
  summarise(n = n(), .groups = "drop") %>%
  pivot_wider(names_from = work_absolute, values_from = n, values_fill = 0) %>%
  mutate(ratio_baseline = log2((Animal.Welfare + 0.5) / (Conservation + 0.5))) %>% 
  group_by(word) %>%
  filter(max(Animal.Welfare, Conservation) >= 5) %>%
  ungroup()

# Leave-one-out loop 
respondents <- unique(df$id)

loo_results <- map_dfr(respondents, function(rid) {
  df %>%
    filter(id != rid) %>%
    group_by(work_absolute, word) %>%
    summarise(n = n(), .groups = "drop") %>%
    pivot_wider(names_from = work_absolute, values_from = n, values_fill = 0) %>%
    mutate(
      ratio_loo  = log2((Animal.Welfare + 0.5) / (Conservation + 0.5)),
      removed_id = rid
    )
})

# Joining and compute influence
influence <- loo_results %>%
  left_join(baseline %>% select(word, ratio_baseline), by = "word") %>%
  mutate(delta = abs(ratio_loo - ratio_baseline)) %>%
  group_by(word) %>%
  summarise(
    max_influence    = max(delta),
    mean_influence   = mean(delta),
    most_influential = removed_id[which.max(delta)],
    .groups = "drop"
  ) %>%
  left_join(baseline, by = "word")

# Flagging fragile words 
# 0.5 on log2 scale ≈ one group using a word ~40% more than the other
# Adjust this threshold up or down to be more/less strict
influence <- influence %>%
  mutate(fragile = max_influence > 0.5)

# Summarising table of fragile words 
influence %>%
  filter(fragile) %>%
  arrange(desc(max_influence)) %>%
  select(word, ratio_baseline, max_influence, most_influential) %>%
  print()

Statistical Tests & Graphs- Words&Definitions

Likert graph for the 20 words

############################# Just AW responses
#Creating a graph shownig just AW responses
likert.20.aw <- first_dataframe[first_dataframe$work_absolute == "Animal Welfare", c("id", word_col_names)]

# Making the data LONG
likert.20.aw.long <- likert.20.aw %>%
  pivot_longer(
    cols = 2:21,
    names_to = "word",
    values_to = "answer")

likert.20.aw.long <- likert.20.aw.long %>%
  count(word, answer) %>%
  group_by(word) %>%
  mutate(prop = n / sum(n)) %>% # This is the percent working out
  ungroup()

############################# Just C responses
#Creating a graph shownig just AW responses
likert.20.c <- first_dataframe[first_dataframe$work_absolute == "Conservation", c("id", word_col_names)]

# Making the data LONG
likert.20.c.long <- likert.20.c %>%
  pivot_longer(
    cols = 2:21,
    names_to = "word",
    values_to = "answer")

likert.20.c.long <- likert.20.c.long %>%
  count(word, answer) %>%
  group_by(word) %>%
  mutate(prop = n / sum(n)) %>% # This is the percent working out
  ungroup()


############################## Just Other responses
likert.20.other <- first_dataframe[first_dataframe$work_absolute == "Other biological field", c("id", word_col_names)]

# Making the data LONG
likert.20.other.long <- likert.20.other %>%
  pivot_longer(
    cols = 2:21,
    names_to = "word",
    values_to = "answer")

likert.20.other.long <- likert.20.other.long %>%
  count(word, answer) %>%
  group_by(word) %>%
  mutate(prop = n / sum(n)) %>% # This is the percent working out
  ungroup()
# Function for making the data long in order to put it into ggplot
prepare_likert_long <- function(df, word_cols) {
  df %>%
    select(id, all_of(word_cols)) %>%
    pivot_longer(
      cols = -id,
      names_to = "word",
      values_to = "answer"
    ) %>%
    count(word, answer) %>%
    group_by(word) %>%
    mutate(prop = n / sum(n)) %>%
    ungroup()
}


# Function for making the ggplot likert graph
plot_likert_gg <- function(df, word_cols, word_order = "positive", y.label = "Word") {
  
  # Working out the proportions
  long_data <- df %>%
    select(id, all_of(word_cols)) %>%
    pivot_longer(cols = -id, names_to = "word", values_to = "answer") %>%
    mutate(answer = as.numeric(as.character(answer))) %>%
    count(word, answer) %>%
    group_by(word) %>%
    mutate(prop = n / sum(n) * 100) %>%
    ungroup()
  
  # Computing the percentage labels 
  label_data <- long_data %>%
    mutate(group = case_when(
      answer %in% 1:3 ~ "negative",
      answer == 4     ~ "neutral",
      answer %in% 5:7 ~ "positive"
    )) %>%
    group_by(word, group) %>%
    summarise(pct = sum(prop), .groups = "drop")
  
  # For the positive/neutral/negative percentages
  plot_data <- long_data %>%
    mutate(answer = factor(answer, levels = 1:7)) %>%
    group_by(word) %>%
   mutate(neutral_half = {
     nh <- prop[answer == 4]
     if (length(nh) == 0) 0 else nh / 2
     }) %>%
    ungroup() %>%
    mutate(
      side = case_when(
        answer %in% 1:3 ~ "negative",
        answer == 4     ~ "neutral",
        answer %in% 5:7 ~ "positive"
      )
    ) %>%
    group_by(word, side) %>%
    mutate(
      answer_ordered = if_else(
        side == "negative",
        factor(answer, levels = c("3","2","1")),
        factor(answer, levels = c("1","2","3","4","5","6","7"))
      )
    ) %>%
    arrange(word, side, answer_ordered) %>%
    mutate(
      cumulative = cumsum(prop),
      cumulative_lag = lag(cumulative, default = 0)
    ) %>%
    ungroup() %>%
    mutate(
      neutral_half = sapply(seq_len(n()), function(i) {
        nh <- neutral_half[word == word[i] & side == "neutral"]
        if(length(nh) == 0) neutral_half[i] else nh[1]
      }),
      xmin = case_when(
        side == "negative" ~ -(cumulative     + neutral_half),
        side == "neutral"  ~ -prop / 2,
        side == "positive" ~ cumulative_lag  + neutral_half
      ),
      xmax = case_when(
        side == "negative" ~ -(cumulative_lag + neutral_half),
        side == "neutral"  ~  prop / 2,
        side == "positive" ~ cumulative      + neutral_half
      )
    )
  
  # Ordering words by positive percentage 
   if (length(word_order) == 1 && word_order == "positive") {
 ordered_words <- label_data %>%
  filter(group == "positive") %>%
  right_join(               
    distinct(label_data, word), 
    by = "word"
  ) %>%
  mutate(pct = replace_na(pct, 0)) %>%  
  arrange(pct) %>%
  pull(word)
 } else {
   ordered_words <- word_order
 }
   
 
  plot_data  <- plot_data  %>% mutate(word = factor(word, levels = ordered_words))
  label_data <- label_data %>% mutate(word = factor(word, levels = ordered_words))
  
  # Colours for the likert 
  likert_colours <- c(
    "1" = "#C0392B",
    "2" = "#E08080",
    "3" = "#F4B8B8",
    "4" = "#D3D3D3",
    "5" = "#B8C8F4",
    "6" = "#6080E0",
    "7" = "#2B3EC0"
  )
  
  # Plotting the graph
  ggplot(plot_data) +
    geom_rect(aes(xmin = xmin, xmax = xmax,
                  ymin = as.numeric(factor(word, levels = ordered_words)) - 0.4,
                  ymax = as.numeric(factor(word, levels = ordered_words)) + 0.4,
                  fill = answer)) +
    # Negative % label (left)
    geom_text(
      data = filter(label_data, group == "negative"),
      aes(x = -105, y = word, label = paste0(round(pct), "%")),
      colour = "#C0392B", fontface = "bold", size = 3, hjust = 1
    ) +
    # Positive % label (right)
    geom_text(
      data = filter(label_data, group == "positive"),
      aes(x = 105, y = word, label = paste0(round(pct), "%")),
      colour = "#2B3EC0", fontface = "bold", size = 3, hjust = 0
    ) +
    geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
    scale_fill_manual(values = likert_colours, name = "Ratings:") +
    scale_x_continuous(
      breaks = c(-100, -50, 0, 50, 100),
      labels = c("100", "50", "0", "50", "100"),
      limits = c(-120, 120)
    ) +
    scale_y_discrete(limits = ordered_words) +
    labs(x = "percentage", y = y.label) +
    theme_minimal() +
    theme(
      plot.title    = element_text(face = "bold", hjust = 0.5),
      legend.position = "bottom",
      legend.margin = margin(t = -8),
      panel.grid.major.y = element_blank(),
      text = element_text(family = "Times New Roman"),
      axis.text.y = element_text(size = 12)) +
    guides(fill = guide_legend(nrow = 1, keywidth = 0.9, keyheight = 0.9))
}

# A list of the order the 20 words go in when they show up on the AW list
aw.word.order.likert <- c("mental", "physical", "health", "need", "wellbeing", "choice", "feelings", "want", "sentience", "suffering", "stress", "natural", "function.", "happiness", "consciousness", "fitness", "distress", "cope", "ethical", "moral")

# Plotting for AW and C for the TWENTY WORDS
plot_likert_gg(likert.20.aw, word_col_names, "positive")
plot_likert_gg(likert.20.c, word_col_names, word_order = rev(aw.word.order.likert))
plot_likert_gg(likert.20.other, word_col_names, word_order = rev(aw.word.order.likert))

Likert graph for the 9 definitions

################ Animal Welfare
likert.9.aw <- first_dataframe[first_dataframe$work_absolute == "Animal Welfare",
                               c("id", def_names)]
likert.9.aw.long <- prepare_likert_long(likert.9.aw, def_names)

################ Conservation
likert.9.c <- first_dataframe[first_dataframe$work_absolute == "Conservation",
                               c("id", def_names)]
likert.9.c.long <- prepare_likert_long(likert.9.all, def_names)

################ Other
likert.9.other <- first_dataframe[first_dataframe$work_absolute == "Other biological field",
                               c("id", def_names)]
likert.9.other.long <- prepare_likert_long(likert.9.other, def_names)

# A list of the order the 9 defs go in when they show up on the AW list
aw.def.order.likert <- c("def6","def5","def2","def9","def3","def8","def4","def7","def1")
# Plotting for AW and C for the NINE DEFINTIONS
plot_likert_gg(likert.9.aw, def_names,"positive", "Definition")
plot_likert_gg(likert.9.c, def_names, word_order = rev(aw.def.order.likert), "Definition")
plot_likert_gg(likert.9.other, def_names, word_order = rev(aw.def.order.likert), "Definition")

Cumulative linear models (CLM)

# Creating a new df with all the information I need for the cumulative linear model
clm.df <- first_dataframe[, c("id", word_col_names, def_names, "work_absolute", 
                              "experience", "final_group_animal", "final_group_taxa", 
                              "final_group_continent", "questionnaire")]

# Removing all the prefer not to say options from the df
clm.df <- clm.df[!grepl("prefer_not_to_say", clm.df$final_group_animal, ignore.case = TRUE), ]
clm.df <- clm.df[!grepl("prefer_not_to_say", clm.df$final_group_taxa, ignore.case = TRUE), ]
clm.df <- clm.df[!grepl("prefer_not_to_say", clm.df$final_group_continent, ignore.case = TRUE), ]

# Turning work_absolute into a factor (maybe should do this at the start of doc?)
clm.df$work_absolute <- as.factor(clm.df$work_absolute)

# Sticking the smallest group (Austrailia) with Other 
clm.df$final_group_continent[clm.df$final_group_continent %in% c("other", "australia_and_oceania")] <- "other+"

# Turning them into factored data as they should be. Did this for work and exp above
clm.df$final_group_animal <- as.factor(clm.df$final_group_animal)
clm.df$final_group_taxa <- as.factor(clm.df$final_group_taxa)
clm.df$final_group_continent <- as.factor(clm.df$final_group_continent)
clm.df$questionnaire <- as.factor(clm.df$questionnaire)

# Making a df with no "Other" group in it
clm.df.no.other <- clm.df
clm.df.no.other <- clm.df.no.other[!grepl("Other biological field", clm.df.no.other$work_absolute, ignore.case = TRUE), ]

word.clm.function <- function(word.or.def) {
  anova.results <- clm(data = clm.df, clm.df[[word.or.def]]~(
      work_absolute + 
      experience +
      final_group_animal +
      final_group_taxa +
      final_group_continent +
      questionnaire
    ))

  x <- anova(anova.results)
}

# All 20 words at once 
clm.func.result.word <- lapply(word_col_names, word.clm.function)
# clm.func.result.word #commented out for ease


# Creating a dummy varible just for GVIF checking
clm.df$dummy <- as.numeric(clm.df$mental)

model_for_vif <- lm(dummy ~ work_absolute + experience + final_group_animal + 
                    final_group_taxa + final_group_continent + 
                    questionnaire, data = clm.df)

car::vif(model_for_vif)



# Running the nine defintion models
def.clm.function <- function(word.or.def) {
  anova.results <- clm(data = clm.df, clm.df[[word.or.def]]~(
        work_absolute +
        experience +
        final_group_animal +  
        final_group_taxa +
        final_group_continent +
        questionnaire

    ))

  x <- anova(anova.results)
}


 clm.func.result.def <- lapply(def_names, def.clm.function)
 # clm.func.result.def # commented out for ease

useful_words <- c("distress", "stress", "ethical", "fitness", "suffering", "mental", "function.")

# Slighlty changing the output of the function
word.clm.function.new <- function(word.or.def) {
  clm(data = clm.df, clm.df[[word.or.def]] ~ (
    work_absolute +
    experience +
    final_group_animal +
    final_group_taxa +
    final_group_continent +
    questionnaire
  ))
  # removed anova() - just returns the model itself
}
# Models without A
word.clm.function.nowork <- function(word.or.def) {
  clm(data = clm.df, clm.df[[word.or.def]] ~ (
    experience +
    final_group_taxa +
    final_group_continent +
    final_group_animal +
    questionnaire
  ))
}
clm.func.result.nowork <- lapply(word_col_names, word.clm.function.nowork)

# Models without B
word.clm.function.noanimal <- function(word.or.def) {
  clm(data = clm.df, clm.df[[word.or.def]] ~ (
    work_absolute +
    experience +
    final_group_taxa +
    final_group_continent +
    questionnaire
  ))
}
clm.func.result.noanimal <- lapply(word_col_names, word.clm.function.noanimal)

clm.func.result.word.full <- lapply(word_col_names, word.clm.function.new)
clm.func.result.nowork <- lapply(word_col_names, word.clm.function.nowork)
clm.func.result.noanimal <- lapply(word_col_names, word.clm.function.noanimal)


aic_comparison <- data.frame(
  word = word_col_names,
  AIC_full = sapply(clm.func.result.word.full, AIC),
  AIC_nowork = sapply(clm.func.result.nowork, AIC),
  AIC_noanimal = sapply(clm.func.result.noanimal, AIC)
)

aic_comparison[aic_comparison$word %in% useful_words, ]

# FOR 20 WORDS
odds_ratio_word <- do.call(rbind, lapply(seq_along(clm.func.result.word.full), function(i) {
  coefs <- exp(coef(clm.func.result.word.full[[i]]))
  # Remove threshold parameters (they contain "|")
  coefs <- coefs[!grepl("\\|", names(coefs))]
  data.frame(word = word_col_names[i], t(coefs))
}))

odds_ratio_word

# FOR 9 DEFINITIONS
def.clm.function.noanova <- function(word.or.def) {
  model <- clm(data = clm.df, clm.df[[word.or.def]] ~ (
    work_absolute +
    experience +
    final_group_animal +
    final_group_taxa +
    final_group_continent +
    questionnaire
  ))
  return(model)
}

clm.func.result.def.noanova <- lapply(def_names, def.clm.function.noanova)

# Calculaiting the logodds ratio
odds_ratio_def <- do.call(rbind, lapply(seq_along(clm.func.result.def.noanova), function(i) {
  coefs <- exp(coef(clm.func.result.def.noanova[[i]]))
  # Remove threshold parameters (they contain "|")
  coefs <- coefs[!grepl("\\|", names(coefs))]
  data.frame(def = def_names[i], t(coefs))
}))

odds_ratio_def

#### For experience
# Extracting experience p-values for all 20 words
work_abs_pvals <- sapply(seq_along(word_col_names), function(i) {
  as.data.frame(clm.func.result.word[[i]])["experience", "Pr(>Chisq)"]
})
names(work_abs_pvals) <- word_col_names

# Finding significant words
significant_words <- names(work_abs_pvals[work_abs_pvals < 0.05])

# LONG data and Dealing with NAs
plot_data <- clm.df %>%
  select(experience, all_of(significant_words)) %>%
  filter(!is.na(experience)) %>%
  pivot_longer(
    cols      = all_of(significant_words),
    names_to  = "word",
    values_to = "rating"
  ) %>%
  mutate(
    rating        = as.numeric(as.character(rating)),
    experience = factor(experience, levels = c("Less than 10 years", "10 or more years"))
  ) %>%
  filter(!is.na(rating))

# Computing the means 
means_data <- plot_data %>%
  group_by(word, experience) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE), .groups = "drop")

# Plotting the graph
ggplot(plot_data, aes(x = experience, y = rating, colour = experience)) +
  geom_jitter(width = 0.15, height = 0.1, alpha = 0.4, size = 1.5) +
    stat_summary(
  fun.data = mean_cl_normal,
  geom     = "errorbar",
  width    = 0.15,
  colour   = "black",
  linewidth = 0.5
) +
  geom_line(
    data    = means_data,
    aes(x   = experience, y = mean_rating, group = word),
    colour  = "black", linewidth = 0.7
  ) +
  geom_point(
    data = means_data,
    aes(x = experience, y = mean_rating),
    colour = "black", size = 3
  ) +
  facet_wrap(~ word, nrow = 2) +
  scale_y_continuous(breaks = 1:7, limits = c(1, 7)) +
  scale_colour_manual(
    values = c("Less than 10 years" = "steelblue", "10 or more years" = "firebrick"),
        labels = c("Less than 10 years" = "Less than ten years", 
             "10 or more years" = "Ten or more years")) +
    scale_x_discrete(
      labels = c("Less than 10 years" = "<10 years", "10 or more years" = ">=10 years")
  ) +
  labs(
    x      = NULL,
    y      = "Rating (1-7)",
    colour = "Experience"
  ) +
  theme_minimal() +
  theme(
    strip.text       = element_text(face = "bold"),
    legend.position  = "bottom",
    text = element_text(family = "Times New Roman"),
    panel.grid.minor = element_blank()
  )
    

#### For final_group_animal
# Extracting final_group_animal p-values for all 20 words 
work_abs_pvals <- sapply(seq_along(word_col_names), function(i) {
  as.data.frame(clm.func.result.word[[i]])["final_group_animal", "Pr(>Chisq)"]
})
names(work_abs_pvals) <- word_col_names

# Finding significant words
significant_words <- names(work_abs_pvals[work_abs_pvals < 0.05])

# LONG data and Dealing with NAs
plot_data <- clm.df %>%
  select(final_group_animal, all_of(significant_words)) %>%
  filter(!is.na(final_group_animal)) %>%
  pivot_longer(
    cols      = all_of(significant_words),
    names_to  = "word",
    values_to = "rating"
  ) %>%
  mutate(
    rating        = as.numeric(as.character(rating)),
    final_group_animal = factor(final_group_animal, levels = c("just captive", "wild_animals", "wild_animals_and_captive"))
  ) %>%
  filter(!is.na(rating))

# Computing the means
means_data <- plot_data %>%
  group_by(word, final_group_animal) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE), .groups = "drop")

# Plotting the graph
ggplot(plot_data, aes(x = final_group_animal, y = rating, colour = final_group_animal)) +
  geom_jitter(width = 0.15, height = 0.1, alpha = 0.4, size = 1.5) +
    stat_summary(
  fun.data = mean_cl_normal,
  geom     = "errorbar",
  width    = 0.15,
  colour   = "black",
  linewidth = 0.5
) +
  geom_line(
    data    = means_data,
    aes(x   = final_group_animal, y = mean_rating, group = word),
    colour  = "black", linewidth = 0.7
  ) +
  geom_point(
    data = means_data,
    aes(x = final_group_animal, y = mean_rating),
    colour = "black", size = 3
  ) +
  facet_wrap(~ word, nrow = 2) +
  scale_y_continuous(breaks = 1:7, limits = c(1, 7)) +
  scale_colour_manual(
    values = c("just captive" = "steelblue", "wild_animals" = "firebrick", "wild_animals_and_captive" = "darkgreen"),
    labels = c("just captive" = "Just Captive", 
             "wild_animals" = "Just Wild", 
             "wild_animals_and_captive" = "Wild & Captive")) +
    scale_x_discrete(
  labels = c("just captive" = "Captive", "wild_animals" = "Wild", "wild_animals_and_captive" = "Wild & Captive")
) +
  labs(
    x      = NULL,
    y      = "Rating (1-7)",
    colour = "Animal Context"
  ) +
  theme_minimal() +
  theme(
    strip.text       = element_text(face = "bold"),
    legend.position  = c(0.88, 0.25),
    text = element_text(family = "Times New Roman"),
    panel.grid.minor = element_blank()
  )



#### For work_absolute
# Extracting work_absolute p-values for all 20 words 
work_abs_pvals <- sapply(seq_along(word_col_names), function(i) {
  as.data.frame(clm.func.result.word[[i]])["work_absolute", "Pr(>Chisq)"]
})
names(work_abs_pvals) <- word_col_names

# Finding significant words
significant_words <- names(work_abs_pvals[work_abs_pvals < 0.05])

# LONG data and Dealing with NAs
plot_data <- clm.df %>%
  select(work_absolute, all_of(significant_words)) %>%
  filter(!is.na(work_absolute)) %>%
  pivot_longer(
    cols      = all_of(significant_words),
    names_to  = "word",
    values_to = "rating"
  ) %>%
  mutate(
    rating        = as.numeric(as.character(rating)),
    work_absolute = factor(work_absolute, levels = c("Animal Welfare", "Conservation", "Other biological field"))
  ) %>%
  filter(!is.na(rating))

# Computing means
means_data <- plot_data %>%
  group_by(word, work_absolute) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE), .groups = "drop")

# Plotting the graph
ggplot(plot_data, aes(x = work_absolute, y = rating, colour = work_absolute)) +
  geom_jitter(width = 0.15, height = 0.1, alpha = 0.6, size = 1.5) +
  stat_summary(
  fun.data = mean_cl_normal,
  geom     = "errorbar",
  width    = 0.15,
  colour   = "black",
  linewidth = 0.5
) +
  geom_line(
    data    = means_data,
    aes(x   = work_absolute, y = mean_rating, group = word),
    colour  = "black", linewidth = 0.7
  ) +
  geom_point(
    data = means_data,
    aes(x = work_absolute, y = mean_rating),
    colour = "black", size = 3
  ) +
  facet_wrap(~ word, nrow = 2) +
  scale_y_continuous(breaks = 1:7, limits = c(1, 7)) +
  scale_colour_manual(
  values = c("Animal Welfare" = "#D55E00", "Conservation" = "#2E8B57", 
             "Other biological field" = "#0072B2"),
  labels = c("Animal Welfare" = "Animal Welfare (AW)", 
             "Conservation" = "Conservation (C)", 
             "Other biological field" = "Other biological field (O)")) +
  scale_x_discrete(
  labels = c("Animal Welfare" = "AW", "Conservation" = "C", "Other biological field" = "O")
) +
  labs(
    x      = NULL,
    y      = "Rating (1-7)",
    colour = "Scientific Field"
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Times New Roman"),
    strip.text       = element_text(face = "bold"),
    legend.position  = "bottom",
    legend.margin = margin(t = -10), # This moves the legend closer to the graph
    panel.grid.minor = element_blank()
  )

#### For work_absolute
# Extracting work_absolute p-values for all 20 words
work_abs_pvals <- sapply(seq_along(def_names), function(i) {
  as.data.frame(clm.func.result.def[[i]])["work_absolute", "Pr(>Chisq)"]
})
names(work_abs_pvals) <- def_names

# Finding significant words
significant_words <- names(work_abs_pvals[work_abs_pvals < 0.05])

# LONG data and dealing with NAs
plot_data <- clm.df %>%
  select(work_absolute, all_of(significant_words)) %>%
  filter(!is.na(work_absolute)) %>%
  pivot_longer(
    cols      = all_of(significant_words),
    names_to  = "word",
    values_to = "rating"
  ) %>%
  mutate(
    rating        = as.numeric(as.character(rating)),
    work_absolute = factor(work_absolute, levels = c("Animal Welfare", "Conservation", "Other biological field"))
  ) %>%
  filter(!is.na(rating))

# Computing means
means_data <- plot_data %>%
  group_by(word, work_absolute) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE), .groups = "drop")

# Plotting the graph
ggplot(plot_data, aes(x = work_absolute, y = rating, colour = work_absolute)) +
  geom_jitter(width = 0.15, height = 0.1, alpha = 0.4, size = 1.5) +
  stat_summary(
  fun.data = mean_cl_normal,
  geom     = "errorbar",
  width    = 0.15,
  colour   = "black",
  linewidth = 0.5
) +
  geom_line(
    data    = means_data,
    aes(x   = work_absolute, y = mean_rating, group = word),
    colour  = "black", linewidth = 0.7
  ) +
  geom_point(
    data = means_data,
    aes(x = work_absolute, y = mean_rating),
    colour = "black", size = 3
  ) +
  facet_wrap(~ word, nrow = 2) +
  scale_y_continuous(breaks = 1:7, limits = c(1, 7)) +
  scale_colour_manual(
  values = c("Animal Welfare" = "#D55E00", "Conservation" = "#2E8B57", 
             "Other biological field" = "#0072B2"),
  labels = c("Animal Welfare" = "Animal Welfare (AW)", 
             "Conservation" = "Conservation (C)", 
             "Other biological field" = "Other biological field (O)")) +
  scale_x_discrete(
  labels = c("Animal Welfare" = "AW", "Conservation" = "C", "Other biological field" = "O")
) +
  labs(
    x      = NULL,
    y      = "Rating (1-7)",
    colour = "Scientific Field"
  ) +
  theme_minimal() +
  theme(
    text = element_text(family = "Times New Roman"),
    strip.text       = element_text(face = "bold"),
    legend.position  = "bottom",
    legend.margin = margin(t = -10), # This moves the legend closer to the graph
    panel.grid.minor = element_blank()
  )





#### For questionnaire 
# Extracting questionnaire p-values for all 20 words
work_abs_pvals <- sapply(seq_along(def_names), function(i) {
  as.data.frame(clm.func.result.def[[i]])["questionnaire", "Pr(>Chisq)"]
})
names(work_abs_pvals) <- def_names

# Finding significant words
significant_words <- names(work_abs_pvals[work_abs_pvals < 0.05])

# LONG data and dealing with NAs
plot_data <- clm.df %>%
  select(questionnaire, all_of(significant_words)) %>%
  filter(!is.na(questionnaire)) %>%
  pivot_longer(
    cols      = all_of(significant_words),
    names_to  = "word",
    values_to = "rating"
  ) %>%
  mutate(
    rating        = as.numeric(as.character(rating)),
    questionnaire = factor(questionnaire, levels = c("a", "b"))
  ) %>%
  filter(!is.na(rating))

# Computing means
means_data <- plot_data %>%
  group_by(word, questionnaire) %>%
  summarise(mean_rating = mean(rating, na.rm = TRUE), .groups = "drop")

# Plotting the graph
ggplot(plot_data, aes(x = questionnaire, y = rating, colour = questionnaire)) +
  geom_jitter(width = 0.15, height = 0.1, alpha = 0.4, size = 1.5) +
    stat_summary(
  fun.data = mean_cl_normal,
  geom     = "errorbar",
  width    = 0.15,
  colour   = "black",
  linewidth = 0.5
) +
  geom_line(
    data    = means_data,
    aes(x   = questionnaire, y = mean_rating, group = word),
    colour  = "black", linewidth = 0.7
  ) +
  geom_point(
    data = means_data,
    aes(x = questionnaire, y = mean_rating),
    colour = "black", size = 3
  ) +
  facet_wrap(~ word, nrow = 2) +
  scale_y_continuous(breaks = 1:7, limits = c(1, 7)) +
  scale_colour_manual(
    values = c("a" = "steelblue", "b" = "firebrick")
  ) +
  labs(
    title  = "Ratings by questionnaire for significant definitions",
    x      = NULL,
    y      = "Rating (1-7)",
    colour = "Questionnaire"
  ) +
  theme_minimal() +
  theme(
    strip.text       = element_text(face = "bold"),
    text = element_text(family = "Times New Roman"),
    panel.grid.minor = element_blank()
  )

PCA tests - Words

#Subsetting the data I want for the words
worddata <- first_dataframe[,c(3:22, 45, 46, 52:55, 57, 58)] 
worddata[word_col_names] <- lapply(worddata[word_col_names], as.numeric)

#Subsetting the data I want for the definitions
defdata <- first_dataframe[,c(24:32, 45, 46, 52:55, 57, 58)] 
defdata[def_names] <- lapply(defdata[def_names], as.numeric)

# Read from https://rpubs.com/kharshita/970636
# All data should be continuous numeric that can take on decimal values, except for the intresting demographic questions (eg taxa, animals etc)

pairs(worddata[,-c(21:28)])

pca.out <- prcomp(worddata[,-c(21:28)], scale = TRUE)
biplot(pca.out)

# Code that I got from the internet
rda.out <- vegan::rda(worddata[,-c(21:28)], scale = TRUE)
rda_scores <- scores(rda.out)
biplot(rda.out,
       display = "sites")

# You can then group them together using colour to see any clear patterns
work_colours <- c("#D55E00","#2E8B57", "#0072B2")

par(family = "Times New Roman")
biplot(rda.out, display = "sites")
ordihull(rda.out,
         group = worddata$work_absolute,
         col = work_colours,
         lty = 1:3,
         lwd = c(3,6,3))
legend("topright",
       legend = levels(as.factor(worddata$work_absolute)),
       col = work_colours,
       lty = 1:3,
       lwd = 3,
       cex = 0.8)

# Link for the website: https://www.datacamp.com/tutorial/pca-analysis-r
# Five main steps for computing PCA: 
# 1) Data normlaization, 
# 2) Covaraiince matrix compuration,
# 3) Eigenvlues and eigenvectors, 
# 4) Selectioin of principal componanets, 
# 5) Data transformation in a new space 

# Normalising the data
numerical_data <- worddata[,1:20]
# head(numerical_data)

data_normalized <- scale(numerical_data)
# head(data_normalized)

# Applying PCA
data.pca <- princomp(data_normalized)
summary(data.pca)

data.pca$loadings[, 1:2] #This just looks at the first 2 comps in relation to the data.


# Visualization of the principal components
# Scree Plot
fviz_eig(data.pca, addlabels = TRUE) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))

# Contribution of each variable
fviz_cos2(data.pca, choice = "var", axes = 1:2) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))

# Biplot combined with cos2 
fviz_pca_var(data.pca, col.var = "cos2",
            gradient.cols = c("black", "orange", "green"),
            repel = TRUE) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))

# Testing suitability of PCA
psych::KMO(data_normalized)
psych::cortest.bartlett(cor(data_normalized), n = nrow(data_normalized))

PCA tests - Definitions

# Read from https://rpubs.com/kharshita/970636
# All data should be continuous numeric that can take on decimal values, except for the intresting demographic questions (eg taxa, animals etc)

# This is just beginning stuff not sure what I am looking at
# Need to check for 'NAs' or '0s' as apparently can effect the PCA
# colSums(is.na(shortworddata)) #This might be the right code to check? 

pairs(defdata[,-c(10:17)])

pca.out <- prcomp(defdata[,-c(10:17)], scale = TRUE)
biplot(pca.out)

# Principal Component Analysis

# Some fancy stuff I copied from the internet 
rda.out <- vegan::rda(defdata[,-c(10:17)], scale = TRUE)
rda_scores <- scores(rda.out)

# You can then group them together using colour to see any clear patterns
# I did not change anyhting here from the web apart from the dataframe name
par(family = "Times New Roman")
biplot(rda.out, display = "sites")

ordihull(rda.out,
         group = defdata$work_absolute,
         col = work_colours,
         lty = 1:3,
         lwd = c(3,6,3)) +
  theme(text = element_text(family = "Times New Roman"))

legend("topright",
       legend = levels(as.factor(defdata$work_absolute)),
       col = work_colours,
       lty = 1:3,
       lwd = 3,
       cex = 0.8) +
  theme(text = element_text(family = "Times New Roman"))

# Link for the website: https://www.datacamp.com/tutorial/pca-analysis-r
# Five main steps for computing PCA: 
# 1) Data normlaization, 
# 2) Covaraiince matrix compuration,
# 3) Eigenvlues and eigenvectors, 
# 4) Selectioin of principal componanets, 
# 5) Data transformation in a new space 
par(family = "Times New Roman")
# Normalising the data
numerical_data <- defdata[,1:9]
# head(numerical_data)

data_normalized <- scale(numerical_data)
# head(data_normalized)

# Applying PCA
data.pca <- princomp(data_normalized)
summary(data.pca)

data.pca$loadings[, 1:4] #This just looks at the first 2 comps in relation to the data.


# Visualization of the principal components
# Scree Plot
fviz_eig(data.pca, addlabels = TRUE) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))

# Contribution of each variable
fviz_cos2(data.pca, choice = "var", axes = 1:2) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))

# Biplot combined with cos2 

fviz_pca_var(data.pca, 
             axes = c(1,2),
             col.var = "cos2",
             gradient.cols = c("black", "orange", "green"),
             repel = TRUE) + 
  labs(title = NULL) +
  theme(text = element_text(family = "Times New Roman"))


# Testing if data is appropriate for PCA
psych::KMO(data_normalized)
psych::cortest.bartlett(cor(data_normalized), n = nrow(data_normalized))

Frameworks

#Got this df from the MCQ bit at the top of the code
frameworks_working_df <- frameworks_binary_df
frameworks_working_df <- cbind(frameworks_working_df, 
                               first_dataframe[ ,c("id","work_absolute", "experience",
                                                   "final_group_animal",
                                                   "final_group_taxa",
                                                   "final_group_continent")])


# Need to order the variables
eight_rankings <- c("nutrition", "enviroment", "physical_health", "behavioural_interactions", "mental_state", "biological_functioning","affective_state", "natural_living")

first_dataframe[eight_rankings] <- lapply(first_dataframe[eight_rankings], function(col) {
  factor(col,
         levels = c("1st", "2nd", "3rd", "4th", "5th"),
         ordered = TRUE)
})
# is.ordered(first_dataframe$mental_state) #TRUE
# is.ordered(first_dataframe$affective_state) #TRUE

frameworks_working_df <- cbind(frameworks_working_df, 
                               first_dataframe[, eight_rankings])

# Making graph of which frameworks particpants have heard of
# To make a nice stacked graph need to get the data into long format
frameworks.working.df.long <- frameworks_working_df %>%
  pivot_longer(
    cols = starts_with(framework_names),  # or list them manually
    names_to = "frameworks",
    values_to = "response"
  )

# Counting how many of each framework there is
frameworks.working.df.long <- frameworks.working.df.long %>%
  group_by(work_absolute, frameworks) %>%
  summarise(yes_count = sum(response == 1), .groups = "drop")

# For Order that they appear in the graph
framework_order <- c("five.freedoms",
                     "five.domains",
                     "three.orientations",
                     "dynamic.animal.welfare.concept..dawcon.",
                     "X3rs..replacement..reduction..refinement.",
                     "quality.of.life",
                     "life.worth.living",
                     "welfare.quality..project",
                     "i.have.not.heard.of.any.of.these")
                      

frameworks.working.df.long <- frameworks.working.df.long %>%
  mutate(frameworks = factor(frameworks, levels = framework_order))

#Changing names for the stacked bar chart
framework_labels <- c(
  "five.freedoms" = "Five freedoms",
  "five.domains" = "Five domains",
  "three.orientations" = "Three orientations",
  "dynamic.animal.welfare.concept..dawcon." = "DAWcon",
  "X3rs..replacement..reduction..refinement." = "3Rs",
  "quality.of.life" = "Quality of life",
  "life.worth.living" = "Life worth livng",
  "welfare.quality..project" = "Welfare quality project",
  "i.have.not.heard.of.any.of.these" = "None")

# Another way to show the data, might be better as tried to take on Paul's advice
group_sizes <- c("Animal Welfare" = 36, "Conservation" = 45, "Other" = 24)

framework_props <- frameworks_working_df %>%
  select(-c("id", "total_selected")) %>%
  pivot_longer(cols = where(is.numeric), 
               names_to = "framework", 
               values_to = "known") %>%
  filter(!is.na(known)) %>%
  group_by(work_absolute, framework) %>%
  summarise(n_known = sum(known == 1), .groups = "drop") %>%
  mutate(pct = round(n_known / group_sizes[work_absolute] * 100, 1))

framework_props <- framework_props %>%
  mutate(framework = factor(framework, levels = rev(framework_order)))

ggplot(framework_props, aes(x = work_absolute, y = framework, fill = pct)) +
  geom_tile(colour = "white", linewidth = 0.5) +
  geom_text(aes(label = paste0(round(pct), "%")),
            size = 3, family = "Times New Roman") +
  scale_fill_gradient(low = "#E6F1FB", high = "#14508F",
                      name = "% known", limits = c(0, 100)) +
  scale_y_discrete(labels = framework_labels) +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(
    text             = element_text(family = "Times New Roman"),
    plot.title       = element_text(face = "bold", hjust = 0.5),
    panel.grid       = element_blank(),
    legend.position  = "bottom",
    legend.key.width = unit(1, "cm"),    
    legend.key.height = unit(0.3, "cm"), 
    legend.text      = element_text(size = 8),
    legend.title     = element_text(size = 8)
  )

# Working out the mean in each group
frameworks_working_df %>%
  group_by(work_absolute) %>%
  summarise(mean_value = mean(total_selected, na.rm = TRUE))

# Using Kruskal-Wallis Test
kruskal.test(work_absolute ~ total_selected, data = frameworks_working_df)

# Checking each combination seperately. AW v C and AW v Other are the main two
pairwise.wilcox.test(frameworks_working_df$total_selected, frameworks_working_df$work_absolute)

#Removing all the prefer not to say options from the df
frameworks_working_df_prefer <- frameworks_working_df

frameworks_working_df_prefer <- frameworks_working_df_prefer[!grepl("prefer_not_to_say", frameworks_working_df_prefer$final_group_animal, ignore.case = TRUE), ]

frameworks_working_df_prefer <- frameworks_working_df_prefer[!grepl("prefer_not_to_say", frameworks_working_df_prefer$final_group_taxa, ignore.case = TRUE), ]

frameworks_working_df_prefer <- frameworks_working_df_prefer[!grepl("prefer_not_to_say", frameworks_working_df_prefer$final_group_continent, ignore.case = TRUE), ]

# Creating a list for the five domains and the three orientations
five.domains.list <- c("nutrition", "enviroment", "physical_health", "behavioural_interactions", "mental_state")

three.or.list <- c("biological_functioning", "affective_state", "natural_living")

############################ Five Domains
# Need to make the data long in order to carry out the lmer
rank_five_long <- frameworks_working_df_prefer %>%
  pivot_longer(
    cols = c(five.domains.list),
    names_to = "domain",
    values_to = "ranking"
  )

rank_five_long$domain <- as.factor(rank_five_long$domain)

# Creating differnt models in order to be able to test them against each other and see if one model better explains the pattern in the data of then the other.
##### Main model
rank_five_model <- clmm(ranking ~ work_absolute * domain +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_five_long)
#####

rank_five_model_no_work <- clmm(ranking ~ domain +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_five_long)
#### Need to put the more basic model first and the more complex model second
# anova(rank_five_model_no_work, rank_five_model)
# Pr(>Chisq) = 0.08233

rank_five_model_no_work_i <- clmm(ranking ~ domain + work_absolute +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_five_long)
# anova(rank_five_model_no_work_i, rank_five_model)
# Pr(>Chisq) = 0.1047

rank_five_model_no_experience <- clmm(ranking ~ work_absolute * domain +
                                        final_group_animal +
                                        final_group_taxa +
                                        final_group_continent + 
                                        (1 | id), 
                                      data = rank_five_long)
# anova(rank_five_model_no_experience, rank_five_model)
# Pr(>Chisq) = 0.9013

rank_five_model_no_animal <- clmm(ranking ~ work_absolute * domain +
                                    experience +
                                    final_group_taxa +
                                    final_group_continent +
                                    (1 | id), 
                                  data = rank_five_long)
# anova(rank_five_model_no_animal, rank_five_model)
# Pr(>Chisq) = 0.6868

rank_five_model_no_taxa <- clmm(ranking ~ work_absolute * domain +
                                  experience +
                                  final_group_animal +
                                  final_group_continent +
                                  (1 | id), 
                        data = rank_five_long)
anova(rank_five_model_no_taxa, rank_five_model)
# Pr(>Chisq) = 0.9908

rank_five_model_no_continent <- clmm(ranking ~ work_absolute * domain +
                                       experience +
                                       final_group_animal +
                                       final_group_taxa +
                                       (1 | id), 
                        data = rank_five_long)
# anova(rank_five_model_no_continent, rank_five_model)
# Pr(>Chisq) = 0.8684


############################ Three oreitnations
# Need to make the data long in order to carry out the lmer
rank_three_long <- frameworks_working_df_prefer %>%
  pivot_longer(
    cols = c(three.or.list),
    names_to = "domain",
    values_to = "ranking"
  )

# Turning domain into a factor
rank_three_long$domain <- as.factor(rank_three_long$domain)

# Making every model and comparing it to the orginal via ANOVA
##### Main Model
rank_three_model <- clmm(ranking ~ work_absolute * domain +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_three_long)
#####

rank_three_model_no_work <- clmm(ranking ~ domain +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_three_long)
anova(rank_three_model_no_work, rank_three_model)
# Pr(>Chisq) = 0.4331

rank_three_model_no_interaction <- clmm(ranking ~ work_absolute + domain +
                          experience +
                          final_group_animal +
                          final_group_taxa +
                          final_group_continent +
                          (1 | id), 
                        data = rank_three_long)
anova(rank_three_model_no_interaction, rank_three_model)
# Pr(>Chisq) = 0.345

rank_three_model_no_experience <- clmm(ranking ~ work_absolute * domain +
                                        final_group_animal +
                                        final_group_taxa +
                                        final_group_continent + 
                                        (1 | id), 
                                      data = rank_three_long)
# anova(rank_three_model_no_experience, rank_three_model)
# Pr(>Chisq) = 0.6194

rank_three_model_no_animal <- clmm(ranking ~ work_absolute * domain +
                                     experience +
                                     final_group_taxa +
                                     final_group_continent +
                                    (1 | id), 
                                   data = rank_three_long)
# anova(rank_three_model_no_animal, rank_three_model)
# Pr(>Chisq) = 0.9548

rank_three_model_no_taxa <- clmm(ranking ~ work_absolute * domain +
                                   experience +
                                   final_group_animal +
                                   final_group_continent +
                                  (1 | id), 
                        data = rank_three_long)
anova(rank_three_model_no_taxa, rank_three_model)
# Pr(>Chisq) = 0.3373

rank_three_model_no_continent <- clmm(ranking ~ work_absolute * domain +
                                       experience +
                                       final_group_animal +
                                       final_group_taxa +
                                       (1 | id), 
                        data = rank_three_long)
# anova(rank_three_model_no_continent, rank_three_model)
# Pr(>Chisq) = 0.7157


# Just some descripitve statisitcs for FIVE DOMAINS
overall_summary <- rank_five_long %>%
  group_by(domain) %>%
  summarise(
    mean_rank = mean(as.numeric(ranking), na.rm = TRUE),
    sd_rank   = sd(as.numeric(ranking), na.rm = TRUE),
    n         = n()
  ) %>%
  arrange(mean_rank)

print(overall_summary)

rank_five_null <- clmm(ranking ~ 1 + (1 | id),
                       data = rank_five_long)

rank_five_domain_only <- clmm(ranking ~ domain + (1 | id),
                              data = rank_five_long)

anova(rank_five_null, rank_five_domain_only)

# Just some descripitve statisitcs for THREE OREINTATIONS
overall_summary <- rank_three_long %>%
  group_by(domain) %>%
  summarise(
    mean_rank = mean(as.numeric(ranking), na.rm = TRUE),
    sd_rank   = sd(as.numeric(ranking), na.rm = TRUE),
    n         = n()
  ) %>%
  arrange(mean_rank)

print(overall_summary)

rank_five_null <- clmm(ranking ~ 1 + (1 | id),
                       data = rank_three_long)

rank_five_domain_only <- clmm(ranking ~ domain + (1 | id),
                              data = rank_three_long)

anova(rank_five_null, rank_five_domain_only)

# Making a graph function for both the 5D and 3O
plot_likert_ranked <- function(df, word_cols, n_levels,
                               label_pos     = 105,
                               neg_limit     = 70,
                               neg_label_pos = 65,
                               legend_labs   = NULL) {
  
  # Defining positive/neutral/negative by n_levels
  pos_levels <- if (n_levels == 5) 1:2 else 1
  neu_levels <- if (n_levels == 5) 3    else 2
  neg_levels <- if (n_levels == 5) 4:5  else 3
  
  # Colours
  likert_colours <- if (n_levels == 5) {
    c("1" = "#2B3EC0",
      "2" = "#B8C8F4",
      "3" = "#D3D3D3",
      "4" = "#E08080",
      "5" = "#C0392B")
  } else {
    c("1" = "#2B3EC0",
      "2" = "#D3D3D3",
      "3" = "#C0392B")
  }
  
  #Computing proportions
  long_data <- df %>%
  select(all_of(word_cols)) %>%
  mutate(id = row_number()) %>%
  pivot_longer(cols = -id, names_to = "word", values_to = "answer") %>%
  mutate(
    answer = as.numeric(gsub("st|nd|rd|th", "", as.character(answer)))
  ) %>%
  filter(!is.na(answer)) %>%
  count(word, answer) %>%                 
  group_by(word) %>%                             
  mutate(prop = n / sum(n) * 100) %>%            
  ungroup()    
  
  # Working out percentage labels
  label_data <- long_data %>%
  mutate(group = case_when(
    as.integer(answer) %in% as.integer(pos_levels) ~ "positive",
    as.integer(answer) %in% as.integer(neu_levels) ~ "neutral",
    as.integer(answer) %in% as.integer(neg_levels) ~ "negative"
  )) %>%
  group_by(word, group) %>%
  summarise(pct = sum(prop), .groups = "drop") %>%
  complete(word, group = c("negative", "neutral", "positive"),
           fill = list(pct = 0))
  
  # Positive, neutral, negative calculations
  plot_data <- long_data %>%
    mutate(answer = factor(answer, levels = 1:n_levels)) %>%
    group_by(word) %>%
    mutate(neutral_half = prop[answer == neu_levels] / 2) %>%
    ungroup() %>%
    mutate(
  side = case_when(
    as.integer(answer) %in% as.integer(pos_levels) ~ "positive",
    as.integer(answer) %in% as.integer(neu_levels) ~ "neutral",
    as.integer(answer) %in% as.integer(neg_levels) ~ "negative"
    )
  ) %>%
    group_by(word, side) %>%
    mutate(
      answer_ordered = if_else(
        side == "positive",
        factor(answer, levels = rev(as.character(pos_levels))),
        factor(answer, levels = as.character(1:n_levels))
      )
    ) %>%
    arrange(word, side, answer_ordered) %>%
    mutate(
      cumulative     = cumsum(prop),
      cumulative_lag = lag(cumulative, default = 0)
    ) %>%
    ungroup() %>%
    mutate(
      neutral_half = sapply(seq_len(n()), function(i) {
        nh <- neutral_half[word == word[i] & side == "neutral"]
        if (length(nh) == 0) neutral_half[i] else nh[1]
      }),
      xmin = case_when(
        side == "positive" ~ cumulative_lag   + neutral_half,
        side == "neutral"  ~ -prop / 2,
        side == "negative" ~  -(cumulative  + neutral_half)
      ),
      xmax = case_when(
        side == "positive" ~ cumulative + neutral_half,
        side == "neutral"  ~  prop / 2,
        side == "negative" ~  -(cumulative_lag      + neutral_half)
      )
    )
  
  # Ordering words by positive percentage
  ordered_words <- label_data %>%
    filter(group == "positive") %>%
    right_join(distinct(label_data, word), by = "word") %>%
    mutate(pct = replace_na(pct, 0)) %>%
    arrange(pct) %>%
    pull(word)
  
  plot_data  <- plot_data  %>% mutate(word = factor(word, levels = ordered_words))
  label_data <- label_data %>% mutate(word = factor(word, levels = ordered_words))
  
  # Plotting the graph
  ggplot(plot_data) +
  geom_rect(aes(
    xmin = xmin, xmax = xmax,
    ymin = as.numeric(factor(word, levels = ordered_words)) - 0.4,
    ymax = as.numeric(factor(word, levels = ordered_words)) + 0.4,
    fill = answer
  )) +
  geom_text(
    data = filter(label_data, group == "positive"),
    aes(x = label_pos, y = word, label = paste0(round(pct), "%")),
    colour = "#2B3EC0", fontface = "bold", size = 3, hjust = 0
  ) +
  geom_text(
    data = filter(label_data, group == "negative"),
    aes(x = -neg_label_pos, y = word, label = paste0(round(pct), "%")),
    colour = "#C0392B", fontface = "bold", size = 3, hjust = 1
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
  scale_fill_manual(values = likert_colours, name = "Ranking:",
                    labels = if (!is.null(legend_labs)) legend_labs else waiver()) +
  scale_x_continuous(
    breaks = c(-neg_limit + 20, 0, 50, 100),
    labels = c(paste0(neg_limit - 20), "0", "50", "100"),
    limits = c(-(neg_limit + 5), 115)
  ) +
  scale_y_discrete(
    limits = ordered_words,
    labels = function(x) {
      x <- gsub("_", " ", x)
      paste0(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x)))
    }
  ) +
  labs(x = "Percentage", y = NULL) +
  theme_minimal() +
  theme(
    text               = element_text(family = "Times New Roman"),
    plot.title         = element_text(face = "bold", hjust = 0.5),
    legend.position    = "bottom",
    panel.grid.major.y = element_blank()
  ) +
  guides(fill = guide_legend(nrow = 1, keywidth = 0.9, keyheight = 0.9))
}

# Five Domains graph
plot_likert_ranked(df = frameworks_working_df,
                   word_cols = five.domains.list,
                   n_levels = 5,
                   label_pos = 105,
                   neg_limit = 70,
                   neg_label_pos = 65,
                   legend_labs   = c("1st", "2nd", "3rd", "4th", "5th"))

# Three Orientations graph
plot_likert_ranked(df = frameworks_working_df,
                   word_cols = three.or.list,
                   n_levels = 3,
                   label_pos = 105,
                   neg_limit = 70,
                   neg_label_pos = 65,
                   legend_labs = c("1st", "2nd", "3rd"))

Should there be an AW definition?

# Filling in the empty cells to prevent R from getting confused 
first_dataframe <- first_dataframe %>%
  mutate(yes.def.question = ifelse(yes.def.question == "", "not.applicable", yes.def.question))

first_dataframe <- first_dataframe %>%
  mutate(no.def.question = ifelse(no.def.question == "", "not.applicable", no.def.question))


### Chi squared test
chisq.def.y.n.tbl <- table(first_dataframe$work_absolute, 
                           first_dataframe$should.there.be.a.def)
#From research this one is fine to use use, dont use both this and fisher test
chisq.test(chisq.def.y.n.tbl) #X-squared = 2.2233, df = 4, p-value = 0.6948
# Cochran’s guidelines to show this is a suitable test (ie less than 20% values less than 5)

# bit wierd Cohen (1988) sighted as a guidline for this, look it up if needs to
lsr::cramersV(chisq.def.y.n.tbl) #Cramer's V = 0.1028949

# Percent of yes/no/etc graph
ggplot(first_dataframe, aes(x = should.there.be.a.def, fill = work_absolute)) +
  geom_bar(aes(y = after_stat(prop), group = work_absolute),
           position = "dodge") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(text = element_text(family = "Times New Roman")) +
  scale_fill_manual(values = work_colours) +
  labs(x = "Should there be a definiton of animal wefalre?", 
       y = "Percentage of responses", fill = "Scientific Field")

Conservation Answers

table(first_dataframe$work_absolute)

priority_data <- read.csv("priority_data.csv")

priority_data[,c("id", "work_absolute", "experience", "final_group_animal", "final_group_taxa", "final_group_continent")] <- first_dataframe[,c("id", "work_absolute", "experience", "final_group_animal", "final_group_taxa", "final_group_continent")]

priority_data_c <- subset(priority_data, work_absolute == "Conservation")

# "is.na" Removes actual N/A data point (one person who did not answer)
priority_data_c <- priority_data_c[priority_data_c$priority.individual != "" & !is.na(priority_data_c$priority.individual), ]


# "n/a" Removes the n/a I put in myself, three from each collumn, 5 people overall
priority_data_all_clm <- priority_data_c
priority_data_ind_clm <- priority_data_c

priority_data_ind_clm <- priority_data_ind_clm[priority_data_ind_clm$priority.individual != "n/a" & !is.na(priority_data_ind_clm$priority.individual), ]

priority_data_all_clm <- priority_data_all_clm[priority_data_all_clm$priority.all != "n/a" & !is.na(priority_data_all_clm$priority.all), ]

# Turning the data into factored levels
priority_levels <- c("very.high",
                     "high",
                     "medium",
                     "low",
                     "very.low")

# Reverse levels for correct graph orientation
priority_levels <- rev(priority_levels)

# Re-apply factors with reversed levels
priority_data_ind_clm$priority.individual <- factor(priority_data_ind_clm$priority.individual,
                                                    levels = priority_levels,
                                                    ordered = TRUE)
priority_data_all_clm$priority.all <- factor(priority_data_all_clm$priority.all,
                                             levels = priority_levels,
                                             ordered = TRUE)

# Combine directly from the refactored columns
priority.combined <- bind_rows(
  data.frame(bar = "Individual work",       answer = as.character(priority_data_ind_clm$priority.individual)),
  data.frame(bar = "All conservation work", answer = as.character(priority_data_all_clm$priority.all))
) %>%
  mutate(answer = factor(answer, levels = priority_levels, ordered = TRUE))

# Long data
long_data <- priority.combined %>%
  filter(!is.na(answer)) %>%
  count(bar, answer) %>%
  group_by(bar) %>%
  mutate(prop = n / sum(n) * 100) %>%
  ungroup()

# Percentage labels
label_data <- long_data %>%
  mutate(group = case_when(
    answer %in% priority_levels[1:2] ~ "negative",
    answer == priority_levels[3]     ~ "neutral",
    answer %in% priority_levels[4:5] ~ "positive"
  )) %>%
  group_by(bar, group) %>%
  summarise(pct = sum(prop), .groups = "drop")

# Working out the pos/neu/neg
plot_data <- long_data %>%
  mutate(
    side = case_when(
      answer %in% priority_levels[1:2] ~ "negative",
      answer == priority_levels[3]     ~ "neutral",
      answer %in% priority_levels[4:5] ~ "positive"
    )
  ) %>%
  group_by(bar) %>%
  mutate(neutral_half = prop[answer == priority_levels[3]] / 2) %>%
  ungroup() %>%
  group_by(bar, side) %>%
  mutate(
    answer_ordered = if_else(
      side == "negative",
      factor(as.character(answer),
             levels = c(priority_levels[2], priority_levels[1])),
      factor(as.character(answer),
             levels = priority_levels)
    )
  ) %>%
  arrange(bar, side, answer_ordered) %>%
  mutate(
    cumulative     = cumsum(prop),
    cumulative_lag = lag(cumulative, default = 0)
  ) %>%
  ungroup() %>%
  mutate(
    xmin = case_when(
      side == "negative" ~ -(cumulative     + neutral_half),
      side == "neutral"  ~ -prop / 2,
      side == "positive" ~ cumulative_lag  + neutral_half
    ),
    xmax = case_when(
      side == "negative" ~ -(cumulative_lag + neutral_half),
      side == "neutral"  ~  prop / 2,
      side == "positive" ~ cumulative      + neutral_half
    ),
    bar = factor(bar, levels = c("Individual work", "All conservation work"))
  )

label_data <- label_data %>%
  mutate(bar = factor(bar, levels = c("Individual work", "All conservation work")))

# Colours
likert_colours <- c(
  "#C0392B", "#E08080", "#D3D3D3", "#6080E0", "#2B3EC0"
) %>% setNames(priority_levels)

# Plotting the graph
ggplot(plot_data) +
  geom_rect(aes(xmin = xmin, xmax = xmax,
                ymin = as.numeric(bar) - 0.3,
                ymax = as.numeric(bar) + 0.3,
                fill = answer)) +
  geom_text(
    data = filter(label_data, group == "negative"),
    aes(x = -60, y = bar, label = paste0(round(pct), "%")),
    colour = "#C0392B", fontface = "bold", size = 3, hjust = 1
  ) +
  geom_text(
    data = filter(label_data, group == "positive"),
    aes(x = 105, y = bar, label = paste0(round(pct), "%")),
    colour = "#2B3EC0", fontface = "bold", size = 3, hjust = 0
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
  scale_fill_manual(values = likert_colours, name = "Rating: ",
                    breaks = priority_levels,
                    labels = c("Very low", "Low", "Medium", "High", "Very high")) +
  scale_x_continuous(
    breaks = c(-50, 0, 50, 100),
    labels = c( "50", "0", "50", "100"),
    limits = c(-70, 110)
  ) +
  scale_y_discrete(limits = c("Individual work", "All conservation work")) +
  labs(
    x     = "Percentage",
    y     = NULL
  ) +
  theme_minimal() +
  theme(
    text               = element_text(family = "Times New Roman"),
    plot.title         = element_text(face = "bold", hjust = 0.5),
    legend.position    = "bottom",
    # legend.text = element_text(size = 8),
    panel.grid.major.y = element_blank()
  ) +
  guides(fill = guide_legend(nrow = 1, keywidth = 0.9, keyheight = 0.9))


# Getting rid of the prefer not to say options for the CLMs
priority_data_all_clm <- priority_data_all_clm[!grepl("prefer_not_to_say", priority_data_all_clm$final_group_continent, ignore.case = TRUE), ]
priority_data_ind_clm <- priority_data_ind_clm[!grepl("prefer_not_to_say", priority_data_ind_clm$final_group_continent, ignore.case = TRUE), ]

# Doing a clm with all the other demograhic questions
p.all.clm <- clm(priority.all ~ experience + 
      final_group_animal + 
      final_group_taxa +
      final_group_continent,
    data = priority_data_all_clm)
anova(p.all.clm)
# Nothing was significant


p.ind.clm <- clm(priority.individual ~ experience + 
      final_group_animal + 
      final_group_taxa +
      final_group_continent,
    data = priority_data_ind_clm)
anova(p.ind.clm)
# Animal was a signifcant factor with p=0.04280


# Looking at magnitude, direction and confidence intervals
ci <- exp(confint(p.ind.clm))
ci <- ci[!grepl("\\|", rownames(ci)), ]
odds_ratio_df <- data.frame(
  variable = rownames(ci),
  OR       = exp(coef(p.ind.clm)[!grepl("\\|", names(coef(p.ind.clm)))]),
  lower_CI = ci[, 1],
  upper_CI = ci[, 2]
)
odds_ratio_df[grepl("final_group_animal", odds_ratio_df$variable), ]
print(odds_ratio_df)

# Creating a seperate dataframe for the graph I want
conservation.answers <- first_dataframe[first_dataframe$work_absolute == "Conservation", 
                                        c(1, 49, 50)]
# This brings across the 45 conservation answers I received and the id, and two help/hindererd questions they answered.

#Changing the names to make it more managable
names(conservation.answers)[names(conservation.answers) == "Do.you.think.animal.welfare.has.helped.or.hindered.your.research.in.conservation."] <- "solo.help.hindered.conservation"
names(conservation.answers)[names(conservation.answers) == "Do.you.think.animal.welfare.has.helped.or.hindered.the.practice.of.conservation."] <- "all.help.hindered.conservation"



####### Making sure the data has no wierd spaces at the end
conservation.answers[2] <- lapply(conservation.answers[2], function(y) {
  y <- gsub("\u00A0", " ", y)   #replaces non-breaking space with normal space
  y <- trimws(y)               #remove leading/trailing spaces
  y
})

conservation.answers[3] <- lapply(conservation.answers[3], function(y) {
  y <- gsub("\u00A0", " ", y)   #replaces non-breaking space with normal space
  y <- trimws(y)               #remove leading/trailing spaces
  y
})
#######

help.hinder.data <- first_dataframe[,  c("id", "work_absolute", "experience", "final_group_animal", "final_group_taxa", "final_group_continent", "solo.help.hindered.conservation", "all.help.hindered.conservation")]

help.hinder.data <- subset(help.hinder.data, work_absolute == "Conservation")

####### Making sure the data has no wierd spaces at the end
help.hinder.data[7] <- lapply(help.hinder.data[7], function(y) {
  y <- gsub("\u00A0", " ", y)   #replaces non-breaking space with normal space
  y <- trimws(y)               #remove leading/trailing spaces
  y
})

help.hinder.data[8] <- lapply(help.hinder.data[8], function(y) {
  y <- gsub("\u00A0", " ", y)   #replaces non-breaking space with normal space
  y <- trimws(y)               #remove leading/trailing spaces
  y
})
#######

help.hinder.data.ind <- help.hinder.data
help.hinder.data.all <- help.hinder.data


# Should get rid of the two answers that won't work with a CLM
help.hinder.data.ind <- help.hinder.data.ind[
  !help.hinder.data.ind$solo.help.hindered.conservation %in% 
    c("Context dependant", "Do not know") &
  !is.na(help.hinder.data.ind$solo.help.hindered.conservation),]

help.hinder.data.all <- help.hinder.data.all[
  !help.hinder.data.all$all.help.hindered.conservation %in% 
    c("Context dependant", "Do not know") &
  !is.na(help.hinder.data.all$all.help.hindered.conservation),]

# Need to convert the likert data into ordered factor as that's what it is. However not sure what to do with the "Don't know" and "Context dependant" answers, might need to remove them.
help.hindered.levels.vec <- c("Significantly hindered",
                              "Slightly hindered",
                              "Neutral",
                              "Slightly helped",
                              "Significantly helped")

help.hinder.data.ind$solo.help.hindered.conservation <- factor(help.hinder.data.ind$solo.help.hindered.conservation,
                         levels = c(help.hindered.levels.vec),
                         ordered = TRUE) #Use to order the data

help.hinder.data.all$all.help.hindered.conservation <- factor(help.hinder.data.all$all.help.hindered.conservation,
                         levels = c(help.hindered.levels.vec),
                         ordered = TRUE) #Use to order the data

# Removing the prefer not to say answers
help.hinder.data.ind <- help.hinder.data.ind[!grepl("prefer_not_to_say", help.hinder.data.ind$final_group_continent, ignore.case = TRUE), ]
help.hinder.data.all <- help.hinder.data.all[!grepl("prefer_not_to_say", help.hinder.data.all$final_group_continent, ignore.case = TRUE), ]

# Doing a clm with all the other demograhic questions
# I have taken out continent on both as the group size for Americas are 0 and 1
h.h.ind.clm <- clm(solo.help.hindered.conservation ~ 
                     experience +
                     final_group_animal +
                     final_group_taxa,
                     # final_group_continent,
                   data = help.hinder.data.ind)
anova(h.h.ind.clm)
# Nothing was significant


h.h.all.clm <- clm(all.help.hindered.conservation ~ 
                     experience + 
                     final_group_animal + 
                     final_group_taxa,
                     # final_group_continent,
                     data = help.hinder.data.all)
anova(h.h.all.clm)
# Nothing was significant

# Making a Likert graph without the Context and Do not know answers
# Combining into one df, need to add NAs so they are the same size.

col1 <- help.hinder.data.ind$solo.help.hindered.conservation
col2 <- col2 <- factor(c(help.hinder.data.all$all.help.hindered.conservation, NA, NA),
               levels = levels(help.hinder.data.ind$solo.help.hindered.conservation),
               ordered = TRUE)

help.hinder.combined <- data.frame(col1, col2)

# Making a vector with the levels
help.hindered.levels.vec <- c("Significantly hindered",
                              "Slightly hindered",
                              "Neutral",
                              "Slightly helped",
                              "Significantly helped")

# Combining both the indivual and all parts
df_ind <- data.frame(
  bar    = "Individual work",
  answer = as.character(help.hinder.data.ind$solo.help.hindered.conservation)
)

df_all <- data.frame(
  bar    = "All conservation work",
  answer = as.character(help.hinder.data.all$all.help.hindered.conservation)
)

help.hinder.combined <- bind_rows(df_ind, df_all) %>%
  mutate(answer = factor(answer, levels = help.hindered.levels.vec, ordered = TRUE))

# Long data
long_data <- help.hinder.combined %>%
  filter(!is.na(answer)) %>%
  count(bar, answer) %>%
  group_by(bar) %>%
  mutate(prop = n / sum(n) * 100) %>%
  ungroup()

# Percent
label_data <- long_data %>%
  mutate(group = case_when(
    answer %in% c("Significantly hindered", "Slightly hindered") ~ "negative",
    answer == "Neutral"                                           ~ "neutral",
    answer %in% c("Slightly helped", "Significantly helped")     ~ "positive"
  )) %>%
  group_by(bar, group) %>%
  summarise(pct = sum(prop), .groups = "drop")

# Creating part of the graph
plot_data <- long_data %>%
  mutate(
    side = case_when(
      answer %in% c("Significantly hindered", "Slightly hindered") ~ "negative",
      answer == "Neutral"                                           ~ "neutral",
      answer %in% c("Slightly helped", "Significantly helped")     ~ "positive"
    )
  ) %>%
  group_by(bar) %>%
  mutate(neutral_half = prop[answer == "Neutral"] / 2) %>%
  ungroup() %>%
  group_by(bar, side) %>%
  mutate(
    answer_ordered = if_else(
      side == "negative",
      factor(as.character(answer),
             levels = c("Slightly hindered", "Significantly hindered")),
      factor(as.character(answer),
             levels = help.hindered.levels.vec)
    )
  ) %>%
  arrange(bar, side, answer_ordered) %>%
  mutate(
    cumulative     = cumsum(prop),
    cumulative_lag = lag(cumulative, default = 0)
  ) %>%
  ungroup() %>%
  mutate(
    xmin = case_when(
      side == "negative" ~ -(cumulative     + neutral_half),
      side == "neutral"  ~ -prop / 2,
      side == "positive" ~ cumulative_lag  + neutral_half
    ),
    xmax = case_when(
      side == "negative" ~ -(cumulative_lag + neutral_half),
      side == "neutral"  ~  prop / 2,
      side == "positive" ~ cumulative      + neutral_half
    ),
    bar = factor(bar, levels = c("Individual work", "All conservation work"))
  )

label_data <- label_data %>%
  mutate(bar = factor(bar, levels = c("Individual work", "All conservation work")))

# Colours
likert_colours <- c(
  "Significantly hindered" = "#C0392B",
  "Slightly hindered"      = "#E08080",
  "Neutral"                = "#D3D3D3",
  "Slightly helped"        = "#6080E0",
  "Significantly helped"   = "#2B3EC0"
)

# Final plot
ggplot(plot_data) +
  geom_rect(aes(xmin = xmin, xmax = xmax,
                ymin = as.numeric(bar) - 0.3,
                ymax = as.numeric(bar) + 0.3,
                fill = answer)) +
  geom_text(
    data = filter(label_data, group == "negative"),
    aes(x = -105, y = bar, label = paste0(round(pct), "%")),
    colour = "#C0392B", fontface = "bold", size = 3, hjust = 1
  ) +
  geom_text(
    data = filter(label_data, group == "positive"),
    aes(x = 105, y = bar, label = paste0(round(pct), "%")),
    colour = "#2B3EC0", fontface = "bold", size = 3, hjust = 0
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
  scale_fill_manual(values = likert_colours, name = NULL,
                    breaks = help.hindered.levels.vec) +
  scale_x_continuous(
    breaks = c(-100, -50, 0, 50, 100),
    labels = c("100", "50", "0", "50", "100"),
    limits = c(-120, 120)
  ) +
  scale_y_discrete(limits = c("Individual work", "All conservation work")) +
  labs(
    x       = "Percentage",
    y       = NULL
  ) +
  theme_minimal() +
  theme(
    text               = element_text(family = "Times New Roman"),
    plot.title         = element_text(face = "bold", hjust = 0.5),
    legend.position    = "bottom",
    panel.grid.major.y = element_blank()
  ) +
  guides(fill = guide_legend(nrow = 2))