library(haven)
library(psych)
library(tidyverse)
library(writexl)
library(tidyverse)
library(ggcorrplot)
library(ggrepel)
# Read the .dta file and assign to eurob2
eurob2 <- read_dta("C:/Users/ricardo.branco/Downloads/ZA7602_v2-0-0.dta")
selected_vars <- c(
  "country", "d11", "qa1",                          # Demographics / Administrative
  "qa2_1", "qa2_2",                                 # QA2 (Environmental Seriousness/Perception)
  "qa3_1", "qa3_2", "qa3_3", "qa3_4", "qa3_5", "qa3_6", "qa3_7", "qa3_8", "qa3_9", "qa3_10",
  "qa4_1", "qa4_2", "qa4_3", "qa4_4", "qa4_5", "qa4_6", "qa4_7", "qa4_8", "qa4_9", "qa4_10", "qa4_11", "qa4_12", "qa4_13",
  "qa6_1", "qa6_2", "qa6_3", "qa6_4", "qa6_5", "qa6_6", "qa6_7", "qa6_8", "qa6_9", "qa6_10", "qa6_11", "qa6_12", "qa6_13", "qa6_14",
  "qa7_1", "qa7_2", "qa7_3", "qa7_4", "qa7_5", "qa7_6",
  "qa12_1", "qa12_2", "qa12_3", "qa12_4", "qa12_5",
  "qa13_1", "qa13_2", "qa13_3", "qa13_4", "qa13_5", "qa13_6",
  "qa14_1", "qa14_2", "qa14_3", "qa14_4"
)

# Reducing the number of variables for Analysis and clean all the answers with DK - Don't Kown
fa_eurob2 <- eurob2 %>%
  select(all_of(selected_vars)) %>% 
  filter(
    if_all(starts_with("qa7"), ~ . != 5),
    if_all(starts_with("qa12"), ~ . != 5),
    if_all(starts_with("qa13"), ~ . != 5),
    if_all(starts_with("qa14"), ~ . != 5)
  )


clean_fa_ebaro <- fa_eurob2 %>%
  select("qa7_1", "qa7_2", "qa7_3", "qa7_4", "qa7_5", "qa7_6",
  "qa12_1", "qa12_2", "qa12_3", "qa12_4", "qa12_5",
  "qa13_1", "qa13_2", "qa13_3", "qa13_4", "qa13_5", "qa13_6",
  "qa14_1", "qa14_2", "qa14_3", "qa14_4")
KMO_result <- KMO(clean_fa_ebaro) # data suitability for factor analysis
print(KMO_result)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = clean_fa_ebaro)
## Overall MSA =  0.91
## MSA for each item = 
##  qa7_1  qa7_2  qa7_3  qa7_4  qa7_5  qa7_6 qa12_1 qa12_2 qa12_3 qa12_4 qa12_5 
##   0.92   0.90   0.92   0.92   0.88   0.89   0.93   0.94   0.92   0.90   0.90 
## qa13_1 qa13_2 qa13_3 qa13_4 qa13_5 qa13_6 qa14_1 qa14_2 qa14_3 qa14_4 
##   0.94   0.93   0.91   0.93   0.94   0.59   0.93   0.91   0.95   0.78
scree(clean_fa_ebaro) # plot eigenvalues: number of factors considered important

# Existing models
models_01 <- list(
  pcf_result4 = principal(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", method = "cov"),
  pcf_result3 = principal(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", method = "cov"),
  pcf_result2 = principal(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", method = "cov"),
  paf_result4 = fa(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", fm = "pa"),
  paf_result3 = fa(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", fm = "pa"),
  paf_result2 = fa(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", fm = "pa")
)


# Extract metrics
comparison_01 <- data.frame(
  Model = names(models_01),
  Factors = sapply(models_01, function(x) ncol(x$loadings)),
  RMSR = sapply(models_01, function(x) if(!is.null(x$rms)) x$rms else NA),
  CumulativeVariance = sapply(models_01, function(x) if(!is.null(x$Vaccounted)) sum(x$Vaccounted["Proportion Var", ]) else NA),
  MeanCommunality = sapply(models_01, function(x) if(!is.null(x$communality)) mean(x$communality) else NA),
  LoadingsAbove0.7 = sapply(models_01, function(x) sum(abs(x$loadings) > 0.7))
)

print(comparison_01)
##                   Model Factors       RMSR CumulativeVariance MeanCommunality
## pcf_result4 pcf_result4       4 0.05990158          0.5092418       0.5092418
## pcf_result3 pcf_result3       3 0.05899353          0.4554137       0.4554137
## pcf_result2 pcf_result2       2 0.06549309          0.3812904       0.3812904
## paf_result4 paf_result4       4 0.02510545          0.3986021       0.3986021
## paf_result3 paf_result3       3 0.03400314          0.3704551       0.3704551
## paf_result2 paf_result2       2 0.05147832          0.3263483       0.3263483
##             LoadingsAbove0.7
## pcf_result4                9
## pcf_result3                7
## pcf_result2                6
## paf_result4                4
## paf_result3                4
## paf_result2                3
# Remove problematic variables
clean_fa_ebaro_v2 <- clean_fa_ebaro[, !names(clean_fa_ebaro) %in% 
  c("qa13_6", "qa14_4", "qa7_2", "qa14_2", "qa14_1", "qa12_3", "qa14_3", "qa13_4")]

#3 factors with oblique rotation (factors likely correlated)
paf_3factor <- fa(clean_fa_ebaro_v2, 
                   nfactors = 3, 
                   rotate = "oblimin",  # allows factors to correlate
                   fm = "pa")
print(paf_3factor, cut = 0.4, sort = TRUE)
## Factor Analysis using method =  pa
## Call: fa(r = clean_fa_ebaro_v2, nfactors = 3, rotate = "oblimin", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        item   PA1   PA2   PA3   h2   u2 com
## qa7_6     5  0.83             0.68 0.32 1.0
## qa7_5     4  0.79             0.60 0.40 1.0
## qa7_3     2  0.70             0.55 0.45 1.0
## qa7_4     3  0.67             0.52 0.48 1.1
## qa7_1     1  0.65             0.40 0.60 1.0
## qa12_5    9        0.78       0.58 0.42 1.0
## qa12_4    8        0.75       0.55 0.45 1.0
## qa12_1    6        0.54       0.38 0.62 1.1
## qa12_2    7        0.49       0.40 0.60 1.2
## qa13_3   12              0.61 0.32 0.68 1.0
## qa13_2   11              0.60 0.37 0.63 1.0
## qa13_5   13              0.46 0.37 0.63 1.3
## qa13_1   10              0.43 0.31 0.69 1.5
## 
##                        PA1  PA2  PA3
## SS loadings           2.79 1.94 1.28
## Proportion Var        0.21 0.15 0.10
## Cumulative Var        0.21 0.36 0.46
## Proportion Explained  0.46 0.32 0.21
## Cumulative Proportion 0.46 0.79 1.00
## 
##  With factor correlations of 
##      PA1  PA2  PA3
## PA1 1.00 0.48 0.45
## PA2 0.48 1.00 0.55
## PA3 0.45 0.55 1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 3 factors are sufficient.
## 
## df null model =  78  with the objective function =  4.37 with Chi Square =  89985.71
## df of  the model are 42  and the objective function was  0.16 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  20590 with the empirical chi square  1719.03  with prob <  0 
## The total n.obs was  20590  with Likelihood Chi Square =  3364.7  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.931
## RMSEA index =  0.062  and the 90 % confidence intervals are  0.06 0.064
## BIC =  2947.54
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA1  PA2  PA3
## Correlation of (regression) scores with factors   0.93 0.90 0.83
## Multiple R square of scores with factors          0.87 0.81 0.69
## Minimum correlation of possible factor scores     0.74 0.62 0.39
# Compare with varimax
paf_3factor_v <- fa(clean_fa_ebaro_v2, 
                     nfactors = 3, 
                     rotate = "varimax", 
                     fm = "pa")
print(paf_3factor_v, cut = 0.4, sort = TRUE)
## Factor Analysis using method =  pa
## Call: fa(r = clean_fa_ebaro_v2, nfactors = 3, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        item  PA1  PA2  PA3   h2   u2 com
## qa7_6     5 0.79           0.68 0.32 1.2
## qa7_5     4 0.75           0.60 0.40 1.1
## qa7_3     2 0.69           0.55 0.45 1.3
## qa7_4     3 0.66           0.52 0.48 1.4
## qa7_1     1 0.61           0.40 0.60 1.2
## qa12_5    9      0.72      0.58 0.42 1.2
## qa12_4    8      0.70      0.55 0.45 1.3
## qa12_1    6      0.54      0.38 0.62 1.6
## qa12_2    7      0.52      0.40 0.60 2.0
## qa13_2   11           0.56 0.37 0.63 1.3
## qa13_3   12           0.55 0.32 0.68 1.1
## qa13_5   13           0.48 0.37 0.63 2.1
## qa13_1   10           0.45 0.31 0.69 1.9
## 
##                        PA1  PA2  PA3
## SS loadings           2.71 1.96 1.35
## Proportion Var        0.21 0.15 0.10
## Cumulative Var        0.21 0.36 0.46
## Proportion Explained  0.45 0.33 0.22
## Cumulative Proportion 0.45 0.78 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 3 factors are sufficient.
## 
## df null model =  78  with the objective function =  4.37 with Chi Square =  89985.71
## df of  the model are 42  and the objective function was  0.16 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  20590 with the empirical chi square  1719.03  with prob <  0 
## The total n.obs was  20590  with Likelihood Chi Square =  3364.7  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.931
## RMSEA index =  0.062  and the 90 % confidence intervals are  0.06 0.064
## BIC =  2947.54
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA1  PA2  PA3
## Correlation of (regression) scores with factors   0.91 0.84 0.75
## Multiple R square of scores with factors          0.82 0.71 0.56
## Minimum correlation of possible factor scores     0.64 0.42 0.12
# Existing models
models_02 <- list(
  pcf_result4 = principal(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", method = "cov"),
  pcf_result3 = principal(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", method = "cov"),
  pcf_result2 = principal(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", method = "cov"),
  paf_result4 = fa(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", fm = "pa"),
  paf_result3 = fa(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", fm = "pa"),
  paf_result2 = fa(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", fm = "pa"),
  paf_3factor = fa(clean_fa_ebaro_v2, nfactors = 3, rotate = "oblimin", fm = "pa")
)


# Extract metrics
comparison2 <- data.frame(
  Model = names(models_02),
  Factors = sapply(models_02, function(x) ncol(x$loadings)),
  RMSR = sapply(models_02, function(x) if(!is.null(x$rms)) x$rms else NA),
  CumulativeVariance = sapply(models_02, function(x) if(!is.null(x$Vaccounted)) sum(x$Vaccounted["Proportion Var", ]) else NA),
  MeanCommunality = sapply(models_02, function(x) if(!is.null(x$communality)) mean(x$communality) else NA),
  LoadingsAbove0.7 = sapply(models_02, function(x) sum(abs(x$loadings) > 0.7))
)

print(comparison2)
##                   Model Factors       RMSR CumulativeVariance MeanCommunality
## pcf_result4 pcf_result4       4 0.05990158          0.5092418       0.5092418
## pcf_result3 pcf_result3       3 0.05899353          0.4554137       0.4554137
## pcf_result2 pcf_result2       2 0.06549309          0.3812904       0.3812904
## paf_result4 paf_result4       4 0.02510545          0.3986021       0.3986021
## paf_result3 paf_result3       3 0.03400314          0.3704551       0.3704551
## paf_result2 paf_result2       2 0.05147832          0.3263483       0.3263483
## paf_3factor paf_3factor       3 0.02313400          0.4632199       0.4632199
##             LoadingsAbove0.7
## pcf_result4                9
## pcf_result3                7
## pcf_result2                6
## paf_result4                4
## paf_result3                4
## paf_result2                3
## paf_3factor                5

Final Model

Factor 1 – Environmental Awareness

  • Variables: qa7_1, qa7_3, qa7_4, qa7_5, qa7_6
  • Description: Worries about plastics, microplastics, chemicals
    Emotional / affective dimension

Factor 2 – Systemic Solution

  • Variables: qa12_1, qa12_2, qa12_4, qa12_5
  • Description: Industry effort, product design, collection facilities, education
    Systemic and structural approaches to environmental problems

Factor 3 – Ethical Consumption Values

  • Variables: qa13_1, qa13_2, qa13_3, qa13_5
  • Description: Clothing durability, recyclability, second-hand, transparency labels
    Values-based purchasing decisions

Factor Correlations

  • Correlations: 0.45–0.55 (moderate, justifying oblique rotation)
  • PA1 ↔︎ PA3 (r = 0.61): People who support systemic solutions also value ethical consumption
  • PA2 ↔︎ PA1 (r = 0.47): Environmental concern drives support for structural solutions
  • PA2 ↔︎ PA3 (r = 0.48): Environmental concern drives ethical values
# Calculate Country Positions on Environmental Attitudes
# Using your final model - 3 dimensions
clean_fa_final <- clean_fa_ebaro %>%
  select(qa7_1, qa7_3, qa7_4, qa7_5, qa7_6,        # PA1: Environmental Concern
         qa12_1, qa12_2, qa12_4, qa12_5,           # PA2: Systemic Solutions
         qa13_1, qa13_2, qa13_3, qa13_5)           # PA3: Ethical Consumption

# Extract factor scores
factor_scores <- factor.scores(clean_fa_final, paf_3factor)
scores_df <- as.data.frame(factor_scores$scores)
names(scores_df) <- c("Env_Concern", "Systemic_Solutions", "Ethical_Consumption")

# Step 2: Add country information
scores_df$country <- fa_eurob2$country

# Remove rows with missing data
scores_df_complete <- scores_df %>%
  filter(complete.cases(.))

# Step 3: Calculate country-level means
country_scores <- scores_df_complete %>%
  group_by(country) %>%
  summarise(
    Env_Concern_mean = mean(Env_Concern, na.rm = TRUE),
    Ethical_Consumption_mean = mean(Ethical_Consumption, na.rm = TRUE),
    n = n()
  ) %>%
  ungroup()

# Step 4: Add country labels
country_labels <- c(
  `1` = "FR", `2` = "BE", `3` = "NL", `4` = "DE-W", `5` = "IT",
  `6` = "LU", `7` = "DK", `8` = "IE", `9` = "GB", `11` = "GR",
  `12` = "ES", `13` = "PT", `14` = "DE-E", `16` = "FI", `17` = "SE",
  `18` = "AT", `19` = "CY", `20` = "CZ", `21` = "EE", `22` = "HU",
  `23` = "LV", `24` = "LT", `25` = "MT", `26` = "PL", `27` = "SK",
  `28` = "SI", `29` = "BG", `30` = "RO", `32` = "HR"
)

country_scores <- country_scores %>%
  mutate(country_label = country_labels[as.character(country)])

# Step 5: Create the visualization
ggplot(country_scores, aes(x = Env_Concern_mean, y = Ethical_Consumption_mean)) +
  # Add quadrant lines at zero
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
  
  # Add points
  geom_point(aes(size = n), color = "#2C5F8D", alpha = 0.6) +
  
  # Add country labels with ggrepel to avoid overlap
  geom_text_repel(
    aes(label = country_label),
    size = 3.5,
    box.padding = 0.5,
    point.padding = 0.3,
    segment.color = "gray70",
    segment.size = 0.3,
    max.overlaps = 30
  ) +
  
  # Styling
  scale_size_continuous(range = c(3, 8), name = "Sample Size") +
  labs(
    title = "Country Positions on Environmental Attitudes",
    subtitle = "Factor 1 (Environmental Concern) vs Factor 3 (Ethical Consumption Values)",
    x = "Environmental Concern (PA1)\n← Lower concern | Higher concern →",
    y = "Ethical Consumption Values (PA3)\n← Lower values | Higher values →",
    caption = "Note: Positions based on country mean factor scores"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11, color = "gray30"),
    panel.grid.minor = element_blank(),
    legend.position = "bottom",
    plot.caption = element_text(hjust = 0, color = "gray50", size = 9)
  )

## 
## === Countries with HIGHEST Environmental Concern ===
## # A tibble: 5 × 2
##   country_label Env_Concern_mean
##   <chr>                    <dbl>
## 1 AT                       0.327
## 2 FI                       0.267
## 3 BE                       0.257
## 4 DE-E                     0.207
## 5 DE-W                     0.206
## 
## === Countries with HIGHEST Ethical Consumption Values ===
## # A tibble: 5 × 2
##   country_label Ethical_Consumption_mean
##   <chr>                            <dbl>
## 1 EE                               0.468
## 2 DE-E                             0.465
## 3 CZ                               0.396
## 4 AT                               0.306
## 5 LV                               0.297
## 
## === Countries with LOWEST Environmental Concern ===
## # A tibble: 5 × 2
##   country_label Env_Concern_mean
##   <chr>                    <dbl>
## 1 CY                      -0.732
## 2 ES                      -0.402
## 3 GR                      -0.385
## 4 MT                      -0.380
## 5 BG                      -0.268
## 
## Country-level correlation between PA1 and PA3: r = 0.568
# Alternative visualization - with regional groupings
# Define regional groups - according Eurostat's standard macro-regions
country_scores <- country_scores %>%
  mutate(
    region = case_when(
      country_label %in% c("DK", "SE", "FI", "EE", "LV", "LT", "IE") ~ "Northern Europe",
      country_label %in% c("BE", "FR", "NL", "LU", "DE", "AT", "GB") ~ "Western Europe",
      country_label %in% c("PT", "ES", "IT", "GR", "CY", "MT") ~ "Southern Europe",
      country_label %in% c("PL", "CZ", "SK", "HU", "RO", "BG") ~ "Eastern Europe",
      TRUE ~ "Other"
)
  )

# Colored by region
ggplot(country_scores, aes(x = Env_Concern_mean, y = Ethical_Consumption_mean, color = region)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
  geom_point(aes(size = n), alpha = 0.7) +
  geom_text_repel(
    aes(label = country_label),
    size = 3,
    box.padding = 0.5,
    max.overlaps = 30
  ) +
  scale_color_brewer(palette = "Set2", name = "Region") +
  scale_size_continuous(range = c(3, 8), name = "Sample Size") +
  labs(
    title = "Country Positions on Environmental Attitudes by Region",
    subtitle = "Factor 1 (Environmental Concern) vs Factor 3 (Ethical Consumption Values)",
    x = "Environmental Concern (PA1)\n← Lower concern | Higher concern →",
    y = "Ethical Consumption Values (PA3)\n← Lower values | Higher values →"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    panel.grid.minor = element_blank(),
    legend.position = "right"
  )