library(haven)
library(psych)
library(tidyverse)
library(writexl)
library(tidyverse)
library(ggcorrplot)
library(ggrepel)
# Read the .dta file and assign to eurob2
eurob2 <- read_dta("C:/Users/ricardo.branco/Downloads/ZA7602_v2-0-0.dta")
selected_vars <- c(
"country", "d11", "qa1", # Demographics / Administrative
"qa2_1", "qa2_2", # QA2 (Environmental Seriousness/Perception)
"qa3_1", "qa3_2", "qa3_3", "qa3_4", "qa3_5", "qa3_6", "qa3_7", "qa3_8", "qa3_9", "qa3_10",
"qa4_1", "qa4_2", "qa4_3", "qa4_4", "qa4_5", "qa4_6", "qa4_7", "qa4_8", "qa4_9", "qa4_10", "qa4_11", "qa4_12", "qa4_13",
"qa6_1", "qa6_2", "qa6_3", "qa6_4", "qa6_5", "qa6_6", "qa6_7", "qa6_8", "qa6_9", "qa6_10", "qa6_11", "qa6_12", "qa6_13", "qa6_14",
"qa7_1", "qa7_2", "qa7_3", "qa7_4", "qa7_5", "qa7_6",
"qa12_1", "qa12_2", "qa12_3", "qa12_4", "qa12_5",
"qa13_1", "qa13_2", "qa13_3", "qa13_4", "qa13_5", "qa13_6",
"qa14_1", "qa14_2", "qa14_3", "qa14_4"
)
# Reducing the number of variables for Analysis and clean all the answers with DK - Don't Kown
fa_eurob2 <- eurob2 %>%
select(all_of(selected_vars)) %>%
filter(
if_all(starts_with("qa7"), ~ . != 5),
if_all(starts_with("qa12"), ~ . != 5),
if_all(starts_with("qa13"), ~ . != 5),
if_all(starts_with("qa14"), ~ . != 5)
)
clean_fa_ebaro <- fa_eurob2 %>%
select("qa7_1", "qa7_2", "qa7_3", "qa7_4", "qa7_5", "qa7_6",
"qa12_1", "qa12_2", "qa12_3", "qa12_4", "qa12_5",
"qa13_1", "qa13_2", "qa13_3", "qa13_4", "qa13_5", "qa13_6",
"qa14_1", "qa14_2", "qa14_3", "qa14_4")
KMO_result <- KMO(clean_fa_ebaro) # data suitability for factor analysis
print(KMO_result)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = clean_fa_ebaro)
## Overall MSA = 0.91
## MSA for each item =
## qa7_1 qa7_2 qa7_3 qa7_4 qa7_5 qa7_6 qa12_1 qa12_2 qa12_3 qa12_4 qa12_5
## 0.92 0.90 0.92 0.92 0.88 0.89 0.93 0.94 0.92 0.90 0.90
## qa13_1 qa13_2 qa13_3 qa13_4 qa13_5 qa13_6 qa14_1 qa14_2 qa14_3 qa14_4
## 0.94 0.93 0.91 0.93 0.94 0.59 0.93 0.91 0.95 0.78
scree(clean_fa_ebaro) # plot eigenvalues: number of factors considered important

# Existing models
models_01 <- list(
pcf_result4 = principal(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", method = "cov"),
pcf_result3 = principal(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", method = "cov"),
pcf_result2 = principal(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", method = "cov"),
paf_result4 = fa(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", fm = "pa"),
paf_result3 = fa(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", fm = "pa"),
paf_result2 = fa(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", fm = "pa")
)
# Extract metrics
comparison_01 <- data.frame(
Model = names(models_01),
Factors = sapply(models_01, function(x) ncol(x$loadings)),
RMSR = sapply(models_01, function(x) if(!is.null(x$rms)) x$rms else NA),
CumulativeVariance = sapply(models_01, function(x) if(!is.null(x$Vaccounted)) sum(x$Vaccounted["Proportion Var", ]) else NA),
MeanCommunality = sapply(models_01, function(x) if(!is.null(x$communality)) mean(x$communality) else NA),
LoadingsAbove0.7 = sapply(models_01, function(x) sum(abs(x$loadings) > 0.7))
)
print(comparison_01)
## Model Factors RMSR CumulativeVariance MeanCommunality
## pcf_result4 pcf_result4 4 0.05990158 0.5092418 0.5092418
## pcf_result3 pcf_result3 3 0.05899353 0.4554137 0.4554137
## pcf_result2 pcf_result2 2 0.06549309 0.3812904 0.3812904
## paf_result4 paf_result4 4 0.02510545 0.3986021 0.3986021
## paf_result3 paf_result3 3 0.03400314 0.3704551 0.3704551
## paf_result2 paf_result2 2 0.05147832 0.3263483 0.3263483
## LoadingsAbove0.7
## pcf_result4 9
## pcf_result3 7
## pcf_result2 6
## paf_result4 4
## paf_result3 4
## paf_result2 3
# Remove problematic variables
clean_fa_ebaro_v2 <- clean_fa_ebaro[, !names(clean_fa_ebaro) %in%
c("qa13_6", "qa14_4", "qa7_2", "qa14_2", "qa14_1", "qa12_3", "qa14_3", "qa13_4")]
#3 factors with oblique rotation (factors likely correlated)
paf_3factor <- fa(clean_fa_ebaro_v2,
nfactors = 3,
rotate = "oblimin", # allows factors to correlate
fm = "pa")
print(paf_3factor, cut = 0.4, sort = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = clean_fa_ebaro_v2, nfactors = 3, rotate = "oblimin", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item PA1 PA2 PA3 h2 u2 com
## qa7_6 5 0.83 0.68 0.32 1.0
## qa7_5 4 0.79 0.60 0.40 1.0
## qa7_3 2 0.70 0.55 0.45 1.0
## qa7_4 3 0.67 0.52 0.48 1.1
## qa7_1 1 0.65 0.40 0.60 1.0
## qa12_5 9 0.78 0.58 0.42 1.0
## qa12_4 8 0.75 0.55 0.45 1.0
## qa12_1 6 0.54 0.38 0.62 1.1
## qa12_2 7 0.49 0.40 0.60 1.2
## qa13_3 12 0.61 0.32 0.68 1.0
## qa13_2 11 0.60 0.37 0.63 1.0
## qa13_5 13 0.46 0.37 0.63 1.3
## qa13_1 10 0.43 0.31 0.69 1.5
##
## PA1 PA2 PA3
## SS loadings 2.79 1.94 1.28
## Proportion Var 0.21 0.15 0.10
## Cumulative Var 0.21 0.36 0.46
## Proportion Explained 0.46 0.32 0.21
## Cumulative Proportion 0.46 0.79 1.00
##
## With factor correlations of
## PA1 PA2 PA3
## PA1 1.00 0.48 0.45
## PA2 0.48 1.00 0.55
## PA3 0.45 0.55 1.00
##
## Mean item complexity = 1.1
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 78 with the objective function = 4.37 with Chi Square = 89985.71
## df of the model are 42 and the objective function was 0.16
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 20590 with the empirical chi square 1719.03 with prob < 0
## The total n.obs was 20590 with Likelihood Chi Square = 3364.7 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.931
## RMSEA index = 0.062 and the 90 % confidence intervals are 0.06 0.064
## BIC = 2947.54
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA3
## Correlation of (regression) scores with factors 0.93 0.90 0.83
## Multiple R square of scores with factors 0.87 0.81 0.69
## Minimum correlation of possible factor scores 0.74 0.62 0.39
# Compare with varimax
paf_3factor_v <- fa(clean_fa_ebaro_v2,
nfactors = 3,
rotate = "varimax",
fm = "pa")
print(paf_3factor_v, cut = 0.4, sort = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = clean_fa_ebaro_v2, nfactors = 3, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item PA1 PA2 PA3 h2 u2 com
## qa7_6 5 0.79 0.68 0.32 1.2
## qa7_5 4 0.75 0.60 0.40 1.1
## qa7_3 2 0.69 0.55 0.45 1.3
## qa7_4 3 0.66 0.52 0.48 1.4
## qa7_1 1 0.61 0.40 0.60 1.2
## qa12_5 9 0.72 0.58 0.42 1.2
## qa12_4 8 0.70 0.55 0.45 1.3
## qa12_1 6 0.54 0.38 0.62 1.6
## qa12_2 7 0.52 0.40 0.60 2.0
## qa13_2 11 0.56 0.37 0.63 1.3
## qa13_3 12 0.55 0.32 0.68 1.1
## qa13_5 13 0.48 0.37 0.63 2.1
## qa13_1 10 0.45 0.31 0.69 1.9
##
## PA1 PA2 PA3
## SS loadings 2.71 1.96 1.35
## Proportion Var 0.21 0.15 0.10
## Cumulative Var 0.21 0.36 0.46
## Proportion Explained 0.45 0.33 0.22
## Cumulative Proportion 0.45 0.78 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 78 with the objective function = 4.37 with Chi Square = 89985.71
## df of the model are 42 and the objective function was 0.16
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 20590 with the empirical chi square 1719.03 with prob < 0
## The total n.obs was 20590 with Likelihood Chi Square = 3364.7 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.931
## RMSEA index = 0.062 and the 90 % confidence intervals are 0.06 0.064
## BIC = 2947.54
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA3
## Correlation of (regression) scores with factors 0.91 0.84 0.75
## Multiple R square of scores with factors 0.82 0.71 0.56
## Minimum correlation of possible factor scores 0.64 0.42 0.12
# Existing models
models_02 <- list(
pcf_result4 = principal(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", method = "cov"),
pcf_result3 = principal(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", method = "cov"),
pcf_result2 = principal(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", method = "cov"),
paf_result4 = fa(clean_fa_ebaro, nfactors = 4, rotate = "oblimin", fm = "pa"),
paf_result3 = fa(clean_fa_ebaro, nfactors = 3, rotate = "oblimin", fm = "pa"),
paf_result2 = fa(clean_fa_ebaro, nfactors = 2, rotate = "oblimin", fm = "pa"),
paf_3factor = fa(clean_fa_ebaro_v2, nfactors = 3, rotate = "oblimin", fm = "pa")
)
# Extract metrics
comparison2 <- data.frame(
Model = names(models_02),
Factors = sapply(models_02, function(x) ncol(x$loadings)),
RMSR = sapply(models_02, function(x) if(!is.null(x$rms)) x$rms else NA),
CumulativeVariance = sapply(models_02, function(x) if(!is.null(x$Vaccounted)) sum(x$Vaccounted["Proportion Var", ]) else NA),
MeanCommunality = sapply(models_02, function(x) if(!is.null(x$communality)) mean(x$communality) else NA),
LoadingsAbove0.7 = sapply(models_02, function(x) sum(abs(x$loadings) > 0.7))
)
print(comparison2)
## Model Factors RMSR CumulativeVariance MeanCommunality
## pcf_result4 pcf_result4 4 0.05990158 0.5092418 0.5092418
## pcf_result3 pcf_result3 3 0.05899353 0.4554137 0.4554137
## pcf_result2 pcf_result2 2 0.06549309 0.3812904 0.3812904
## paf_result4 paf_result4 4 0.02510545 0.3986021 0.3986021
## paf_result3 paf_result3 3 0.03400314 0.3704551 0.3704551
## paf_result2 paf_result2 2 0.05147832 0.3263483 0.3263483
## paf_3factor paf_3factor 3 0.02313400 0.4632199 0.4632199
## LoadingsAbove0.7
## pcf_result4 9
## pcf_result3 7
## pcf_result2 6
## paf_result4 4
## paf_result3 4
## paf_result2 3
## paf_3factor 5
Final Model
- 13 variables, 3 factors
- Principal Axis Factoring with oblimin
rotation
- Removed 8 problematic variables from original
21
- Fit: RMSR = 0.02
Factor 1 – Environmental Awareness
- Variables: qa7_1, qa7_3, qa7_4, qa7_5, qa7_6
- Description: Worries about plastics, microplastics,
chemicals
Emotional / affective dimension
Factor 2 – Systemic Solution
- Variables: qa12_1, qa12_2, qa12_4, qa12_5
- Description: Industry effort, product design,
collection facilities, education
Systemic and structural approaches to environmental problems
Factor 3 – Ethical Consumption Values
- Variables: qa13_1, qa13_2, qa13_3, qa13_5
- Description: Clothing durability, recyclability,
second-hand, transparency labels
Values-based purchasing decisions
Factor Correlations
- Correlations: 0.45–0.55 (moderate, justifying
oblique rotation)
- PA1 ↔︎ PA3 (r = 0.61): People who support systemic solutions also
value ethical consumption
- PA2 ↔︎ PA1 (r = 0.47): Environmental concern drives support for
structural solutions
- PA2 ↔︎ PA3 (r = 0.48): Environmental concern drives ethical
values
# Calculate Country Positions on Environmental Attitudes
# Using your final model - 3 dimensions
clean_fa_final <- clean_fa_ebaro %>%
select(qa7_1, qa7_3, qa7_4, qa7_5, qa7_6, # PA1: Environmental Concern
qa12_1, qa12_2, qa12_4, qa12_5, # PA2: Systemic Solutions
qa13_1, qa13_2, qa13_3, qa13_5) # PA3: Ethical Consumption
# Extract factor scores
factor_scores <- factor.scores(clean_fa_final, paf_3factor)
scores_df <- as.data.frame(factor_scores$scores)
names(scores_df) <- c("Env_Concern", "Systemic_Solutions", "Ethical_Consumption")
# Step 2: Add country information
scores_df$country <- fa_eurob2$country
# Remove rows with missing data
scores_df_complete <- scores_df %>%
filter(complete.cases(.))
# Step 3: Calculate country-level means
country_scores <- scores_df_complete %>%
group_by(country) %>%
summarise(
Env_Concern_mean = mean(Env_Concern, na.rm = TRUE),
Ethical_Consumption_mean = mean(Ethical_Consumption, na.rm = TRUE),
n = n()
) %>%
ungroup()
# Step 4: Add country labels
country_labels <- c(
`1` = "FR", `2` = "BE", `3` = "NL", `4` = "DE-W", `5` = "IT",
`6` = "LU", `7` = "DK", `8` = "IE", `9` = "GB", `11` = "GR",
`12` = "ES", `13` = "PT", `14` = "DE-E", `16` = "FI", `17` = "SE",
`18` = "AT", `19` = "CY", `20` = "CZ", `21` = "EE", `22` = "HU",
`23` = "LV", `24` = "LT", `25` = "MT", `26` = "PL", `27` = "SK",
`28` = "SI", `29` = "BG", `30` = "RO", `32` = "HR"
)
country_scores <- country_scores %>%
mutate(country_label = country_labels[as.character(country)])
# Step 5: Create the visualization
ggplot(country_scores, aes(x = Env_Concern_mean, y = Ethical_Consumption_mean)) +
# Add quadrant lines at zero
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
# Add points
geom_point(aes(size = n), color = "#2C5F8D", alpha = 0.6) +
# Add country labels with ggrepel to avoid overlap
geom_text_repel(
aes(label = country_label),
size = 3.5,
box.padding = 0.5,
point.padding = 0.3,
segment.color = "gray70",
segment.size = 0.3,
max.overlaps = 30
) +
# Styling
scale_size_continuous(range = c(3, 8), name = "Sample Size") +
labs(
title = "Country Positions on Environmental Attitudes",
subtitle = "Factor 1 (Environmental Concern) vs Factor 3 (Ethical Consumption Values)",
x = "Environmental Concern (PA1)\n← Lower concern | Higher concern →",
y = "Ethical Consumption Values (PA3)\n← Lower values | Higher values →",
caption = "Note: Positions based on country mean factor scores"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11, color = "gray30"),
panel.grid.minor = element_blank(),
legend.position = "bottom",
plot.caption = element_text(hjust = 0, color = "gray50", size = 9)
)

##
## === Countries with HIGHEST Environmental Concern ===
## # A tibble: 5 × 2
## country_label Env_Concern_mean
## <chr> <dbl>
## 1 AT 0.327
## 2 FI 0.267
## 3 BE 0.257
## 4 DE-E 0.207
## 5 DE-W 0.206
##
## === Countries with HIGHEST Ethical Consumption Values ===
## # A tibble: 5 × 2
## country_label Ethical_Consumption_mean
## <chr> <dbl>
## 1 EE 0.468
## 2 DE-E 0.465
## 3 CZ 0.396
## 4 AT 0.306
## 5 LV 0.297
##
## === Countries with LOWEST Environmental Concern ===
## # A tibble: 5 × 2
## country_label Env_Concern_mean
## <chr> <dbl>
## 1 CY -0.732
## 2 ES -0.402
## 3 GR -0.385
## 4 MT -0.380
## 5 BG -0.268
##
## Country-level correlation between PA1 and PA3: r = 0.568
# Alternative visualization - with regional groupings
# Define regional groups - according Eurostat's standard macro-regions
country_scores <- country_scores %>%
mutate(
region = case_when(
country_label %in% c("DK", "SE", "FI", "EE", "LV", "LT", "IE") ~ "Northern Europe",
country_label %in% c("BE", "FR", "NL", "LU", "DE", "AT", "GB") ~ "Western Europe",
country_label %in% c("PT", "ES", "IT", "GR", "CY", "MT") ~ "Southern Europe",
country_label %in% c("PL", "CZ", "SK", "HU", "RO", "BG") ~ "Eastern Europe",
TRUE ~ "Other"
)
)
# Colored by region
ggplot(country_scores, aes(x = Env_Concern_mean, y = Ethical_Consumption_mean, color = region)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50", alpha = 0.5) +
geom_point(aes(size = n), alpha = 0.7) +
geom_text_repel(
aes(label = country_label),
size = 3,
box.padding = 0.5,
max.overlaps = 30
) +
scale_color_brewer(palette = "Set2", name = "Region") +
scale_size_continuous(range = c(3, 8), name = "Sample Size") +
labs(
title = "Country Positions on Environmental Attitudes by Region",
subtitle = "Factor 1 (Environmental Concern) vs Factor 3 (Ethical Consumption Values)",
x = "Environmental Concern (PA1)\n← Lower concern | Higher concern →",
y = "Ethical Consumption Values (PA3)\n← Lower values | Higher values →"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
panel.grid.minor = element_blank(),
legend.position = "right"
)
