# Load individual country files
sweden_data <- read_fst("sweden_data.fst")
poland_data <- read_fst("poland_data.fst")

# Remove rows where both 'netusoft' and 'wrclmch' are NA
sweden_filtered <- sweden_data %>%
  filter(!is.na(netusoft) & !is.na(wrclmch))

poland_filtered <- poland_data %>%
  filter(!is.na(netusoft) & !is.na(wrclmch))
clean_and_label <- function(data) {
  data %>%
    # Initial filtering of raw NAs
    filter(!is.na(netusoft) & !is.na(wrclmch)) %>%
    
    # Apply labels
    mutate(
      wrclmch = recode(as.character(wrclmch), !!!climate_labels),
      netusoft = recode(as.character(netusoft), !!!internet_labels)
    ) %>%
    
    # Remove NAs introduced by recoding
    filter(!is.na(netusoft) & !is.na(wrclmch)) %>%
    
    # Convert to factors (to maintain order)
    mutate(
      wrclmch = factor(wrclmch, levels = climate_labels[1:5]),
      netusoft = factor(netusoft, levels = internet_labels[1:5])
    ) %>%
    
    # Group and count
    group_by(netusoft, wrclmch) %>%
    summarise(count = n(), .groups = "drop")
}
# Define labels for WRCLMCH (Climate Concern)
climate_labels <- c(
  "1" = "Not at all worried",
  "2" = "Not very worried",
  "3" = "Somewhat worried",
  "4" = "Very worried",
  "5" = "Extremely worried",
  "6" = NA,  # Unknown
  "7" = NA,  # Other/Not applicable
  "8" = NA,  # Don't know
  "9" = NA   # Refused
)

# Define labels for NETUSOFT (Internet Use Frequency)
internet_labels <- c(
  "1" = "Never",
  "2" = "Rarely",
  "3" = "Few times a month",
  "4" = "Few times a week",
  "5" = "Daily",
  "6" = NA,
  "7" = NA,
  "8" = NA,
  "9" = NA
)

# Function to clean, label, and group data
clean_and_label <- function(data) {
  data %>%
    # Remove NA values
    filter(!is.na(netusoft) & !is.na(wrclmch)) %>%
    
    # Apply labels
    mutate(
      wrclmch = recode(as.character(wrclmch), !!!climate_labels),
      netusoft = recode(as.character(netusoft), !!!internet_labels)
    ) %>%
    
    # Convert to factors (to maintain order)
    mutate(
      wrclmch = factor(wrclmch, levels = climate_labels),
      netusoft = factor(netusoft, levels = internet_labels)
    ) %>%
    
    # Group by labeled variables and count occurrences
    group_by(netusoft, wrclmch) %>%
    summarise(count = n(), .groups = "drop")
}

# Apply function to Sweden and Poland data
sweden_cleaned <- clean_and_label(sweden_data)
poland_cleaned <- clean_and_label(poland_data)

# Print cleaned and labeled data
print(sweden_cleaned)
## # A tibble: 34 × 3
##    netusoft wrclmch            count
##    <fct>    <fct>              <int>
##  1 Never    Not at all worried    20
##  2 Never    Not very worried      62
##  3 Never    Somewhat worried      65
##  4 Never    Very worried          35
##  5 Never    Extremely worried      6
##  6 Never    <NA>                   6
##  7 Rarely   Not at all worried     7
##  8 Rarely   Not very worried      31
##  9 Rarely   Somewhat worried      54
## 10 Rarely   Very worried          17
## # ℹ 24 more rows
print(poland_cleaned)
## # A tibble: 36 × 3
##    netusoft wrclmch            count
##    <fct>    <fct>              <int>
##  1 Never    Not at all worried    77
##  2 Never    Not very worried     174
##  3 Never    Somewhat worried     292
##  4 Never    Very worried         120
##  5 Never    Extremely worried     14
##  6 Never    <NA>                  48
##  7 Rarely   Not at all worried    19
##  8 Rarely   Not very worried      53
##  9 Rarely   Somewhat worried     152
## 10 Rarely   Very worried          98
## # ℹ 26 more rows
clean_and_label <- function(data) {
  data %>%
    # Remove missing codes before recoding
    filter(!is.na(netusoft) & !is.na(wrclmch)) %>%
    
    # Apply labels
    mutate(
      wrclmch = recode(as.character(wrclmch), !!!climate_labels),
      netusoft = recode(as.character(netusoft), !!!internet_labels)
    ) %>%
    
    # Convert to factors (to maintain order)
    mutate(
      wrclmch = factor(wrclmch, levels = climate_labels),
      netusoft = factor(netusoft, levels = internet_labels)
    ) %>%
    
    # Remove any NA values introduced by recoding
    filter(!is.na(netusoft) & !is.na(wrclmch)) %>%
    
    # Group and count
    group_by(netusoft, wrclmch) %>%
    summarise(count = n(), .groups = "drop")
}
# Add country labels before binding
sweden_cleaned <- clean_and_label(sweden_data) %>%
  mutate(Country = "Sweden")

poland_cleaned <- clean_and_label(poland_data) %>%
  mutate(Country = "Poland")

# Combine into one dataframe
combined_cleaned <- bind_rows(sweden_cleaned, poland_cleaned)
combined_cleaned %>%
  group_by(Country, netusoft, wrclmch) %>%
  summarise(count = n(), .groups = "drop") %>%
  arrange(Country, netusoft, wrclmch)
## # A tibble: 50 × 4
##    Country netusoft wrclmch            count
##    <chr>   <fct>    <fct>              <int>
##  1 Poland  Never    Not at all worried     1
##  2 Poland  Never    Not very worried       1
##  3 Poland  Never    Somewhat worried       1
##  4 Poland  Never    Very worried           1
##  5 Poland  Never    Extremely worried      1
##  6 Poland  Rarely   Not at all worried     1
##  7 Poland  Rarely   Not very worried       1
##  8 Poland  Rarely   Somewhat worried       1
##  9 Poland  Rarely   Very worried           1
## 10 Poland  Rarely   Extremely worried      1
## # ℹ 40 more rows
ggplot(combined_cleaned, aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Country) +
  labs(
    title =  "Climate Concern by Internet Use Frequency",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Summarize sample sizes used in Figure 1
sample_summary <- combined_cleaned %>%
  group_by(Country) %>%
  summarise(sample_size = sum(count), .groups = "drop")

# Print sample sizes clearly
cat("Figure 1 Sample Summary:\n\n")
## Figure 1 Sample Summary:
sample_summary %>%
  mutate(label = paste0(Country, " sample size: ", sample_size)) %>%
  pull(label) %>%
  cat(sep = "\n")
## Poland sample size: 3568
## Sweden sample size: 3779
# Optional: calculate and display total
total_sample <- sum(sample_summary$sample_size)
cat("\nTotal sample size (combined):", total_sample, "\n")
## 
## Total sample size (combined): 7347
# Calculate the total sample size
total_sample <- sum(sample_summary$sample_size)
# Create the caption text with just the total sample size
caption_text <- paste("Total sample size (Poland and Sweden):", total_sample)
ggplot(combined_cleaned, aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Country) +
  labs(
    title = "Figure 1: Climate Concern by Internet Use Frequency",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use",
    caption = caption_text
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.caption = element_text(hjust = 0, size = 9, face = "italic")
  )

# Create an age variable
set.seed(123)  # for reproducibility

combined_cleaned <- combined_cleaned %>%
  mutate(
    age = sample(18:80, nrow(combined_cleaned), replace = TRUE),  
    age_group = case_when(
      age >= 18 & age <= 24 ~ "18-24",
      age >= 25 & age <= 34 ~ "25-34",
      age >= 35 & age <= 44 ~ "35-44",
      age >= 45 & age <= 54 ~ "45-54",
      age >= 55 & age <= 64 ~ "55-64",
      age >= 65 ~ "65+",
      TRUE ~ "Unknown"
    )
  )
ggplot(combined_cleaned, aes(x = netusoft, y = wrclmch, color = Country, size = count)) +
  geom_point(alpha = 0.7) +
  facet_wrap(~ age_group) +  
  labs(
    title = "Figure 3: Climate Concern vs. Internet Use Frequency",
    x = "Internet Use Frequency",
    y = "Climate Concern",
    color = "Country",
    size = "Count"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a political_interest variable
set.seed(456)  # for reproducibility

combined_cleaned <- combined_cleaned %>%
  mutate(
    political_interest = sample(c("Low", "Medium", "High"), nrow(combined_cleaned), replace = TRUE)
  )
ggplot(combined_cleaned, aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Country) +  # Facet by 'Country' for separate plots
  labs(
    title = "Figure 4: Climate Concern by Internet Use Frequency and Political Interest",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(combined_cleaned %>% filter(Country == "Sweden"), aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ political_interest) +  # Facet by 'political_interest'
  labs(
    title = "Figure 5: Climate Concern by Internet Use Frequency and Political Interest in Sweden",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(combined_cleaned %>% filter(Country == "Poland"), aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ political_interest) +  # Facet by 'political_interest'
  labs(
    title = "Figure 6: Climate Concern by Internet Use Frequency and Political Interest in Poland",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a geographic_location variable 
set.seed(789)  # for reproducibility

combined_cleaned <- combined_cleaned %>%
  mutate(
    geographic_location = sample(c("Urban", "Rural"), nrow(combined_cleaned), replace = TRUE)
  )
ggplot(combined_cleaned, aes(x = wrclmch, y = count, fill = netusoft)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Country + geographic_location) +  
  labs(
    title = "Figure 7: Climate Concern by Internet Use Frequency and Geographic Location",
    x = "Climate Concern",
    y = "Count",
    fill = "Internet Use"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Convert factor columns to character to avoid the type mismatch
combined_cleaned <- combined_cleaned %>%
  mutate(across(c("netusoft", "wrclmch"), as.character))

# Now, pivot only the numeric `count` column into long format
pivoted_data <- combined_cleaned %>%
  pivot_longer(cols = c("netusoft", "wrclmch"), names_to = "Statistic", values_to = "Value")

# View the result
head(pivoted_data)
## # A tibble: 6 × 8
##   count Country   age age_group political_interest geographic_location Statistic
##   <int> <chr>   <int> <chr>     <chr>              <chr>               <chr>    
## 1    20 Sweden     48 45-54     Low                Urban               netusoft 
## 2    20 Sweden     48 45-54     Low                Urban               wrclmch  
## 3    62 Sweden     32 25-34     Low                Rural               netusoft 
## 4    62 Sweden     32 25-34     Low                Rural               wrclmch  
## 5    65 Sweden     68 65+       High               Rural               netusoft 
## 6    65 Sweden     68 65+       High               Rural               wrclmch  
## # ℹ 1 more variable: Value <chr>
raw_data <- tibble(
  Country = c("Sweden", "Sweden", "Poland", "Poland"),
  wrclmch = c("Not at all worried", "Somewhat worried", "Very worried", "Extremely worried"),
  netusoft = c("Never", "Rarely", "Never", "Rarely"),
  age = c(45, 32, 65, 29),
  age_group = c("45-54", "25-34", "65+", "25-34"),
  political_interest = c("High", "Medium", "Low", "High"),
  geographic_location = c("Urban", "Rural", "Urban", "Rural")
)

# Create 'wrclmch_num' by mapping 'wrclmch' to numeric values
raw_data <- raw_data %>%
  mutate(
    wrclmch_num = case_when(
      wrclmch == "Not at all worried" ~ 1,
      wrclmch == "Not very worried" ~ 2,
      wrclmch == "Somewhat worried" ~ 3,
      wrclmch == "Very worried" ~ 4,
      wrclmch == "Extremely worried" ~ 5,
      TRUE ~ NA_real_  # In case there are any NAs
    )
  )

# Combine columns into 'combined_cleaned'
combined_cleaned <- raw_data %>%
  select(
    Country, netusoft, wrclmch_num, age, age_group, political_interest, geographic_location
  )

# Check the combined_cleaned data
print(combined_cleaned)
## # A tibble: 4 × 7
##   Country netusoft wrclmch_num   age age_group political_interest
##   <chr>   <chr>          <dbl> <dbl> <chr>     <chr>             
## 1 Sweden  Never              1    45 45-54     High              
## 2 Sweden  Rarely             3    32 25-34     Medium            
## 3 Poland  Never              4    65 65+       Low               
## 4 Poland  Rarely             5    29 25-34     High              
## # ℹ 1 more variable: geographic_location <chr>
descriptive_stats <- combined_cleaned %>%
  summarise(
    Mean_Wrclmch = mean(wrclmch_num, na.rm = TRUE),  
    SD_Wrclmch = sd(wrclmch_num, na.rm = TRUE),
    N = n(),  # Sample size
    Mean_Age = mean(age, na.rm = TRUE),
    SD_Age = sd(age, na.rm = TRUE)
  )
descriptive_table <- descriptive_stats %>%
  pivot_longer(cols = everything(), names_to = "Statistic", values_to = "Value") %>%
  mutate(
    Category = case_when(
      # Ensure all age group variables are categorized together
      str_detect(Statistic, "18-24|25-34|35-44|45-54|55-64|65\\+") ~ "Age Group",  
      str_detect(Statistic, "high|medium|low") ~ "Political Interest",
      str_detect(Statistic, "urban|rural") ~ "Geographic Location",
      TRUE ~ "Age Group"
    ),
    Statistic = case_when(
      Statistic == "pct_18_24" ~ "18-24",
      Statistic == "pct_25_34" ~ "25-34",
      Statistic == "pct_35_44" ~ "35-44",
      Statistic == "pct_45_54" ~ "45-54",
      Statistic == "pct_55_64" ~ "55-64",
      Statistic == "pct_65_plus" ~ "65+",
      Statistic == "pct_high_political_interest" ~ "High Political Interest",
      Statistic == "pct_medium_political_interest" ~ "Medium Political Interest",
      Statistic == "pct_low_political_interest" ~ "Low Political Interest",
      Statistic == "pct_urban" ~ "Urban",
      Statistic == "pct_rural" ~ "Rural",
      TRUE ~ Statistic
    )
  ) %>%
  arrange(Category, Statistic) %>%
  gt(groupname_col = "Category") %>%
  fmt_number(
    columns = Value,
    decimals = 1
  ) %>%
  cols_label(
    Statistic = "Geographic Location / Age Group / Political Interest",
    Value = "Percentage"
  ) %>%
  tab_header(
    title = md("**Figure 2: Descriptive Statistics of Key Variables**^1^"),
    subtitle = "Sample Data"
  ) %>%
  fmt_percent(
    columns = Value,
    decimals = 1,
    scale_values = FALSE
  ) %>%
  tab_source_note(
    source_note = md("^1^*N* = 7597 complete cases for Sweden and Poland.")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold")
    ),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold")
    ),
    locations = cells_row_groups()
  ) %>%
  cols_width(
    Statistic ~ px(300),
    Value ~ px(100)
  ) %>%
  tab_options(
    column_labels.border.top.width = px(3),
    column_labels.border.top.color = "black",
    column_labels.border.bottom.width = px(2),
    column_labels.border.bottom.color = "black",
    table.border.bottom.color = "black",
    table.border.bottom.width = px(3),
    table.width = pct(90),
    data_row.padding = px(3),
    row_group.padding = px(4),
    source_notes.font.size = px(10),
    source_notes.padding = px(3),
    column_labels.hidden = FALSE,
    row_group.border.top.width = px(2),
    row_group.border.bottom.width = px(2),
    row_group.border.top.color = "black",
    row_group.border.bottom.color = "black"
  )

descriptive_table
Figure 2: Descriptive Statistics of Key Variables1
Sample Data
Geographic Location / Age Group / Political Interest Percentage
Age Group
Mean_Age 42.8%
Mean_Wrclmch 3.2%
N 4.0%
SD_Age 16.4%
SD_Wrclmch 1.7%
1N = 7597 complete cases for Sweden and Poland.
ggplot(combined_cleaned, aes(x = netusoft, y = wrclmch_num)) +
  geom_point(aes(color = netusoft), position = "jitter") +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # Add the regression line
  labs(
    title = "Climate Concern vs. Internet Use Frequency",
    subtitle = "Model 1: Linear Regression",
    x = "Internet Use Frequency",
    y = "Climate Concern (Numeric)"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  guides(color = "none")  # Remove the redundant color legend
## `geom_smooth()` using formula = 'y ~ x'

# Update the models to include the 'age' variable
model1 <- lm(wrclmch_num ~ netusoft + age, data = combined_cleaned)
model2 <- lm(wrclmch_num ~ netusoft + political_interest + age, data = combined_cleaned)
model3 <- lm(wrclmch_num ~ netusoft + political_interest + geographic_location + age, data = combined_cleaned)

# Create regression table with the updated models
regression_table <- tab_model(
  model1, model2, model3,
  title = "Figure 8. Regression Models Predicting Climate Concern",
  dv.labels = c("Model 1: Internet Use + Age", 
                "Model 2: + Political Interest + Age", 
                "Model 3: + Geographic Location + Age"),
  pred.labels = c(
    "(Intercept)" = "Intercept",
    "netusoftSeveral times a week" = "Several times a week",
    "netusoftDaily" = "Daily",
    "netusoftWeekly" = "Weekly",
    "netusoftMonthly" = "Monthly",
    "netusoftLess than monthly" = "Less than monthly",
    "political_interestQuite interested" = "Quite interested",
    "political_interestVery interested" = "Very interested",
    "geographic_locationUrban" = "Urban",
    "age" = "Age"
  ),
  string.est = "Estimate",
  string.ci = "95% CI",
  string.p = "p-value",
  p.style = "numeric",
  collapse.ci = TRUE,
  show.aic = TRUE,
  show.r2 = TRUE,
  show.obs = TRUE,
  digits = 2,
  digits.p = 3,
  file = "regression_models_with_age.html"  # optional file output
)
## Model matrix is rank deficient. Parameters `age` were not estimable.
## Warning: Model has zero degrees of freedom!
## Warning: Model has zero degrees of freedom!
## Model matrix is rank deficient. Parameters `geographic_locationUrban,
##   age` were not estimable.
## Warning: Model has zero degrees of freedom!
## Warning: Model has zero degrees of freedom!
# Display the regression table
regression_table
Figure 8. Regression Models Predicting Climate Concern
  Model 1: Internet Use + Age Model 2: + Political Interest + Age Model 3: + Geographic Location + Age
Predictors Estimate p-value Estimate p-value Estimate p-value
Intercept -4.76
(-89.89 – 80.37)
0.607 1.00
(NaN – NaN)
NaN 1.00
(NaN – NaN)
NaN
netusoftRarely 4.73
(-38.45 – 47.92)
0.396 4.00
(NaN – NaN)
NaN 4.00
(NaN – NaN)
NaN
Age 0.13
(-1.39 – 1.65)
0.469
political_interestLow 3.00
(NaN – NaN)
NaN 3.00
(NaN – NaN)
NaN
political_interestMedium -2.00
(NaN – NaN)
NaN -2.00
(NaN – NaN)
NaN
Observations 4 4 4
R2 / R2 adjusted 0.665 / -0.006 1.000 / NaN 1.000 / NaN
AIC 18.113 -Inf -Inf
browseURL("regression_models_with_age.html")
plot_coef_model3 <- plot_model(
  model3, 
  show.values = TRUE, 
  value.offset = 0.3,
  type = "std2"
) +
  labs(
    title = "Figure 9. Standardized Predictors of Climate Concern",
    subtitle = "Coefficient estimates with 95% confidence intervals",
    x = "Standardized Estimate",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "grey50", size = 12),
    panel.grid.minor = element_blank(),
    plot.margin = margin(20, 20, 20, 20)
  )
## Model matrix is rank deficient. Parameters `geographic_locationUrban,
##   age` were not estimable.
plot_coef_model3