packages <- c("tidyverse", "fst", "modelsummary", "viridis")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1081218 57.8 2195905 117.3 NA 1533104 81.9
## Vcells 1870295 14.3 8388608 64.0 16384 2397355 18.3
ess <- read_fst("All-ESS-Data.fst")
spain_data <- read.fst("spain_data.fst")
italy_data <- read.fst("italy_data.fst")
Provide code and answer.
Prompt: in the tutorial, we calculated the average trust in others for France and visualized it. Using instead the variable ‘Trust in Parliament’ (trstplt) and the country of Spain (country file provided on course website), visualize the average trust by survey year. You can truncate the y-axis if you wish. Provide appropriate titles and labels given the changes. What are your main takeaways based on the visual (e.g., signs of increase, decrease, or stall)?
spain_data <- spain_data %>%
mutate(
trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),
)
table(spain_data$trstplt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 5165 1830 2329 2441 2085 2890 1154 639 355 80 71
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
spain_data$year[spain_data$essround == i] <- replacements[i]
}
table(spain_data$year)
##
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
trust_by_year <- spain_data %>%
group_by(year) %>%
summarize(mean_trust_in_parliament = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
## year mean_trust_in_parliament
## <dbl> <dbl>
## 1 2002 3.41
## 2 2004 3.66
## 3 2006 3.49
## 4 2008 3.32
## 5 2010 2.72
## 6 2012 1.91
## 7 2014 2.23
## 8 2016 2.40
## 9 2018 2.55
## 10 2020 1.94
ggplot(trust_by_year, aes(x = year, y = mean_trust_in_parliament)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "Trust in Parliament in Spain (2002-2020)",
x = "Survey Year",
y = "Average Trust in Parliament (0-10 scale)") +
ylim(0, 10) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(trust_by_year, aes(x = year, y = mean_trust_in_parliament)) +
geom_line(aes(group = 1), color = "blue", size = 1, linetype = "longdash") +
labs(title = "Trust in Parliament in Spain (2002-2020)",
x = "Survey Year",
y = "Average Trust in Parliament (0-10 scale)") +
ylim(0, 10) +
theme_minimal() +
theme(legend.position = "none")
There was the highest level of trust in 2003. From then on the level of
rust decreased until it hit a low of around 2.0 in 2012. After that
there was a slight increase but in 2016, the rate decreased again.
Provide answer only.
Prompt and question: Based on the figure we produced above called task2_plot, tell us: what are your main takeaways regarding France relative to Italy and Norway? Make sure to be concrete and highlight at least two important comparative trends visualized in the graph.
The task2_plot illustrates that France’s data is relatively average compared to Italy and Norway. In 1990, France’s proportion is only about 0.05 higher than Italy. However, in 1940, France’s data was the most similmar to Norway’s, being about 0.15 lower than Norway.
Provide code and answer.
Question: What is the marginal percentage of Italian men who feel close to a particular political party?
italy_data <- read.fst("italy_data.fst")
italy_data <- italy_data %>%
mutate(
ppltrst = ifelse(ppltrst %in% c(77, 88, 99), NA, ppltrst), # set values 77, 88, and 99 to NA.
)
italy_data <- italy_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 0:3 ~ "Not Close", # Not Close (0 to 3)
lrscale %in% 7:10 ~ "Close", # Close (7 to 10)
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- italy_data %>% # Begin with the dataset 'france_data'
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female Close 955 24.5
## 2 Female Not Close 930 23.9
## 3 Male Close 1084 27.8
## 4 Male Not Close 924 23.7
27.8% of Italian men feel particularly close to a political party.
Provide code and output only.
Prompt: In the tutorial, we calculated then visualized the percentage distribution for left vs. right by gender for France. Your task is to replicate the second version of the visualization but for the country of Sweden instead.
sweden_data <- read.fst("sweden_data.fst")
sweden_data <- sweden_data %>%
mutate(
ppltrst = ifelse(ppltrst %in% c(77, 88, 99), NA, ppltrst), # set values 77, 88, and 99 to NA.
)
sweden_data <- sweden_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 0:3 ~ "Left", # Left-wing (0 to 3)
lrscale %in% 7:10 ~ "Right", # Right-wing (7 to 10)
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- sweden_data %>%
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female Left 2296 23.0
## 2 Female Right 2530 25.3
## 3 Male Left 2062 20.6
## 4 Male Right 3107 31.1
sweden_data <- sweden_data %>%
mutate(
geo = recode(as.character(domicil),
'1' = "Urban",
'2' = "Urban",
'3' = "Rural",
'4' = "Rural",
'5' = "Rural",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_)
) %>%
filter(!is.na(lrscale), !is.na(geo)) # Removing rows with NA in clsprty or geo
# Calculate conditional probabilities, excluding NAs
cond <- sweden_data %>%
count(lrscale, geo) %>%
group_by(geo) %>%
mutate(prob = n / sum(n))
cond
## # A tibble: 4 × 4
## # Groups: geo [2]
## lrscale geo n prob
## <chr> <chr> <int> <dbl>
## 1 Left Rural 2784 0.444
## 2 Left Urban 1570 0.423
## 3 Right Rural 3487 0.556
## 4 Right Urban 2140 0.577
plot <- ggplot(cond, aes(x = lrscale, y = prob, fill = lrscale)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_viridis_d(name = "Political Orientation") +
labs(y = "Conditional Probability",
x = "Feels Close to a Party",
title = "Conditional Probability of Political Orientation in Sweden",
subtitle = "by Geographical Area: Urban vs. Rural") +
facet_wrap(~ geo, nrow = 1) +
theme(legend.position = "bottom",
legend.title = element_blank(),
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1))
# Display the plot
plot
Provide code and answer: In Hungary, what is the conditional probability of NOT feeling close to any particular party given that the person lives in a rural area?
hungary_data <- read.fst("hungary_data.fst")
hungary_data <- hungary_data %>%
mutate(
ppltrst = ifelse(ppltrst %in% c(77, 88, 99), NA, ppltrst), # set values 77, 88, and 99 to NA.
)
hungary_data <- hungary_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_ # Set anything that is not 1 or 2 to NA
),
lrscale = case_when(
lrscale %in% 0:3 ~ "Not Close", # Not Close (0 to 3)
lrscale %in% 7:10 ~ "Close", # Close (7 to 10)
TRUE ~ NA_character_ # Moderate (4, 5, 6) and special codes (77, 88, 99) set to NA
)
)
lrscale_percentages <- hungary_data %>%
filter(!is.na(lrscale), !is.na(gndr)) %>% # Filter out rows where 'lrscale' or 'gender' is NA (missing data)
group_by(gndr, lrscale) %>% # Group the data by 'gender' and 'lrscale' categories
summarise(count = n(), .groups = 'drop') %>% # Summarise each group to get counts, and then drop groupings
mutate(percentage = count / sum(count) * 100) # Calculate percentage for each group by dividing count by total count and multiplying by 100
lrscale_percentages # The resulting dataframe
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female Close 2368 34.2
## 2 Female Not Close 1450 20.9
## 3 Male Close 2018 29.1
## 4 Male Not Close 1096 15.8
hungary_data <- hungary_data %>%
mutate(
geo = recode(as.character(domicil),
'1' = "Urban",
'2' = "Urban",
'3' = "Rural",
'4' = "Rural",
'5' = "Rural",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_)
) %>%
filter(!is.na(lrscale), !is.na(geo)) # Removing rows with NA in clsprty or geo
cond <- hungary_data %>%
count(lrscale, geo) %>%
group_by(geo) %>%
mutate(prob = n / sum(n) *100)
cond
## # A tibble: 4 × 4
## # Groups: geo [2]
## lrscale geo n prob
## <chr> <chr> <int> <dbl>
## 1 Close Rural 3087 65.1
## 2 Close Urban 1297 59.2
## 3 Not Close Rural 1653 34.9
## 4 Not Close Urban 893 40.8
The conditional probability of someone not feeling close to any political party in an rural area is 34.87%.