#Installing & Applying Packages
packages <-c("tidyverse", "fst", "modelsummary", "viridis")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "fst" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "fst" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "viridis" "viridisLite" "modelsummary" "fst" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
#Loading Data into R
getwd()
## [1] "/Users/owner/Downloads"
setwd("/Users/owner/Downloads")
rm(list=ls()); gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 1074684 57.4 2200161 117.6 NA 1483800 79.3
## Vcells 1863612 14.3 8388608 64.0 16384 2375777 18.2
hungary_data<- read_fst("hungary_data.fst")
spain_data <- read_fst("spain_data.fst")
sweden_data <- read_fst("sweden_data.fst")
italy_data <- read_fst("italy_data.fst")
#Viewing & Cleaning Spain trstplt data
spain_data <- spain_data %>%
mutate(
trstplt = ifelse(trstplt %in% c(77, 88, 99), NA, trstplt),)
table(spain_data$trstplt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 5165 1830 2329 2441 2085 2890 1154 639 355 80 71
#Creating a year variable
spain_data$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){spain_data$year[spain_data$essround == i] <- replacements[i]}
table(spain_data$year)
##
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020
## 1729 1663 1876 2576 1885 1889 1925 1958 1668 2283
#Calculating average trust by year
trust_by_year <- spain_data %>%
group_by(year) %>%
summarize(mean_trust = mean(trstplt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
## year mean_trust
## <dbl> <dbl>
## 1 2002 3.41
## 2 2004 3.66
## 3 2006 3.49
## 4 2008 3.32
## 5 2010 2.72
## 6 2012 1.91
## 7 2014 2.23
## 8 2016 2.40
## 9 2018 2.55
## 10 2020 1.94
#Visualizing trust by year
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
labs(title = "Trust in Parliament in Spain (2002-2020)",
x = "Survey Year",
y = "Average Trust (0-10 scale)") + ylim(0, 10) + theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
For this variable, it is important to note that a survey response of 0
means ‘No trust at all’ and a response of 10 means ’ Complete trust’.
From the graph above, it can be interpreted that the average trust in
parliament in Spain decreased from the initial survey date until 2012.
After 2012, trust in parliament began to steadily increase until 2018,
before decreasing in 2020 to a similar level as was recorded in
2012.
Based on the figure produced called task2_plot, the following takeaways can be made: On average, Norway has a greater proportion of survey respondents that feel close to a party than either France or Italy. This can be seen as the trendline for Norway is above that of Italy and France on the y-axis, representing a larger proportion of respondents that answered “yes” to the survey question. Furthermore, the proportion of the population for Norway, Italy, and France that feel close to a party has decreased over time. Finally, the proportion of survey respondents that feel close to a party has rapidly declined in France and Italy since 1990 as seen by the steep negative slope. This is different than the responses for Norway since 1990, which appears to have leveled off, representing relatively little change in the number of respondents that feel close to a party.
#Viewing and cleaning up data
italy_data <-italy_data %>%
mutate(clsprty=ifelse(clsprty == 2,0, ifelse(clsprty %in% c(7,8,9), NA, clsprty))) %>%
mutate(gndr = ifelse(gndr %in% (9), NA, gndr))
table(italy_data$clsprty)
##
## 0 1
## 5828 3626
table(italy_data$gndr)
##
## 1 2
## 4836 5329
# Adding names to gndr
italy_data <-italy_data %>%
mutate(gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_))
# Adding names to clsprty
italy_data <-italy_data %>%
mutate(clsprty = case_when(
clsprty == 0 ~ "No",
clsprty == 1 ~ "Yes",
TRUE ~ NA_character_))
#Checking it worked
table(italy_data$clsprty)
##
## No Yes
## 5828 3626
table(italy_data$gndr)
##
## Female Male
## 5329 4836
#Calculating the marginal percentage
clsprty_percentages <- italy_data %>%
filter(!is.na(clsprty),!is.na(gndr)) %>%
group_by(gndr,clsprty) %>%
summarise(count =n(), .groups ='drop') %>%
mutate(percentage = count / sum(count) * 100)
clsprty_percentages
## # A tibble: 4 × 4
## gndr clsprty count percentage
## <chr> <chr> <int> <dbl>
## 1 Female No 3228 34.2
## 2 Female Yes 1686 17.9
## 3 Male No 2593 27.5
## 4 Male Yes 1936 20.5
The response “yes” for the variable clsprty indicates that the respondent feels closer to a particular party than all other parties. Therefore, the marginal percentage of Italian men that feel close to a particular political party is 20.5%.
#Cleaning & naming data
sweden_data <-sweden_data %>%
mutate(lrscale=ifelse(lrscale %in% c(77,88,99), NA, lrscale),
lrscale = case_when(
lrscale %in% 0:3 ~ "Left",
lrscale %in% 7:10 ~ "Right",
TRUE ~ NA_character_))
table(sweden_data$lrscale)
##
## Left Right
## 4362 5641
## Note: Responses of 4,5,6 to lrscale are removed as they represent 'moderate' or 'centrist' self-placement on the scale.
sweden_data <-sweden_data %>%
mutate(gndr = ifelse(gndr %in% c(9), NA, gndr),
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_))
table(sweden_data$gndr)
##
## Female Male
## 9127 9076
#Calculations
lrscale_percentages <- sweden_data %>%
filter(!is.na(lrscale), !is.na(gndr)) %>%
group_by(gndr, lrscale) %>%
summarise (count = n(), .groups ='drop') %>%
mutate(percentage = count / sum(count) * 100)
lrscale_percentages
## # A tibble: 4 × 4
## gndr lrscale count percentage
## <chr> <chr> <int> <dbl>
## 1 Female Left 2296 23.0
## 2 Female Right 2530 25.3
## 3 Male Left 2062 20.6
## 4 Male Right 3107 31.1
#Visualization
lrscale_plot <- ggplot(lrscale_percentages, aes(x = percentage, y=reorder(gndr, -percentage), fill=gndr)) +
geom_col()+
coord_flip()+
guides(fill="none")+
facet_wrap(~ lrscale, nrow = 1)+
labs(x= "Percentage of Respondents", y=NULL, title = "Political Orientation by Gender", subtitle = "Comparing the percentage distribution of left vs. right for Sweden (2002-2020) ") +
theme(plot.title = element_text(size=16, face = "bold"), plot.subtitle = element_text(size =12), axis.title.y = element_blank(), legend.position = "bottom")
lrscale_plot
#Cleaning up data
hungary_data <-hungary_data %>%
mutate(clsprty=ifelse(clsprty %in% c(7,8,9), NA, clsprty),
clsprty = case_when(
clsprty == 2 ~ "No",
clsprty == 1 ~ "Yes",
TRUE ~ NA_character_))
table(hungary_data$clsprty)
##
## No Yes
## 8679 7342
# Cleaning and recoding domicil as a new variable "geo" with named responses
hungary_data <- hungary_data %>%
mutate(
geo = recode(as.character(domicil),
'1' = "Urban",
'2' = "Urban",
'3' = "Rural",
'4' = "Rural",
'5' = "Rural",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_)
) %>%
filter(!is.na(clsprty), !is.na(geo))
#Calculate conditional probability
cond <- hungary_data %>%
count(clsprty, geo) %>%
group_by(geo) %>%
mutate(prob = n / sum(n))
cond
## # A tibble: 4 × 4
## # Groups: geo [2]
## clsprty geo n prob
## <chr> <chr> <int> <dbl>
## 1 No Rural 6275 0.554
## 2 No Urban 2395 0.512
## 3 Yes Rural 5055 0.446
## 4 Yes Urban 2283 0.488
Therefore, in Hungary, given that someone resides in a rural area, the probability of that individual not feeling close to any particular party is 55.4%.