packages <- c("tidyr", "ggplot2", "dplyr", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "rmarkdown", "ggridges", "viridis", "questionr", "flextable")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: viridisLite
## [[1]]
## [1] "tidyr" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "ggplot2" "tidyr" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "dplyr" "ggplot2" "tidyr" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "modelsummary" "dplyr" "ggplot2" "tidyr" "stats"
## [6] "graphics" "grDevices" "utils" "datasets" "methods"
## [11] "base"
##
## [[5]]
## [1] "forcats" "modelsummary" "dplyr" "ggplot2" "tidyr"
## [6] "stats" "graphics" "grDevices" "utils" "datasets"
## [11] "methods" "base"
##
## [[6]]
## [1] "RColorBrewer" "forcats" "modelsummary" "dplyr" "ggplot2"
## [6] "tidyr" "stats" "graphics" "grDevices" "utils"
## [11] "datasets" "methods" "base"
##
## [[7]]
## [1] "fst" "RColorBrewer" "forcats" "modelsummary" "dplyr"
## [6] "ggplot2" "tidyr" "stats" "graphics" "grDevices"
## [11] "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "forcats"
## [6] "modelsummary" "dplyr" "ggplot2" "tidyr" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[9]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "forcats" "modelsummary" "dplyr" "ggplot2" "tidyr"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[10]]
## [1] "rmarkdown" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "forcats" "modelsummary" "dplyr" "ggplot2"
## [11] "tidyr" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "forcats" "modelsummary" "dplyr"
## [11] "ggplot2" "tidyr" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[12]]
## [1] "ggridges" "rmarkdown" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "forcats" "modelsummary" "dplyr"
## [11] "ggplot2" "tidyr" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[13]]
## [1] "questionr" "ggridges" "rmarkdown" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "forcats" "modelsummary"
## [11] "dplyr" "ggplot2" "tidyr" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[14]]
## [1] "flextable" "questionr" "ggridges" "rmarkdown" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "forcats"
## [11] "modelsummary" "dplyr" "ggplot2" "tidyr" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
ess <- read_fst("/Users/jocelyn/Desktop/SOC252/Tutorial 1/All-ESS-Data.fst")
table(ess$essround)
##
## 1 2 3 4 5 6 7 8 9 10
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
table(ess$year)
##
## 2002 2004 2006 2008 2010 2012 2014 2016 2018 2020
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
uk_data <- ess[ess$cntry == "GB", ]
uk_data_clean <- uk_data %>%
select(yrbrn, agea, hinctnta, polintr, fairelc, clsprty, gndr, vote, essround, year) %>%
mutate(
polintr = ifelse(polintr %in% c(7, 8, 9), NA, polintr),
fairelc = ifelse(fairelc %in% c(77, 88, 99), NA, fairelc),
gndr = ifelse(gndr %in% c(9), NA, gndr),
clsprty = ifelse(clsprty %in% c(7, 8, 9), NA, clsprty),
hinctnta = ifelse(hinctnta %in% c(77, 88, 99), NA, hinctnta),
yrbrn = ifelse(yrbrn %in% c(7777, 8888, 9999), NA, yrbrn),
vote = ifelse(vote == 2, 0, ifelse(vote %in% c(3, 7, 8, 9), NA, vote))
) %>%
filter(agea > 18 & agea < 90)
table(uk_data$vote)
##
## 1 2 3 7 8
## 14576 5281 1058 9 55
table(uk_data$vote)
##
## 1 2 3 7 8
## 14576 5281 1058 9 55
table(uk_data_clean$vote)
##
## 0 1
## 5088 14330
table(uk_data_clean$vote_binary)
## < table of extent 0 >
table(uk_data_clean$vote_yes)
## < table of extent 0 >
vote_by_year <- uk_data_clean %>%
group_by(year) %>%
summarize(mean_vote = mean(vote, na.rm = TRUE))
vote_by_year
## # A tibble: 10 × 2
## year mean_vote
## <dbl> <dbl>
## 1 2002 0.731
## 2 2004 0.695
## 3 2006 0.730
## 4 2008 0.715
## 5 2010 0.723
## 6 2012 0.722
## 7 2014 0.714
## 8 2016 0.774
## 9 2018 0.802
## 10 2020 0.809
ggplot(vote_by_year, aes(x = year, y = mean_vote)) +
geom_line(color = "blue", size = 1) + # Line to show the trend
geom_point(color = "red", size = 3) + # Points to highlight each year's value
labs(title = "Likelihood to Vote in the UK (2002-2020)",
x = "Year",
y = "Mean Vote (No-Yes scale)") +
ylim(0, 1) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ess$vote[ess$vote %in% c(3, 7, 8, 9)] <- NA
ess_data_clean <- ess %>%
select(yrbrn, vote, essround, year) %>%
mutate(
vote = ifelse(vote == 2, 0, ifelse(vote %in% c(3, 7, 8, 9), NA, vote))
)
avg_vote_uk <- uk_data_clean %>%
group_by(year) %>%
summarize(mean_vote_uk = mean(vote, na.rm = TRUE))
avg_vote_all_countries <- ess_data_clean %>%
group_by(year) %>%
summarize(mean_vote_all = mean(vote, na.rm = TRUE))
avg_vote_all_countries
## # A tibble: 10 × 2
## year mean_vote_all
## <dbl> <dbl>
## 1 2002 0.802
## 2 2004 0.770
## 3 2006 0.779
## 4 2008 0.776
## 5 2010 0.763
## 6 2012 0.760
## 7 2014 0.766
## 8 2016 0.766
## 9 2018 0.785
## 10 2020 0.817
combined_data <- left_join(avg_vote_uk, avg_vote_all_countries, by = "year")
long_data <- combined_data %>%
pivot_longer(
cols = starts_with("mean_vote"),
names_to = "Vote_Type",
values_to = "Mean_Vote"
)
ggplot(long_data) +
geom_line(aes(x = year, y = Mean_Vote, color = Vote_Type), size = 1) +
geom_point(aes(x = year, y = Mean_Vote, color = Vote_Type), size = 3) +
labs(title = "Likelihood to Vote in the UK vs Europe (2002-2020)",
x = "Year",
y = "Mean Vote (No-Yes scale)") +
ylim(0, 1) +
scale_color_manual(values = c("blue", "red"),
name = "Dataset",
labels = c("UK", "All ESS"),
breaks = c("mean_vote_uk", "mean_vote_all")) +
theme_minimal()
datasummary_skim(uk_data_clean)
Unique (#)
Missing (%)
Mean
SD
Min
Median
Max
yrbrn
91
0
1959.3
18.2
1913.0
1960.0
2003.0
agea
71
0
51.5
17.8
19.0
51.0
89.0
hinctnta
11
41
5.1
3.0
1.0
5.0
10.0
polintr
5
0
2.5
0.9
1.0
2.0
4.0
fairelc
12
84
8.9
1.8
0.0
10.0
10.0
clsprty
3
1
1.5
0.5
1.0
1.0
2.0
gndr
2
0
1.6
0.5
1.0
2.0
2.0
vote
3
3
0.7
0.4
0.0
1.0
1.0
essround
10
0
5.3
2.7
1.0
5.0
10.0
year
10
0
2010.6
5.4
2002.0
2010.0
2020.0
#Column Percentages for clsprty and Vote
table(uk_data_clean$clsprty, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 1 28.3 60.3 51.9
## 2 71.7 39.7 48.1
## Total 100.0 100.0 100.0
#Column Percentages for polintr and Vote
table(uk_data_clean$polintr, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 1 5.7 16.6 13.7
## 2 25.8 47.8 42.0
## 3 33.2 24.3 26.6
## 4 35.3 11.3 17.6
## Total 100.0 100.0 100.0
#Column Percentages for gndr and Vote
table(uk_data_clean$gndr, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 1 44.3 45.0 44.8
## 2 55.7 55.0 55.2
## Total 100.0 100.0 100.0
#Column Percentages for hinctnta and Vote
table(uk_data_clean$hinctnta, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 1 20.8 11.6 13.9
## 2 14.3 11.9 12.5
## 3 12.6 10.2 10.8
## 4 8.5 8.9 8.8
## 5 8.5 8.7 8.6
## 6 7.5 9.2 8.8
## 7 8.1 9.9 9.5
## 8 7.1 9.7 9.1
## 9 6.2 8.8 8.2
## 10 6.4 11.0 9.8
## Total 100.0 100.0 100.0
#Column Percentages for fairelc and Vote
table(uk_data_clean$fairelc, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 0 1.1 0.3 0.5
## 1 0.1 0.1 0.1
## 2 0.8 0.1 0.3
## 3 1.1 0.5 0.6
## 4 2.2 0.8 1.2
## 5 10.0 3.0 4.7
## 6 6.3 3.2 3.9
## 7 9.3 4.9 5.9
## 8 15.2 11.7 12.5
## 9 14.8 12.2 12.8
## 10 39.2 63.3 57.6
## Total 100.0 100.0 100.0
table(uk_data_clean$agea, uk_data_clean$vote) %>%
cprop()
##
## 0 1 All
## 19 1.4 0.2 0.5
## 20 1.9 0.4 0.8
## 21 2.1 0.5 0.9
## 22 2.0 0.5 0.9
## 23 2.0 0.6 1.0
## 24 2.5 0.8 1.2
## 25 2.3 0.7 1.1
## 26 2.5 0.9 1.3
## 27 2.5 0.7 1.2
## 28 2.4 0.9 1.3
## 29 2.3 1.1 1.4
## 30 2.8 1.3 1.7
## 31 2.7 1.2 1.6
## 32 2.4 1.4 1.6
## 33 2.6 1.3 1.7
## 34 2.1 1.4 1.6
## 35 2.5 1.6 1.8
## 36 2.7 1.7 2.0
## 37 2.2 1.6 1.8
## 38 2.4 1.5 1.7
## 39 2.3 1.5 1.7
## 40 1.8 1.9 1.9
## 41 2.2 1.8 1.9
## 42 1.7 1.7 1.7
## 43 1.5 1.6 1.6
## 44 1.7 1.8 1.8
## 45 1.7 1.7 1.7
## 46 1.5 1.6 1.6
## 47 1.7 1.7 1.7
## 48 1.8 1.8 1.8
## 49 1.7 1.8 1.8
## 50 1.7 1.6 1.7
## 51 1.4 1.8 1.7
## 52 1.5 1.7 1.7
## 53 1.5 1.6 1.6
## 54 1.7 1.8 1.8
## 55 1.5 1.7 1.6
## 56 1.3 2.0 1.8
## 57 1.4 1.9 1.8
## 58 0.9 1.9 1.6
## 59 1.1 2.0 1.8
## 60 1.1 2.0 1.8
## 61 1.3 2.0 1.8
## 62 1.1 1.9 1.7
## 63 1.0 1.8 1.6
## 64 1.0 2.1 1.8
## 65 0.9 1.9 1.6
## 66 0.9 2.0 1.7
## 67 1.1 2.0 1.8
## 68 1.1 1.8 1.6
## 69 0.8 2.0 1.7
## 70 0.7 2.0 1.6
## 71 0.8 1.7 1.5
## 72 0.7 1.8 1.5
## 73 0.6 1.6 1.4
## 74 0.6 1.6 1.3
## 75 0.6 1.6 1.4
## 76 0.6 1.4 1.1
## 77 0.7 1.4 1.3
## 78 0.6 1.2 1.1
## 79 0.3 1.1 0.9
## 80 0.5 1.1 0.9
## 81 0.5 1.1 1.0
## 82 0.5 1.1 0.9
## 83 0.4 1.0 0.9
## 84 0.3 0.7 0.6
## 85 0.3 0.6 0.5
## 86 0.3 0.6 0.5
## 87 0.3 0.5 0.4
## 88 0.2 0.4 0.3
## 89 0.1 0.4 0.3
## Total 100.0 100.0 100.0
#Mean and Standard Deviation for Vote by Gender
summary_gender_vote <- uk_data_clean %>%
group_by(gndr) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_gender_vote
## # A tibble: 2 × 3
## gndr mean std_dev
## <dbl> <dbl> <dbl>
## 1 1 0.741 0.438
## 2 2 0.735 0.441
#Mean and Standard Deviation for Vote by Incomr
summary_income_vote <- uk_data_clean %>%
group_by(hinctnta) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_income_vote
## # A tibble: 11 × 3
## hinctnta mean std_dev
## <dbl> <dbl> <dbl>
## 1 1 0.628 0.483
## 2 2 0.715 0.452
## 3 3 0.711 0.454
## 4 4 0.760 0.427
## 5 5 0.756 0.430
## 6 6 0.788 0.409
## 7 7 0.788 0.409
## 8 8 0.805 0.396
## 9 9 0.812 0.391
## 10 10 0.839 0.367
## 11 NA 0.718 0.450
#Mean and Standard Deviation for Vote by Fair Election
summary_fair_election_vote <- uk_data_clean %>%
group_by(fairelc) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_fair_election_vote
## # A tibble: 12 × 3
## fairelc mean std_dev
## <dbl> <dbl> <dbl>
## 1 0 0.429 0.514
## 2 1 0.75 0.5
## 3 2 0.25 0.463
## 4 3 0.579 0.507
## 5 4 0.556 0.504
## 6 5 0.497 0.502
## 7 6 0.620 0.487
## 8 7 0.630 0.484
## 9 8 0.715 0.452
## 10 9 0.728 0.446
## 11 10 0.840 0.367
## 12 NA 0.733 0.442
#Mean and Standard Deviation for Vote by Age
summary_age_vote <- uk_data_clean %>%
group_by(agea) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_age_vote
## # A tibble: 71 × 3
## agea mean std_dev
## <dbl> <dbl> <dbl>
## 1 19 0.311 0.465
## 2 20 0.376 0.486
## 3 21 0.413 0.494
## 4 22 0.416 0.494
## 5 23 0.482 0.501
## 6 24 0.466 0.500
## 7 25 0.480 0.501
## 8 26 0.498 0.501
## 9 27 0.451 0.499
## 10 28 0.529 0.500
## # ℹ 61 more rows
#Mean and Standard Deviation for Vote by Political Interest
summary_political_interest_vote <- uk_data_clean %>%
group_by(polintr) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_political_interest_vote
## # A tibble: 5 × 3
## polintr mean std_dev
## <dbl> <dbl> <dbl>
## 1 1 0.891 0.312
## 2 2 0.839 0.367
## 3 3 0.674 0.469
## 4 4 0.474 0.499
## 5 NA 0.286 0.488
#Mean and Standard Deviation for Vote by Closeness to Party
summary_close_to_party_vote <- uk_data_clean %>%
group_by(clsprty) %>%
summarize(mean = mean(vote, na.rm = TRUE),
std_dev = sd(vote, na.rm = TRUE))
summary_close_to_party_vote
## # A tibble: 3 × 3
## clsprty mean std_dev
## <dbl> <dbl> <dbl>
## 1 1 0.857 0.350
## 2 2 0.609 0.488
## 3 NA 0.742 0.439
vote_by_income <- uk_data_clean %>%
group_by(hinctnta) %>%
summarize(mean_vote = mean(vote, na.rm = TRUE))
vote_by_income
## # A tibble: 11 × 2
## hinctnta mean_vote
## <dbl> <dbl>
## 1 1 0.628
## 2 2 0.715
## 3 3 0.711
## 4 4 0.760
## 5 5 0.756
## 6 6 0.788
## 7 7 0.788
## 8 8 0.805
## 9 9 0.812
## 10 10 0.839
## 11 NA 0.718
ggplot (data = vote_by_income,
mapping = aes(x = hinctnta,
y = mean_vote,
color = "red")) +
geom_smooth() +
labs(title = "Likelihood to Vote by Houshold Income Decile", x = "Household Income Decile", y = "Likelihood to Vote") +
ylim (0,1) +
geom_point()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).