packages <- c("tidyr", "ggplot2", "dplyr", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "rmarkdown", "ggridges", "viridis", "questionr", "flextable") 

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)


lapply(packages, library, character.only = TRUE)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: viridisLite
## [[1]]
## [1] "tidyr"     "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "ggplot2"   "tidyr"     "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "dplyr"     "ggplot2"   "tidyr"     "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "modelsummary" "dplyr"        "ggplot2"      "tidyr"        "stats"       
##  [6] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [11] "base"        
## 
## [[5]]
##  [1] "forcats"      "modelsummary" "dplyr"        "ggplot2"      "tidyr"       
##  [6] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [11] "methods"      "base"        
## 
## [[6]]
##  [1] "RColorBrewer" "forcats"      "modelsummary" "dplyr"        "ggplot2"     
##  [6] "tidyr"        "stats"        "graphics"     "grDevices"    "utils"       
## [11] "datasets"     "methods"      "base"        
## 
## [[7]]
##  [1] "fst"          "RColorBrewer" "forcats"      "modelsummary" "dplyr"       
##  [6] "ggplot2"      "tidyr"        "stats"        "graphics"     "grDevices"   
## [11] "utils"        "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "forcats"     
##  [6] "modelsummary" "dplyr"        "ggplot2"      "tidyr"        "stats"       
## [11] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [16] "base"        
## 
## [[9]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "forcats"      "modelsummary" "dplyr"        "ggplot2"      "tidyr"       
## [11] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [16] "methods"      "base"        
## 
## [[10]]
##  [1] "rmarkdown"    "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "forcats"      "modelsummary" "dplyr"        "ggplot2"     
## [11] "tidyr"        "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "forcats"      "modelsummary" "dplyr"       
## [11] "ggplot2"      "tidyr"        "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[12]]
##  [1] "ggridges"     "rmarkdown"    "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "forcats"      "modelsummary" "dplyr"       
## [11] "ggplot2"      "tidyr"        "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[13]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "forcats"      "modelsummary"
## [11] "dplyr"        "ggplot2"      "tidyr"        "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[14]]
##  [1] "flextable"    "questionr"    "ggridges"     "rmarkdown"    "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "forcats"     
## [11] "modelsummary" "dplyr"        "ggplot2"      "tidyr"        "stats"       
## [16] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [21] "base"
ess <- read_fst("/Users/jocelyn/Desktop/SOC252/Tutorial 1/All-ESS-Data.fst")
table(ess$essround)
## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
table(ess$year)
## 
##  2002  2004  2006  2008  2010  2012  2014  2016  2018  2020 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
uk_data <- ess[ess$cntry == "GB", ]
uk_data_clean <- uk_data %>%
  select(yrbrn, agea, hinctnta, polintr, fairelc, clsprty, gndr, vote, essround, year)  %>%
  mutate(
     polintr = ifelse(polintr %in% c(7, 8, 9), NA, polintr),
     fairelc = ifelse(fairelc %in% c(77, 88, 99), NA, fairelc),
     gndr = ifelse(gndr %in% c(9), NA, gndr),
     clsprty = ifelse(clsprty %in% c(7, 8, 9), NA, clsprty),
     hinctnta = ifelse(hinctnta %in% c(77, 88, 99), NA, hinctnta),
     yrbrn = ifelse(yrbrn %in% c(7777, 8888, 9999), NA, yrbrn), 
     vote = ifelse(vote == 2, 0, ifelse(vote %in% c(3, 7, 8, 9), NA, vote))
  ) %>%
filter(agea > 18 & agea < 90)
  
table(uk_data$vote)
## 
##     1     2     3     7     8 
## 14576  5281  1058     9    55
table(uk_data$vote)
## 
##     1     2     3     7     8 
## 14576  5281  1058     9    55
table(uk_data_clean$vote)
## 
##     0     1 
##  5088 14330
table(uk_data_clean$vote_binary)
## < table of extent 0 >
table(uk_data_clean$vote_yes)
## < table of extent 0 >
vote_by_year <- uk_data_clean %>%
  group_by(year) %>%
 summarize(mean_vote = mean(vote, na.rm = TRUE))
vote_by_year
## # A tibble: 10 × 2
##     year mean_vote
##    <dbl>     <dbl>
##  1  2002     0.731
##  2  2004     0.695
##  3  2006     0.730
##  4  2008     0.715
##  5  2010     0.723
##  6  2012     0.722
##  7  2014     0.714
##  8  2016     0.774
##  9  2018     0.802
## 10  2020     0.809
ggplot(vote_by_year, aes(x = year, y = mean_vote)) +
  geom_line(color = "blue", size = 1) +  # Line to show the trend
  geom_point(color = "red", size = 3) +  # Points to highlight each year's value
  labs(title = "Likelihood to Vote in the UK (2002-2020)", 
       x = "Year", 
       y = "Mean Vote (No-Yes scale)") +
  ylim(0, 1) + 
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ess$vote[ess$vote %in% c(3, 7, 8, 9)] <- NA
ess_data_clean <- ess %>%
  select(yrbrn, vote, essround, year)  %>%
  mutate(
     vote = ifelse(vote == 2, 0, ifelse(vote %in% c(3, 7, 8, 9), NA, vote))
  )

avg_vote_uk <- uk_data_clean %>%
  group_by(year) %>%
  summarize(mean_vote_uk = mean(vote, na.rm = TRUE))
avg_vote_all_countries <- ess_data_clean %>%
  group_by(year) %>%
  summarize(mean_vote_all = mean(vote, na.rm = TRUE))
avg_vote_all_countries
## # A tibble: 10 × 2
##     year mean_vote_all
##    <dbl>         <dbl>
##  1  2002         0.802
##  2  2004         0.770
##  3  2006         0.779
##  4  2008         0.776
##  5  2010         0.763
##  6  2012         0.760
##  7  2014         0.766
##  8  2016         0.766
##  9  2018         0.785
## 10  2020         0.817
combined_data <- left_join(avg_vote_uk, avg_vote_all_countries, by = "year")
long_data <- combined_data %>%
  pivot_longer(
    cols = starts_with("mean_vote"), 
    names_to = "Vote_Type",  
    values_to = "Mean_Vote"  
  )

ggplot(long_data) +
  geom_line(aes(x = year, y = Mean_Vote, color = Vote_Type), size = 1) +
  geom_point(aes(x = year, y = Mean_Vote, color = Vote_Type), size = 3) +
  labs(title = "Likelihood to Vote in the UK vs Europe (2002-2020)", 
       x = "Year", 
       y = "Mean Vote (No-Yes scale)") +
  ylim(0, 1) +
  scale_color_manual(values = c("blue", "red"), 
                     name = "Dataset", 
                     labels = c("UK", "All ESS"), 
                     breaks = c("mean_vote_uk", "mean_vote_all")) + 
  theme_minimal()

datasummary_skim(uk_data_clean)
Unique (#) Missing (%) Mean SD Min Median Max
yrbrn 91 0 1959.3 18.2 1913.0 1960.0 2003.0
agea 71 0 51.5 17.8 19.0 51.0 89.0
hinctnta 11 41 5.1 3.0 1.0 5.0 10.0
polintr 5 0 2.5 0.9 1.0 2.0 4.0
fairelc 12 84 8.9 1.8 0.0 10.0 10.0
clsprty 3 1 1.5 0.5 1.0 1.0 2.0
gndr 2 0 1.6 0.5 1.0 2.0 2.0
vote 3 3 0.7 0.4 0.0 1.0 1.0
essround 10 0 5.3 2.7 1.0 5.0 10.0
year 10 0 2010.6 5.4 2002.0 2010.0 2020.0
#Column Percentages for clsprty and Vote
table(uk_data_clean$clsprty, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   1      28.3  60.3  51.9
##   2      71.7  39.7  48.1
##   Total 100.0 100.0 100.0
#Column Percentages for polintr and Vote
table(uk_data_clean$polintr, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   1       5.7  16.6  13.7
##   2      25.8  47.8  42.0
##   3      33.2  24.3  26.6
##   4      35.3  11.3  17.6
##   Total 100.0 100.0 100.0
#Column Percentages for gndr and Vote
table(uk_data_clean$gndr, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   1      44.3  45.0  44.8
##   2      55.7  55.0  55.2
##   Total 100.0 100.0 100.0
#Column Percentages for hinctnta and Vote
table(uk_data_clean$hinctnta, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   1      20.8  11.6  13.9
##   2      14.3  11.9  12.5
##   3      12.6  10.2  10.8
##   4       8.5   8.9   8.8
##   5       8.5   8.7   8.6
##   6       7.5   9.2   8.8
##   7       8.1   9.9   9.5
##   8       7.1   9.7   9.1
##   9       6.2   8.8   8.2
##   10      6.4  11.0   9.8
##   Total 100.0 100.0 100.0
#Column Percentages for fairelc and Vote
table(uk_data_clean$fairelc, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   0       1.1   0.3   0.5
##   1       0.1   0.1   0.1
##   2       0.8   0.1   0.3
##   3       1.1   0.5   0.6
##   4       2.2   0.8   1.2
##   5      10.0   3.0   4.7
##   6       6.3   3.2   3.9
##   7       9.3   4.9   5.9
##   8      15.2  11.7  12.5
##   9      14.8  12.2  12.8
##   10     39.2  63.3  57.6
##   Total 100.0 100.0 100.0
table(uk_data_clean$agea, uk_data_clean$vote) %>%
  cprop()
##        
##         0     1     All  
##   19      1.4   0.2   0.5
##   20      1.9   0.4   0.8
##   21      2.1   0.5   0.9
##   22      2.0   0.5   0.9
##   23      2.0   0.6   1.0
##   24      2.5   0.8   1.2
##   25      2.3   0.7   1.1
##   26      2.5   0.9   1.3
##   27      2.5   0.7   1.2
##   28      2.4   0.9   1.3
##   29      2.3   1.1   1.4
##   30      2.8   1.3   1.7
##   31      2.7   1.2   1.6
##   32      2.4   1.4   1.6
##   33      2.6   1.3   1.7
##   34      2.1   1.4   1.6
##   35      2.5   1.6   1.8
##   36      2.7   1.7   2.0
##   37      2.2   1.6   1.8
##   38      2.4   1.5   1.7
##   39      2.3   1.5   1.7
##   40      1.8   1.9   1.9
##   41      2.2   1.8   1.9
##   42      1.7   1.7   1.7
##   43      1.5   1.6   1.6
##   44      1.7   1.8   1.8
##   45      1.7   1.7   1.7
##   46      1.5   1.6   1.6
##   47      1.7   1.7   1.7
##   48      1.8   1.8   1.8
##   49      1.7   1.8   1.8
##   50      1.7   1.6   1.7
##   51      1.4   1.8   1.7
##   52      1.5   1.7   1.7
##   53      1.5   1.6   1.6
##   54      1.7   1.8   1.8
##   55      1.5   1.7   1.6
##   56      1.3   2.0   1.8
##   57      1.4   1.9   1.8
##   58      0.9   1.9   1.6
##   59      1.1   2.0   1.8
##   60      1.1   2.0   1.8
##   61      1.3   2.0   1.8
##   62      1.1   1.9   1.7
##   63      1.0   1.8   1.6
##   64      1.0   2.1   1.8
##   65      0.9   1.9   1.6
##   66      0.9   2.0   1.7
##   67      1.1   2.0   1.8
##   68      1.1   1.8   1.6
##   69      0.8   2.0   1.7
##   70      0.7   2.0   1.6
##   71      0.8   1.7   1.5
##   72      0.7   1.8   1.5
##   73      0.6   1.6   1.4
##   74      0.6   1.6   1.3
##   75      0.6   1.6   1.4
##   76      0.6   1.4   1.1
##   77      0.7   1.4   1.3
##   78      0.6   1.2   1.1
##   79      0.3   1.1   0.9
##   80      0.5   1.1   0.9
##   81      0.5   1.1   1.0
##   82      0.5   1.1   0.9
##   83      0.4   1.0   0.9
##   84      0.3   0.7   0.6
##   85      0.3   0.6   0.5
##   86      0.3   0.6   0.5
##   87      0.3   0.5   0.4
##   88      0.2   0.4   0.3
##   89      0.1   0.4   0.3
##   Total 100.0 100.0 100.0
#Mean and Standard Deviation for Vote by Gender  
summary_gender_vote <- uk_data_clean %>% 
  group_by(gndr) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_gender_vote
## # A tibble: 2 × 3
##    gndr  mean std_dev
##   <dbl> <dbl>   <dbl>
## 1     1 0.741   0.438
## 2     2 0.735   0.441
#Mean and Standard Deviation for Vote by Incomr  
summary_income_vote <- uk_data_clean %>% 
  group_by(hinctnta) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_income_vote
## # A tibble: 11 × 3
##    hinctnta  mean std_dev
##       <dbl> <dbl>   <dbl>
##  1        1 0.628   0.483
##  2        2 0.715   0.452
##  3        3 0.711   0.454
##  4        4 0.760   0.427
##  5        5 0.756   0.430
##  6        6 0.788   0.409
##  7        7 0.788   0.409
##  8        8 0.805   0.396
##  9        9 0.812   0.391
## 10       10 0.839   0.367
## 11       NA 0.718   0.450
#Mean and Standard Deviation for Vote by Fair Election  
summary_fair_election_vote <- uk_data_clean %>% 
  group_by(fairelc) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_fair_election_vote
## # A tibble: 12 × 3
##    fairelc  mean std_dev
##      <dbl> <dbl>   <dbl>
##  1       0 0.429   0.514
##  2       1 0.75    0.5  
##  3       2 0.25    0.463
##  4       3 0.579   0.507
##  5       4 0.556   0.504
##  6       5 0.497   0.502
##  7       6 0.620   0.487
##  8       7 0.630   0.484
##  9       8 0.715   0.452
## 10       9 0.728   0.446
## 11      10 0.840   0.367
## 12      NA 0.733   0.442
#Mean and Standard Deviation for Vote by Age 
summary_age_vote <- uk_data_clean %>% 
  group_by(agea) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_age_vote
## # A tibble: 71 × 3
##     agea  mean std_dev
##    <dbl> <dbl>   <dbl>
##  1    19 0.311   0.465
##  2    20 0.376   0.486
##  3    21 0.413   0.494
##  4    22 0.416   0.494
##  5    23 0.482   0.501
##  6    24 0.466   0.500
##  7    25 0.480   0.501
##  8    26 0.498   0.501
##  9    27 0.451   0.499
## 10    28 0.529   0.500
## # ℹ 61 more rows
#Mean and Standard Deviation for Vote by Political Interest  
summary_political_interest_vote <- uk_data_clean %>% 
  group_by(polintr) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_political_interest_vote
## # A tibble: 5 × 3
##   polintr  mean std_dev
##     <dbl> <dbl>   <dbl>
## 1       1 0.891   0.312
## 2       2 0.839   0.367
## 3       3 0.674   0.469
## 4       4 0.474   0.499
## 5      NA 0.286   0.488
#Mean and Standard Deviation for Vote by Closeness to Party   
summary_close_to_party_vote <- uk_data_clean %>% 
  group_by(clsprty) %>% 
  summarize(mean = mean(vote, na.rm = TRUE),
            std_dev = sd(vote, na.rm = TRUE)) 
summary_close_to_party_vote
## # A tibble: 3 × 3
##   clsprty  mean std_dev
##     <dbl> <dbl>   <dbl>
## 1       1 0.857   0.350
## 2       2 0.609   0.488
## 3      NA 0.742   0.439
vote_by_income <- uk_data_clean %>%
  group_by(hinctnta) %>%
 summarize(mean_vote = mean(vote, na.rm = TRUE))
vote_by_income
## # A tibble: 11 × 2
##    hinctnta mean_vote
##       <dbl>     <dbl>
##  1        1     0.628
##  2        2     0.715
##  3        3     0.711
##  4        4     0.760
##  5        5     0.756
##  6        6     0.788
##  7        7     0.788
##  8        8     0.805
##  9        9     0.812
## 10       10     0.839
## 11       NA     0.718
ggplot (data = vote_by_income,
        mapping = aes(x = hinctnta,
y = mean_vote,
color = "red")) +
  geom_smooth() +
  labs(title = "Likelihood to Vote by Houshold Income Decile", x = "Household Income Decile", y = "Likelihood to Vote") + 
  ylim (0,1) +
  geom_point()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).