R Markdown

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here

# Install packages if they aren't installed already
#new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
#if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
ess <- read_fst("/Users/jocelyn/Desktop/SOC252/Tutorial 1/All-ESS-Data.fst")

Mission #1: Tables for Variables of Interest

ess_data_clean <- ess %>%
  mutate(
    vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
    mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
  ) %>%
  select(yrbrn, vote, mainact)

table(ess_data_clean$vote)
## 
##      1      2 
## 350196  99401
table(ess_data_clean$yrbrn)
## 
## 1885 1893 1900 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 
##    1    1    5    1    1    3    5    3    7    9   33   41   53   91  109  140 
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 
##  157  218  267  332  445  695  811 1093 1169 1323 1660 1815 1929 2439 2604 3071 
## 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 
## 3280 3445 3625 3869 4280 4620 4802 5206 5634 6190 5848 5884 6055 6366 6642 7347 
## 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 
## 7797 7961 8017 8586 7963 8293 8008 8206 8401 8449 8303 8423 8250 9020 8483 8652 
## 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 
## 8347 8705 8836 8543 8313 8407 8273 8627 8054 8153 7656 7820 7879 7691 7463 7482 
## 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 
## 7388 8020 7173 7272 6902 6947 7059 6963 6616 6364 6020 5651 4970 4550 4372 3567 
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 7777 8888 9999 
## 3454 3062 2764 2318 2095 1884 1476 1243 1059  698  574  294   10  962  245 1898

Mission #2: Look into an Additional Socio-Demographic Variable (mainact)

table(ess_data_clean$mainact)
## 
##     1     2     3     4     5     6     7     8     9 
## 38877  6654  2388   876  2540 15339   163 11818   949

Mission #3: Filter Variables for Country of Interest (Russia)

Russia_data <- ess[ess$cntry == "RU", ]
Variables_of_Interest <- Russia_data %>%
  filter(cntry == "RU", yrbrn > 1920 & yrbrn < 2005) %>%
  mutate(
    vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
    mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
  ) %>%
  select(yrbrn, vote, mainact)

Mission #4: Created a Data_Summary Skim for Variables of Interest

Before Data Filtering/Cleaning

datasummary_skim(Russia_data %>% select(yrbrn, vote, mainact))
Unique (#) Missing (%) Mean SD Min Median Max
yrbrn 91 0 1983.8 391.4 1912.0 1965.0 9999.0
vote 5 0 1.5 1.0 1.0 1.0 8.0
mainact 9 0 57.4 21.6 1.0 66.0 99.0

After Data Cleaning

datasummary_skim(Variables_of_Interest %>% select(yrbrn, vote, mainact))
Unique (#) Missing (%) Mean SD Min Median Max
yrbrn 81 0 1964.4 18.8 1921.0 1965.0 2001.0
vote 3 7 1.3 0.5 1.0 1.0 2.0
mainact 8 86 3.2 2.7 1.0 1.0 8.0

Mission #5: Frequency Check for “mainact”

freq(Variables_of_Interest$mainact)
##        n    % val%
## 1    905  7.3 53.4
## 2    125  1.0  7.4
## 3     17  0.1  1.0
## 4     18  0.1  1.1
## 5     50  0.4  2.9
## 6    398  3.2 23.5
## 8    183  1.5 10.8
## NA 10707 86.3   NA
Variables_of_Interest %>%
drop_na(mainact) %>%
select(mainact) %>%
freq() %>%
  as.data.frame() %>%
  ggplot(aes(x=factor(rownames(.),
                      levels= c("Paid Work","Education","Unemployed Looking",
                                      "Sick/Disabled", "Retired", "Community or Military Service", 
                                      "Housework, looking
                                       After Children", "Other")), 
             y=`%`)) +
  geom_col() +
  labs(title = "Main Activity",
       x = "mainact")

Mission #6: Conditional Probabilities

table(Variables_of_Interest$yrbrn, Variables_of_Interest$vote) %>%
  rprop()
##       
##        1     2     Total
##   1921 100.0   0.0 100.0
##   1922  77.8  22.2 100.0
##   1923  66.7  33.3 100.0
##   1924  89.7  10.3 100.0
##   1925  89.2  10.8 100.0
##   1926  77.8  22.2 100.0
##   1927  76.9  23.1 100.0
##   1928  84.9  15.1 100.0
##   1929  80.8  19.2 100.0
##   1930  80.5  19.5 100.0
##   1931  72.8  27.2 100.0
##   1932  75.6  24.4 100.0
##   1933  81.6  18.4 100.0
##   1934  69.8  30.2 100.0
##   1935  79.8  20.2 100.0
##   1936  85.3  14.7 100.0
##   1937  81.1  18.9 100.0
##   1938  79.5  20.5 100.0
##   1939  88.8  11.2 100.0
##   1940  81.6  18.4 100.0
##   1941  79.0  21.0 100.0
##   1942  76.2  23.8 100.0
##   1943  77.6  22.4 100.0
##   1944  82.0  18.0 100.0
##   1945  80.0  20.0 100.0
##   1946  77.2  22.8 100.0
##   1947  82.2  17.8 100.0
##   1948  80.6  19.4 100.0
##   1949  84.0  16.0 100.0
##   1950  76.6  23.4 100.0
##   1951  80.0  20.0 100.0
##   1952  73.4  26.6 100.0
##   1953  72.4  27.6 100.0
##   1954  69.7  30.3 100.0
##   1955  78.4  21.6 100.0
##   1956  76.9  23.1 100.0
##   1957  73.3  26.7 100.0
##   1958  69.1  30.9 100.0
##   1959  67.7  32.3 100.0
##   1960  69.0  31.0 100.0
##   1961  72.7  27.3 100.0
##   1962  73.7  26.3 100.0
##   1963  70.7  29.3 100.0
##   1964  72.3  27.7 100.0
##   1965  69.4  30.6 100.0
##   1966  70.6  29.4 100.0
##   1967  61.5  38.5 100.0
##   1968  57.9  42.1 100.0
##   1969  65.4  34.6 100.0
##   1970  67.2  32.8 100.0
##   1971  64.5  35.5 100.0
##   1972  61.6  38.4 100.0
##   1973  65.6  34.4 100.0
##   1974  64.5  35.5 100.0
##   1975  63.7  36.3 100.0
##   1976  60.4  39.6 100.0
##   1977  53.9  46.1 100.0
##   1978  59.3  40.7 100.0
##   1979  60.7  39.3 100.0
##   1980  57.3  42.7 100.0
##   1981  47.8  52.2 100.0
##   1982  53.7  46.3 100.0
##   1983  54.7  45.3 100.0
##   1984  56.1  43.9 100.0
##   1985  58.7  41.3 100.0
##   1986  49.6  50.4 100.0
##   1987  43.7  56.3 100.0
##   1988  44.8  55.2 100.0
##   1989  41.7  58.3 100.0
##   1990  38.4  61.6 100.0
##   1991  31.8  68.2 100.0
##   1992  43.2  56.8 100.0
##   1993  50.7  49.3 100.0
##   1994  40.5  59.5 100.0
##   1995  42.4  57.6 100.0
##   1996  36.4  63.6 100.0
##   1997  38.1  61.9 100.0
##   All   66.8  33.2 100.0

Mission #7: Crosstabs

yrbrnvote <- datasummary_crosstab(yrbrn ~ vote, data = Variables_of_Interest)
yrbrnvote
yrbrn 1  2 All
1921 N 4 0 4
% row 100.0 0.0 100.0
1922 N 7 2 9
% row 77.8 22.2 100.0
1923 N 20 10 31
% row 64.5 32.3 100.0
1924 N 26 3 32
% row 81.2 9.4 100.0
1925 N 33 4 37
% row 89.2 10.8 100.0
1926 N 35 10 46
% row 76.1 21.7 100.0
1927 N 30 9 41
% row 73.2 22.0 100.0
1928 N 62 11 76
% row 81.6 14.5 100.0
1929 N 59 14 74
% row 79.7 18.9 100.0
1930 N 70 17 87
% row 80.5 19.5 100.0
1931 N 59 22 83
% row 71.1 26.5 100.0
1932 N 68 22 92
% row 73.9 23.9 100.0
1933 N 62 14 76
% row 81.6 18.4 100.0
1934 N 60 26 86
% row 69.8 30.2 100.0
1935 N 91 23 115
% row 79.1 20.0 100.0
1936 N 116 20 137
% row 84.7 14.6 100.0
1937 N 129 30 160
% row 80.6 18.8 100.0
1938 N 132 34 169
% row 78.1 20.1 100.0
1939 N 135 17 152
% row 88.8 11.2 100.0
1940 N 146 33 180
% row 81.1 18.3 100.0
1941 N 128 34 164
% row 78.0 20.7 100.0
1942 N 77 24 102
% row 75.5 23.5 100.0
1943 N 76 22 99
% row 76.8 22.2 100.0
1944 N 82 18 103
% row 79.6 17.5 100.0
1945 N 92 23 117
% row 78.6 19.7 100.0
1946 N 122 36 163
% row 74.8 22.1 100.0
1947 N 171 37 210
% row 81.4 17.6 100.0
1948 N 141 34 179
% row 78.8 19.0 100.0
1949 N 168 32 202
% row 83.2 15.8 100.0
1950 N 157 48 206
% row 76.2 23.3 100.0
1951 N 156 39 195
% row 80.0 20.0 100.0
1952 N 152 55 209
% row 72.7 26.3 100.0
1953 N 126 48 174
% row 72.4 27.6 100.0
1954 N 161 70 231
% row 69.7 30.3 100.0
1955 N 149 41 193
% row 77.2 21.2 100.0
1956 N 150 45 201
% row 74.6 22.4 100.0
1957 N 159 58 219
% row 72.6 26.5 100.0
1958 N 141 63 205
% row 68.8 30.7 100.0
1959 N 130 62 194
% row 67.0 32.0 100.0
1960 N 156 70 229
% row 68.1 30.6 100.0
1961 N 152 57 214
% row 71.0 26.6 100.0
1962 N 137 49 193
% row 71.0 25.4 100.0
1963 N 116 48 170
% row 68.2 28.2 100.0
1964 N 146 56 205
% row 71.2 27.3 100.0
1965 N 127 56 188
% row 67.6 29.8 100.0
1966 N 144 60 211
% row 68.2 28.4 100.0
1967 N 112 70 187
% row 59.9 37.4 100.0
1968 N 114 83 201
% row 56.7 41.3 100.0
1969 N 102 54 161
% row 63.4 33.5 100.0
1970 N 131 64 198
% row 66.2 32.3 100.0
1971 N 131 72 207
% row 63.3 34.8 100.0
1972 N 125 78 208
% row 60.1 37.5 100.0
1973 N 124 65 196
% row 63.3 33.2 100.0
1974 N 129 71 206
% row 62.6 34.5 100.0
1975 N 116 66 187
% row 62.0 35.3 100.0
1976 N 134 88 227
% row 59.0 38.8 100.0
1977 N 104 89 199
% row 52.3 44.7 100.0
1978 N 112 77 195
% row 57.4 39.5 100.0
1979 N 111 72 190
% row 58.4 37.9 100.0
1980 N 133 99 238
% row 55.9 41.6 100.0
1981 N 96 105 210
% row 45.7 50.0 100.0
1982 N 110 95 210
% row 52.4 45.2 100.0
1983 N 122 101 225
% row 54.2 44.9 100.0
1984 N 128 100 234
% row 54.7 42.7 100.0
1985 N 122 86 216
% row 56.5 39.8 100.0
1986 N 130 132 273
% row 47.6 48.4 100.0
1987 N 101 130 249
% row 40.6 52.2 100.0
1988 N 91 112 224
% row 40.6 50.0 100.0
1989 N 88 123 221
% row 39.8 55.7 100.0
1990 N 43 69 235
% row 18.3 29.4 100.0
1991 N 34 73 195
% row 17.4 37.4 100.0
1992 N 38 50 141
% row 27.0 35.5 100.0
1993 N 34 33 130
% row 26.2 25.4 100.0
1994 N 15 22 97
% row 15.5 22.7 100.0
1995 N 14 19 87
% row 16.1 21.8 100.0
1996 N 12 21 67
% row 17.9 31.3 100.0
1997 N 8 13 44
% row 18.2 29.5 100.0
1998 N 0 0 31
% row 0.0 0.0 100.0
1999 N 0 0 17
% row 0.0 0.0 100.0
2000 N 0 0 19
% row 0.0 0.0 100.0
2001 N 0 0 15
% row 0.0 0.0 100.0
All N 7724 3838 12403
% row 62.3 30.9 100.0

Mission #8: Produce a Visual of a Categorical Variable

table(Variables_of_Interest$mainact, Variables_of_Interest$yrbrn) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
             y=Freq, 
             color=Var1)) +
  geom_line() +
  labs(title="Main Act vs. Yearborn",
       x = "Survey",
       color = "Main Act")

Mission #9: Create a Visualization for a Cetegorical Value and Outcome of Interest

table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "1, 2") %>%
  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(1, 2), 
             y=Freq, 
             color=Var1)) +
  geom_line() +
  labs(title="Main Act vs. Yearborn",
       x = "Survey",
       color = "Main Act")
## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion

## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion
## Warning: Removed 7 rows containing missing values (`geom_line()`).

Mission #10: Do a Second Visualization with Conditional Probabilities

# double check clean
Variables_of_Interest <- Variables_of_Interest %>%
  filter(!is.na(mainact) & !is.na(vote))

# visualize
table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Main Act vs. Probability of Voting",
       x = "Main Act",
       fill = "Voted vs. Did not Vote")