Participation_3

R Markdown

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here

# Install packages if they aren't installed already
#new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
#if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

ess <- read_fst("/Users/jocelyn/Desktop/SOC252/Tutorial 1/All-ESS-Data.fst")

Mission #1: Tables for Variables of Interest

ess_data_clean <- ess %>%
  mutate(
    vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
    mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
  ) %>%
  select(yrbrn, vote, mainact)

table(ess_data_clean$vote)

## 
##      1      2 
## 350196  99401

table(ess_data_clean$yrbrn)

## 
## 1885 1893 1900 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 
##    1    1    5    1    1    3    5    3    7    9   33   41   53   91  109  140 
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 
##  157  218  267  332  445  695  811 1093 1169 1323 1660 1815 1929 2439 2604 3071 
## 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 
## 3280 3445 3625 3869 4280 4620 4802 5206 5634 6190 5848 5884 6055 6366 6642 7347 
## 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 
## 7797 7961 8017 8586 7963 8293 8008 8206 8401 8449 8303 8423 8250 9020 8483 8652 
## 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 
## 8347 8705 8836 8543 8313 8407 8273 8627 8054 8153 7656 7820 7879 7691 7463 7482 
## 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 
## 7388 8020 7173 7272 6902 6947 7059 6963 6616 6364 6020 5651 4970 4550 4372 3567 
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 7777 8888 9999 
## 3454 3062 2764 2318 2095 1884 1476 1243 1059  698  574  294   10  962  245 1898

Mission #2: Look into an Additional Socio-Demographic Variable (mainact)

table(ess_data_clean$mainact)

## 
##     1     2     3     4     5     6     7     8     9 
## 38877  6654  2388   876  2540 15339   163 11818   949

Mission #3: Filter Variables for Country of Interest (Russia)

Russia_data <- ess[ess$cntry == "RU", ]

Variables_of_Interest <- Russia_data %>%
  filter(cntry == "RU", yrbrn > 1920 & yrbrn < 2005) %>%
  mutate(
    vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
    mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
  ) %>%
  select(yrbrn, vote, mainact)

Mission #4: Created a Data_Summary Skim for Variables of Interest

Before Data Filtering/Cleaning

datasummary_skim(Russia_data %>% select(yrbrn, vote, mainact))

	Unique (#)	Mean	SD	Min	Median	Max
yrbrn	91	1983.8	391.4	1912.0	1965.0	9999.0
vote	5	1.5	1.0	1.0	1.0	8.0
mainact	9	57.4	21.6	1.0	66.0	99.0

After Data Cleaning

datasummary_skim(Variables_of_Interest %>% select(yrbrn, vote, mainact))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
yrbrn	81	0	1964.4	18.8	1921.0	1965.0	2001.0
vote	3	7	1.3	0.5	1.0	1.0	2.0
mainact	8	86	3.2	2.7	1.0	1.0	8.0

Mission #5: Frequency Check for “mainact”

freq(Variables_of_Interest$mainact)

##        n    % val%
## 1    905  7.3 53.4
## 2    125  1.0  7.4
## 3     17  0.1  1.0
## 4     18  0.1  1.1
## 5     50  0.4  2.9
## 6    398  3.2 23.5
## 8    183  1.5 10.8
## NA 10707 86.3   NA

Variables_of_Interest %>%
drop_na(mainact) %>%
select(mainact) %>%
freq() %>%
  as.data.frame() %>%
  ggplot(aes(x=factor(rownames(.),
                      levels= c("Paid Work","Education","Unemployed Looking",
                                      "Sick/Disabled", "Retired", "Community or Military Service", 
                                      "Housework, looking
                                       After Children", "Other")), 
             y=`%`)) +
  geom_col() +
  labs(title = "Main Activity",
       x = "mainact")

Mission #6: Conditional Probabilities

table(Variables_of_Interest$yrbrn, Variables_of_Interest$vote) %>%
  rprop()

##       
##        1     2     Total
##   1921 100.0   0.0 100.0
##   1922  77.8  22.2 100.0
##   1923  66.7  33.3 100.0
##   1924  89.7  10.3 100.0
##   1925  89.2  10.8 100.0
##   1926  77.8  22.2 100.0
##   1927  76.9  23.1 100.0
##   1928  84.9  15.1 100.0
##   1929  80.8  19.2 100.0
##   1930  80.5  19.5 100.0
##   1931  72.8  27.2 100.0
##   1932  75.6  24.4 100.0
##   1933  81.6  18.4 100.0
##   1934  69.8  30.2 100.0
##   1935  79.8  20.2 100.0
##   1936  85.3  14.7 100.0
##   1937  81.1  18.9 100.0
##   1938  79.5  20.5 100.0
##   1939  88.8  11.2 100.0
##   1940  81.6  18.4 100.0
##   1941  79.0  21.0 100.0
##   1942  76.2  23.8 100.0
##   1943  77.6  22.4 100.0
##   1944  82.0  18.0 100.0
##   1945  80.0  20.0 100.0
##   1946  77.2  22.8 100.0
##   1947  82.2  17.8 100.0
##   1948  80.6  19.4 100.0
##   1949  84.0  16.0 100.0
##   1950  76.6  23.4 100.0
##   1951  80.0  20.0 100.0
##   1952  73.4  26.6 100.0
##   1953  72.4  27.6 100.0
##   1954  69.7  30.3 100.0
##   1955  78.4  21.6 100.0
##   1956  76.9  23.1 100.0
##   1957  73.3  26.7 100.0
##   1958  69.1  30.9 100.0
##   1959  67.7  32.3 100.0
##   1960  69.0  31.0 100.0
##   1961  72.7  27.3 100.0
##   1962  73.7  26.3 100.0
##   1963  70.7  29.3 100.0
##   1964  72.3  27.7 100.0
##   1965  69.4  30.6 100.0
##   1966  70.6  29.4 100.0
##   1967  61.5  38.5 100.0
##   1968  57.9  42.1 100.0
##   1969  65.4  34.6 100.0
##   1970  67.2  32.8 100.0
##   1971  64.5  35.5 100.0
##   1972  61.6  38.4 100.0
##   1973  65.6  34.4 100.0
##   1974  64.5  35.5 100.0
##   1975  63.7  36.3 100.0
##   1976  60.4  39.6 100.0
##   1977  53.9  46.1 100.0
##   1978  59.3  40.7 100.0
##   1979  60.7  39.3 100.0
##   1980  57.3  42.7 100.0
##   1981  47.8  52.2 100.0
##   1982  53.7  46.3 100.0
##   1983  54.7  45.3 100.0
##   1984  56.1  43.9 100.0
##   1985  58.7  41.3 100.0
##   1986  49.6  50.4 100.0
##   1987  43.7  56.3 100.0
##   1988  44.8  55.2 100.0
##   1989  41.7  58.3 100.0
##   1990  38.4  61.6 100.0
##   1991  31.8  68.2 100.0
##   1992  43.2  56.8 100.0
##   1993  50.7  49.3 100.0
##   1994  40.5  59.5 100.0
##   1995  42.4  57.6 100.0
##   1996  36.4  63.6 100.0
##   1997  38.1  61.9 100.0
##   All   66.8  33.2 100.0

Mission #7: Crosstabs

yrbrnvote <- datasummary_crosstab(yrbrn ~ vote, data = Variables_of_Interest)
yrbrnvote

yrbrn		1	2	All
1921	N	4	0	4
	% row	100.0	0.0	100.0
1922	N	7	2	9
	% row	77.8	22.2	100.0
1923	N	20	10	31
	% row	64.5	32.3	100.0
1924	N	26	3	32
	% row	81.2	9.4	100.0
1925	N	33	4	37
	% row	89.2	10.8	100.0
1926	N	35	10	46
	% row	76.1	21.7	100.0
1927	N	30	9	41
	% row	73.2	22.0	100.0
1928	N	62	11	76
	% row	81.6	14.5	100.0
1929	N	59	14	74
	% row	79.7	18.9	100.0
1930	N	70	17	87
	% row	80.5	19.5	100.0
1931	N	59	22	83
	% row	71.1	26.5	100.0
1932	N	68	22	92
	% row	73.9	23.9	100.0
1933	N	62	14	76
	% row	81.6	18.4	100.0
1934	N	60	26	86
	% row	69.8	30.2	100.0
1935	N	91	23	115
	% row	79.1	20.0	100.0
1936	N	116	20	137
	% row	84.7	14.6	100.0
1937	N	129	30	160
	% row	80.6	18.8	100.0
1938	N	132	34	169
	% row	78.1	20.1	100.0
1939	N	135	17	152
	% row	88.8	11.2	100.0
1940	N	146	33	180
	% row	81.1	18.3	100.0
1941	N	128	34	164
	% row	78.0	20.7	100.0
1942	N	77	24	102
	% row	75.5	23.5	100.0
1943	N	76	22	99
	% row	76.8	22.2	100.0
1944	N	82	18	103
	% row	79.6	17.5	100.0
1945	N	92	23	117
	% row	78.6	19.7	100.0
1946	N	122	36	163
	% row	74.8	22.1	100.0
1947	N	171	37	210
	% row	81.4	17.6	100.0
1948	N	141	34	179
	% row	78.8	19.0	100.0
1949	N	168	32	202
	% row	83.2	15.8	100.0
1950	N	157	48	206
	% row	76.2	23.3	100.0
1951	N	156	39	195
	% row	80.0	20.0	100.0
1952	N	152	55	209
	% row	72.7	26.3	100.0
1953	N	126	48	174
	% row	72.4	27.6	100.0
1954	N	161	70	231
	% row	69.7	30.3	100.0
1955	N	149	41	193
	% row	77.2	21.2	100.0
1956	N	150	45	201
	% row	74.6	22.4	100.0
1957	N	159	58	219
	% row	72.6	26.5	100.0
1958	N	141	63	205
	% row	68.8	30.7	100.0
1959	N	130	62	194
	% row	67.0	32.0	100.0
1960	N	156	70	229
	% row	68.1	30.6	100.0
1961	N	152	57	214
	% row	71.0	26.6	100.0
1962	N	137	49	193
	% row	71.0	25.4	100.0
1963	N	116	48	170
	% row	68.2	28.2	100.0
1964	N	146	56	205
	% row	71.2	27.3	100.0
1965	N	127	56	188
	% row	67.6	29.8	100.0
1966	N	144	60	211
	% row	68.2	28.4	100.0
1967	N	112	70	187
	% row	59.9	37.4	100.0
1968	N	114	83	201
	% row	56.7	41.3	100.0
1969	N	102	54	161
	% row	63.4	33.5	100.0
1970	N	131	64	198
	% row	66.2	32.3	100.0
1971	N	131	72	207
	% row	63.3	34.8	100.0
1972	N	125	78	208
	% row	60.1	37.5	100.0
1973	N	124	65	196
	% row	63.3	33.2	100.0
1974	N	129	71	206
	% row	62.6	34.5	100.0
1975	N	116	66	187
	% row	62.0	35.3	100.0
1976	N	134	88	227
	% row	59.0	38.8	100.0
1977	N	104	89	199
	% row	52.3	44.7	100.0
1978	N	112	77	195
	% row	57.4	39.5	100.0
1979	N	111	72	190
	% row	58.4	37.9	100.0
1980	N	133	99	238
	% row	55.9	41.6	100.0
1981	N	96	105	210
	% row	45.7	50.0	100.0
1982	N	110	95	210
	% row	52.4	45.2	100.0
1983	N	122	101	225
	% row	54.2	44.9	100.0
1984	N	128	100	234
	% row	54.7	42.7	100.0
1985	N	122	86	216
	% row	56.5	39.8	100.0
1986	N	130	132	273
	% row	47.6	48.4	100.0
1987	N	101	130	249
	% row	40.6	52.2	100.0
1988	N	91	112	224
	% row	40.6	50.0	100.0
1989	N	88	123	221
	% row	39.8	55.7	100.0
1990	N	43	69	235
	% row	18.3	29.4	100.0
1991	N	34	73	195
	% row	17.4	37.4	100.0
1992	N	38	50	141
	% row	27.0	35.5	100.0
1993	N	34	33	130
	% row	26.2	25.4	100.0
1994	N	15	22	97
	% row	15.5	22.7	100.0
1995	N	14	19	87
	% row	16.1	21.8	100.0
1996	N	12	21	67
	% row	17.9	31.3	100.0
1997	N	8	13	44
	% row	18.2	29.5	100.0
1998	N	0	0	31
	% row	0.0	0.0	100.0
1999	N	0	0	17
	% row	0.0	0.0	100.0
2000	N	0	0	19
	% row	0.0	0.0	100.0
2001	N	0	0	15
	% row	0.0	0.0	100.0
All	N	7724	3838	12403
	% row	62.3	30.9	100.0

Mission #8: Produce a Visual of a Categorical Variable

table(Variables_of_Interest$mainact, Variables_of_Interest$yrbrn) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(), 
             y=Freq, 
             color=Var1)) +
  geom_line() +
  labs(title="Main Act vs. Yearborn",
       x = "Survey",
       color = "Main Act")

Mission #9: Create a Visualization for a Cetegorical Value and Outcome of Interest

table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "1, 2") %>%
  ggplot(aes(x=Var2 %>% as.character() %>% as.integer(1, 2), 
             y=Freq, 
             color=Var1)) +
  geom_line() +
  labs(title="Main Act vs. Yearborn",
       x = "Survey",
       color = "Main Act")

## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion

## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion

## Warning: Removed 7 rows containing missing values (`geom_line()`).

Mission #10: Do a Second Visualization with Conditional Probabilities

# double check clean
Variables_of_Interest <- Variables_of_Interest %>%
  filter(!is.na(mainact) & !is.na(vote))

# visualize
table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Main Act vs. Probability of Voting",
       x = "Main Act",
       fill = "Voted vs. Did not Vote")

Participation_3_MATTKA

Jocelyn Mattka

2023-10-09

R Markdown

Mission #1: Tables for Variables of Interest

Mission #2: Look into an Additional Socio-Demographic Variable (mainact)

Mission #3: Filter Variables for Country of Interest (Russia)

Mission #4: Created a Data_Summary Skim for Variables of Interest

Mission #5: Frequency Check for “mainact”

Mission #6: Conditional Probabilities

Mission #7: Crosstabs

Mission #8: Produce a Visual of a Categorical Variable

Mission #9: Create a Visualization for a Cetegorical Value and Outcome of Interest

Mission #10: Do a Second Visualization with Conditional Probabilities