R Markdown
# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr") # add any you need here
# Install packages if they aren't installed already
#new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
#if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
ess <- read_fst("/Users/jocelyn/Desktop/SOC252/Tutorial 1/All-ESS-Data.fst")
Mission #1: Tables for Variables of Interest
ess_data_clean <- ess %>%
mutate(
vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
) %>%
select(yrbrn, vote, mainact)
table(ess_data_clean$vote)
##
## 1 2
## 350196 99401
table(ess_data_clean$yrbrn)
##
## 1885 1893 1900 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
## 1 1 5 1 1 3 5 3 7 9 33 41 53 91 109 140
## 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930
## 157 218 267 332 445 695 811 1093 1169 1323 1660 1815 1929 2439 2604 3071
## 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946
## 3280 3445 3625 3869 4280 4620 4802 5206 5634 6190 5848 5884 6055 6366 6642 7347
## 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
## 7797 7961 8017 8586 7963 8293 8008 8206 8401 8449 8303 8423 8250 9020 8483 8652
## 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
## 8347 8705 8836 8543 8313 8407 8273 8627 8054 8153 7656 7820 7879 7691 7463 7482
## 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
## 7388 8020 7173 7272 6902 6947 7059 6963 6616 6364 6020 5651 4970 4550 4372 3567
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 7777 8888 9999
## 3454 3062 2764 2318 2095 1884 1476 1243 1059 698 574 294 10 962 245 1898
Mission #2: Look into an Additional Socio-Demographic Variable
(mainact)
table(ess_data_clean$mainact)
##
## 1 2 3 4 5 6 7 8 9
## 38877 6654 2388 876 2540 15339 163 11818 949
Mission #3: Filter Variables for Country of Interest (Russia)
Russia_data <- ess[ess$cntry == "RU", ]
Variables_of_Interest <- Russia_data %>%
filter(cntry == "RU", yrbrn > 1920 & yrbrn < 2005) %>%
mutate(
vote = ifelse(vote %in% c(3, 7, 8, 9), NA, vote),
mainact = ifelse(mainact %in% c(66, 77, 88, 99), NA, mainact)
) %>%
select(yrbrn, vote, mainact)
Mission #4: Created a Data_Summary Skim for Variables of
Interest
Before Data Filtering/Cleaning
datasummary_skim(Russia_data %>% select(yrbrn, vote, mainact))
Unique (#)
Missing (%)
Mean
SD
Min
Median
Max
yrbrn
91
0
1983.8
391.4
1912.0
1965.0
9999.0
vote
5
0
1.5
1.0
1.0
1.0
8.0
mainact
9
0
57.4
21.6
1.0
66.0
99.0
After Data Cleaning
datasummary_skim(Variables_of_Interest %>% select(yrbrn, vote, mainact))
Unique (#)
Missing (%)
Mean
SD
Min
Median
Max
yrbrn
81
0
1964.4
18.8
1921.0
1965.0
2001.0
vote
3
7
1.3
0.5
1.0
1.0
2.0
mainact
8
86
3.2
2.7
1.0
1.0
8.0
Mission #5: Frequency Check for “mainact”
freq(Variables_of_Interest$mainact)
## n % val%
## 1 905 7.3 53.4
## 2 125 1.0 7.4
## 3 17 0.1 1.0
## 4 18 0.1 1.1
## 5 50 0.4 2.9
## 6 398 3.2 23.5
## 8 183 1.5 10.8
## NA 10707 86.3 NA
Variables_of_Interest %>%
drop_na(mainact) %>%
select(mainact) %>%
freq() %>%
as.data.frame() %>%
ggplot(aes(x=factor(rownames(.),
levels= c("Paid Work","Education","Unemployed Looking",
"Sick/Disabled", "Retired", "Community or Military Service",
"Housework, looking
After Children", "Other")),
y=`%`)) +
geom_col() +
labs(title = "Main Activity",
x = "mainact")
Mission #6: Conditional Probabilities
table(Variables_of_Interest$yrbrn, Variables_of_Interest$vote) %>%
rprop()
##
## 1 2 Total
## 1921 100.0 0.0 100.0
## 1922 77.8 22.2 100.0
## 1923 66.7 33.3 100.0
## 1924 89.7 10.3 100.0
## 1925 89.2 10.8 100.0
## 1926 77.8 22.2 100.0
## 1927 76.9 23.1 100.0
## 1928 84.9 15.1 100.0
## 1929 80.8 19.2 100.0
## 1930 80.5 19.5 100.0
## 1931 72.8 27.2 100.0
## 1932 75.6 24.4 100.0
## 1933 81.6 18.4 100.0
## 1934 69.8 30.2 100.0
## 1935 79.8 20.2 100.0
## 1936 85.3 14.7 100.0
## 1937 81.1 18.9 100.0
## 1938 79.5 20.5 100.0
## 1939 88.8 11.2 100.0
## 1940 81.6 18.4 100.0
## 1941 79.0 21.0 100.0
## 1942 76.2 23.8 100.0
## 1943 77.6 22.4 100.0
## 1944 82.0 18.0 100.0
## 1945 80.0 20.0 100.0
## 1946 77.2 22.8 100.0
## 1947 82.2 17.8 100.0
## 1948 80.6 19.4 100.0
## 1949 84.0 16.0 100.0
## 1950 76.6 23.4 100.0
## 1951 80.0 20.0 100.0
## 1952 73.4 26.6 100.0
## 1953 72.4 27.6 100.0
## 1954 69.7 30.3 100.0
## 1955 78.4 21.6 100.0
## 1956 76.9 23.1 100.0
## 1957 73.3 26.7 100.0
## 1958 69.1 30.9 100.0
## 1959 67.7 32.3 100.0
## 1960 69.0 31.0 100.0
## 1961 72.7 27.3 100.0
## 1962 73.7 26.3 100.0
## 1963 70.7 29.3 100.0
## 1964 72.3 27.7 100.0
## 1965 69.4 30.6 100.0
## 1966 70.6 29.4 100.0
## 1967 61.5 38.5 100.0
## 1968 57.9 42.1 100.0
## 1969 65.4 34.6 100.0
## 1970 67.2 32.8 100.0
## 1971 64.5 35.5 100.0
## 1972 61.6 38.4 100.0
## 1973 65.6 34.4 100.0
## 1974 64.5 35.5 100.0
## 1975 63.7 36.3 100.0
## 1976 60.4 39.6 100.0
## 1977 53.9 46.1 100.0
## 1978 59.3 40.7 100.0
## 1979 60.7 39.3 100.0
## 1980 57.3 42.7 100.0
## 1981 47.8 52.2 100.0
## 1982 53.7 46.3 100.0
## 1983 54.7 45.3 100.0
## 1984 56.1 43.9 100.0
## 1985 58.7 41.3 100.0
## 1986 49.6 50.4 100.0
## 1987 43.7 56.3 100.0
## 1988 44.8 55.2 100.0
## 1989 41.7 58.3 100.0
## 1990 38.4 61.6 100.0
## 1991 31.8 68.2 100.0
## 1992 43.2 56.8 100.0
## 1993 50.7 49.3 100.0
## 1994 40.5 59.5 100.0
## 1995 42.4 57.6 100.0
## 1996 36.4 63.6 100.0
## 1997 38.1 61.9 100.0
## All 66.8 33.2 100.0
Mission #7: Crosstabs
yrbrnvote <- datasummary_crosstab(yrbrn ~ vote, data = Variables_of_Interest)
yrbrnvote
yrbrn
1
2
All
1921
N
4
0
4
% row
100.0
0.0
100.0
1922
N
7
2
9
% row
77.8
22.2
100.0
1923
N
20
10
31
% row
64.5
32.3
100.0
1924
N
26
3
32
% row
81.2
9.4
100.0
1925
N
33
4
37
% row
89.2
10.8
100.0
1926
N
35
10
46
% row
76.1
21.7
100.0
1927
N
30
9
41
% row
73.2
22.0
100.0
1928
N
62
11
76
% row
81.6
14.5
100.0
1929
N
59
14
74
% row
79.7
18.9
100.0
1930
N
70
17
87
% row
80.5
19.5
100.0
1931
N
59
22
83
% row
71.1
26.5
100.0
1932
N
68
22
92
% row
73.9
23.9
100.0
1933
N
62
14
76
% row
81.6
18.4
100.0
1934
N
60
26
86
% row
69.8
30.2
100.0
1935
N
91
23
115
% row
79.1
20.0
100.0
1936
N
116
20
137
% row
84.7
14.6
100.0
1937
N
129
30
160
% row
80.6
18.8
100.0
1938
N
132
34
169
% row
78.1
20.1
100.0
1939
N
135
17
152
% row
88.8
11.2
100.0
1940
N
146
33
180
% row
81.1
18.3
100.0
1941
N
128
34
164
% row
78.0
20.7
100.0
1942
N
77
24
102
% row
75.5
23.5
100.0
1943
N
76
22
99
% row
76.8
22.2
100.0
1944
N
82
18
103
% row
79.6
17.5
100.0
1945
N
92
23
117
% row
78.6
19.7
100.0
1946
N
122
36
163
% row
74.8
22.1
100.0
1947
N
171
37
210
% row
81.4
17.6
100.0
1948
N
141
34
179
% row
78.8
19.0
100.0
1949
N
168
32
202
% row
83.2
15.8
100.0
1950
N
157
48
206
% row
76.2
23.3
100.0
1951
N
156
39
195
% row
80.0
20.0
100.0
1952
N
152
55
209
% row
72.7
26.3
100.0
1953
N
126
48
174
% row
72.4
27.6
100.0
1954
N
161
70
231
% row
69.7
30.3
100.0
1955
N
149
41
193
% row
77.2
21.2
100.0
1956
N
150
45
201
% row
74.6
22.4
100.0
1957
N
159
58
219
% row
72.6
26.5
100.0
1958
N
141
63
205
% row
68.8
30.7
100.0
1959
N
130
62
194
% row
67.0
32.0
100.0
1960
N
156
70
229
% row
68.1
30.6
100.0
1961
N
152
57
214
% row
71.0
26.6
100.0
1962
N
137
49
193
% row
71.0
25.4
100.0
1963
N
116
48
170
% row
68.2
28.2
100.0
1964
N
146
56
205
% row
71.2
27.3
100.0
1965
N
127
56
188
% row
67.6
29.8
100.0
1966
N
144
60
211
% row
68.2
28.4
100.0
1967
N
112
70
187
% row
59.9
37.4
100.0
1968
N
114
83
201
% row
56.7
41.3
100.0
1969
N
102
54
161
% row
63.4
33.5
100.0
1970
N
131
64
198
% row
66.2
32.3
100.0
1971
N
131
72
207
% row
63.3
34.8
100.0
1972
N
125
78
208
% row
60.1
37.5
100.0
1973
N
124
65
196
% row
63.3
33.2
100.0
1974
N
129
71
206
% row
62.6
34.5
100.0
1975
N
116
66
187
% row
62.0
35.3
100.0
1976
N
134
88
227
% row
59.0
38.8
100.0
1977
N
104
89
199
% row
52.3
44.7
100.0
1978
N
112
77
195
% row
57.4
39.5
100.0
1979
N
111
72
190
% row
58.4
37.9
100.0
1980
N
133
99
238
% row
55.9
41.6
100.0
1981
N
96
105
210
% row
45.7
50.0
100.0
1982
N
110
95
210
% row
52.4
45.2
100.0
1983
N
122
101
225
% row
54.2
44.9
100.0
1984
N
128
100
234
% row
54.7
42.7
100.0
1985
N
122
86
216
% row
56.5
39.8
100.0
1986
N
130
132
273
% row
47.6
48.4
100.0
1987
N
101
130
249
% row
40.6
52.2
100.0
1988
N
91
112
224
% row
40.6
50.0
100.0
1989
N
88
123
221
% row
39.8
55.7
100.0
1990
N
43
69
235
% row
18.3
29.4
100.0
1991
N
34
73
195
% row
17.4
37.4
100.0
1992
N
38
50
141
% row
27.0
35.5
100.0
1993
N
34
33
130
% row
26.2
25.4
100.0
1994
N
15
22
97
% row
15.5
22.7
100.0
1995
N
14
19
87
% row
16.1
21.8
100.0
1996
N
12
21
67
% row
17.9
31.3
100.0
1997
N
8
13
44
% row
18.2
29.5
100.0
1998
N
0
0
31
% row
0.0
0.0
100.0
1999
N
0
0
17
% row
0.0
0.0
100.0
2000
N
0
0
19
% row
0.0
0.0
100.0
2001
N
0
0
15
% row
0.0
0.0
100.0
All
N
7724
3838
12403
% row
62.3
30.9
100.0
Mission #8: Produce a Visual of a Categorical Variable
table(Variables_of_Interest$mainact, Variables_of_Interest$yrbrn) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var2 %>% as.character() %>% as.integer(),
y=Freq,
color=Var1)) +
geom_line() +
labs(title="Main Act vs. Yearborn",
x = "Survey",
color = "Main Act")
Mission #9: Create a Visualization for a Cetegorical Value and
Outcome of Interest
table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "1, 2") %>%
ggplot(aes(x=Var2 %>% as.character() %>% as.integer(1, 2),
y=Freq,
color=Var1)) +
geom_line() +
labs(title="Main Act vs. Yearborn",
x = "Survey",
color = "Main Act")
## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion
## Warning in Var2 %>% as.character() %>% as.integer(1, 2): NAs introduced by
## coercion
## Warning: Removed 7 rows containing missing values (`geom_line()`).
Mission #10: Do a Second Visualization with Conditional
Probabilities
# double check clean
Variables_of_Interest <- Variables_of_Interest %>%
filter(!is.na(mainact) & !is.na(vote))
# visualize
table(Variables_of_Interest$mainact, Variables_of_Interest$vote) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
geom_col(position = "dodge") +
labs(title="Main Act vs. Probability of Voting",
x = "Main Act",
fill = "Voted vs. Did not Vote")