# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "knitr" "viridis" "viridisLite" "fst" "RColorBrewer"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "RColorBrewer" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "RColorBrewer" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[11]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "RColorBrewer" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "RColorBrewer" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
#install.packages("fst")
library(fst)
setwd("~/Desktop/Homework_5_project")
ess <- read_fst("All-ESS-Data.fst")
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
ess$year[ess$essround == i] <- replacements[i]
}
table(ess$flttrd)
##
## 1 2 3 4 7 8 9
## 8989 24458 7219 2034 24 211 65
table(ess$imwbcnt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 26607 18119 31951 44795 46898 136926 46688 48299 37746 12610 15653
## 77 88 99
## 748 22278 1237
table(ess$netusoft)
##
## 1 2 3 4 5 7 8 9
## 23872 9013 8739 14124 97494 49 101 199
france_data <- ess %>% # Start with the original 'ess' dataset
filter(cntry == "FR") %>% # Filter the data to only include rows where country (cntry) is France ("FR")
mutate(
flttrd = ifelse(flttrd %in% c(7, 8, 9), NA, flttrd), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt), # For 'stfdem', set values 77, 88, and 99 to NA.
netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft), # For 'trstplt', set values 77, 88, and 99 to NA.
)
table(france_data$flttrd)
##
## 1 2 3 4
## 375 1183 284 144
table(france_data$imwbcnt)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1135 631 1342 1685 1861 6781 1693 1629 1164 342 490
table(france_data$netusoft)
##
## 1 2 3 4 5
## 919 342 301 550 3944
#Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).
datasummary_skim(france_data %>% select(flttrd,imwbcnt,netusoft))
| Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
|---|---|---|---|---|---|---|---|---|
| flttrd | 5 | 90 | 2.1 | 0.8 | 1.0 | 2.0 | 4.0 | |
| imwbcnt | 12 | 1 | 4.7 | 2.2 | 0.0 | 5.0 | 10.0 | |
| netusoft | 6 | 68 | 4.0 | 1.5 | 1.0 | 5.0 | 5.0 |
#Some information can be obtained from the charts of the above three variables. The value of unique in imwbcnt is the largest, its missing value is also the smallest, and its standard deviation is also the largest, so it can be inferred that its data is the most unstable and has the largest change, and the value of outlier is also more than other variables. The other two variables are not very noteworthy, but you can see that flttrd has the most missing values.
#Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.
#Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).
trust_by_year <- france_data %>%
group_by(year) %>%
summarize(mean_trust = mean(imwbcnt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
## year mean_trust
## <dbl> <dbl>
## 1 2002 4.52
## 2 2004 4.43
## 3 2006 4.46
## 4 2008 4.71
## 5 2010 4.59
## 6 2012 4.54
## 7 2014 4.85
## 8 2016 4.82
## 9 2018 5.03
## 10 2020 5.20
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
geom_line(color = "pink", size = 1) +
geom_point(color = "purple", size = 3) +
labs(title = "Immigrants make country worse or better place to live
(2000-2020)",
x = "Survey Year",
y = "worse place to live - best place to live (0-10)") +
ylim(0, 10) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#As can be seen from the chart above, from 2000 to 2020, French people
have maintained a neutral attitude toward whether immigrants make the
country better or worse, and there is no specific anti-immigration
situation, but there is also no specific pro-immigration data, and the
line on the data has maintained a very steady and slight upward trend.
However, it can also be seen that since 2000, French people’s influence
on the country by immigrants has gradually shown an upward trend, but
the curve has a very small change curve.
Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).
ess_selected <- ess %>%
filter(cntry %in% c("FR", "IS", "IE")) %>%
mutate(imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt))
task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -imwbcnt, FUN=median), y = imwbcnt, fill = cntry)) +
geom_boxplot() +
theme_minimal() +
theme(legend.position = "none") +
labs(title = " Immigrants make country worse or better place to live (France, Iceland, Ireland)",
x = "Country",
y = "worse place to live - best place to live (0-10)")
task3plot
## Warning: Removed 890 rows containing non-finite values (`stat_boxplot()`).
#As can be seen from the figure above, France’s value is the lowest, its
Q1 and Q3 are the lowest values among the three countries, so it can be
concluded that compared with Ireland and iceland, France is the least
favorable to immigration. Among the three countries, Ireland has the
largest range of IQR, which indicates that the people who think that
immigrants are better or worse for the country to live in are both large
and dispersed in a large range, which may be the most unstable data in
the three countries. While iceland has the highest value, although its
highest value Q3 is the same as Ireland, its average IQR and Q1 are the
highest minimum values, and it is also the only country with a lower
outlier among the three countries.
Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).
france_data <- france_data %>%
mutate(
edulvla = case_when(
essround < 5 & edulvla == 55 ~ NA_real_,
TRUE ~ edulvla
),
edulvlb = case_when(
essround >= 5 & edulvlb == 5555 ~ NA_real_,
TRUE ~ edulvlb
),
educ_level = case_when(
essround < 5 & edulvla == 5 ~ "BA",
essround >= 5 & edulvlb > 600 ~ "BA",
TRUE ~ "No BA"
)
)
table(france_data$educ_level)
##
## BA No BA
## 4235 14803
imwbcntedu <- datasummary_crosstab(imwbcnt ~ educ_level, data = france_data)
imwbcntedu
| imwbcnt | BA | No BA | All | |
|---|---|---|---|---|
| 0 | N | 70 | 1065 | 1135 |
| % row | 6.2 | 93.8 | 100.0 | |
| 1 | N | 41 | 590 | 631 |
| % row | 6.5 | 93.5 | 100.0 | |
| 2 | N | 140 | 1202 | 1342 |
| % row | 10.4 | 89.6 | 100.0 | |
| 3 | N | 205 | 1480 | 1685 |
| % row | 12.2 | 87.8 | 100.0 | |
| 4 | N | 300 | 1561 | 1861 |
| % row | 16.1 | 83.9 | 100.0 | |
| 5 | N | 1680 | 5101 | 6781 |
| % row | 24.8 | 75.2 | 100.0 | |
| 6 | N | 429 | 1264 | 1693 |
| % row | 25.3 | 74.7 | 100.0 | |
| 7 | N | 563 | 1066 | 1629 |
| % row | 34.6 | 65.4 | 100.0 | |
| 8 | N | 441 | 723 | 1164 |
| % row | 37.9 | 62.1 | 100.0 | |
| 9 | N | 135 | 207 | 342 |
| % row | 39.5 | 60.5 | 100.0 | |
| 10 | N | 159 | 331 | 490 |
| % row | 32.4 | 67.6 | 100.0 | |
| All | N | 4235 | 14803 | 19038 |
| % row | 22.2 | 77.8 | 100.0 |
france_data <- france_data %>%
mutate(paidjob = case_when(
pdjobev == 2 ~ "No",
pdjobev == 1 ~ "Yes",
pdjobev %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ as.character(pdjobev)
))
table(france_data$paidjob)
##
## 6 No Yes
## 9771 1440 7786
imwbcntpaidjob <- datasummary_crosstab(imwbcnt ~ paidjob, data = france_data)
imwbcntpaidjob
| imwbcnt | Â 6 | No | Yes | All | |
|---|---|---|---|---|---|
| 0 | N | 444 | 69 | 619 | 1135 |
| % row | 39.1 | 6.1 | 54.5 | 100.0 | |
| 1 | N | 266 | 36 | 328 | 631 |
| % row | 42.2 | 5.7 | 52.0 | 100.0 | |
| 2 | N | 543 | 89 | 708 | 1342 |
| % row | 40.5 | 6.6 | 52.8 | 100.0 | |
| 3 | N | 780 | 117 | 788 | 1685 |
| % row | 46.3 | 6.9 | 46.8 | 100.0 | |
| 4 | N | 858 | 151 | 847 | 1861 |
| % row | 46.1 | 8.1 | 45.5 | 100.0 | |
| 5 | N | 3752 | 502 | 2512 | 6781 |
| % row | 55.3 | 7.4 | 37.0 | 100.0 | |
| 6 | N | 913 | 144 | 635 | 1693 |
| % row | 53.9 | 8.5 | 37.5 | 100.0 | |
| 7 | N | 941 | 123 | 561 | 1629 |
| % row | 57.8 | 7.6 | 34.4 | 100.0 | |
| 8 | N | 671 | 92 | 400 | 1164 |
| % row | 57.6 | 7.9 | 34.4 | 100.0 | |
| 9 | N | 204 | 36 | 101 | 342 |
| % row | 59.6 | 10.5 | 29.5 | 100.0 | |
| 10 | N | 294 | 38 | 154 | 490 |
| % row | 60.0 | 7.8 | 31.4 | 100.0 | |
| All | N | 9771 | 1440 | 7786 | 19038 |
| % row | 51.3 | 7.6 | 40.9 | 100.0 |
table(france_data$imwbcnt, france_data$paidjob) %>% cprop()
##
## 6 No Yes All
## 0 4.6 4.9 8.1 6.0
## 1 2.8 2.6 4.3 3.4
## 2 5.6 6.4 9.3 7.2
## 3 8.1 8.4 10.3 9.0
## 4 8.9 10.8 11.1 9.9
## 5 38.8 35.9 32.8 36.2
## 6 9.4 10.3 8.3 9.0
## 7 9.7 8.8 7.3 8.7
## 8 6.9 6.6 5.2 6.2
## 9 2.1 2.6 1.3 1.8
## 10 3.0 2.7 2.0 2.6
## Total 100.0 100.0 100.0 100.0
#From the cross chart of imbcnt and paidjob, it can be seen that the more people think that immigration can have a positive impact on the country, almost all of them have jobs, while the majority of people who do not have jobs or have jobs have a moderate attitude towards this phenomenon. As can be seen from the cross plot of imbcnt and education, the closer the index is, the higher the no ba value is that immigrants make a country better to live in.
Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).
df <- france_data %>%
filter(!is.na(educ_level) & !is.na(imwbcnt))
df <- df %>%
mutate(imwbcnt = case_when(
imwbcnt == 0 ~ "worse place to live",
imwbcnt == 10 ~ "better place to live",
TRUE ~ as.character(imwbcnt)
))
table(df$imwbcnt)
##
## 1 2 3
## 631 1342 1685
## 4 5 6
## 1861 6781 1693
## 7 8 9
## 1629 1164 342
## better place to live worse place to live
## 490 1135
# visualize
table(df$imwbcnt, df$educ_level) %>%
cprop() %>%
as.data.frame() %>%
filter(Var1 != "Total",
Var2 != "All") %>%
ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
geom_col(position = "dodge") +
labs(title="Immigrants make country worse or better place to live in france",
y = "Conditional Percentage",
x = "worse place to live - best place to live (0-10)",
fill = "At least BA vs. Not")
#As can be seen from the chart, people without a degree are more likely
to choose a lower value, and they are more inclined to believe that
immigration will make the country worse to live in. However, starting
from 5 and above, more people with a degree believe that immigration
will bring better impact on the living environment of the country. But
surprisingly, the highest percentage of people who think immigration
makes the country a better place to live is those without degrees, by a
wide margin.