R Markdown

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
#install.packages("fst")
library(fst)
setwd("~/Desktop/Homework_5_project")
ess <- read_fst("All-ESS-Data.fst")
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}
table(ess$flttrd)
## 
##     1     2     3     4     7     8     9 
##  8989 24458  7219  2034    24   211    65
table(ess$imwbcnt)
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  26607  18119  31951  44795  46898 136926  46688  48299  37746  12610  15653 
##     77     88     99 
##    748  22278   1237
table(ess$netusoft)
## 
##     1     2     3     4     5     7     8     9 
## 23872  9013  8739 14124 97494    49   101   199
france_data <- ess %>% # Start with the original 'ess' dataset
  filter(cntry == "FR") %>% # Filter the data to only include rows where country (cntry) is France ("FR")
  mutate(
    flttrd = ifelse(flttrd %in% c(7, 8, 9), NA, flttrd), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt), # For 'stfdem', set values 77, 88, and 99 to NA.
    netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft), # For 'trstplt', set values 77, 88, and 99 to NA.
  )
table(france_data$flttrd)
## 
##    1    2    3    4 
##  375 1183  284  144
table(france_data$imwbcnt)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 1135  631 1342 1685 1861 6781 1693 1629 1164  342  490
table(france_data$netusoft)
## 
##    1    2    3    4    5 
##  919  342  301  550 3944

Task 1

#Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).

datasummary_skim(france_data %>% select(flttrd,imwbcnt,netusoft))
Unique (#) Missing (%) Mean SD Min Median Max
flttrd 5 90 2.1 0.8 1.0 2.0 4.0
imwbcnt 12 1 4.7 2.2 0.0 5.0 10.0
netusoft 6 68 4.0 1.5 1.0 5.0 5.0

#Some information can be obtained from the charts of the above three variables. The value of unique in imwbcnt is the largest, its missing value is also the smallest, and its standard deviation is also the largest, so it can be inferred that its data is the most unstable and has the largest change, and the value of outlier is also more than other variables. The other two variables are not very noteworthy, but you can see that flttrd has the most missing values.

Task 2

#Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.

#Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).

trust_by_year <- france_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(imwbcnt, na.rm = TRUE))
trust_by_year
## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       4.52
##  2  2004       4.43
##  3  2006       4.46
##  4  2008       4.71
##  5  2010       4.59
##  6  2012       4.54
##  7  2014       4.85
##  8  2016       4.82
##  9  2018       5.03
## 10  2020       5.20
ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "pink", size = 1) +  
  geom_point(color = "purple", size = 3) +  
  labs(title = "Immigrants make country worse or better place to live
 (2000-2020)", 
       x = "Survey Year", 
       y = "worse place to live - best place to live (0-10)") +
  ylim(0, 10) +  
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#As can be seen from the chart above, from 2000 to 2020, French people have maintained a neutral attitude toward whether immigrants make the country better or worse, and there is no specific anti-immigration situation, but there is also no specific pro-immigration data, and the line on the data has maintained a very steady and slight upward trend. However, it can also be seen that since 2000, French people’s influence on the country by immigrants has gradually shown an upward trend, but the curve has a very small change curve.

Task 3

Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).

ess_selected <- ess %>%
  filter(cntry %in% c("FR", "IS", "IE")) %>%
  mutate(imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -imwbcnt, FUN=median), y = imwbcnt, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = " Immigrants make country worse or better place to live (France, Iceland, Ireland)", 
       x = "Country", 
       y = "worse place to live - best place to live (0-10)")

task3plot
## Warning: Removed 890 rows containing non-finite values (`stat_boxplot()`).

#As can be seen from the figure above, France’s value is the lowest, its Q1 and Q3 are the lowest values among the three countries, so it can be concluded that compared with Ireland and iceland, France is the least favorable to immigration. Among the three countries, Ireland has the largest range of IQR, which indicates that the people who think that immigrants are better or worse for the country to live in are both large and dispersed in a large range, which may be the most unstable data in the three countries. While iceland has the highest value, although its highest value Q3 is the same as Ireland, its average IQR and Q1 are the highest minimum values, and it is also the only country with a lower outlier among the three countries.

Task 4

Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).

france_data <- france_data %>%
  mutate(
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),

    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

table(france_data$educ_level)
## 
##    BA No BA 
##  4235 14803
imwbcntedu <- datasummary_crosstab(imwbcnt ~ educ_level, data = france_data)
imwbcntedu
imwbcnt BA No BA All
0 N 70 1065 1135
% row 6.2 93.8 100.0
1 N 41 590 631
% row 6.5 93.5 100.0
2 N 140 1202 1342
% row 10.4 89.6 100.0
3 N 205 1480 1685
% row 12.2 87.8 100.0
4 N 300 1561 1861
% row 16.1 83.9 100.0
5 N 1680 5101 6781
% row 24.8 75.2 100.0
6 N 429 1264 1693
% row 25.3 74.7 100.0
7 N 563 1066 1629
% row 34.6 65.4 100.0
8 N 441 723 1164
% row 37.9 62.1 100.0
9 N 135 207 342
% row 39.5 60.5 100.0
10 N 159 331 490
% row 32.4 67.6 100.0
All N 4235 14803 19038
% row 22.2 77.8 100.0
france_data <- france_data %>%
  mutate(paidjob = case_when(
    pdjobev == 2 ~ "No",
    pdjobev == 1 ~ "Yes",
    pdjobev %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(pdjobev)
  ))

table(france_data$paidjob)
## 
##    6   No  Yes 
## 9771 1440 7786
imwbcntpaidjob <- datasummary_crosstab(imwbcnt ~ paidjob, data = france_data)
imwbcntpaidjob
imwbcnt  6 No Yes All
0 N 444 69 619 1135
% row 39.1 6.1 54.5 100.0
1 N 266 36 328 631
% row 42.2 5.7 52.0 100.0
2 N 543 89 708 1342
% row 40.5 6.6 52.8 100.0
3 N 780 117 788 1685
% row 46.3 6.9 46.8 100.0
4 N 858 151 847 1861
% row 46.1 8.1 45.5 100.0
5 N 3752 502 2512 6781
% row 55.3 7.4 37.0 100.0
6 N 913 144 635 1693
% row 53.9 8.5 37.5 100.0
7 N 941 123 561 1629
% row 57.8 7.6 34.4 100.0
8 N 671 92 400 1164
% row 57.6 7.9 34.4 100.0
9 N 204 36 101 342
% row 59.6 10.5 29.5 100.0
10 N 294 38 154 490
% row 60.0 7.8 31.4 100.0
All N 9771 1440 7786 19038
% row 51.3 7.6 40.9 100.0
table(france_data$imwbcnt, france_data$paidjob) %>% cprop()
##        
##         6     No    Yes   All  
##   0       4.6   4.9   8.1   6.0
##   1       2.8   2.6   4.3   3.4
##   2       5.6   6.4   9.3   7.2
##   3       8.1   8.4  10.3   9.0
##   4       8.9  10.8  11.1   9.9
##   5      38.8  35.9  32.8  36.2
##   6       9.4  10.3   8.3   9.0
##   7       9.7   8.8   7.3   8.7
##   8       6.9   6.6   5.2   6.2
##   9       2.1   2.6   1.3   1.8
##   10      3.0   2.7   2.0   2.6
##   Total 100.0 100.0 100.0 100.0

#From the cross chart of imbcnt and paidjob, it can be seen that the more people think that immigration can have a positive impact on the country, almost all of them have jobs, while the majority of people who do not have jobs or have jobs have a moderate attitude towards this phenomenon. As can be seen from the cross plot of imbcnt and education, the closer the index is, the higher the no ba value is that immigrants make a country better to live in.

Task 5

Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).

df <- france_data %>%
  filter(!is.na(educ_level) & !is.na(imwbcnt))

df <- df %>%
  mutate(imwbcnt = case_when(
    imwbcnt == 0 ~ "worse place to live",
    imwbcnt == 10 ~ "better place to live",
    TRUE ~ as.character(imwbcnt)  
  ))

table(df$imwbcnt)
## 
##                    1                    2                    3 
##                  631                 1342                 1685 
##                    4                    5                    6 
##                 1861                 6781                 1693 
##                    7                    8                    9 
##                 1629                 1164                  342 
## better place to live  worse place to live 
##                  490                 1135
# visualize
table(df$imwbcnt, df$educ_level) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Immigrants make country worse or better place to live in france",
       y = "Conditional Percentage",
       x = "worse place to live - best place to live (0-10)",
       fill = "At least BA vs. Not")

#As can be seen from the chart, people without a degree are more likely to choose a lower value, and they are more inclined to believe that immigration will make the country worse to live in. However, starting from 5 and above, more people with a degree believe that immigration will bring better impact on the living environment of the country. But surprisingly, the highest percentage of people who think immigration makes the country a better place to live is those without degrees, by a wide margin.