Homework_5_Yilin

R Markdown

# List of packages
packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

#install.packages("fst")
library(fst)
setwd("~/Desktop/Homework_5_project")
ess <- read_fst("All-ESS-Data.fst")

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

table(ess$flttrd)

## 
##     1     2     3     4     7     8     9 
##  8989 24458  7219  2034    24   211    65

table(ess$imwbcnt)

## 
##      0      1      2      3      4      5      6      7      8      9     10 
##  26607  18119  31951  44795  46898 136926  46688  48299  37746  12610  15653 
##     77     88     99 
##    748  22278   1237

table(ess$netusoft)

## 
##     1     2     3     4     5     7     8     9 
## 23872  9013  8739 14124 97494    49   101   199

france_data <- ess %>% # Start with the original 'ess' dataset
  filter(cntry == "FR") %>% # Filter the data to only include rows where country (cntry) is France ("FR")
  mutate(
    flttrd = ifelse(flttrd %in% c(7, 8, 9), NA, flttrd), # If 'clsprty' is 2, set it to 0. If it's 7, 8, or 9, set it to NA.
    imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt), # For 'stfdem', set values 77, 88, and 99 to NA.
    netusoft = ifelse(netusoft %in% c(7, 8, 9), NA, netusoft), # For 'trstplt', set values 77, 88, and 99 to NA.
  )

table(france_data$flttrd)

## 
##    1    2    3    4 
##  375 1183  284  144

table(france_data$imwbcnt)

## 
##    0    1    2    3    4    5    6    7    8    9   10 
## 1135  631 1342 1685 1861 6781 1693 1629 1164  342  490

table(france_data$netusoft)

## 
##    1    2    3    4    5 
##  919  342  301  550 3944

Task 1

#Do a data summary table of three variables of interest. Discuss briefly what you note (i.e., add text in your markdown after the Task 1 code).

datasummary_skim(france_data %>% select(flttrd,imwbcnt,netusoft))

	Unique (#)	Missing (%)	Mean	SD	Min	Median	Max
flttrd	5	90	2.1	0.8	1.0	2.0	4.0
imwbcnt	12	1	4.7	2.2	0.0	5.0	10.0
netusoft	6	68	4.0	1.5	1.0	5.0	5.0

#Some information can be obtained from the charts of the above three variables. The value of unique in imwbcnt is the largest, its missing value is also the smallest, and its standard deviation is also the largest, so it can be inferred that its data is the most unstable and has the largest change, and the value of outlier is also more than other variables. The other two variables are not very noteworthy, but you can see that flttrd has the most missing values.

Task 2

#Choose one of the three variables you just summarized in the table. This will be your current main outcome of interest.

#Produce a visual that showcases the mean (average) for your outcome of interest by survey year (can be, e.g., point + line plot or ridge plot, depending on your variable). Discuss briefly what you note (i.e., add text in your markdown after the Task 2 code).

trust_by_year <- france_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(imwbcnt, na.rm = TRUE))
trust_by_year

## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       4.52
##  2  2004       4.43
##  3  2006       4.46
##  4  2008       4.71
##  5  2010       4.59
##  6  2012       4.54
##  7  2014       4.85
##  8  2016       4.82
##  9  2018       5.03
## 10  2020       5.20

ggplot(trust_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "pink", size = 1) +  
  geom_point(color = "purple", size = 3) +  
  labs(title = "Immigrants make country worse or better place to live
 (2000-2020)", 
       x = "Survey Year", 
       y = "worse place to live - best place to live (0-10)") +
  ylim(0, 10) +  
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#As can be seen from the chart above, from 2000 to 2020, French people have maintained a neutral attitude toward whether immigrants make the country better or worse, and there is no specific anti-immigration situation, but there is also no specific pro-immigration data, and the line on the data has maintained a very steady and slight upward trend. However, it can also be seen that since 2000, French people’s influence on the country by immigrants has gradually shown an upward trend, but the curve has a very small change curve.

Task 3

Provide a comparison visual of your outcome of interest with two other countries. You can choose the geom() you prefer. Discuss briefly what you note (i.e., add text in your markdown after the Task 3 code).

ess_selected <- ess %>%
  filter(cntry %in% c("FR", "IS", "IE")) %>%
  mutate(imwbcnt = ifelse(imwbcnt %in% c(77, 88, 99), NA, imwbcnt))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -imwbcnt, FUN=median), y = imwbcnt, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = " Immigrants make country worse or better place to live (France, Iceland, Ireland)", 
       x = "Country", 
       y = "worse place to live - best place to live (0-10)")

task3plot

## Warning: Removed 890 rows containing non-finite values (`stat_boxplot()`).

#As can be seen from the figure above, France’s value is the lowest, its Q1 and Q3 are the lowest values among the three countries, so it can be concluded that compared with Ireland and iceland, France is the least favorable to immigration. Among the three countries, Ireland has the largest range of IQR, which indicates that the people who think that immigrants are better or worse for the country to live in are both large and dispersed in a large range, which may be the most unstable data in the three countries. While iceland has the highest value, although its highest value Q3 is the same as Ireland, its average IQR and Q1 are the highest minimum values, and it is also the only country with a lower outlier among the three countries.

Task 4

Produce a cross-tab between your outcome of interest and a socio-demographic variable (use datasummary_crosstab). Then, calculate column percentages using cprop(), making sure to pick a second socio-demographic variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 4 code).

france_data <- france_data %>%
  mutate(
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),

    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

table(france_data$educ_level)

## 
##    BA No BA 
##  4235 14803

imwbcntedu <- datasummary_crosstab(imwbcnt ~ educ_level, data = france_data)
imwbcntedu

imwbcnt		BA	No BA	All
0	N	70	1065	1135
	% row	6.2	93.8	100.0
1	N	41	590	631
	% row	6.5	93.5	100.0
2	N	140	1202	1342
	% row	10.4	89.6	100.0
3	N	205	1480	1685
	% row	12.2	87.8	100.0
4	N	300	1561	1861
	% row	16.1	83.9	100.0
5	N	1680	5101	6781
	% row	24.8	75.2	100.0
6	N	429	1264	1693
	% row	25.3	74.7	100.0
7	N	563	1066	1629
	% row	34.6	65.4	100.0
8	N	441	723	1164
	% row	37.9	62.1	100.0
9	N	135	207	342
	% row	39.5	60.5	100.0
10	N	159	331	490
	% row	32.4	67.6	100.0
All	N	4235	14803	19038
	% row	22.2	77.8	100.0

france_data <- france_data %>%
  mutate(paidjob = case_when(
    pdjobev == 2 ~ "No",
    pdjobev == 1 ~ "Yes",
    pdjobev %in% c(7, 8, 9) ~ NA_character_,
    TRUE ~ as.character(pdjobev)
  ))

table(france_data$paidjob)

## 
##    6   No  Yes 
## 9771 1440 7786

imwbcntpaidjob <- datasummary_crosstab(imwbcnt ~ paidjob, data = france_data)
imwbcntpaidjob

imwbcnt		6	No	Yes	All
0	N	444	69	619	1135
	% row	39.1	6.1	54.5	100.0
1	N	266	36	328	631
	% row	42.2	5.7	52.0	100.0
2	N	543	89	708	1342
	% row	40.5	6.6	52.8	100.0
3	N	780	117	788	1685
	% row	46.3	6.9	46.8	100.0
4	N	858	151	847	1861
	% row	46.1	8.1	45.5	100.0
5	N	3752	502	2512	6781
	% row	55.3	7.4	37.0	100.0
6	N	913	144	635	1693
	% row	53.9	8.5	37.5	100.0
7	N	941	123	561	1629
	% row	57.8	7.6	34.4	100.0
8	N	671	92	400	1164
	% row	57.6	7.9	34.4	100.0
9	N	204	36	101	342
	% row	59.6	10.5	29.5	100.0
10	N	294	38	154	490
	% row	60.0	7.8	31.4	100.0
All	N	9771	1440	7786	19038
	% row	51.3	7.6	40.9	100.0

table(france_data$imwbcnt, france_data$paidjob) %>% cprop()

##        
##         6     No    Yes   All  
##   0       4.6   4.9   8.1   6.0
##   1       2.8   2.6   4.3   3.4
##   2       5.6   6.4   9.3   7.2
##   3       8.1   8.4  10.3   9.0
##   4       8.9  10.8  11.1   9.9
##   5      38.8  35.9  32.8  36.2
##   6       9.4  10.3   8.3   9.0
##   7       9.7   8.8   7.3   8.7
##   8       6.9   6.6   5.2   6.2
##   9       2.1   2.6   1.3   1.8
##   10      3.0   2.7   2.0   2.6
##   Total 100.0 100.0 100.0 100.0

#From the cross chart of imbcnt and paidjob, it can be seen that the more people think that immigration can have a positive impact on the country, almost all of them have jobs, while the majority of people who do not have jobs or have jobs have a moderate attitude towards this phenomenon. As can be seen from the cross plot of imbcnt and education, the closer the index is, the higher the no ba value is that immigrants make a country better to live in.

Task 5

Choose one of the two socio-demographic variables you just worked with. Visualize the conditional probability (or column percentages) of your outcome given your selected socio-dem variable. Discuss briefly what you note (i.e., add text in your markdown after the Task 5 code).

df <- france_data %>%
  filter(!is.na(educ_level) & !is.na(imwbcnt))

df <- df %>%
  mutate(imwbcnt = case_when(
    imwbcnt == 0 ~ "worse place to live",
    imwbcnt == 10 ~ "better place to live",
    TRUE ~ as.character(imwbcnt)  
  ))

table(df$imwbcnt)

## 
##                    1                    2                    3 
##                  631                 1342                 1685 
##                    4                    5                    6 
##                 1861                 6781                 1693 
##                    7                    8                    9 
##                 1629                 1164                  342 
## better place to live  worse place to live 
##                  490                 1135

# visualize
table(df$imwbcnt, df$educ_level) %>%
  cprop() %>%
  as.data.frame() %>%
  filter(Var1 != "Total",
         Var2 != "All") %>%
  ggplot(aes(x=Var1, y=Freq, fill=Var2)) +
  geom_col(position = "dodge") +
  labs(title="Immigrants make country worse or better place to live in france",
       y = "Conditional Percentage",
       x = "worse place to live - best place to live (0-10)",
       fill = "At least BA vs. Not")

#As can be seen from the chart, people without a degree are more likely to choose a lower value, and they are more inclined to believe that immigration will make the country worse to live in. However, starting from 5 and above, more people with a degree believe that immigration will bring better impact on the living environment of the country. But surprisingly, the highest percentage of people who think immigration makes the country a better place to live is those without degrees, by a wide margin.

Homework_5_Yilin_Zhou

2023-10-16