Task 1

packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"

setwd("~/Desktop/SOC_202_YAY/")

getwd()

## [1] "/Users/apple/Desktop/SOC_202_YAY"

ess <- read_fst("All-ESS-Data.fst")

table(ess$essround)

## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

Next, create tables the three variables.

table(ess$eisced)

## 
##     0     1     2     3     4     5     6     7    55    77    88    99 
## 73306 38823 71917 74258 87348 49268 42651 49558  1052   483   497  1394

table(ess$stfeco)

## 
##     0     1     2     3     4     5     6     7     8     9    10    77    88 
## 39574 25671 45330 59142 55182 75639 57084 58459 41634 13226  8012   424 10562 
##    99 
##   616

table(ess$happy)

## 
##      0      1      2      3      4      5      6      7      8      9     10 
##   3933   3540   7343  13759  17016  52117  43707  87869 130802  76229  51006 
##     77     88     99 
##    330   2163    741

uk_data <- ess %>% 
  filter(cntry == "GB") %>% 
  mutate(
   stfeco = ifelse(stfeco %in% c( 77, 88, 99), NA, stfeco),
    eisced = ifelse(eisced %in% c(55,77, 88, 99), NA, eisced), 
   happy = ifelse(happy %in% c(77, 88, 99), NA, happy), 
  )

datasummary_skim(uk_data %>% select(stfeco, eisced, happy))

	Unique (#)	Missing (%)	Mean	SD	Median	Max
stfeco	12	2	4.4	2.3	5.0	10.0
eisced	9	2	2.2	2.4	1.0	7.0
happy	12	0	7.5	1.9	8.0	10.0

As can be seen from this chart, the average happiness index of British people is high. The British public is halfway between satisfied with the current state of the British economy, not very satisfied and not very dissatisfied overall. It can also be seen from the table that the highest average level of education in Britain is 2.2, which is the secondary level, which is somewhat unexpected.

Task 2

happy_by_year <- uk_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(happy, na.rm = TRUE))
happy_by_year

## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       7.54
##  2  2004       7.37
##  3  2006       7.43
##  4  2008       7.44
##  5  2010       7.41
##  6  2012       7.50
##  7  2014       7.47
##  8  2016       7.64
##  9  2018       7.57
## 10  2020       7.29

ggplot(happy_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "blue", size = 1) + 
  geom_point(color = "red", size = 3) +  
  labs(title = "Happy Level in UK (2002-2020)", 
       x = "Survey Year", 
       y = "Happy (0-10)") +
  ylim(0, 10) +  
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

It can be seen that the average happiness of the British people is high, and has remained stable at around 7.5 from 2002 to 2020. The past ten years have been stable, without major fluctuations and changes. But you can see a slight decline in average happiness from around 2017 through 2020.

Task 3

ess_selected <- ess %>%
  filter(cntry %in% c("GB", "DE", "FR")) %>%
  mutate(happy = ifelse(happy %in% c(77, 88, 99), NA, happy))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -happy, FUN=median), y = happy, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Boxplot comparison about happiness in politicians (UK, Germany, France)", 
       x = "Country", 
       y = "Scale (0-10)")

task3plot

## Warning: Removed 253 rows containing non-finite values (`stat_boxplot()`).

The chart directly illustrates the differences in various happiness values between German, French and British people. The median happiness score in these countries is similar, at around 7.7. Germany has the lowest minimum value of happiness of the three countries. It is worth noting that the French data are negatively skewed.

Task 4

uk_data <- uk_data %>%
  mutate(
   
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

table(uk_data$educ_level)

## 
##    BA No BA 
##  6116 14863

happyedu <- datasummary_crosstab(happy ~ educ_level, data = uk_data)
happyedu

happy		BA	No BA	All
0	N	14	99	113
	% row	12.4	87.6	100.0
1	N	23	89	112
	% row	20.5	79.5	100.0
2	N	33	207	240
	% row	13.8	86.2	100.0
3	N	113	389	502
	% row	22.5	77.5	100.0
4	N	153	517	670
	% row	22.8	77.2	100.0
5	N	322	1246	1568
	% row	20.5	79.5	100.0
6	N	448	1093	1541
	% row	29.1	70.9	100.0
7	N	1252	2552	3804
	% row	32.9	67.1	100.0
8	N	1913	4186	6099
	% row	31.4	68.6	100.0
9	N	1282	2458	3740
	% row	34.3	65.7	100.0
10	N	552	2005	2557
	% row	21.6	78.4	100.0
All	N	6116	14863	20979
	% row	29.2	70.8	100.0

table(uk_data$happy, uk_data$educ_level) %>%
  cprop()

##        
##         BA    No BA All  
##   0       0.2   0.7   0.5
##   1       0.4   0.6   0.5
##   2       0.5   1.4   1.1
##   3       1.9   2.6   2.4
##   4       2.5   3.5   3.2
##   5       5.3   8.4   7.5
##   6       7.3   7.4   7.4
##   7      20.5  17.2  18.2
##   8      31.3  28.2  29.1
##   9      21.0  16.6  17.9
##   10      9.0  13.5  12.2
##   Total 100.0 100.0 100.0

What you can see here is that the majority of people in the UK are happy with their lives, regardless of whether they have a degree or not. Since there are more than twice as many non-BA holders as there are ba holders, it’s hard to see whether education has any effect on a person’s happiness.

uk_data <- uk_data %>%
  mutate(pdwrk_recode = case_when(
    pdwrk == 1 ~ 'yes',
    pdwrk == 0 ~ 'no', 
  ))


table(uk_data$pdwrk_recode)

## 
##    no   yes 
## 10164 10815

table(uk_data$pdwrk)

## 
##     0     1 
## 10164 10815

happypdwrk <- datasummary_crosstab(happy ~ pdwrk_recode, data = uk_data)
happypdwrk

happy		no	yes	All
0	N	89	24	113
	% row	78.8	21.2	100.0
1	N	75	37	112
	% row	67.0	33.0	100.0
2	N	149	91	240
	% row	62.1	37.9	100.0
3	N	288	214	502
	% row	57.4	42.6	100.0
4	N	370	300	670
	% row	55.2	44.8	100.0
5	N	866	702	1568
	% row	55.2	44.8	100.0
6	N	733	808	1541
	% row	47.6	52.4	100.0
7	N	1589	2215	3804
	% row	41.8	58.2	100.0
8	N	2749	3350	6099
	% row	45.1	54.9	100.0
9	N	1708	2032	3740
	% row	45.7	54.3	100.0
10	N	1523	1034	2557
	% row	59.6	40.4	100.0
All	N	10164	10815	20979
	% row	48.4	51.6	100.0

table(uk_data$happy, uk_data$pdwrk) %>%
 
   cprop()

##        
##         0     1     All  
##   0       0.9   0.2   0.5
##   1       0.7   0.3   0.5
##   2       1.5   0.8   1.1
##   3       2.8   2.0   2.4
##   4       3.6   2.8   3.2
##   5       8.5   6.5   7.5
##   6       7.2   7.5   7.4
##   7      15.7  20.5  18.2
##   8      27.1  31.0  29.1
##   9      16.8  18.8  17.9
##   10     15.0   9.6  12.2
##   Total 100.0 100.0 100.0

Here you can see how having a gainful job affects happiness. More people with unpaid jobs say they are unhappy with their lives than those with paid jobs. In Britain, though, overall happiness is high. But it can be seen that more people with jobs and wages rate their own happiness as higher.

Task 5

df <- uk_data %>%
  filter(!is.na(happy) & !is.na(pdwrk))

df <- df %>%
  mutate(pdwrk = case_when(
    pdwrk == 1 ~ "Yes",
    pdwrk == 0 ~ "No",
    TRUE ~ as.character(pdwrk)  # or "Other", or NA_character_, depending on your needs
  ))

table(df$pdwrk)

## 
##    No   Yes 
## 10139 10807

uk_clean <- uk_data %>%
  filter(!is.na(pdwrk) & !is.na(happy))

uk_probs <- uk_clean %>%
  count(happy, pdwrk) %>%
  group_by(pdwrk) %>%
  mutate(prob = n / sum(n))

ggplot(uk_probs, aes(x = as.factor(happy), y = prob, color = pdwrk)) +
  geom_point() +
  geom_line(aes(group = pdwrk)) +
  labs(title = "Conditional Probabilities of happy level",
       subtitle = "with the paid job",
       x = "Happiness Scale", 
       y = "Probability") +
  theme_minimal()

After visualization, the happiness levels of the two groups become clearer and easier to compare. Light blue is those with paid work, dark blue is those without paid work. It can be seen that in the happiness index range from 0 to 4, that is, feeling very unhappy or somewhat unhappy, the proportion of people who do not pay work is higher. And the people who have paid work are 6 to 10 points happier than the other group, that is, they are happier. So whether or not you have a job can affect how happy people are. Both groups peak at 8 at the same time. The trend of the two data points is very similar. Finally, it’s worth noting that more people who don’t have paid work are extremely happy than those who do.

Homework_5_Di_Wu

2023-10-16