Task 1

packages <- c("tidyverse", "modelsummary", "forcats", "RColorBrewer", 
              "fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "viridis", "questionr")

new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: viridisLite
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[3]]
##  [1] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
##  [6] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [11] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [16] "datasets"     "methods"      "base"        
## 
## [[4]]
##  [1] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
##  [6] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [11] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [16] "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[6]]
##  [1] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[7]]
##  [1] "knitr"        "viridis"      "viridisLite"  "fst"          "RColorBrewer"
##  [6] "modelsummary" "lubridate"    "forcats"      "stringr"      "dplyr"       
## [11] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [16] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [21] "datasets"     "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "knitr"        "viridis"      "viridisLite"  "fst"         
##  [6] "RColorBrewer" "modelsummary" "lubridate"    "forcats"      "stringr"     
## [11] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [16] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [21] "utils"        "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "rmarkdown"    "kableExtra"   "knitr"        "viridis"      "viridisLite" 
##  [6] "fst"          "RColorBrewer" "modelsummary" "lubridate"    "forcats"     
## [11] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [16] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [21] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[11]]
##  [1] "ggridges"     "rmarkdown"    "kableExtra"   "knitr"        "viridis"     
##  [6] "viridisLite"  "fst"          "RColorBrewer" "modelsummary" "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[12]]
##  [1] "questionr"    "ggridges"     "rmarkdown"    "kableExtra"   "knitr"       
##  [6] "viridis"      "viridisLite"  "fst"          "RColorBrewer" "modelsummary"
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"
setwd("~/Desktop/SOC_202_YAY/")

getwd()  
## [1] "/Users/apple/Desktop/SOC_202_YAY"
ess <- read_fst("All-ESS-Data.fst")
table(ess$essround)
## 
##     1     2     3     4     5     6     7     8     9    10 
## 42359 47537 43000 56752 52458 54673 40185 44387 49519 59685
ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

Next, create tables the three variables.

table(ess$eisced)
## 
##     0     1     2     3     4     5     6     7    55    77    88    99 
## 73306 38823 71917 74258 87348 49268 42651 49558  1052   483   497  1394
table(ess$stfeco)
## 
##     0     1     2     3     4     5     6     7     8     9    10    77    88 
## 39574 25671 45330 59142 55182 75639 57084 58459 41634 13226  8012   424 10562 
##    99 
##   616
table(ess$happy)
## 
##      0      1      2      3      4      5      6      7      8      9     10 
##   3933   3540   7343  13759  17016  52117  43707  87869 130802  76229  51006 
##     77     88     99 
##    330   2163    741
uk_data <- ess %>% 
  filter(cntry == "GB") %>% 
  mutate(
   stfeco = ifelse(stfeco %in% c( 77, 88, 99), NA, stfeco),
    eisced = ifelse(eisced %in% c(55,77, 88, 99), NA, eisced), 
   happy = ifelse(happy %in% c(77, 88, 99), NA, happy), 
  )

datasummary_skim(uk_data %>% select(stfeco, eisced, happy))
Unique (#) Missing (%) Mean SD Min Median Max
stfeco 12 2 4.4 2.3 0.0 5.0 10.0
eisced 9 2 2.2 2.4 0.0 1.0 7.0
happy 12 0 7.5 1.9 0.0 8.0 10.0

As can be seen from this chart, the average happiness index of British people is high. The British public is halfway between satisfied with the current state of the British economy, not very satisfied and not very dissatisfied overall. It can also be seen from the table that the highest average level of education in Britain is 2.2, which is the secondary level, which is somewhat unexpected.

Task 2

happy_by_year <- uk_data %>%
  group_by(year) %>%
  summarize(mean_trust = mean(happy, na.rm = TRUE))
happy_by_year
## # A tibble: 10 × 2
##     year mean_trust
##    <dbl>      <dbl>
##  1  2002       7.54
##  2  2004       7.37
##  3  2006       7.43
##  4  2008       7.44
##  5  2010       7.41
##  6  2012       7.50
##  7  2014       7.47
##  8  2016       7.64
##  9  2018       7.57
## 10  2020       7.29
ggplot(happy_by_year, aes(x = year, y = mean_trust)) +
  geom_line(color = "blue", size = 1) + 
  geom_point(color = "red", size = 3) +  
  labs(title = "Happy Level in UK (2002-2020)", 
       x = "Survey Year", 
       y = "Happy (0-10)") +
  ylim(0, 10) +  
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

It can be seen that the average happiness of the British people is high, and has remained stable at around 7.5 from 2002 to 2020. The past ten years have been stable, without major fluctuations and changes. But you can see a slight decline in average happiness from around 2017 through 2020.

Task 3

ess_selected <- ess %>%
  filter(cntry %in% c("GB", "DE", "FR")) %>%
  mutate(happy = ifelse(happy %in% c(77, 88, 99), NA, happy))


task3plot <- ggplot(ess_selected, aes(x = reorder(cntry, -happy, FUN=median), y = happy, fill = cntry)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Boxplot comparison about happiness in politicians (UK, Germany, France)", 
       x = "Country", 
       y = "Scale (0-10)")

task3plot
## Warning: Removed 253 rows containing non-finite values (`stat_boxplot()`).

The chart directly illustrates the differences in various happiness values between German, French and British people. The median happiness score in these countries is similar, at around 7.7. Germany has the lowest minimum value of happiness of the three countries. It is worth noting that the French data are negatively skewed.

Task 4

uk_data <- uk_data %>%
  mutate(
   
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    )
  )

table(uk_data$educ_level)
## 
##    BA No BA 
##  6116 14863
happyedu <- datasummary_crosstab(happy ~ educ_level, data = uk_data)
happyedu
happy BA No BA All
0 N 14 99 113
% row 12.4 87.6 100.0
1 N 23 89 112
% row 20.5 79.5 100.0
2 N 33 207 240
% row 13.8 86.2 100.0
3 N 113 389 502
% row 22.5 77.5 100.0
4 N 153 517 670
% row 22.8 77.2 100.0
5 N 322 1246 1568
% row 20.5 79.5 100.0
6 N 448 1093 1541
% row 29.1 70.9 100.0
7 N 1252 2552 3804
% row 32.9 67.1 100.0
8 N 1913 4186 6099
% row 31.4 68.6 100.0
9 N 1282 2458 3740
% row 34.3 65.7 100.0
10 N 552 2005 2557
% row 21.6 78.4 100.0
All N 6116 14863 20979
% row 29.2 70.8 100.0
table(uk_data$happy, uk_data$educ_level) %>%
  cprop()
##        
##         BA    No BA All  
##   0       0.2   0.7   0.5
##   1       0.4   0.6   0.5
##   2       0.5   1.4   1.1
##   3       1.9   2.6   2.4
##   4       2.5   3.5   3.2
##   5       5.3   8.4   7.5
##   6       7.3   7.4   7.4
##   7      20.5  17.2  18.2
##   8      31.3  28.2  29.1
##   9      21.0  16.6  17.9
##   10      9.0  13.5  12.2
##   Total 100.0 100.0 100.0

What you can see here is that the majority of people in the UK are happy with their lives, regardless of whether they have a degree or not. Since there are more than twice as many non-BA holders as there are ba holders, it’s hard to see whether education has any effect on a person’s happiness.

uk_data <- uk_data %>%
  mutate(pdwrk_recode = case_when(
    pdwrk == 1 ~ 'yes',
    pdwrk == 0 ~ 'no', 
  ))


table(uk_data$pdwrk_recode)
## 
##    no   yes 
## 10164 10815
table(uk_data$pdwrk)
## 
##     0     1 
## 10164 10815
happypdwrk <- datasummary_crosstab(happy ~ pdwrk_recode, data = uk_data)
happypdwrk
happy no yes All
0 N 89 24 113
% row 78.8 21.2 100.0
1 N 75 37 112
% row 67.0 33.0 100.0
2 N 149 91 240
% row 62.1 37.9 100.0
3 N 288 214 502
% row 57.4 42.6 100.0
4 N 370 300 670
% row 55.2 44.8 100.0
5 N 866 702 1568
% row 55.2 44.8 100.0
6 N 733 808 1541
% row 47.6 52.4 100.0
7 N 1589 2215 3804
% row 41.8 58.2 100.0
8 N 2749 3350 6099
% row 45.1 54.9 100.0
9 N 1708 2032 3740
% row 45.7 54.3 100.0
10 N 1523 1034 2557
% row 59.6 40.4 100.0
All N 10164 10815 20979
% row 48.4 51.6 100.0
table(uk_data$happy, uk_data$pdwrk) %>%
 
   cprop()
##        
##         0     1     All  
##   0       0.9   0.2   0.5
##   1       0.7   0.3   0.5
##   2       1.5   0.8   1.1
##   3       2.8   2.0   2.4
##   4       3.6   2.8   3.2
##   5       8.5   6.5   7.5
##   6       7.2   7.5   7.4
##   7      15.7  20.5  18.2
##   8      27.1  31.0  29.1
##   9      16.8  18.8  17.9
##   10     15.0   9.6  12.2
##   Total 100.0 100.0 100.0

Here you can see how having a gainful job affects happiness. More people with unpaid jobs say they are unhappy with their lives than those with paid jobs. In Britain, though, overall happiness is high. But it can be seen that more people with jobs and wages rate their own happiness as higher.

Task 5

df <- uk_data %>%
  filter(!is.na(happy) & !is.na(pdwrk))

df <- df %>%
  mutate(pdwrk = case_when(
    pdwrk == 1 ~ "Yes",
    pdwrk == 0 ~ "No",
    TRUE ~ as.character(pdwrk)  # or "Other", or NA_character_, depending on your needs
  ))

table(df$pdwrk)
## 
##    No   Yes 
## 10139 10807
uk_clean <- uk_data %>%
  filter(!is.na(pdwrk) & !is.na(happy))

uk_probs <- uk_clean %>%
  count(happy, pdwrk) %>%
  group_by(pdwrk) %>%
  mutate(prob = n / sum(n))

ggplot(uk_probs, aes(x = as.factor(happy), y = prob, color = pdwrk)) +
  geom_point() +
  geom_line(aes(group = pdwrk)) +
  labs(title = "Conditional Probabilities of happy level",
       subtitle = "with the paid job",
       x = "Happiness Scale", 
       y = "Probability") +
  theme_minimal()

After visualization, the happiness levels of the two groups become clearer and easier to compare. Light blue is those with paid work, dark blue is those without paid work. It can be seen that in the happiness index range from 0 to 4, that is, feeling very unhappy or somewhat unhappy, the proportion of people who do not pay work is higher. And the people who have paid work are 6 to 10 points happier than the other group, that is, they are happier. So whether or not you have a job can affect how happy people are. Both groups peak at 8 at the same time. The trend of the two data points is very similar. Finally, it’s worth noting that more people who don’t have paid work are extremely happy than those who do.