Overall there are six big tasks in this exercise. Some of the tasks may look very complicated to you. But following the small steps in these big tasks, you will find your way out! Just give a bit patience to the tasks, a bit confidence to yourself, and your small fingures to the keyboard!

1. Rewrite the following code chunks using the %>% operator.

#Question: Rewrite the following code chunks using the %>% operator.**
#log(sd(c(5, 13, 89)))
#as.numeric(scale(c(100, 32, 45)))
library(tidyverse)
c(5, 13, 89) %>% sd() %>% log()
# [1] 3.8
c(100, 32, 45) %>% scale() %>% as.numeric()
# [1]  1.14 -0.75 -0.39

2. Use Wave 1 data to see which cohort have the highest proportion of agreeing and agreeing completely with the argument "A child under age 6 will suffer from having a working mother" (i.e. val1i5).

library(tidyverse) 
library(haven)
library(janitor)
library(ggplot2)
wave1 <- read_dta("anchor1_50percent_Eng.dta")
  1. val1i5,
  2. cohort,
  3. sex_gen,
  4. age,
  5. sd3,
  6. sat1i1.

Please find out what and how the three variables-- "val1i5", "sd3" and "sat1i1" measure in the questionnaire. You can also rename these variables so that their names are more understandable to you, and recode some categorical variables into factors.

wave1a <-  wave1 %>%  #pipe "wave1" into the following code, Rstudio recognized the transmute is for wave1
  transmute( # Create new variables and keep only those
    val1i5, 
    cohort=as_factor(cohort), 
    gender=as_factor(sex_gen), 
    age, 
    partner=as_factor(sd3), 
    fam_sat=sat1i1
  ) 
tabyl(wave1a$val1i5) #to check the missing observations
#  wave1a$val1i5    n percent
#             -2    5 0.00081
#             -1   92 0.01484
#              1 1201 0.19368
#              2 1173 0.18916
#              3 1660 0.26770
#              4 1117 0.18013
#              5  953 0.15368
#there are some observations with values of -2 and -1.

wave1b <- wave1a %>% 
  mutate(
    val1i5=case_when(val1i5<0 ~ as.numeric(NA),  #when val1i5<0, make it missing
                     TRUE ~ as.numeric(val1i5)), #the rest observation remained unchanged
    agree_val1i5=case_when(val1i5>3 ~ 1, val1i5 %in% c(1:3) ~ 0, 
                           TRUE ~ as.numeric(NA)) # the rest are assigned NA
    #create a new variable called "agree_val1i5", when val1i5 is >3, assign 1 to agree_val1i5
    #when val1i5 is equal to 1, 2, 3, assign 0 to agree_val1i5
    #I use %in% to say when value of val1i5 belongs to 1-3
    # c(1:3) represents a vector of values 1, 2, 3, I use ":" to mean from 1 to 3.
        )%>%
  drop_na(agree_val1i5, cohort) #drop those missing in agree_val1i5 and cohort.

table(wave1b$cohort,wave1b$agree_val1i5) %>% prop.table(margin=1)
#                                   
#                                       0    1
#   -7 Incomplete data                        
#   0 former capikid first interview          
#   1 1991-1993                      0.62 0.38
#   2 1981-1983                      0.68 0.32
#   3 1971-1973                      0.68 0.32
#   4 2001-2003                               
#   9 former capikid re-interview
#cohort 1991-1993 has the highest proportion.
#of course, you cal also you tabyl().

3. Please do a chi-square test to see whether respondents' gender are correlated with their view on working mother(i.e. "val1i5" )

table(wave1a$gender)
# 
#               -10 not in demodiff                -7 Incomplete data -4 Filter error / Incorrect entry 
#                                 0                                 0                                 0 
#                 -3 Does not apply                            1 Male                          2 Female 
#                                 0                              3029                              3172
#no missing on gender

table(wave1a$val1i5)
# 
#   -2   -1    1    2    3    4    5 
#    5   92 1201 1173 1660 1117  953
#some missing on val1i5, so needs cleaning

tab <- wave1a%>% 
  mutate(
    val1i5=case_when(val1i5<0 ~ as.numeric(NA),  #when val1i5<0, make it missing
                     TRUE ~ as.numeric(val1i5)) , #the rest observation remained unchanged
    gender=fct_drop(gender) #drop unused level of gender
        )%>%
  drop_na(val1i5)%>%
  tabyl(val1i5,gender) #generate a two-way table
tab
#  val1i5 1 Male 2 Female
#       1    480      721
#       2    573      600
#       3    781      879
#       4    609      508
#       5    524      429
chisq.test(tab)
# 
#   Pearson's Chi-squared test
# 
# data:  tab
# X-squared = 69, df = 4, p-value = 4e-14

4. Calculate the correlation coefficient between age and satisfaction with family life for the overall sample. Secondly, estimate the correlation coefficient among the male and female sample to see whether the correlation differs by gender

tabyl(wave1a$fam_sat)
#  wave1a$fam_sat    n percent
#              -2   13  0.0021
#              -1   10  0.0016
#               0  103  0.0166
#               1   50  0.0081
#               2   89  0.0144
#               3  208  0.0335
#               4  219  0.0353
#               5  593  0.0956
#               6  494  0.0797
#               7 1154  0.1861
#               8 1490  0.2403
#               9  773  0.1247
#              10 1005  0.1621
#13 answered -2 and 10 answered -1.

tabyl(wave1a$age)
#  wave1a$age   n percent
#          14  41  0.0066
#          15 708  0.1142
#          16 722  0.1164
#          17 667  0.1076
#          18  35  0.0056
#          24  24  0.0039
#          25 577  0.0930
#          26 678  0.1093
#          27 647  0.1043
#          28  87  0.0140
#          34  22  0.0035
#          35 502  0.0810
#          36 618  0.0997
#          37 772  0.1245
#          38 101  0.0163
#no missing

#Way 1:
correlation <-   wave1a %>% 
  mutate(
    fam_sat=case_when(fam_sat<0 ~ as.numeric(NA),  #when fam_sat<0, make it missing
                      TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
        ) %>%
  drop_na(fam_sat) %>% #removing missing cases of fam_sat using drop_na()
  dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
correlation
# # A tibble: 1 × 1
#   `cor(x = age, y = fam_sat)`
#                         <dbl>
# 1                     -0.0308
#Way 2: you can clean the data first, rename it as wave1c
wave1c <- wave1a %>% 
  mutate(
    fam_sat=case_when(fam_sat<0 ~ as.numeric(NA),  #when fam_sat<0, make it missing
                      TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
        ) %>%
  drop_na(fam_sat)
#then use wave1c to calcualte the correlation coefficient
correlation <- wave1c %>% 
  dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
correlation
# # A tibble: 1 × 1
#   `cor(x = age, y = fam_sat)`
#                         <dbl>
# 1                     -0.0308
#If you use Way 1 to answer the first step, now you need to clean the wave1a data again
gender_correlation <-   wave1a %>% 
  mutate(
    fam_sat=case_when(fam_sat<0 ~ as.numeric(NA),  #when fam_sat<0, make it missing
                      TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
        ) %>%
  drop_na(fam_sat) %>% #removing missing cases of fam_sat using drop_na()
  group_by(gender)%>% #group by gender
  dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
gender_correlation
# # A tibble: 2 × 2
#   gender   `cor(x = age, y = fam_sat)`
#   <fct>                          <dbl>
# 1 1 Male                       -0.0148
# 2 2 Female                     -0.0453
#If you use Way 2 to do the first step, you can use wave1c to calculate the correlation coefficient by gender
gender_correlation <-wave1c %>% 
                     group_by(gender)%>% #group by gender
                     dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
gender_correlation
# # A tibble: 2 × 2
#   gender   `cor(x = age, y = fam_sat)`
#   <fct>                          <dbl>
# 1 1 Male                       -0.0148
# 2 2 Female                     -0.0453

5. Calculate the average family life satisfaction for the overall sample. Then calculate the average family life satisfaction for those who has a partner and those who has no partner.

#If you use Way 1 in Question 4, then you need to clean the wave1a again and do the calculation
wave1a %>% 
  mutate(
    fam_sat=case_when(fam_sat<0 ~ as.numeric(NA),  #when fam_sat<0, make it missing
                      TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
        ) %>%
  drop_na(fam_sat) %>% #removing missing cases of fam_sat 
  dplyr::summarise(mean_famsat=mean(fam_sat))
# # A tibble: 1 × 1
#   mean_famsat
#         <dbl>
# 1        7.23
#If you use Way 2 in Question 4, you use wave1c directly
wave1c %>% 
  dplyr::summarise(mean_famsat=mean(fam_sat))
# # A tibble: 1 × 1
#   mean_famsat
#         <dbl>
# 1        7.23
## or If you use Way 2 in Question 4, you use wave1c directly and do it without "%>% "
mean(wave1c$fam_sat)
# [1] 7.2
#now, for simplicity, I use wave1c directly

#first, check whether there is any missing for the partner variable
class(wave1c$partner)
# [1] "factor"
# it is a factor
tabyl(wave1c$partner)
#                     wave1c$partner    n percent
#              -5 Inconsistent value    0  0.0000
#  -4 Filter error / Incorrect entry    0  0.0000
#                  -3 Does not apply    0  0.0000
#                       -2 No answer   12  0.0019
#                      -1 Don't know   14  0.0023
#                              1 Yes 3554  0.5753
#                               2 No 2598  0.4205
# there are some observations with ""no answer" and "don't know"
# this means we need to clean the data
#now, for simplicity, I use wave1c directly

part_famsat <- wave1c %>% 
  mutate(
    partner=case_when(
                  partner %in% c("-2 No answer","-1 Don't know") ~ NA,  #partner<0, make it missing
                  TRUE ~ partner) #the rest observation remained unchanged
          #make the partner back to a factor variable
        ) %>%
  drop_na(partner) %>% #removing missing cases of fam_sat 
  group_by(partner) %>%
  dplyr::summarise(mean(fam_sat))

part_famsat
# # A tibble: 2 × 2
#   partner `mean(fam_sat)`
#   <fct>             <dbl>
# 1 1 Yes              7.27
# 2 2 No               7.18

6. Please generate two bar plots: figure1 is to show the distribution of the family life satisfaction for the whole sample; figure 2 is to show the show the distribution of the family life satisfaction by cohort

#way1: simply plot it 
figure1 <- ggplot(data=wave1a,
                  mapping=aes(x=fam_sat))+
           geom_bar()
figure1

#way2: treat fam_sat as a factor and swap the coordinate sytem to make it nice looking
figure1 <- ggplot(data=wave1a,
                  mapping=aes(x=as_factor(fam_sat)))+ #as_factor(fam_sat) is to make it a factor
           geom_bar()+
           coord_flip() #swap the coordinating system
figure1  
#it looks nicer!

#way1: simply plot it 
figure2 <- ggplot(data=wave1a,
                  mapping=aes(x=as_factor(fam_sat)))+ #as_factor(fam_sat) is to make it a factor
           geom_bar()+
           facet_wrap(~cohort)+ #plot the graph by cohort
           coord_flip() ##swap the coordinating system
figure2