Overall there are six big tasks in this exercise. Some of the tasks may look very complicated to you. But following the small steps in these big tasks, you will find your way out! Just give a bit patience to the tasks, a bit confidence to yourself, and your small fingures to the keyboard!
1. Rewrite the following code chunks using the %>% operator.
#Question: Rewrite the following code chunks using the %>% operator.**
#log(sd(c(5, 13, 89)))
#as.numeric(scale(c(100, 32, 45)))
library(tidyverse)
c(5, 13, 89) %>% sd() %>% log()
# [1] 3.8
c(100, 32, 45) %>% scale() %>% as.numeric()
# [1] 1.14 -0.75 -0.39
2. Use Wave 1 data to see which cohort have the highest proportion of agreeing and agreeing completely with the argument "A child under age 6 will suffer from having a working mother" (i.e. val1i5).
library(tidyverse)
library(haven)
library(janitor)
library(ggplot2)
wave1 <- read_dta("anchor1_50percent_Eng.dta")
Please find out what and how the three variables-- "val1i5", "sd3" and "sat1i1" measure in the questionnaire. You can also rename these variables so that their names are more understandable to you, and recode some categorical variables into factors.
wave1a <- wave1 %>% #pipe "wave1" into the following code, Rstudio recognized the transmute is for wave1
transmute( # Create new variables and keep only those
val1i5,
cohort=as_factor(cohort),
gender=as_factor(sex_gen),
age,
partner=as_factor(sd3),
fam_sat=sat1i1
)
mutate()
to create a new variable
"agree_val1i5" where 1 represents for val1i5 is 4 or 5, 0 for val1i5 is
1, 2 or 3. Then do a two-way tabulation. You can refer to the exercise
in Session 3 to recapture the use of prop.table()
.tabyl(wave1a$val1i5) #to check the missing observations
# wave1a$val1i5 n percent
# -2 5 0.00081
# -1 92 0.01484
# 1 1201 0.19368
# 2 1173 0.18916
# 3 1660 0.26770
# 4 1117 0.18013
# 5 953 0.15368
#there are some observations with values of -2 and -1.
wave1b <- wave1a %>%
mutate(
val1i5=case_when(val1i5<0 ~ as.numeric(NA), #when val1i5<0, make it missing
TRUE ~ as.numeric(val1i5)), #the rest observation remained unchanged
agree_val1i5=case_when(val1i5>3 ~ 1, val1i5 %in% c(1:3) ~ 0,
TRUE ~ as.numeric(NA)) # the rest are assigned NA
#create a new variable called "agree_val1i5", when val1i5 is >3, assign 1 to agree_val1i5
#when val1i5 is equal to 1, 2, 3, assign 0 to agree_val1i5
#I use %in% to say when value of val1i5 belongs to 1-3
# c(1:3) represents a vector of values 1, 2, 3, I use ":" to mean from 1 to 3.
)%>%
drop_na(agree_val1i5, cohort) #drop those missing in agree_val1i5 and cohort.
table(wave1b$cohort,wave1b$agree_val1i5) %>% prop.table(margin=1)
#
# 0 1
# -7 Incomplete data
# 0 former capikid first interview
# 1 1991-1993 0.62 0.38
# 2 1981-1983 0.68 0.32
# 3 1971-1973 0.68 0.32
# 4 2001-2003
# 9 former capikid re-interview
#cohort 1991-1993 has the highest proportion.
#of course, you cal also you tabyl().
3. Please do a chi-square test to see whether respondents' gender are correlated with their view on working mother(i.e. "val1i5" )
table(wave1a$gender)
#
# -10 not in demodiff -7 Incomplete data -4 Filter error / Incorrect entry
# 0 0 0
# -3 Does not apply 1 Male 2 Female
# 0 3029 3172
#no missing on gender
table(wave1a$val1i5)
#
# -2 -1 1 2 3 4 5
# 5 92 1201 1173 1660 1117 953
#some missing on val1i5, so needs cleaning
tab <- wave1a%>%
mutate(
val1i5=case_when(val1i5<0 ~ as.numeric(NA), #when val1i5<0, make it missing
TRUE ~ as.numeric(val1i5)) , #the rest observation remained unchanged
gender=fct_drop(gender) #drop unused level of gender
)%>%
drop_na(val1i5)%>%
tabyl(val1i5,gender) #generate a two-way table
tab
# val1i5 1 Male 2 Female
# 1 480 721
# 2 573 600
# 3 781 879
# 4 609 508
# 5 524 429
chisq.test(tab)
#
# Pearson's Chi-squared test
#
# data: tab
# X-squared = 69, df = 4, p-value = 4e-14
4. Calculate the correlation coefficient between age and satisfaction with family life for the overall sample. Secondly, estimate the correlation coefficient among the male and female sample to see whether the correlation differs by gender
summarise()
function. Note: you should check the missing
situation of age and satisfaction with family life first.tabyl(wave1a$fam_sat)
# wave1a$fam_sat n percent
# -2 13 0.0021
# -1 10 0.0016
# 0 103 0.0166
# 1 50 0.0081
# 2 89 0.0144
# 3 208 0.0335
# 4 219 0.0353
# 5 593 0.0956
# 6 494 0.0797
# 7 1154 0.1861
# 8 1490 0.2403
# 9 773 0.1247
# 10 1005 0.1621
#13 answered -2 and 10 answered -1.
tabyl(wave1a$age)
# wave1a$age n percent
# 14 41 0.0066
# 15 708 0.1142
# 16 722 0.1164
# 17 667 0.1076
# 18 35 0.0056
# 24 24 0.0039
# 25 577 0.0930
# 26 678 0.1093
# 27 647 0.1043
# 28 87 0.0140
# 34 22 0.0035
# 35 502 0.0810
# 36 618 0.0997
# 37 772 0.1245
# 38 101 0.0163
#no missing
#Way 1:
correlation <- wave1a %>%
mutate(
fam_sat=case_when(fam_sat<0 ~ as.numeric(NA), #when fam_sat<0, make it missing
TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
) %>%
drop_na(fam_sat) %>% #removing missing cases of fam_sat using drop_na()
dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
correlation
# # A tibble: 1 × 1
# `cor(x = age, y = fam_sat)`
# <dbl>
# 1 -0.0308
#Way 2: you can clean the data first, rename it as wave1c
wave1c <- wave1a %>%
mutate(
fam_sat=case_when(fam_sat<0 ~ as.numeric(NA), #when fam_sat<0, make it missing
TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
) %>%
drop_na(fam_sat)
#then use wave1c to calcualte the correlation coefficient
correlation <- wave1c %>%
dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
correlation
# # A tibble: 1 × 1
# `cor(x = age, y = fam_sat)`
# <dbl>
# 1 -0.0308
group_by()
function.#If you use Way 1 to answer the first step, now you need to clean the wave1a data again
gender_correlation <- wave1a %>%
mutate(
fam_sat=case_when(fam_sat<0 ~ as.numeric(NA), #when fam_sat<0, make it missing
TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
) %>%
drop_na(fam_sat) %>% #removing missing cases of fam_sat using drop_na()
group_by(gender)%>% #group by gender
dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
gender_correlation
# # A tibble: 2 × 2
# gender `cor(x = age, y = fam_sat)`
# <fct> <dbl>
# 1 1 Male -0.0148
# 2 2 Female -0.0453
#If you use Way 2 to do the first step, you can use wave1c to calculate the correlation coefficient by gender
gender_correlation <-wave1c %>%
group_by(gender)%>% #group by gender
dplyr::summarise(cor(x=age, y=fam_sat)) #estimate correlation coefficient between age and fam_sat
gender_correlation
# # A tibble: 2 × 2
# gender `cor(x = age, y = fam_sat)`
# <fct> <dbl>
# 1 1 Male -0.0148
# 2 2 Female -0.0453
5. Calculate the average family life satisfaction for the overall sample. Then calculate the average family life satisfaction for those who has a partner and those who has no partner.
#If you use Way 1 in Question 4, then you need to clean the wave1a again and do the calculation
wave1a %>%
mutate(
fam_sat=case_when(fam_sat<0 ~ as.numeric(NA), #when fam_sat<0, make it missing
TRUE ~ as.numeric(fam_sat)) #the rest observation remained unchanged
) %>%
drop_na(fam_sat) %>% #removing missing cases of fam_sat
dplyr::summarise(mean_famsat=mean(fam_sat))
# # A tibble: 1 × 1
# mean_famsat
# <dbl>
# 1 7.23
#If you use Way 2 in Question 4, you use wave1c directly
wave1c %>%
dplyr::summarise(mean_famsat=mean(fam_sat))
# # A tibble: 1 × 1
# mean_famsat
# <dbl>
# 1 7.23
## or If you use Way 2 in Question 4, you use wave1c directly and do it without "%>% "
mean(wave1c$fam_sat)
# [1] 7.2
#now, for simplicity, I use wave1c directly
#first, check whether there is any missing for the partner variable
class(wave1c$partner)
# [1] "factor"
# it is a factor
tabyl(wave1c$partner)
# wave1c$partner n percent
# -5 Inconsistent value 0 0.0000
# -4 Filter error / Incorrect entry 0 0.0000
# -3 Does not apply 0 0.0000
# -2 No answer 12 0.0019
# -1 Don't know 14 0.0023
# 1 Yes 3554 0.5753
# 2 No 2598 0.4205
# there are some observations with ""no answer" and "don't know"
# this means we need to clean the data
#now, for simplicity, I use wave1c directly
part_famsat <- wave1c %>%
mutate(
partner=case_when(
partner %in% c("-2 No answer","-1 Don't know") ~ NA, #partner<0, make it missing
TRUE ~ partner) #the rest observation remained unchanged
#make the partner back to a factor variable
) %>%
drop_na(partner) %>% #removing missing cases of fam_sat
group_by(partner) %>%
dplyr::summarise(mean(fam_sat))
part_famsat
# # A tibble: 2 × 2
# partner `mean(fam_sat)`
# <fct> <dbl>
# 1 1 Yes 7.27
# 2 2 No 7.18
6. Please generate two bar plots: figure1 is to show the distribution of the family life satisfaction for the whole sample; figure 2 is to show the show the distribution of the family life satisfaction by cohort
ggplot()
to generate the first plot#way1: simply plot it
figure1 <- ggplot(data=wave1a,
mapping=aes(x=fam_sat))+
geom_bar()
figure1
#way2: treat fam_sat as a factor and swap the coordinate sytem to make it nice looking
figure1 <- ggplot(data=wave1a,
mapping=aes(x=as_factor(fam_sat)))+ #as_factor(fam_sat) is to make it a factor
geom_bar()+
coord_flip() #swap the coordinating system
figure1
#it looks nicer!
facet_wrap(~)
to generate the second
plot#way1: simply plot it
figure2 <- ggplot(data=wave1a,
mapping=aes(x=as_factor(fam_sat)))+ #as_factor(fam_sat) is to make it a factor
geom_bar()+
facet_wrap(~cohort)+ #plot the graph by cohort
coord_flip() ##swap the coordinating system
figure2