class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## Managing and exploring longitudinal data ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- <style type="text/css"> .remark-slide-content { font-size: 20px; padding: 20px 80px 20px 80px; } .remark-code, .remark-inline-code { background: #f0f0f0; } .remark-code { font-size: 14px; } </style> #Let's get ready ```r library(tidyverse) # Add the tidyverse package to my current library. library(haven) # Handle labelled data. library(janitor) #cleaning data library(splitstackshape) #transform wide data (with stacked variables) to long data library(ggplot2) ``` --- #Prepare the data - Import data ```r wave1 <- read_dta("anchor1_50percent_Eng.dta") wave2 <- read_dta("anchor2_50percent_Eng.dta") wave3 <- read_dta("anchor3_50percent_Eng.dta") wave4 <- read_dta("anchor4_50percent_Eng.dta") wave5 <- read_dta("anchor5_50percent_Eng.dta") wave6 <- read_dta("anchor6_50percent_Eng.dta") ``` --- #Clean data - Exampel case: does getting a partner make you happy? ```r clean_fun <- function(df) { df %>% transmute( id, age, wave, sex=as_factor(sex_gen), #make sex_gen as a factor relstat=as_factor(relstat), #make relstat as a factor relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again health=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing TRUE ~ as.numeric(hlt1)), sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing TRUE ~ as.numeric(sat6)), partner=case_when(relstat %in% c("1 Never married single","6 Divorced/separated single","9 Widowed single") ~ "No", # when relstat has any of the situations, I assign "No" relstat %in% c("2 Never married LAT","3 Never married COHAB", "4 Married COHAB","5 Married noncohabiting", "7 Divorced/separated LAT","8 Divorced/separated COHAB", "10 Widowed LAT", "11 Widowed COHAB") ~ 'Yes') %>% as_factor() # when relstat has any of the situations, I assign "Yes" )%>% drop_na() } wave1a <- clean_fun(wave1) wave2a <- clean_fun(wave2) wave3a <- clean_fun(wave3) wave4a <- clean_fun(wave4) wave5a <- clean_fun(wave5) wave6a <- clean_fun(wave6) ``` --- #Explore panel data - Example case: Does getting a partner make you happier? - Keep those who are single initially and follow them across 6 waves ```r ## Sample selection wave1b <- wave1a %>% filter(partner=="No")%>% rename(wave_1=wave, age_1=age, sex_1=sex, relstat_1=relstat, health_1=health, sat_1=sat, partner_1=partner) #rename variables wave2b <- wave2a %>% rename(wave_2=wave, age_2=age, sex_2=sex, relstat_2=relstat, health_2=health, sat_2=sat, partner_2=partner) wave3b <- wave3a %>% rename(wave_3=wave, age_3=age, sex_3=sex, relstat_3=relstat, health_3=health, sat_3=sat, partner_3=partner) wave4b <- wave4a %>% rename(wave_4=wave, age_4=age, sex_4=sex, relstat_4=relstat, health_4=health, sat_4=sat, partner_4=partner) wave5b <- wave5a %>% rename(wave_5=wave, age_5=age, sex_5=sex, relstat_5=relstat, health_5=health, sat_5=sat, partner_5=partner) wave6b <- wave6a %>% rename(wave_6=wave, age_6=age, sex_6=sex, relstat_6=relstat, health_6=health, sat_6=sat, partner_6=partner) panel_wide <- left_join(wave1b, wave2b, by = "id") %>% # left join wave1b and wave2b left_join(wave3b, by = "id") %>% # left join with wave3b left_join(wave4b, by = "id") %>% # left join with wave4b left_join(wave5b, by = "id") %>% # left join with wave5b left_join(wave6b, by = "id") # left join with wave6b #by using left_join I keep those single in wave1 and follow them ``` --- #Let us look at the data <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure1.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Explore the panel data - Check the participation over time: how many people participate in the survey over time? ```r table(panel_wide$wave_1, panel_wide$wave_2) #check how many participate in wave1 and wave2 ``` ``` ## ## 2 ## 1 1971 ``` ```r panel_wide$check <- paste(panel_wide$wave_1, panel_wide$wave_2, panel_wide$wave_3, panel_wide$wave_4, panel_wide$wave_5, panel_wide$wave_6, sep='-') #create a variable named check to see the participation over time table(panel_wide$check) ``` ``` ## ## 1-2-3-4-5-6 1-2-3-4-5-NA 1-2-3-4-NA-6 1-2-3-4-NA-NA ## 1045 142 48 154 ## 1-2-3-NA-5-6 1-2-3-NA-5-NA 1-2-3-NA-NA-NA 1-2-NA-4-5-6 ## 45 16 202 27 ## 1-2-NA-4-5-NA 1-2-NA-4-NA-6 1-2-NA-4-NA-NA 1-2-NA-NA-NA-NA ## 10 1 14 267 ## 1-NA-3-4-5-6 1-NA-3-4-5-NA 1-NA-3-4-NA-6 1-NA-3-4-NA-NA ## 30 10 6 12 ## 1-NA-3-NA-5-6 1-NA-3-NA-5-NA 1-NA-3-NA-NA-NA 1-NA-NA-4-5-6 ## 4 1 30 9 ## 1-NA-NA-NA-5-NA 1-NA-NA-NA-NA-NA ## 1 517 ``` --- #Explore the panel data ```r #or you can do tabyl(panel_wide,check) %>% adorn_totals("row") ``` ``` ## check n percent ## 1-2-3-4-5-6 1045 0.4033191818 ## 1-2-3-4-5-NA 142 0.0548050946 ## 1-2-3-4-NA-6 48 0.0185256658 ## 1-2-3-4-NA-NA 154 0.0594365110 ## 1-2-3-NA-5-6 45 0.0173678117 ## 1-2-3-NA-5-NA 16 0.0061752219 ## 1-2-3-NA-NA-NA 202 0.0779621768 ## 1-2-NA-4-5-6 27 0.0104206870 ## 1-2-NA-4-5-NA 10 0.0038595137 ## 1-2-NA-4-NA-6 1 0.0003859514 ## 1-2-NA-4-NA-NA 14 0.0054033192 ## 1-2-NA-NA-NA-NA 267 0.1030490158 ## 1-NA-3-4-5-6 30 0.0115785411 ## 1-NA-3-4-5-NA 10 0.0038595137 ## 1-NA-3-4-NA-6 6 0.0023157082 ## 1-NA-3-4-NA-NA 12 0.0046314164 ## 1-NA-3-NA-5-6 4 0.0015438055 ## 1-NA-3-NA-5-NA 1 0.0003859514 ## 1-NA-3-NA-NA-NA 30 0.0115785411 ## 1-NA-NA-4-5-6 9 0.0034735623 ## 1-NA-NA-NA-5-NA 1 0.0003859514 ## 1-NA-NA-NA-NA-NA 517 0.1995368584 ## Total 2591 1.0000000000 ``` --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - by using `table()` ```r table(panel_wide$partner_1, panel_wide$partner_2) #check how many get a partner from wave1 to wave2 ``` ``` ## ## No Yes ## No 1518 453 ## Yes 0 0 ``` ```r tabyl(panel_wide,partner_1, partner_2) ``` ``` ## partner_1 No Yes NA_ ## No 1518 453 620 ## Yes 0 0 0 ``` ```r tabyl(panel_wide,partner_2, partner_3) tabyl(panel_wide,partner_3, partner_4) tabyl(panel_wide,partner_4, partner_5) tabyl(panel_wide,partner_5, partner_6) ``` --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - first, transform the format of the data from wide to long ```r #Step 1: make a panel dataset (person-period dataset) panel_long<- merged.stack(panel_wide, #dataset for transfrom var.stubs = c("age", "wave", "sex","relstat", "health", "sat", "partner"), #var.stubs is to specify the prefixes of the variable groups sep = "_") %>% #sep is to specify the character that separates the "variable name" from the "times" in the source drop_na(wave) #drop the observations which did not join the wave ``` --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - second, define transition or event over time I make a simplified data for demonstration. ```r example <- select(panel_long, id, check, .time_1, wave, relstat, partner) ``` <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure4.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Let us look at some cases <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure2.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Let us look at some cases <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure3.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - second, define transition or event over time - For a respondent , if partner=Yes at time t and partner=No at time t-1 (i.e. the previous wave), we can say that at time t, this person gets a partner - For a respondent , if partner=No at time t and partner=Yes at time t-1 (i.e. the previous wave), we can say that at time t, this person just break the partnership - To find the partnership stats at t-1, we can use `dplyr::lag(x=?, n=?)` - here to specify the variable for x, specify how many wave you want to lag - `dplyr::lag(partner, n=1)`, lag partner by 1 wave --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - second, define transition or event over time ```r #Step 2: generate transition variables panel_long <- panel_long %>% group_by(id) %>% mutate( getpartner=case_when( partner!=dplyr::lag(partner, 1) & partner=="Yes" & dplyr::lag(partner, 1)=="No" ~ 1, TRUE ~ 0), breakpartner=case_when( partner!=dplyr::lag(partner, 1) & partner=="No" & dplyr::lag(partner, 1)=="Yes" ~ 1, TRUE ~ 0) ) ``` --- #Case 1 <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure5a.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Case 2 <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure5b.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Case 3 <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure5c.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Know the panel data - Number of transitions ```r table(panel_long$getpartner) ``` ``` ## ## 0 1 ## 9104 1266 ``` ```r table(panel_long$breakpartner) ``` ``` ## ## 0 1 ## 9921 449 ``` --- #Know the panel data - Number of repeated transitions ```r transitiontimes <- panel_long %>% group_by(id) %>% summarize( times_partner=sum(getpartner), times_departner=sum(breakpartner) ) table(transitiontimes$times_partner) ``` ``` ## ## 0 1 2 3 ## 1470 979 139 3 ``` ```r table(transitiontimes$times_departner) ``` ``` ## ## 0 1 2 ## 2168 397 26 ``` --- #Know the panel data - When transitions happen ```r panel_long <- panel_long %>% group_by(id) %>% mutate( partnerwave=case_when(getpartner==1 ~ wave), departnerwave=case_when(breakpartner==1 ~ wave) ) #this is to identify which wave individuals get a partner by generating "partnerwave" and specify the wave when the person get a partner ``` <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure8.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Know the panel data - Look at how life satisfaction change over time ```r sample_id <- sample (unique(panel_long$id), size=12)#randomly select 12 individuals ``` ```r panel_sample <- filter(panel_long, id%in%sample_id) #find the 12 individuals in the panel_long data set by matching id ggplot(data=panel_sample, mapping=aes(x=wave,y=sat))+ #use ggplot to see changes of sat over time geom_point()+ facet_wrap(~ id, ncol=6) #this is new, to plot sat by id ``` --- #Know the panel data <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure6.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Know the panel data - Look at how life satisfaction change when individuals get a partner ```r ggplot(data=panel_sample )+ #use ggplot to see changes of sat over time geom_point(mapping=aes(x=wave,y=sat))+ geom_vline(mapping=aes(xintercept = partnerwave ),colour = "red")+ #plot the wave of getting a partner geom_vline(mapping=aes(xintercept = departnerwave),colour = "blue")+ #plot the wave of breakup facet_wrap(~ id, ncol=6) #this is new, to plot sat by id ``` --- #Know the panel data <img src="https://github.com/fancycmn/2024-Session9/blob/main/Figure7.JPG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Take home - Transform the data to wide or long format - Learn how to check participation over time - Learn how to check transition in status over time - use `table()` - create transition variable - Learn how to check the changes of outcome variable over time - randomly select some individuals, using `sample()` - use `ggplot()` to get some visual impression - Important code - `left_join()` - `paste()` - `dplyr::lag()` - `facet_wrap()`in `ggplot()` - `geom_vline()` --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/1238990)