class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## Managing and exploring longitudinal data ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- <style type="text/css"> .remark-slide-content { font-size: 20px; padding: 20px 80px 20px 80px; } .remark-code, .remark-inline-code { background: #f0f0f0; } .remark-code { font-size: 14px; } </style> #Let's get ready ```r library(tidyverse) # Add the tidyverse package to my current library. library(haven) # Handle labelled data. library(splitstackshape) #transform wide data (with stacked variables) to long data library(ggplot2) ``` --- #Prepare the data - Import data ```r #or you use loop to import to avoid repetitive coding, similar to forvalues in stata for (i in 1:6) { assign(paste0("wave", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6. read_dta(paste0("anchor", i, "_50percent_Eng.dta")) ) } ``` --- #Clean data - Exampel case: does get a partner make you happy? ```r #A dataset of id, age, wave, sex, relstat(relationship status), hlt(health), sat(life satisfaction) clean_fun <- function(df) { df %>% transmute( id=zap_label(id), #remove label of id age=zap_label(age), #remove label of age wave=as.numeric(wave), sex=as_factor(sex_gen), #make sex_gen as a factor relstat=as_factor(relstat), #make relstat as a factor relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again hlt=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing TRUE ~ as.numeric(hlt1)), sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing TRUE ~ as.numeric(sat6)), ptner=case_when(relstat %in% c("1 Never married single","6 Divorced/separated single","9 Widowed single") ~ "No", # when relstat has any of the three situations, I assign "Nevermarried" to new variable "marital1" relstat %in% c("2 Never married LAT","3 Never married COHAB", "4 Married COHAB","5 Married noncohabiting", "7 Divorced/separated LAT","8 Divorced/separated COHAB", "10 Widowed LAT", "11 Widowed COHAB") ~ 'Yes') %>% as_factor() )%>% drop_na() } wave1a <- clean_fun(wave1) wave2a <- clean_fun(wave2) wave3a <- clean_fun(wave3) wave4a <- clean_fun(wave4) wave5a <- clean_fun(wave5) wave6a <- clean_fun(wave6) ``` --- #Explore panel data - Example case: Does getting a partner make you happier? - Keep those who are single initially and follow them across 6 waves ```r ## Sample selection wave1b <- wave1a %>% filter(ptner=="No")%>% rename(wave.1=wave, age.1=age, sex.1=sex, relstat.1=relstat, hlt.1=hlt, sat.1=sat, ptner.1=ptner) #rename variables wave2b <- wave2a %>% rename(wave.2=wave, age.2=age, sex.2=sex, relstat.2=relstat, hlt.2=hlt, sat.2=sat, ptner.2=ptner) wave3b <- wave3a %>% rename(wave.3=wave, age.3=age, sex.3=sex, relstat.3=relstat, hlt.3=hlt, sat.3=sat, ptner.3=ptner) wave4b <- wave4a %>% rename(wave.4=wave, age.4=age, sex.4=sex, relstat.4=relstat, hlt.4=hlt, sat.4=sat, ptner.4=ptner) wave5b <- wave5a %>% rename(wave.5=wave, age.5=age, sex.5=sex, relstat.5=relstat, hlt.5=hlt, sat.5=sat, ptner.5=ptner) wave6b <- wave6a %>% rename(wave.6=wave, age.6=age, sex.6=sex, relstat.6=relstat, hlt.6=hlt, sat.6=sat, ptner.6=ptner) panel_wide <- left_join(wave1b, wave2b, by = "id") %>% # left join wave1b and wave2b left_join(wave3b, by = "id") %>% # left join with wave3b left_join(wave4b, by = "id") %>% # left join with wave4b left_join(wave5b, by = "id") %>% # left join with wave5b left_join(wave6b, by = "id") # left join with wave6b #by using left_join I keep those single in wave1 and follow them ``` --- #Let us look at the data <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic6.PNG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Explore the panel data - Check the participation over time: how many people participate in the survey over time? ```r table(panel_wide$wave.1, panel_wide$wave.2) #check how many participate in wave1 and wave2 ``` ``` ## ## 2 ## 1 1971 ``` ```r panel_wide$check <- paste(panel_wide$wave.1, panel_wide$wave.2, panel_wide$wave.3, panel_wide$wave.4, panel_wide$wave.5, panel_wide$wave.6, sep='-') table(panel_wide$check) ``` ``` ## ## 1-2-3-4-5-6 1-2-3-4-5-NA 1-2-3-4-NA-6 1-2-3-4-NA-NA ## 1045 142 48 154 ## 1-2-3-NA-5-6 1-2-3-NA-5-NA 1-2-3-NA-NA-NA 1-2-NA-4-5-6 ## 45 16 202 27 ## 1-2-NA-4-5-NA 1-2-NA-4-NA-6 1-2-NA-4-NA-NA 1-2-NA-NA-NA-NA ## 10 1 14 267 ## 1-NA-3-4-5-6 1-NA-3-4-5-NA 1-NA-3-4-NA-6 1-NA-3-4-NA-NA ## 30 10 6 12 ## 1-NA-3-NA-5-6 1-NA-3-NA-5-NA 1-NA-3-NA-NA-NA 1-NA-NA-4-5-6 ## 4 1 30 9 ## 1-NA-NA-NA-5-NA 1-NA-NA-NA-NA-NA ## 1 517 ``` --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - by using `table()` ```r table(panel_wide$ptner.1, panel_wide$ptner.2) #check how many get a partner from wave1 to wave2 ``` ``` ## ## No Yes ## No 1518 453 ## Yes 0 0 ``` ```r table(panel_wide$ptner.2, panel_wide$ptner.3) table(panel_wide$ptner.3, panel_wide$ptner.4) table(panel_wide$ptner.4, panel_wide$ptner.5) table(panel_wide$ptner.5, panel_wide$ptner.6) ``` --- #Explore the panel data - Check the transition over time: how many individuals get a partner over time? - by creating a transition variable ```r #Step 1: make a panel dataset (person-period dataset) panel_long<- merged.stack(panel_wide, #dataset for transfrom var.stubs = c("age", "wave", "sex","relstat", "hlt", "sat", "ptner"), #var.stubs is to specify the prefixes of the variable groups sep = ".") %>% #sep is to specify the character that separates the "variable name" from the "times" in the source drop_na(wave) #drop the observations which did not join the wave #Step 2: generate transition variables panel_long <- panel_long %>% group_by(id) %>% mutate( havepartner=case_when( ptner!=dplyr::lag(ptner, 1) & dplyr::lag(ptner, 1)=="No" & ptner=="Yes" ~ 1, TRUE ~ 0), breakpartner=case_when( ptner!=dplyr::lag(ptner, 1) & dplyr::lag(ptner, 1)=="Yes" & ptner=="No" ~ 1, TRUE ~ 0) ) ``` --- #Case 1 <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic7.PNG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Case 2 <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic8.PNG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Case 3 <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic9.PNG?raw=true" width="100%" style="display: block; margin: 10px;"> --- #Know the panel data - Number of transitions ```r table(panel_long$havepartner) ``` ``` ## ## 0 1 ## 9104 1266 ``` ```r table(panel_long$breakpartner) ``` ``` ## ## 0 1 ## 9921 449 ``` --- #Know the panel data - Number of repeated transitions ```r transitiontimes <- panel_long %>% group_by(id) %>% summarize( times_partner=sum(havepartner), times_departner=sum(breakpartner) ) table(transitiontimes$times_partner) ``` ``` ## ## 0 1 2 3 ## 1470 979 139 3 ``` ```r table(transitiontimes$times_departner) ``` ``` ## ## 0 1 2 ## 2168 397 26 ``` --- #Know the panel data - Look at how life satisfaction change over time ```r sample_id <- sample (unique(panel_long$id), size=12)#randomly select 12 individuals ``` ```r panel_sample <- filter(panel_long, id%in%sample_id) #find the 12 individuals in the panel_long data set by matching id ggplot(data=panel_sample, mapping=aes(x=wave,y=sat))+ #use ggplot to see changes of sat over time geom_point()+ facet_wrap(~ id, ncol=6) #this is new, to plot sat by id ``` <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic10.PNG?raw=true" width="60%" style="display: block; margin: 10px;"> --- #Know the panel data - Look at how life satisfaction change when individuals get a partner ```r panel_long <- panel_long %>% group_by(id) %>% mutate( partnerwave=case_when(havepartner==1 ~ wave) ) #this is to identify which wave individuals get a partner by generating "partnerwave" and specify the wave when the person get a partner panel_sample <- filter(panel_long, id%in%sample_id) #find the 12 individuals in the panel_long data set by matching id ggplot(data=panel_sample )+ #use ggplot to see changes of sat over time geom_point(mapping=aes(x=wave,y=sat))+ geom_vline(mapping=aes(xintercept = partnerwave ))+ facet_wrap(~ id, ncol=6) #this is new, to plot sat by id ``` <img src="https://github.com/fancycmn/slide8/blob/main/S8_Pic11.PNG?raw=true" width="60%" style="display: block; margin: 10px;"> --- #Take home - Transform the data to wide or long format - Learn how to check participation over time - Learn how to check transition in status over time - use `table()` - create transition variable - Learn how to check the changes of outcome variable over time - randomly select some individuals, using `sample()` - use `ggplot()` to get some visual impression - Important code - `left_join()` - `paste()` - `dplyr::lag()` - `facet_wrap()`in `ggplot()` - `geom_vline()` --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/964161)