class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## Visualization and Introduction to Panel Data ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- <style type="text/css"> .remark-slide-content { font-size: 20px; padding: 20px 80px 20px 80px; } .remark-code, .remark-inline-code { background: #f0f0f0; } .remark-code { font-size: 12px; } </style> #Let's get ready ```r #install two new packages install.packages("broom") #Allows us to turn model objects into tibbles ``` ```r #install.packages("broom") # Add packages to library library(tidyverse) # Add the tidyverse package to my current library. library(haven) # Import data. library(janitor)# Cleaning data library(ggplot2) # Allows us to create nice figures. library(estimatr) # Allows us to estimate (cluster-)robust standard errors. library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables. library(broom) # Allows us to turn model objects into tibbles. ``` --- #Prepare data a dataset of age, sex, relationship status, life satisfaction - DV: life satisfaction - IV: age, sex, relationship status - [Click here to copy and run the codes to get a cleaned dataset](https://rpubs.com/fancycmn/1232910) --- #Prepare data ```r ols1 <- lm_robust(formula = sat6 ~ age, data = wave1c) ols2 <- lm_robust(formula = sat6 ~ age + relstat_new2, data = wave1c) ols3 <- lm_robust(formula = sat6 ~ age + relstat_new2*sex_gen, data = wave1c) texreg::htmlreg(list(ols1,ols2,ols3), include.ci = FALSE, file = "MyOLSModels(2024).html") ``` ``` ## The table was written to the file 'MyOLSModels(2024).html'. ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F7.JPG?raw=true" width="40%" style="display: block; margin: auto;" > --- #Make your output nicer ```r htmlreg(list(ols1,ols2,ols3), include.ci = FALSE, custom.coef.names = c("Intercept", "Age", "Partnered (Ref.=Single)", "Female(Ref.=Male)", "Partnered x Female" ), caption = "My regression table", caption.above = TRUE, single.row = TRUE, digits = 3, file = "MyOLSModels(2024)_v2.html") ``` ``` ## The table was written to the file 'MyOLSModels(2024)_v2.html'. ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F8.JPG?raw=true" width="50%" style="display: block; margin: auto;" > --- #Visualize the regression coefficent `broom::tidy()` turns a model object into a tibble containing coefficients and inference stats. ```r result_ols3 <- broom::tidy(ols3) ``` - Make your result a dataset to plot ```r plotdata <- result_ols3 %>% # Use the ols3a model object, then tidy it, then mutate( term1=as_factor(term), variable = fct_recode(term1, # Recode predictor names, then "Intercept" = "(Intercept)", "Age" = "age", "Partnered" = "relstat_new2partnered", "Female" = "sex_gen2 Female", "Partnered x Female"= "relstat_new2partnered:sex_gen2 Female" )) %>% select(variable, estimate, conf.low, conf.high) plotdata ``` ``` ## variable estimate conf.low conf.high ## 1 Intercept 8.25419018 8.10577567 8.40260469 ## 2 Age -0.03706469 -0.04330089 -0.03082849 ## 3 Partnered 0.57741762 0.44073349 0.71410176 ## 4 Female 0.00034688 -0.13977482 0.14046858 ## 5 Partnered x Female 0.04393919 -0.13461431 0.22249269 ``` --- #Now you can plot the regression coefficent - specify the layer: what are x and y, in bar, point, or other forms - plot a point chart ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = variable, y = estimate)) + #specifiy your x and y in the plot geom_point() #specify plot result in point ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F1.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Now you can plot the regression coefficent -spefiy the layer: what are x and y, in bar, point, or other forms - plot a point chart ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = variable, y = estimate)) + #specifiy your x and y in the plot geom_point() + #specify plot result in point coord_flip() ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F2.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Now you can plot the regression coefficent - Modify your point chart, size and color ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = variable, y = estimate)) + #specifiy your x and y in the plot geom_point(size=5, color="blue" )+ #change the size and color of the point coord_flip() ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F3.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - plot a point chart with confidence interval ```r ggplot(data = plotdata, aes(x = variable, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + #this is the code to plot point with confidence interval, but you have to specify ymin and ymax for the CI coord_flip() ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F4.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - add a reference line ```r ggplot(data = plotdata, aes(x = variable, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI coord_flip() + geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F5.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - remove intercept ```r ggplot(data = plotdata[-1,], #remove intercept using [] where [-1,] remove Row 1 aes(x = variable, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI coord_flip() + geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line ``` <img src="https://github.com/fancycmn/24-Session7/blob/main/F6.JPG?raw=true" width="45%" style="display: block; margin: auto;" > --- #Questions for Portfolio 1 --- #Data in long format and wide format Data in **wide format**: contains values that do not repeat in the first column <img src="https://github.com/fancycmn/24-Session7/blob/main/F9.JPG?raw=true" width="120%" style="display: block; margin: auto;" > --- #Data in long format and wide format Data in **long format**: contains values that do repeat in the first column <img src="https://www.theanalysisfactor.com/wp-content/uploads/2013/10/image002.jpg" width="80%" style="display: block; margin: 10px;"> --- #What is panel data .pull-left[ **Micro panel** - N(persons) >>> T(time points) - PAIRFAM, SOEP, BHPS, SHARE, HILDA, GGP, etc <img src="https://github.com/fancycmn/slide-7/blob/main/S7_Pic2.PNG?raw=true" width="50%" style="display: block; margin: 10px;"> ] .pull-right[ **Macro panel** - N countries >>> T(time points) - OECD, World bank, UNPD, etc. <img src="https://github.com/fancycmn/slide-7/blob/main/S7_Pic3.PNG?raw=true" width="50%" style="display: block; margin: 50px 30px;"> ] --- #Unbalanced and balanced panel data .pull-left[ **Unbalanced panel** - Units observed at different points in time - The usual case in micro surveys - Selection problem if being observed is systematic <img src="https://github.com/fancycmn/slide-7/blob/main/S7_Pic4.PNG?raw=true" width="50%" style="display: block; margin: 30px;"> ] .pull-right[ **Balanced panel** - Every unit observed at all times - Ideal-typical case, more often in macro panel data - Never realized in surveys, selection problem if forced <img src="https://github.com/fancycmn/slide-7/blob/main/S7_Pic5.PNG?raw=true" width="50%" style="display: block; margin: 10px 30px;"> ] --- #Benefit and problems with panel data - Benefit - Temporal order of events: panel data > cross-sectional data - Causal inference: within-person comparison > between-person comparison - Identification of causal effects: compare the same person P at t0 to t1 - Both benefit and problem - cost of data collection: 1) few sampling costs; 2) high costs of panel maintenance; 3) overall, lower costs compared to repeated cross-sections - Reliability and validity of constructs: higher reliability; assessment of stable and variable constructs (IQ, personality); - Respondents learn to deal with the questionnaries - Question may change overtime - Problem - At start: similar to a cross-sectional survey - Over time: becomes more selective during to attrition - Solution: weighting (designed weights + longitudinal weight) - Refreshment samples --- #What a micro panel data often contains? - A micro panel dataset (a person-period dataset) have four types of variables - A subjective identifier (e.g. an ID for the person) - A time indicator (e.g. the year of the survey) - Outcome variables - Predictor variables --- #Import data ```r wave1 <- read_dta("anchor1_50percent_Eng.dta") wave2 <- read_dta("anchor2_50percent_Eng.dta") wave3 <- read_dta("anchor3_50percent_Eng.dta") wave4 <- read_dta("anchor4_50percent_Eng.dta") wave5 <- read_dta("anchor5_50percent_Eng.dta") wave6 <- read_dta("anchor6_50percent_Eng.dta") ``` --- #First, check data - Think about what variables you want for analysis - See whether the variables are coded and labelled in the same way across waves - Some variables that are often used - ID (`id`) - Gender - Age - Marital status - Labor force status - Health - Education - No. of children - Income - Life satisfaction: the outcome variable --- #First, check data - In a simple case, I consider variables: id, age, sex_gen, relstat, hlt1, sat6 ```r wave1$sex_gen %>% as_factor() %>% table() wave2$sex_gen %>% as_factor() %>% table() wave3$sex_gen %>% as_factor() %>% table() wave4$sex_gen %>% as_factor() %>% table() wave5$sex_gen %>% as_factor() %>% table() wave6$sex_gen %>% as_factor() %>% table() ``` Write similar codes for other variables to see the distribution and levels across different datasets - Or you could write a function to run repeated codes for different dataset. ```r sex_fun <- function(df) { table(as_factor(df$sex_gen)) } #define a function to generate tables for the distribution of a factor variable "sex_gen" sex_fun(wave1) #just enter your dataset in the function "sex_fun()" ``` ``` ## ## -10 not in demodiff -7 Incomplete data ## 0 0 ## -4 Filter error / Incorrect entry -3 Does not apply ## 0 0 ## 1 Male 2 Female ## 3029 3172 ``` --- #First, check data - use [`sapply`](https://www.youtube.com/watch?v=ejVWRKidi9M) to run the repeated code for six waves ```r sapply(mget(paste0("wave", 1:6)), sex_fun) #sapply: loop over a list and evaluate a function on each element; but different from lapply, the result is shown in a table ``` <img src="https://github.com/fancycmn/slide-7/blob/main/S7_Pic9.PNG?raw=true" width="90%" style="display: block; margin: 30px;"> --- #First, check data ```r #what is past0 paste0("wave", 1) ``` ``` ## [1] "wave1" ``` ```r paste0("wave", 1:6) ``` ``` ## [1] "wave1" "wave2" "wave3" "wave4" "wave5" "wave6" ``` ```r whatisthis<- mget(paste0("wave", 1:6)) sapply(mget(paste0("wave", 1:6)), sex_fun) ``` ``` ## wave1 wave2 wave3 wave4 wave5 wave6 ## -10 not in demodiff 0 0 0 0 0 0 ## -7 Incomplete data 0 0 0 0 0 0 ## -4 Filter error / Incorrect entry 0 0 0 0 0 0 ## -3 Does not apply 0 0 0 0 0 0 ## 1 Male 3029 2197 1905 1668 1493 1342 ## 2 Female 3172 2339 2050 1813 1626 1477 ``` ```r #sapply: loop over a list and evaluate a function on each element and show the result in a table ``` --- #First, check data - you can write the following function ```r relstat_fun <- function(df) { table(as_factor(df$relstat)) } sapply(mget(paste0("wave", 1:6)), relstat_fun) health_fun <- function(df) { table(as_factor(df$hlt1)) } sapply(mget(paste0("wave", 1:6)), health_fun) sat_fun <- function(df) { table(as_factor(df$sat6)) } sapply(mget(paste0("wave", 1:6)), sat_fun) ``` sex_gen, relstat, sat6 are coded in the same way; while hlt1 are coded in different ways, particularly for negative values. --- #Second, clean data - you can repeat the following code for six waves ```r wave1a <- wave1 %>% transmute( id, age, wave=as.numeric(wave), sex_gen=as_factor(sex_gen), #make sex_gen as a factor relstat=as_factor(relstat), #make relstat as a factor relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again hlt1=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing TRUE ~ as.numeric(hlt1)), sat6=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing TRUE ~ as.numeric(sat6)) ) ``` --- #Second, clean data - or use a function ```r clean_fun <- function(df) { df %>% transmute( id, #remove label of id age, #remove label of age wave=as.numeric(wave), sex=as_factor(sex_gen), #make sex_gen as a factor relstat=as_factor(relstat), #make relstat as a factor relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again hlt=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing TRUE ~ as.numeric(hlt1)), sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing TRUE ~ as.numeric(sat6)) ) } wave1a <- clean_fun(wave1) wave2a <- clean_fun(wave2) wave3a <- clean_fun(wave3) wave4a <- clean_fun(wave4) wave5a <- clean_fun(wave5) wave6a <- clean_fun(wave6) ``` --- #Take home 1. use the function `htmlreg` under the "texreg" package to make your regression table nicer 2. use the function `tidy()` under the "broom" package to make your regression as a dataset 3. use ggplot to plot your result - point chart: `geom_point()` - point with confidence interval: `geom_pointrange()` 4. clean multiple data: - define your functions - use `sapply()` --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/1233012)