class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## Visualize your regression result ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- <style type="text/css"> .remark-slide-content { font-size: 20px; padding: 20px 80px 20px 80px; } .remark-code, .remark-inline-code { background: #f0f0f0; } .remark-code { font-size: 12px; } </style> #Let's get ready ```r #install.packages("broom") # Add packages to library library(tidyverse) # Add the tidyverse package to my current library. library(haven) # Import data. library(Hmisc) # Weighting library(ggplot2) # Allows us to create nice figures. library(estimatr) # Allows us to estimate (cluster-)robust standard errors. library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables. library(broom) # Allows us to turn model objects into tibbles. ``` --- #Prepare data a dataset of age, sex, relationship status, life satisfaction, weight - DV: life satisfaction - IV: age, sex, relationship status ```r wave1 <- read_dta("anchor1_50percent_Eng.dta") # sample size =6201 # wave1b <- wave1 %>% transmute( age=zap_labels(age), #Independent variable sat6=case_when(sat6<0 ~ as.numeric(NA), #specify when sat should be considered missing TRUE ~ as.numeric(sat6)) %>% zap_label(), #remove labels of sat6 cdweight=zap_label(cdweight), #cdweight is the variable telling the weight sex_gen=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as categorical, and drop unused level relstat=as_factor(relstat), #treat relationship status as categorical relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), TRUE ~ as.character(relstat)) %>% #specify when relstat should be considered missing as_factor() %>% #make relstat as a factor fct_drop() #drop unused levels in relstat ) %>% drop_na() #drop all observations with missing values in the sample # sample size change from 6201 to 6162 ``` --- #Prepare data ```r wave1c <- wave1b %>% mutate( marital1=case_when( relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried", # when relstat has any of the three situations, I assign "Nevermarried" to new variable "marital1" relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married', # when relstat has any of the two situations, I assign "Married" to new variable "marital1" relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced', # when relstat has any of the three situations, I assign "Divorced" to new variable "marital1" relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow' # when relstat has any of the two situations, I assign "Widow" to new variable "marital1" ) %>% as_factor()# I treat marital1 as a categorical variable ) %>% filter(marital1!= "Widow") #I drop observations when he/she is widowed. # sample size change to 6158 after dropping those widowed ``` --- #Prepare data ```r ols1 <- lm_robust(formula = sat6 ~ age + marital1*sex_gen, data = wave1c, weights = cdweight) summary(ols1) ``` ``` ## ## Call: ## lm_robust(formula = sat6 ~ age + marital1 * sex_gen, data = wave1c, ## weights = cdweight) ## ## Weighted, Standard error type: HC2 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.51700 0.096867 87.9243 0.000e+00 ## age -0.04414 0.004291 -10.2876 1.267e-24 ## marital1Married 0.77335 0.101589 7.6126 3.091e-14 ## marital1Divorced -0.67740 0.292832 -2.3133 2.074e-02 ## sex_gen2 Female 0.05380 0.061767 0.8711 3.837e-01 ## marital1Married:sex_gen2 Female -0.07577 0.111197 -0.6814 4.956e-01 ## marital1Divorced:sex_gen2 Female 0.24337 0.336766 0.7227 4.699e-01 ## CI Lower CI Upper DF ## (Intercept) 8.32710 8.70689 6151 ## age -0.05255 -0.03573 6151 ## marital1Married 0.57420 0.97250 6151 ## marital1Divorced -1.25146 -0.10335 6151 ## sex_gen2 Female -0.06728 0.17489 6151 ## marital1Married:sex_gen2 Female -0.29376 0.14221 6151 ## marital1Divorced:sex_gen2 Female -0.41681 0.90355 6151 ## ## Multiple R-squared: 0.04672 , Adjusted R-squared: 0.04579 ## F-statistic: 29.78 on 6 and 6151 DF, p-value: < 2.2e-16 ``` --- #Make your output nicer ```r htmlreg(list(ols1), include.ci = FALSE, custom.coef.names = c("Intercept", "Age", "Married", "Divorced", "Female", "Female x married", "Female x divorced"), caption = "My regression table", caption.above = TRUE, single.row = TRUE, digits = 3, file = "MyOLSModels1.html") ``` ``` ## The table was written to the file 'MyOLSModels1.html'. ``` --- #Further processing of model results `broom::tidy()` turns a model object into a tibble containing coefficients and inference stats. ```r tidy(ols1) ``` ``` ## term estimate std.error statistic ## 1 (Intercept) 8.51699721 0.096867364 87.9243206 ## 2 age -0.04414123 0.004290704 -10.2876439 ## 3 marital1Married 0.77335402 0.101588634 7.6126038 ## 4 marital1Divorced -0.67740334 0.292831651 -2.3132859 ## 5 sex_gen2 Female 0.05380405 0.061766576 0.8710869 ## 6 marital1Married:sex_gen2 Female -0.07577283 0.111197220 -0.6814274 ## 7 marital1Divorced:sex_gen2 Female 0.24337293 0.336765716 0.7226773 ## p.value conf.low conf.high df outcome ## 1 0.000000e+00 8.32710330 8.70689112 6151 sat6 ## 2 1.267262e-24 -0.05255251 -0.03572995 6151 sat6 ## 3 3.090888e-14 0.57420477 0.97250328 6151 sat6 ## 4 2.073974e-02 -1.25145579 -0.10335089 6151 sat6 ## 5 3.837407e-01 -0.06728004 0.17488814 6151 sat6 ## 6 4.956268e-01 -0.29375827 0.14221261 6151 sat6 ## 7 4.699056e-01 -0.41680565 0.90355152 6151 sat6 ``` ```r data_ols1 <- tidy(ols1) ``` --- #Visualize your result - Make your result a dataset to plot ```r plotdata <- ols1 %>% tidy() %>% # Use the ols3a model object, then tidy it, then mutate(term = fct_recode(as_factor(term), # Recode predictor names, then "Intercept" = "(Intercept)", "Age" = "age", "Married" = "marital1Married", "Divorced" = "marital1Divorced", "Female" = "sex_gen2 Female", "Female x married"= "marital1Married:sex_gen2 Female", "Female x divorced" = "marital1Divorced:sex_gen2 Female" )) %>% select(term, estimate, conf.low, conf.high) plotdata ``` ``` ## term estimate conf.low conf.high ## 1 Intercept 8.51699721 8.32710330 8.70689112 ## 2 Age -0.04414123 -0.05255251 -0.03572995 ## 3 Married 0.77335402 0.57420477 0.97250328 ## 4 Divorced -0.67740334 -1.25145579 -0.10335089 ## 5 Female 0.05380405 -0.06728004 0.17488814 ## 6 Female x married -0.07577283 -0.29375827 0.14221261 ## 7 Female x divorced 0.24337293 -0.41680565 0.90355152 ``` --- #use ggplot2 to plot your result ```r ggplot(data = <DATA>, mapping = aes(x=, y=) + # specify dataset, x, and y to ggplot <GEOM_FUNCTION>()+ # specify types of your chart, e.g. bar, point, line chart <COORDINATE_FUNCTION> # Change the default coordinate system, swap x and y axis #note: + is the symbol to connect different section of code ``` ggplot2 contains many geom functions, which put layers of different types of geometric objects (e.g., points, bars, lines) over a coordinate system. - All geom functions depend on the `mapping` argument. It is paired with `aes()`, which stands for "aesthetic". Aesthetics are the visual properties of your plot. - The most important aesthetics of any graph are the y-axis and the x-axis. Therefore,`aes()`depends on `x` and `y`, because these specify which variable to map to the y-axis and which one to map to the x-axis. - But of course, aesthetics also means, among others, color, shape, size, and so on. --- #Now you can plot your data ```r ggplot(data = plotdata) ## Create an empty coordinate system for the dataset "plotdata". ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic3.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #spefiy the layer: what are x and y, in bar, point, or other forms - plot a point chart ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = term, y = estimate)) + #specifiy your x and y in the plot geom_point() #specify plot result in point ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic4.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #spefiy the layer: what are x and y, in bar, point, or other forms - Modify your point chart, size and color ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = term, y = estimate)) + #specifiy your x and y in the plot geom_point(size=5, color="blue" ) #change the size and color of the point ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic5.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #spefiy the layer: what are x and y, in bar, point, or other forms - plot a bar chart ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = term, y = estimate)) + #specifiy your x and y in the plot geom_col() #specify plot result in column ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic6.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #spefiy the layer: what are x and y, in bar, point, or other forms - Modify your bar chart ```r ggplot(data = plotdata, #specify your dataset by "data=" aes(x = term, y = estimate)) + #specify your x and y in the plot geom_col(color="red", fill="blue") + #change the outline color and fill color in the bar coord_flip() #swap x and y axis ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic7.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - plot a point chart with confidence interval ```r ggplot(data = plotdata, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + #this is the code to plot point with confidence interval, but you have to specify ymin and ymax for the CI coord_flip() ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic8.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - add a reference line ```r ggplot(data = plotdata, aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI coord_flip() + geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic10.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #Plot it with confidence interval - plot a bar chart with confidence interval ```r ggplot(data = plotdata, mapping = aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) + geom_col() + geom_errorbar(width = 0.55) # this is the code to plot bar with CI, width is to specify the size of errorbar ``` <img src="https://github.com/fancycmn/slide6/blob/main/S6_Pic9.png?raw=true" width="60%" style="display: block; margin: auto;" > --- #Take home 1. use the function `htmlreg` under the "texreg" package to make your regression table nicer 2. use the function `tidy()` under the "broom" package to make your regression as a dataset 3. use ggplot to plot your result - point chart: `geom_point()` - bar chart: `geom_col()` - point with confidence interval: `geom_pointrange()` - bar with condfidence interval: `geom_errorbar()` --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/1103889)