class: center, middle, inverse, title-slide .title[ # Advanced quantitative data analysis ] .subtitle[ ## OLS ] .author[ ### Mengni Chen ] .institute[ ### Department of Sociology, University of Copenhagen ] --- <style type="text/css"> .remark-slide-content { font-size: 20px; padding: 20px 80px 20px 80px; } .remark-code, .remark-inline-code { background: #f0f0f0; } .remark-code { font-size: 14px; } </style> #Let's get ready ```r #install two new packages install.packages("Rmisc") #provide mean with confidence interval install.packages("estimatr") #for weighted estimation of mean, sd, variance install.packages("texreg") #for weighted correlation coefficient ``` ```r library(tidyverse) # see session 3. library(haven) # Import data, see session 3. library(Hmisc) # Weighting, see session 4. library(estimatr) # Allows us to estimate (cluster-)robust standard errors. library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables. library(Rmisc) #Provide mean with confidence interval library(ggplot2) # Allows us to create nice figures, will teachin in Week 43 ``` --- #Prepare data a dataset of age, sex, relationship status, life satisfaction, weight - DV: life satisfaction - IV: age, sex, relationship status ```r wave1 <- read_dta("anchor1_50percent_Eng.dta") # sample size =6201 ``` ```r #have a initial look at the variables table(wave1$sat6) wave1$sat6 %>% as_factor() %>% table() table(wave1$sex_gen) wave1$sex_gen %>% as_factor() %>% table() table(wave1$relstat) wave1$relstat %>% as_factor() %>% table() ``` --- #Prepare data a dataset of age, sex, relationship status, life satisfaction, weight - DV: life satisfaction - IV: age, sex, relationship status ```r wave1b <- wave1 %>% transmute( age=zap_labels(age), #Independent variable sat6=case_when(sat6<0 ~ as.numeric(NA), #specify when sat should be considered missing TRUE ~ as.numeric(sat6)) %>% zap_label(), #remove labels of sat6 cdweight=zap_label(cdweight), #cdweight is the variable telling the weight sex_gen=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as categorical, and drop unused level relstat=as_factor(relstat), #treat relationship status as categorical relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), TRUE ~ as.character(relstat)) %>% #specify when relstat should be considered missing as_factor() %>% #make relstat as a factor fct_drop() #drop unused levels in relstat ) %>% drop_na() #drop all observations with missing values in the sample # sample size change from 6201 to 6162 ``` --- #The relationship between marital status and life satisfaction - Generate a new variable for marital status ```r wave1c <- wave1b %>% mutate( marital1=case_when( relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried", # when relstat has any of the three situations, I assign "Nevermarried" to new variable "marital1" relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married', # when relstat has any of the two situations, I assign "Married" to new variable "marital1" relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced', # when relstat has any of the three situations, I assign "Divorced" to new variable "marital1" relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow' # when relstat has any of the two situations, I assign "Widow" to new variable "marital1" ) %>% as_factor()# I treat marital1 as a categorical variable ) %>% filter(marital1!= "Widow") #only 4 cases are widowed, dropping. # sample size change to 6158 after dropping those widowed ``` --- #Exploring the relationship between life satisfaction and marital status First, we can estimate the average sat6 across marital status to have a first look .pull-left[ ```r table(wave1c$marital1) #check the frequency of each category ``` ``` ## ## Nevermarried Married Divorced Widow ## 4117 1757 284 0 ``` ```r wave1c$marital1 <- fct_drop(wave1c$marital1) #drop unused level (i.e. widow) mar_sat<- wave1c %>% group_by(marital1) %>% dplyr::summarise( mean_sat6=mean(sat6), #calculate the mean of sat6 by marital1 ) mar_sat ``` ``` ## # A tibble: 3 × 2 ## marital1 mean_sat6 ## <fct> <dbl> ## 1 Nevermarried 7.60 ## 2 Married 7.80 ## 3 Divorced 6.68 ``` ] .pull-right[ ```r ave_sat_ci<- CI(wave1c$sat6) #CI() is a function under "Rmisc" package ave_sat_ci ``` ``` ## upper mean lower ## 7.661064 7.617408 7.573753 ``` ```r mar_sat<- wave1c %>% group_by(marital1) %>% dplyr::summarise( mean_sat6=mean(sat6), uci=CI(sat6)[1], #select the 1st element of uci which indicates upper CI lci=CI(sat6)[3]) #select the 3rd element of uci which indicates lower CI mar_sat ``` ``` ## # A tibble: 3 × 4 ## marital1 mean_sat6 uci lci ## <fct> <dbl> <dbl> <dbl> ## 1 Nevermarried 7.60 7.66 7.55 ## 2 Married 7.80 7.88 7.73 ## 3 Divorced 6.68 6.93 6.42 ``` ] --- #Exploring the relationship between life satisfaction and marital status .pull-left[ <img src="https://github.com/fancycmn/Slide5/blob/main/S5_Pic1.png?raw=true" width="90%" style="display: block; margin: auto;" > ] .pull-right[ ```r #In Week 43, we will talk about the visualization. ggplot(data = mar_sat_ci, mapping = aes(y = mean_sat6, x = marital1)) + geom_bar(stat = "identity") + geom_errorbar(mapping = aes(ymin = lci, ymax = uci), width = 0.55) + labs(y = "Life satisfaction", x = "Marital1")+ scale_y_continuous(limits=c(0, 10),breaks=seq(0,10,2)) ``` ] --- #Model the relationship between marital status and life satisfaction Coming up with a model of life satisfaction: Can life satisfaction (i.e., `\(y\)`) be expressed as a function (i.e., `\(f()\)`) of another variable (i.e., `\(x\)`)? `$$\hat{y} = f(x)$$` The two critical questions are thus: 1. What is `\(f()\)`? 2. What is `\(x\)`? --- # A linear model of `\(y\)` <img src="https://merlin-intro-r.netlify.app/9-ols-i/img/LinearModel.png" width="60%" style="display: block; margin: auto;"> x could be age, health conditions, marital status, income --- # Ordinary Least Squares .pull-left[ We find `\(\alpha\)` and `\(\beta\)` by choosing the one line that minimizes the *residual sum of squares*: `$$\begin{align*} \ & min \text{RSS} = \min \sum_{i=1}^{n} e_{i}^{2} \\ \ & = \min \sum_{i=1}^{n} y_{i} - \hat{y_{i}} \\ \ & = \min \sum_{i=1}^{n} (y_{i} - (\alpha + \beta x_{i}))^{2} \end{align*}$$` ] .pull-right[ <img src="https://miro.medium.com/max/625/0*gglavDlTUWKn4Loe" width="100%" style="display: block; margin: auto;"> ] --- #Assumptions of OLS -- - About **the sample** - Assumption 1: The sample taken for the linear regression model must be drawn randomly from the population. -- - About **the relationship between Y and X** - Assumption 2: The dependent variable (Y) is a linear function of independent variables (X) and the error term. -- - About **the independent variables** - Assumption 3: There is no multi-collinearity (or perfect collinearity). In other words, there should be no linear relationship between the independent variables. -- - About **the error term** - Assumption 4: The distribution of error terms has zero mean and doesn’t depend on the independent variables(Xs). There must be no relationship between the Xs and the error term. - Assumption 5: There is homoscedasticity and no autocorrelation. - Assumption 6: Error terms should be normally distributed. --- # The `lm()` funcion ```r # Estimate a linear OLS model and assign the results to object ols. ols1 <- lm(formula = sat6 ~ age, data = wave1c) # sat6 is DV, should be written on the left side of ~ # age is IV, should be written on the right side of ~ summary(ols1) ``` ``` ## ## Call: ## lm(formula = sat6 ~ age, data = wave1c) ## ## Residuals: ## Min 1Q Median 3Q Max ## -7.8676 -0.8215 0.2016 1.1785 2.6630 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.213634 0.071954 114.151 <2e-16 *** ## age -0.023069 0.002649 -8.708 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.737 on 6156 degrees of freedom ## Multiple R-squared: 0.01217, Adjusted R-squared: 0.01201 ## F-statistic: 75.84 on 1 and 6156 DF, p-value: < 2.2e-16 ``` --- #Weighted linear OLS - The weights argument allows you to use weights. ```r # Estimate a weighted linear OLS model. ols2_weight <- lm(formula = sat6 ~ age, data = wave1c, weights = cdweight)#specify weight here summary(ols2_weight) ``` ``` ## ## Call: ## lm(formula = sat6 ~ age, data = wave1c, weights = cdweight) ## ## Weighted Residuals: ## Min 1Q Median 3Q Max ## -12.7388 -0.6955 0.2364 1.1134 4.9252 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 8.227414 0.077310 106.42 <2e-16 *** ## age -0.025337 0.002766 -9.16 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 1.787 on 6156 degrees of freedom ## Multiple R-squared: 0.01345, Adjusted R-squared: 0.01329 ## F-statistic: 83.9 on 1 and 6156 DF, p-value: < 2.2e-16 ``` --- #Weighted linear OLS with `lm_robust` - OLS based on a cross-sectional dataset often has issue of [heteroscedasticity](https://rpubs.com/cyobero/187387). Therefore,inference from the standard errors, t-values, p-values, and confidence intervals is likely to be wrong. - We thus need to estimate heteroskedasticity-consistent standard errors or simply robust standard errors, by using `estimatr::lm_robust()`. ```r ols2_weight_robust <- lm_robust(formula = sat6 ~ age, data = wave1c, weights = cdweight) summary(ols2_weight_robust) ``` ``` ## ## Call: ## lm_robust(formula = sat6 ~ age, data = wave1c, weights = cdweight) ## ## Weighted, Standard error type: HC2 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF ## (Intercept) 8.22741 0.076592 107.419 0.000e+00 8.07727 8.37756 6156 ## age -0.02534 0.003011 -8.415 4.833e-17 -0.03124 -0.01943 6156 ## ## Multiple R-squared: 0.01345 , Adjusted R-squared: 0.01329 ## F-statistic: 70.81 on 1 and 6156 DF, p-value: < 2.2e-16 ``` --- #Categorical predictors Factors are automatically treated as dummies, with the first level taken as reference. Thus if you want to change the reference, you need to recode the levels of the factor using the `forcats` package. ```r #as marital1 is a factor in wave1c, it is automatically treated as dummies. ols3_cat <- lm_robust(formula = sat6 ~ age + marital1, data = wave1c, weights = cdweight) summary(ols3_cat) ``` ``` ## ## Call: ## lm_robust(formula = sat6 ~ age + marital1, data = wave1c, weights = cdweight) ## ## Weighted, Standard error type: HC2 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF ## (Intercept) 8.54409 0.092680 92.189 0.000e+00 8.36241 8.72578 6154 ## age -0.04426 0.004283 -10.334 7.863e-25 -0.05266 -0.03587 6154 ## marital1Married 0.73801 0.077592 9.511 2.635e-21 0.58590 0.89012 6154 ## marital1Divorced -0.52564 0.168952 -3.111 1.872e-03 -0.85684 -0.19443 6154 ## ## Multiple R-squared: 0.04629 , Adjusted R-squared: 0.04583 ## F-statistic: 58.52 on 3 and 6154 DF, p-value: < 2.2e-16 ``` --- #Categorical predictors recode the levels of marital1 ```r wave1c$marital1 <- fct_relevel(wave1c$marital1, "Divorced", "Nevermarried", "Married") #use fct_level function to re-level marital1, now divorced is the first level and will be treated as the reference category ols3_cat_relevel <- lm_robust(formula = sat6 ~ age + marital1, data = wave1c, weights = cdweight) summary(ols3_cat_relevel) ``` ``` ## ## Call: ## lm_robust(formula = sat6 ~ age + marital1, data = wave1c, weights = cdweight) ## ## Weighted, Standard error type: HC2 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper ## (Intercept) 8.01846 0.215844 37.149 9.858e-273 7.59532 8.44159 ## age -0.04426 0.004283 -10.334 7.863e-25 -0.05266 -0.03587 ## marital1Nevermarried 0.52564 0.168952 3.111 1.872e-03 0.19443 0.85684 ## marital1Married 1.26364 0.161271 7.836 5.462e-15 0.94750 1.57979 ## DF ## (Intercept) 6154 ## age 6154 ## marital1Nevermarried 6154 ## marital1Married 6154 ## ## Multiple R-squared: 0.04629 , Adjusted R-squared: 0.04583 ## F-statistic: 58.52 on 3 and 6154 DF, p-value: < 2.2e-16 ``` --- #Regression tables Present sevearl regresion together; report standard errors, not confidence intervals. ```r texreg::screenreg(list(ols1,ols2_weight, ols2_weight_robust, ols3_cat), include.ci = FALSE, digits = 3) # include.ci=FALSE means don't include CI, digit=3 to specify the digits of your result. ``` ``` ## ## ======================================================================== ## Model 1 Model 2 Model 3 Model 4 ## ------------------------------------------------------------------------ ## (Intercept) 8.214 *** 8.227 *** 8.227 *** 8.544 *** ## (0.072) (0.077) (0.077) (0.093) ## age -0.023 *** -0.025 *** -0.025 *** -0.044 *** ## (0.003) (0.003) (0.003) (0.004) ## marital1Married 0.738 *** ## (0.078) ## marital1Divorced -0.526 ** ## (0.169) ## ------------------------------------------------------------------------ ## R^2 0.012 0.013 0.013 0.046 ## Adj. R^2 0.012 0.013 0.013 0.046 ## Num. obs. 6158 6158 6158 6158 ## RMSE 1.787 1.757 ## ======================================================================== ## *** p < 0.001; ** p < 0.01; * p < 0.05 ``` --- #Export regression tables ```r #Export to word texreg::htmlreg(list(ols1,ols2_weight, ols2_weight_robust, ols3_cat), include.ci = FALSE, digits = 3, file = "MyOLSModels.doc") # Word table ``` ``` ## The table was written to the file 'MyOLSModels.doc'. ``` ```r #Export to html texreg::htmlreg(list(ols1,ols2_weight, ols2_weight_robust, ols3_cat), include.ci = FALSE, file = "MyOLSModels.html") #html ``` ``` ## The table was written to the file 'MyOLSModels.html'. ``` ```r #Export to excel texreg::htmlreg(list(ols1,ols2_weight, ols2_weight_robust, ols3_cat), include.ci = FALSE, file = "MyOLSModels.xls") #excel ``` ``` ## The table was written to the file 'MyOLSModels.xls'. ``` --- #Take home 1. Exploratory analysis: frequency, mean, ci 2. Linear regression - With weight - Robust standard errors 3. Generate regression tables 4. Important code - lm(formula = your DV ~ your IV, data = your dataset) - lm_robust(formula = your DV ~ your IV, data = your dataset, weights= your weight) - texreg::screenreg() --- class: center, middle #[Exercise](https://rpubs.com/fancycmn/1090534)