library(haven)
## Warning: package 'haven' was built under R version 4.2.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.2.3
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks rstatix::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(jtools)
## Warning: package 'jtools' was built under R version 4.2.3
library(descr)
## Warning: package 'descr' was built under R version 4.2.3
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.2.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.2.3
# Specify the URL of the Stata file
url <- "https://github.com/drCES/winter_603/raw/main/California%20Real%20Estate%20Data%20-%202015%20Ahmed.dta"

# Import the data into R
data <- read_dta(url)

# View the first few rows of the data
head(data)
## # A tibble: 6 × 7
##   bedrooms bathrooms  sqft   zip  price bedrooms_5 bathrooms_rnd
##      <dbl>     <dbl> <dbl> <dbl>  <dbl> <dbl+lbl>          <dbl>
## 1        2       2    1600 92276 147000 2 [1/2]                2
## 2        2       2    1440 92276  99900 2 [1/2]                2
## 3        2       2    1344 92276  89500 2 [1/2]                2
## 4        2       1.5  1056 92276  79900 2 [1/2]                2
## 5        2       2    1440 92276  94900 2 [1/2]                2
## 6        2       1    1016 94501 679000 2 [1/2]                1

Linear Regression

In lecture, we have been discussing bivariate and multivariate regression. In a previous R tutorial, we reviewed how to estimate a bivariate linear regression. Here, we expand our model to include more than one independent variable. The basic procedure is identical to the bivariate regression except we simply add additional IVs to the model using + sign in our code.

We also look at three ways to review the regression output. First, we use the Base R function summary, which shows the results from the regression in a basic format. We also use stargazer and export_summs from the jtools package to review the results. Stargazer and Jtools help to create more user friendly regression tables. In Week 5, we will more closely review how to customize these tables but for now we will simply review the code on how to use them.

bi_ols<-lm(price ~ sqft, data=data)
multi_ols<-lm(price ~ sqft + bathrooms_rnd, data=data)
summary(multi_ols)
## 
## Call:
## lm(formula = price ~ sqft + bathrooms_rnd, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1019350  -226467   -94665   133561  5222842 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -131713.15   51800.40  -2.543   0.0113 *  
## sqft               93.96      21.45   4.380 1.43e-05 ***
## bathrooms_rnd  178763.95   24854.74   7.192 2.17e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 423900 on 532 degrees of freedom
## Multiple R-squared:  0.309,  Adjusted R-squared:  0.3064 
## F-statistic: 118.9 on 2 and 532 DF,  p-value: < 2.2e-16
export_summs(bi_ols, multi_ols) #Jtools Table
Model 1Model 2
(Intercept)105974.09 *  -131713.15 *  
(41744.52)   (51800.40)   
sqft204.40 ***93.96 ***
(15.68)   (21.45)   
bathrooms_rnd       178763.95 ***
       (24854.74)   
N535       535       
R20.24    0.31    
*** p < 0.001; ** p < 0.01; * p < 0.05.
stargazer (bi_ols, multi_ols, type="text", round=3) #Stargazer table
## 
## =====================================================================
##                                    Dependent variable:               
##                     -------------------------------------------------
##                                           price                      
##                               (1)                      (2)           
## ---------------------------------------------------------------------
## sqft                       204.401***               93.960***        
##                             (15.678)                 (21.453)        
##                                                                      
## bathrooms_rnd                                     178,763.900***     
##                                                    (24,854.740)      
##                                                                      
## Constant                 105,974.100**            -131,713.200**     
##                           (41,744.520)             (51,800.400)      
##                                                                      
## ---------------------------------------------------------------------
## Observations                  535                      535           
## R2                           0.242                    0.309          
## Adjusted R2                  0.240                    0.306          
## Residual Std. Error  443,650.500 (df = 533)   423,934.300 (df = 532) 
## F Statistic         169.974*** (df = 1; 533) 118.941*** (df = 2; 532)
## =====================================================================
## Note:                                     *p<0.1; **p<0.05; ***p<0.01
## 
## =
## 3
## -

Changing Independent Variable Types

Above, we input the numeric variable bathrooms_rnd as a continuous variable. However, with discrete numeric variables that are not purely continuous, using the variable as a continuous variable makes assumptions that might not reflect reality. Specifically, and as discussed in the model building tutorial, inputting an IV as a continuous variable assumes that a 1-unit increase in X has the same effect on Y at all points on the x-axis. With discrete numeric variables like the number of bathrooms in a home, the influence of moving from 1 to 2 bathrooms might not be the same as moving from 4 to 5 bathrooms. In this case, we can input the discrete numeric variable as a factor by directly specifying that in the code, e.g., factor(bathrooms_rnd).

By changing the numeric variable to a factor, we effectively include a-1 new variables in our output, where a = the number of discrete options in the factor variable. With the bathrooms_rnd variable, we have 7 levels ranging from 1 to 7. This means our new model, including bathrooms_rnd as a factor, would have 6 independent variables in the model rather than 1.

From an analytic point of view, the inclusion of numeric variables as factors can take up a lot of degrees of freedom, causing a loss of power in your analysis. Because of this, you should not default to using this approach but rather evaluate the regression first using the variable as a continuous one and then compare those results to the model using the variable as a factor. If the factor results show roughly the same linear increase between each 1-unit increase along the x variable, then you should use the variable as a continuous one.

freq(data$bathrooms_rnd, plot=F)
## data$bathrooms_rnd 
##       Frequency  Percent
## 1            24   4.4860
## 2           231  43.1776
## 3           162  30.2804
## 4            80  14.9533
## 5            30   5.6075
## 6             5   0.9346
## 7             3   0.5607
## Total       535 100.0000
multi_ols2<-lm(price ~ sqft + factor(bathrooms_rnd), data=data)

stargazer (multi_ols, multi_ols2, type="text")
## 
## =======================================================================
##                                      Dependent variable:               
##                        ------------------------------------------------
##                                             price                      
##                                  (1)                      (2)          
## -----------------------------------------------------------------------
## sqft                          93.960***                86.987***       
##                                (21.453)                (21.548)        
##                                                                        
## bathrooms_rnd               178,763.900***                             
##                              (24,854.740)                              
##                                                                        
## factor(bathrooms_rnd)2                                -37,288.430      
##                                                      (90,355.450)      
##                                                                        
## factor(bathrooms_rnd)3                               181,202.000*      
##                                                      (95,495.170)      
##                                                                        
## factor(bathrooms_rnd)4                              396,429.900***     
##                                                      (108,793.900)     
##                                                                        
## factor(bathrooms_rnd)5                              473,971.000***     
##                                                      (130,832.400)     
##                                                                        
## factor(bathrooms_rnd)6                               526,037.100**     
##                                                      (218,118.600)     
##                                                                        
## factor(bathrooms_rnd)7                             1,498,756.000***    
##                                                      (271,963.900)     
##                                                                        
## Constant                    -131,713.200**          245,700.000***     
##                              (51,800.400)            (90,392.150)      
##                                                                        
## -----------------------------------------------------------------------
## Observations                     535                      535          
## R2                              0.309                    0.327         
## Adjusted R2                     0.306                    0.319         
## Residual Std. Error     423,934.300 (df = 532)  420,204.000 (df = 527) 
## F Statistic            118.941*** (df = 2; 532) 36.659*** (df = 7; 527)
## =======================================================================
## Note:                                       *p<0.1; **p<0.05; ***p<0.01

Here, we see the results for the two separate OLS models. Model 1 includes the number of bathrooms as a continuous variable hence the one coefficient to interpret. Model 2 includes the number of bathrooms as a factor variable, which creates 6 coefficients in the new model. It created 6 coefficients because there are 7 unique values in the bathrooms_rnd variable.

The interpretation of the coefficient on bathrooms_rnd in Model 1 is more straightforward. Holding all other variables constant, the home selling price is expected to increase by approximately $179,000 for each additional bathroom. For Model 2, we have to evaluate each individual coefficient, which is a comparison for that level of the factor to the reference level, typically the first option in the variable. Here, we would interpret the coefficient for each number of bathrooms from 2 to 7 compared to homes with 1 bathroom. For instance, the coefficient for a 2-bathroom home is not significant and negative. This indicates that, holding the size of the home constant, there is not a significant difference in average home selling price for a 1 or 2-bathroom home. However, as the number of bathrooms increases to 3, the coefficient is positive and significant, indicating that homes of the same size with 3 bathrooms compared to 1 sell on average for roughly $181,000 more. Each individual coefficient needs to be separately interpreted when included in the model like this.

You can also calculate and graph the predicted home price for both models using the ggeffects package along with the ggpredict function.

continuous<-ggpredict(multi_ols, terms=c("bathrooms_rnd [1:7, by=1]")) #Manually set break points

p1 <- ggplot(continuous, aes(x = x, y = predicted)) +
  geom_line(color = "black") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.05) +
  labs(title = "Predicted Home Selling Price by Bathroom",
       x = "# of Bathrooms",
       y = "Selling Price") +
  theme_minimal() +
  scale_x_continuous(breaks = 1:7)


p1

factor<-ggpredict(multi_ols2, terms=c("bathrooms_rnd [1:7, by=1]")) #Manually set break points

p2 <- ggplot(factor, aes(x = x, y = predicted)) +
  geom_line(color = "black") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.05) +
  labs(title = "Predicted Home Selling Price by Bathroom",
       x = "# of Bathrooms",
       y = "Selling Price") +
  theme_minimal() +
  scale_x_continuous(breaks = 1:7)


p2

combined_plot <- p1 + p2 + plot_layout(ncol = 2)
combined_plot

When we put the two predicted home selling price plots side-by-side, we see how our conclusions would slightly change based on how we include the number of bathrooms. On the left, OLS assumes for a continuous IV that a 1-unit increase has the same impact on Y regardless of where you are on the x-axis. On the right, because we included the number of bathrooms as a factor, we do not have that same underlying assumption. The relationship between bathrooms and home selling price, holding square footage the same, is flat at first before a step increase in selling price from 2 to 4 bathrooms before more slowly increasing after that. This pattern we see on the right is indicative of an independent variable that does not have a linear relationship with the DV. In this model, since we have a sufficient sample size and a simple model, we would want to use the results from the model with the variable included as a factor rather than a continuous variable, since that better reflects the true relationship between the two variables.