This notebook shows the computer related work created for Assignment 1 in EconStats tutorial.

Setup

# load libraries
require("ggplot2")
## Loading required package: ggplot2
require("ggthemes")
## Loading required package: ggthemes
# unloadNamespace("cowplot")
# disable scientific notation
options(scipen = 999)

Problem 2 Plots

https://rpsychologist.com/d3/tdist/

2.c

# Drawing Student's T distribution with rejection areas and critical t and p values
ggplot(data.frame(x = c(-4, 4)), aes(x = x)) +
  stat_function(fun = dt,args = list(df = 38), geom = "area", fill="gray", alpha=0.5) +
  geom_segment(aes(x =  1.921, y = 0, xend =  1.921, yend = 0.064), color="skyblue3", size=1) +
  stat_function(fun = dt, args = list(df = 38), size=1) +
  labs(title="Student's t Distribution, dof=38", x="t", y="density") +
  stat_function(fun = dt,args = list(df = 38), 
                xlim = c(-4,-2.02), geom = "area", fill="red", alpha=0.5) +
  stat_function(fun = dt,args = list(df = 38), 
                xlim = c(2.02,4), geom = "area", fill="red", alpha=0.5) +
  theme_stata()

2.d

# Drawing Student's T distribution with rejection areas and critical t and p values
ggplot(data.frame(x = c(-4, 4)), aes(x = x)) +
  stat_function(fun = dt,args = list(df = 38), geom = "area", fill="gray", alpha=0.5) +
  stat_function(fun = dt,args = list(df = 38), 
                xlim = c(1.69,4), geom = "area", fill="red", alpha=0.35) +
  geom_segment(aes(x =  1.921, y = 0, xend =  1.921, yend = 0.064), color="skyblue1", size=1) +
  labs(title="Student's t Distribution, dof=38", x="t", y="density") +
  stat_function(fun = dt, args = list(df = 38), size=1) + 
  theme_stata()

Problem 3

# downloaded stockton2 dataset from: https://github.com/ccolonescu/PoEdata/blob/master/data/stockton2.rda
load("./data/stockton2.rda")
dfs <- stockton2
rm(stockton2)
summary(dfs)
##      price             sqft           beds          baths      
##  Min.   : 50000   Min.   : 704   Min.   :1.00   Min.   :1.000  
##  1st Qu.: 79900   1st Qu.:1201   1st Qu.:3.00   1st Qu.:2.000  
##  Median : 99950   Median :1516   Median :3.00   Median :2.000  
##  Mean   :112811   Mean   :1612   Mean   :3.17   Mean   :2.055  
##  3rd Qu.:128000   3rd Qu.:1874   3rd Qu.:4.00   3rd Qu.:2.000  
##  Max.   :500000   Max.   :4300   Max.   :6.00   Max.   :5.000  
##       age           stories          vacant      
##  Min.   : 0.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 8.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :18.00   Median :1.000   Median :1.0000  
##  Mean   :24.56   Mean   :1.207   Mean   :0.5284  
##  3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :96.00   Max.   :2.000   Max.   :1.0000
head(dfs)

3.a

Estimate the log-linear model ln(PRICE)=B1+B2SQFT+e. Interpret the estimated model parameters. Calculate the slope and elasticity at the sample means, if necessary.

# log-linear
lm1 <- lm(I(log(price)) ~ sqft, data=dfs)
summary(lm1)
## 
## Call:
## lm(formula = I(log(price)) ~ sqft, data = dfs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71030 -0.11982 -0.01392  0.11576  0.90866 
## 
## Coefficients:
##                Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept) 10.59378787  0.02185000   484.8 <0.0000000000000002 ***
## sqft         0.00059596  0.00001287    46.3 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.203 on 878 degrees of freedom
## Multiple R-squared:  0.7094, Adjusted R-squared:  0.7091 
## F-statistic:  2143 on 1 and 878 DF,  p-value: < 0.00000000000000022

a: slope and elasticity

For a log-linear model, the slope is \(b_2*y\), and Elasticity is \(b_2*x\); about the sample means.

# sample means:
y_mean = mean(log(dfs$price))
x_mean = mean(dfs$sqft)
# save b2
b2_lm1 <- lm1$coefficients[2]

# log-linear slope: b_2*y
print(paste("Slope:",b2_lm1*y_mean))
## [1] "Slope: 0.00688603001597391"
# log-linear elasticity: b_2*x
print(paste("Elasticity:",b2_lm1*x_mean))
## [1] "Elasticity: 0.960673215726386"

dPRICE/dSQFT = 67.23 The slope coefficient b2=0.000596 suggests that an increase of a unit 1 sqft is associated with a 0.06% increase in the price of the house.

3.b

Estimate the log-log model (PRICE)=1+2(SQFT)+e. Interpret the estimated parameters. Calculate the slope and elasticity at the sample means, if necessary.

# log-log
lm2 <- lm(I(log(price)) ~ I(log(sqft)), data=dfs)
summary(lm2)
## 
## Call:
## lm(formula = I(log(price)) ~ I(log(sqft)), data = dfs)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.75190 -0.13022 -0.01824  0.11806  0.86244 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   4.17068    0.16551   25.20 <0.0000000000000002 ***
## I(log(sqft))  1.00658    0.02254   44.65 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2083 on 878 degrees of freedom
## Multiple R-squared:  0.6943, Adjusted R-squared:  0.6939 
## F-statistic:  1994 on 1 and 878 DF,  p-value: < 0.00000000000000022

The coefficient 1.00658 says that an increase in living area of 1% is associated with a 1increase in house price.

b: slope and elasticity

For a log-log model, the slope is \(b_2*y/x\), and Elasticity is \(b_2\); about the sample means.

# sample means:
y_mean = mean(log(dfs$price))
x_mean = mean(log(dfs$sqft))
b2_lm1 <- lm1$coefficients[2] # save b2
# log-linear slope: b_2*y
print(paste("Slope:",b2_lm1/x_mean))
## [1] "Slope: 0.0000812436216765968"
# log-linear elasticity: b_2*x
print(paste("Elasticity:",b2_lm1))
## [1] "Elasticity: 0.000595962889691047"

dPRICE/dSQFT = 70.444 The coefficient b2 in a log-log model, 1.0066, is the elasticity. Slope at x_mean is 0.0000812436216765968.

3.c

Compare the R2-value from the linear model PRICE=B1+B2*SQFT+e to the ‘‘generalized’’ R2 measure for the models in (a) and (b).

# linear model
lm_lin <- lm(price ~ sqft, data=dfs)
summary(lm_lin)
## 
## Call:
## lm(formula = price ~ sqft, data = dfs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101224  -16117   -2668   12089  204280 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -18385.650   3256.424  -5.646         0.0000000222 ***
## sqft            81.389      1.918  42.423 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30260 on 878 degrees of freedom
## Multiple R-squared:  0.6721, Adjusted R-squared:  0.6717 
## F-statistic:  1800 on 1 and 878 DF,  p-value: < 0.00000000000000022

R-squared: 0.6721

Generalized R^2: correlation between predicted and true y values, squared: [corr(y*y^)]^2 Predicted values need to be corrected to y_c: by multiplying by e raised to the power of error_variance/2.

# model in A - log-linear:
generalized_log_r2 <- function(lm) {
  pred_y <- exp(predict(lm)) # get prediction and transform back from log by taking antilog
  pred_y <- pred_y*exp(var(lm$residuals)/2) # correct it: multiply by exp(error_variance/2)
  return((cor(dfs$price, pred_y))^2)
}
print(paste("Model A - linear-log:", generalized_log_r2(lm1))) 
## [1] "Model A - linear-log: 0.714787705863015"
print(paste("Model B - log-log:", generalized_log_r2(lm2))) 
## [1] "Model B - log-log: 0.672551299065629"

The Linear model’s R-squared was lower (worse) than the log-linear or the log-log model (the linear model had a Multiple R-squared: 0.6721, Adjusted R-squared: 0.6717). The log-linear model had the best generalized R-squared, but only marginally so. That might mean that (1) from these models, we might want to choose the log-linear models, and that (2) it is not a very good fit, and we should try to make a better model, for example using polynomials or other features.

3.D

Construct histograms of the least squares residuals from each of the models in (a), (b), and (c) and obtain the Jarque–Bera statistics. Based on your observations, do you consider the distributions of the residuals to be compatible with an assumption of normality?

# load the Jarque-Bera Test
#install.packages("tseries")
require("tseries")
## Loading required package: tseries
#install.packages("DescTools")
require(DescTools)
## Loading required package: DescTools
hist(lm1$residuals, breaks = 30) 

jarque.bera.test(lm1$residuals)
## 
##  Jarque Bera Test
## 
## data:  lm1$residuals
## X-squared = 78.854, df = 2, p-value < 0.00000000000000022
Jarque Bera Test

data: lm1$residuals X-squared = 78.854, df = 2, p-value < 2.2e-16

hist(lm2$residuals, breaks = 30)

jarque.bera.test(lm2$residuals)
## 
##  Jarque Bera Test
## 
## data:  lm2$residuals
## X-squared = 52.744, df = 2, p-value = 0.000000000003523
hist(lm_lin$residuals, breaks = 30)

jarque.bera.test(lm_lin$residuals)
## 
##  Jarque Bera Test
## 
## data:  lm_lin$residuals
## X-squared = 2455.9, df = 2, p-value < 0.00000000000000022

However, the JB test is often too conservative. We can see that by seeing the many of the historgrams look relatively like a normal distribution, yet the JB test rejects that hypothesis of normality. There are other more general tests to use in this case, such as the Shapiro-Wilk normality test. The Shapiro-Wilk test tends to have high power under a broad range of useful alternatives. It usually performs well and was highly regarded in other studies of power, but it’s not always the best test since it depends on the context. I will use it since it proves powerful often and is familiar to many readers.

See: Chen, L. and Shapiro, S. (1995) “An Alternative test for normality based on normalized spacings.” Journal of Statistical Computation and Simulation 53, 269-287.

Let’s see what will it say:

library(stats)
stats::shapiro.test(lm2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm2$residuals
## W = 0.9876, p-value = 0.0000008734

The null hypothesis for this test is that the data are normally distributed. Since the p-value is below 0.05, we reject the null hypothesis and deduct, also by this test, that the residuals do NOT come from a normally distributed population.

JB test: https://davegiles.blogspot.com/2014/02/some-things-you-should-know-about.html

3.E - plotting residuals against sqft

# first, let's predict values for each observation, using each model, and insert to the database

# we need to transform back from log to exp.
dfs[["a_ypred"]] <- exp(predict(lm1, dfs)) 
dfs[["b_ypred"]] <- exp(predict(lm2, dfs))
dfs[["c_ypred"]] <- predict(lm_lin, dfs)
## model a
require(ggplot2, ggthemes)
require(ggthemes)

qplot(x=dfs$sqft, y=lm1$residuals, main = "Residuals Plot of Model A: Log-Linear") + theme_stata()

qplot(x=dfs$sqft, y=lm2$residuals, main = "Residuals Plot of Model B: Log-Log") + theme_stata()

qplot(x=dfs$sqft, y=lm_lin$residuals, main = "Residuals Plot of Model C: Linear") + theme_stata()

3.F

y_correct <- function(lm, sqftval) {
  # get prediction and transform back from log by taking antilog
  pred_y <- predict(lm, newdata = data.frame("sqft"=sqftval)) # predict 
  pred_y <- exp(pred_y) # antilog
  # correct it: multiply by exp(error_variance/2)
  pred_y <- pred_y*exp(var(lm$residuals)/2)
  return(pred_y)
}
sqftval=2700
predA_2700 <- y_correct(lm=lm1, sqftval=2700)
predB_2700 <- y_correct(lm=lm2, sqftval=2700)
predC_2700 <- predict(lm_lin, newdata = data.frame("sqft"=2700))
print(paste("Model A predicted for x = 2700, price =", predA_2700))
## [1] "Model A predicted for x = 2700, price = 203511.038783717"
print(paste("Model B predicted for x = 2700, price =", predB_2700))
## [1] "Model B predicted for x = 2700, price = 188216.082373304"
print(paste("Model C predicted for x = 2700, price =", predC_2700))
## [1] "Model C predicted for x = 2700, price = 201364.619818514"

3.h

For each model in (a)–(c), construct a 95% prediction interval for the value of a house with 2700 square feet.

print("Model A (log-linear) prediction interval for sqft (x)=2700, price (y):")
## [1] "Model A (log-linear) prediction interval for sqft (x)=2700, price (y):"
ci_corrected <- function(lm, sqftval) {
  # get prediction and transform back from log by taking antilog
  ci <- exp(predict(lm, newdata = data.frame("sqft"=sqftval), interval="confidence"))  
  # correct it: multiply by exp(error_variance/2)
  #ci <- ci*exp(var(lm$residuals)/2)
  width = ci[3] - ci[2]
  print(ci)
  print(paste("width of CI = ", width))
  print("--------------------------------")
}

sqftval = 2770
ci_corrected(lm1, sqftval)
##        fit      lwr      upr
## 1 207856.9 201271.7 214657.5
## [1] "width of CI =  13385.763821732"
## [1] "--------------------------------"
ci_corrected(lm2, sqftval)
##        fit      lwr      upr
## 1 188990.2 183485.5 194660.1
## [1] "width of CI =  11174.5815406788"
## [1] "--------------------------------"
nrow(dfs)
## [1] 880
print("Model C (linear) prediction interval for sqft (x)=2700, price (y):")
## [1] "Model C (linear) prediction interval for sqft (x)=2700, price (y):"
ci <- predict(lm_lin, newdata=data.frame(sqft=sqftval), interval="confidence")
width = ci[3] - ci[2]
print(ci)
##        fit      lwr      upr
## 1 207061.8 202263.8 211859.9
print(paste("width of CI = ", width))
## [1] "width of CI =  9596.06878955109"
#print(paste("[",exp(11.7246728327), ",", exp(12.5227271673), "]"))
var(lm1$residuals)
## [1] 0.04117571
# require(cowplot)
print(mean(dfs$sqft)) 
## [1] 1611.968
qplot(dfs$sqft, main="Histogram of SQFT Values", binwidth=100)

Plotting models

# first, let's predict values for each observation, using each model, and insert to the database

# we need to transform back from log to exp.
dfs[["a_ypred"]] <- exp(predict(lm1, dfs)) 
dfs[["b_ypred"]] <- exp(predict(lm2, dfs))
dfs[["c_ypred"]] <- predict(lm_lin, dfs)
## model a
require(ggplot2)
require(ggthemes)

ggplot(data=dfs, aes(x=sqft, y=a_ypred)) +
  geom_point(aes(y=price), color="orange", alpha=0.2) +
  geom_point(color="blue") +
  labs(title="Model A: Log-Linear") +
  theme_stata()

ggplot(data=dfs, aes(x=sqft, y=b_ypred)) +
  geom_point(aes(y=price), color="orange", alpha=0.2) +
  geom_point(color="blue") +
  labs(title="Model B: Log-Log") +
  theme_stata()

ggplot(data=dfs, aes(x=sqft, y=c_ypred)) +
  geom_point(aes(y=price), color="orange", alpha=0.2) +
  geom_point(color="blue") +
  labs(title="Model C: Linear") +
  theme_stata()