# load useful libraries
library(dplyr)
library(AER)
library(readxl)

Linear Regressions

Read in Data

Ritvars <- read_excel("Ritvars.xls")

head(Ritvars)

## # A tibble: 6 × 9
##      ID TEST1 TEST2 IMPROVE DOSAGE DRUGDUM FEMALE   AGE INTERVAL
##   <dbl> <dbl> <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <dbl>    <dbl>
## 1     1    75   100      25  0.452       1      0   108    0.592
## 2     2    80    80       0  0.550       1      1    90    0.329
## 3     3    80    70     -10  0.508       1      1   108    0.362
## 4     4    80    90      10  0.478       1      0   138    0.592
## 5     5    75    75       0  0.423       1      0    87    0.822
## 6     6    90   100      10  0.452       1      0   132    0.690

Regression Funciton for estimating treatment effect with interaction on gender

\[Improvement = (Dosage + Female + Dosage*Female + 1)* \theta\]

Matrix calculation for \(\theta\)

\[Y = \theta X + e\]

\[\hat{\theta_{OLS}} = (X'X)^{-1}(X'Y)\]

Matrix calculation for standard error of \(\theta\)

\[\hat{e} = Y - X\hat{\theta_{OLS}}\]

\[s^2=\frac{\hat{e}'\hat{e}}{(N-K)}\]

\[s.e(\hat{\theta_{OLS}}) = diag(s^2((X'X)^{-1}))^{1/2}\]

Step 1: construct data matrix X & Y

N = nrow(Ritvars) # number of observations
X = matrix(0,nrow=N,ncol=4) #innitialize matrix X, N rows, 4 cols
Y = matrix(0,nrow=N,ncol=1) #innitialize matrix Y
X[,1] = Ritvars$DOSAGE # first col: dosage
X[,2] = Ritvars$FEMALE # second col: female
X[,3] = Ritvars$DOSAGE*Ritvars$FEMALE # third col: interaction of dosage*female
X[,4] = 1 # fourth col: intercept 1
Y = Ritvars$IMPROVE # dependent variable: Improve

Step 2: Calculate \(\theta\)
- t(): transpose of matrix
- solve(): inverse of matrix
- %*% : matrix multiplication

theta = solve(t(X)%*%X)%*%(t(X)%*%Y)
theta

##              [,1]
## [1,]  16.50885317
## [2,]   0.62127747
## [3,] -15.90416349
## [4,]   0.09649317

Step 3: Calculate s.e(\(\theta\))

error = Y-X%*%theta
head(error)

##            [,1]
## [1,]  17.441505
## [2,]  -1.050350
## [3,] -11.024953
## [4,]   2.012275
## [5,]  -7.079738
## [6,]   2.441505

sample_error = sum(error^2)/(nrow(Ritvars)-2)
sd_error = sqrt(diag(sample_error*(solve(t(X)%*%X))))
sd_error

## [1]  6.510093  4.756656 12.790536  2.366407

Step 4: Analysis of significance

t = theta/sd_error  # t value
p = 2*pt(-abs(t),df=N-1) # p value
p[3]

## [1] 0.218316

Conclusion The interaction effect is not significant.

Instrumental Variables

Data: Wage and Education

data("CollegeDistance")
cd<-CollegeDistance
head(cd)

##   gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1   male     other 39.15      yes       no  yes   yes   6.2 8.09      0.2
## 2 female     other 48.87       no       no  yes   yes   6.2 8.09      0.2
## 3   male     other 48.74       no       no  yes   yes   6.2 8.09      0.2
## 4   male      afam 40.40       no       no  yes   yes   6.2 8.09      0.2
## 5 female     other 40.48       no       no   no   yes   5.6 8.09      0.4
## 6   male     other 54.71       no       no  yes   yes   5.6 8.09      0.4
##   tuition education income region
## 1 0.88915        12   high  other
## 2 0.88915        12    low  other
## 3 0.88915        12    low  other
## 4 0.88915        12    low  other
## 5 0.88915        13    low  other
## 6 0.88915        12    low  other

OLS regression \[wage = urban + gender + unemp + education +1\]

N_cd = nrow(cd)
X_cd = data.matrix(select(cd,urban ,gender , unemp, education))
X_cd = cbind(X_cd,intercept=1)
Y_cd = data.matrix(select(cd,wage))
ols_cd = solve(t(X_cd)%*%X_cd)%*%(t(X_cd)%*%Y_cd)
error_cd = Y_cd-X_cd%*%ols_cd
sample_error_cd = sum(error_cd^2)/(N_cd-4)
sd_error_cd = sqrt(diag(sample_error_cd*(solve(t(X_cd)%*%X_cd))))

Result from OLS: no effect of education on wage

ols_cd

##                  wage
## urban     -0.05897659
## gender    -0.09366804
## unemp      0.12985075
## education  0.02040418
## intercept  8.45004015

sd_error_cd

##       urban      gender       unemp   education   intercept 
## 0.044507264 0.037768018 0.006810907 0.010503906 0.176982413

Instrumental Variables: Two stage least square

Instrument for education: distance from college
X: independent Variables
W: intruments
E: endogenous variable
Y: dependent variables

\[First Stage:E \rightarrow W\] \[ Compute \hat{E}: fitted value of E\]

\[ Compute X\_W = X-E+\hat{E}\]

\[Second Stage: Y \rightarrow X\_W\]

First Stage: regress E on W

E_cd = data.matrix(select(cd,education))
W_cd = data.matrix(select(cd,urban ,gender , unemp, distance))
W_cd = cbind(W_cd,intercept=1)
# first stage: regress Education on Distance
iv_1 = solve(t(W_cd)%*%W_cd)%*%(t(W_cd)%*%E_cd)
# fitted value of education

Insert fitted value of E to X, replace orginal E

fitted_E =  W_cd %*% iv_1
X_W_cd = cbind(X_cd[,c(1:3)],fitted_E)
X_W_cd = cbind(X_W_cd,intercept=1)

Example of how replacement works

X_cd[1,]

##     urban    gender     unemp education intercept 
##       2.0       1.0       6.2      12.0       1.0

fitted_E[1,]

## [1] 13.80189

X_W_cd[1,]

##     urban    gender     unemp education intercept 
##   2.00000   1.00000   6.20000  13.80189   1.00000

Second stage: regress Y on X_W

iv_2 = solve(t(X_W_cd)%*%X_W_cd)%*%(t(X_W_cd)%*%Y_cd)

Result from 2-stage iv

iv_2

##                  wage
## urban     -0.01490041
## gender    -0.07132907
## unemp      0.13634311
## education  0.67563584
## intercept -0.73550785

Instrumental Variables: IV method

\[\beta = (X'W(W'W)^{-1}W'X)^{-1}X'W(W'W)^{-1}W'Y\]

XWWWW = t(X_cd)%*%W_cd%*%solve(t(W_cd)%*%W_cd)%*%t(W_cd) #part of the calculation

iv_ivm = solve(XWWWW%*%X_cd)%*%XWWWW%*%Y_cd

Result using IV method

iv_ivm

##                  wage
## urban     -0.01490041
## gender    -0.07132907
## unemp      0.13634311
## education  0.67563584
## intercept -0.73550785

Linear Regression and IV methods in R

Sally Chen

Linear Regressions

Instrumental Variables

Instrumental Variables: Two stage least square

Second stage: regress Y on X_W

Instrumental Variables: IV method