# load useful libraries
library(dplyr)
library(AER)
library(readxl)

Linear Regressions

Ritvars <- read_excel("Ritvars.xls")
head(Ritvars)
## # A tibble: 6 × 9
##      ID TEST1 TEST2 IMPROVE DOSAGE DRUGDUM FEMALE   AGE INTERVAL
##   <dbl> <dbl> <dbl>   <dbl>  <dbl>   <dbl>  <dbl> <dbl>    <dbl>
## 1     1    75   100      25  0.452       1      0   108    0.592
## 2     2    80    80       0  0.550       1      1    90    0.329
## 3     3    80    70     -10  0.508       1      1   108    0.362
## 4     4    80    90      10  0.478       1      0   138    0.592
## 5     5    75    75       0  0.423       1      0    87    0.822
## 6     6    90   100      10  0.452       1      0   132    0.690

\[Improvement = (Dosage + Female + Dosage*Female + 1)* \theta\]

\[Y = \theta X + e\]

\[\hat{\theta_{OLS}} = (X'X)^{-1}(X'Y)\]

\[\hat{e} = Y - X\hat{\theta_{OLS}}\]

\[s^2=\frac{\hat{e}'\hat{e}}{(N-K)}\]

\[s.e(\hat{\theta_{OLS}}) = diag(s^2((X'X)^{-1}))^{1/2}\]

N = nrow(Ritvars) # number of observations
X = matrix(0,nrow=N,ncol=4) #innitialize matrix X, N rows, 4 cols
Y = matrix(0,nrow=N,ncol=1) #innitialize matrix Y
X[,1] = Ritvars$DOSAGE # first col: dosage
X[,2] = Ritvars$FEMALE # second col: female
X[,3] = Ritvars$DOSAGE*Ritvars$FEMALE # third col: interaction of dosage*female
X[,4] = 1 # fourth col: intercept 1
Y = Ritvars$IMPROVE # dependent variable: Improve
theta = solve(t(X)%*%X)%*%(t(X)%*%Y)
theta
##              [,1]
## [1,]  16.50885317
## [2,]   0.62127747
## [3,] -15.90416349
## [4,]   0.09649317
error = Y-X%*%theta
head(error)
##            [,1]
## [1,]  17.441505
## [2,]  -1.050350
## [3,] -11.024953
## [4,]   2.012275
## [5,]  -7.079738
## [6,]   2.441505
sample_error = sum(error^2)/(nrow(Ritvars)-2)
sd_error = sqrt(diag(sample_error*(solve(t(X)%*%X))))
sd_error
## [1]  6.510093  4.756656 12.790536  2.366407
t = theta/sd_error  # t value
p = 2*pt(-abs(t),df=N-1) # p value
p[3]
## [1] 0.218316

Instrumental Variables

data("CollegeDistance")
cd<-CollegeDistance
head(cd)
##   gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1   male     other 39.15      yes       no  yes   yes   6.2 8.09      0.2
## 2 female     other 48.87       no       no  yes   yes   6.2 8.09      0.2
## 3   male     other 48.74       no       no  yes   yes   6.2 8.09      0.2
## 4   male      afam 40.40       no       no  yes   yes   6.2 8.09      0.2
## 5 female     other 40.48       no       no   no   yes   5.6 8.09      0.4
## 6   male     other 54.71       no       no  yes   yes   5.6 8.09      0.4
##   tuition education income region
## 1 0.88915        12   high  other
## 2 0.88915        12    low  other
## 3 0.88915        12    low  other
## 4 0.88915        12    low  other
## 5 0.88915        13    low  other
## 6 0.88915        12    low  other
N_cd = nrow(cd)
X_cd = data.matrix(select(cd,urban ,gender , unemp, education))
X_cd = cbind(X_cd,intercept=1)
Y_cd = data.matrix(select(cd,wage))
ols_cd = solve(t(X_cd)%*%X_cd)%*%(t(X_cd)%*%Y_cd)
error_cd = Y_cd-X_cd%*%ols_cd
sample_error_cd = sum(error_cd^2)/(N_cd-4)
sd_error_cd = sqrt(diag(sample_error_cd*(solve(t(X_cd)%*%X_cd))))
ols_cd
##                  wage
## urban     -0.05897659
## gender    -0.09366804
## unemp      0.12985075
## education  0.02040418
## intercept  8.45004015
sd_error_cd 
##       urban      gender       unemp   education   intercept 
## 0.044507264 0.037768018 0.006810907 0.010503906 0.176982413

Instrumental Variables: Two stage least square

\[First Stage:E \rightarrow W\] \[ Compute \hat{E}: fitted value of E\]

\[ Compute X\_W = X-E+\hat{E}\]

\[Second Stage: Y \rightarrow X\_W\]

E_cd = data.matrix(select(cd,education))
W_cd = data.matrix(select(cd,urban ,gender , unemp, distance))
W_cd = cbind(W_cd,intercept=1)
# first stage: regress Education on Distance
iv_1 = solve(t(W_cd)%*%W_cd)%*%(t(W_cd)%*%E_cd)
# fitted value of education
fitted_E =  W_cd %*% iv_1
X_W_cd = cbind(X_cd[,c(1:3)],fitted_E)
X_W_cd = cbind(X_W_cd,intercept=1)
X_cd[1,]
##     urban    gender     unemp education intercept 
##       2.0       1.0       6.2      12.0       1.0
fitted_E[1,]
## [1] 13.80189
X_W_cd[1,]
##     urban    gender     unemp education intercept 
##   2.00000   1.00000   6.20000  13.80189   1.00000

Second stage: regress Y on X_W

iv_2 = solve(t(X_W_cd)%*%X_W_cd)%*%(t(X_W_cd)%*%Y_cd)
iv_2
##                  wage
## urban     -0.01490041
## gender    -0.07132907
## unemp      0.13634311
## education  0.67563584
## intercept -0.73550785

Instrumental Variables: IV method

\[\beta = (X'W(W'W)^{-1}W'X)^{-1}X'W(W'W)^{-1}W'Y\]

XWWWW = t(X_cd)%*%W_cd%*%solve(t(W_cd)%*%W_cd)%*%t(W_cd) #part of the calculation

iv_ivm = solve(XWWWW%*%X_cd)%*%XWWWW%*%Y_cd
  • Result using IV method
iv_ivm
##                  wage
## urban     -0.01490041
## gender    -0.07132907
## unemp      0.13634311
## education  0.67563584
## intercept -0.73550785