# load useful libraries
library(dplyr)
library(AER)
library(readxl)
Ritvars <- read_excel("Ritvars.xls")
head(Ritvars)
## # A tibble: 6 × 9
## ID TEST1 TEST2 IMPROVE DOSAGE DRUGDUM FEMALE AGE INTERVAL
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 75 100 25 0.452 1 0 108 0.592
## 2 2 80 80 0 0.550 1 1 90 0.329
## 3 3 80 70 -10 0.508 1 1 108 0.362
## 4 4 80 90 10 0.478 1 0 138 0.592
## 5 5 75 75 0 0.423 1 0 87 0.822
## 6 6 90 100 10 0.452 1 0 132 0.690
\[Improvement = (Dosage + Female + Dosage*Female + 1)* \theta\]
\[Y = \theta X + e\]
\[\hat{\theta_{OLS}} = (X'X)^{-1}(X'Y)\]
\[\hat{e} = Y - X\hat{\theta_{OLS}}\]
\[s^2=\frac{\hat{e}'\hat{e}}{(N-K)}\]
\[s.e(\hat{\theta_{OLS}}) = diag(s^2((X'X)^{-1}))^{1/2}\]
N = nrow(Ritvars) # number of observations
X = matrix(0,nrow=N,ncol=4) #innitialize matrix X, N rows, 4 cols
Y = matrix(0,nrow=N,ncol=1) #innitialize matrix Y
X[,1] = Ritvars$DOSAGE # first col: dosage
X[,2] = Ritvars$FEMALE # second col: female
X[,3] = Ritvars$DOSAGE*Ritvars$FEMALE # third col: interaction of dosage*female
X[,4] = 1 # fourth col: intercept 1
Y = Ritvars$IMPROVE # dependent variable: Improve
theta = solve(t(X)%*%X)%*%(t(X)%*%Y)
theta
## [,1]
## [1,] 16.50885317
## [2,] 0.62127747
## [3,] -15.90416349
## [4,] 0.09649317
error = Y-X%*%theta
head(error)
## [,1]
## [1,] 17.441505
## [2,] -1.050350
## [3,] -11.024953
## [4,] 2.012275
## [5,] -7.079738
## [6,] 2.441505
sample_error = sum(error^2)/(nrow(Ritvars)-2)
sd_error = sqrt(diag(sample_error*(solve(t(X)%*%X))))
sd_error
## [1] 6.510093 4.756656 12.790536 2.366407
t = theta/sd_error # t value
p = 2*pt(-abs(t),df=N-1) # p value
p[3]
## [1] 0.218316
data("CollegeDistance")
cd<-CollegeDistance
head(cd)
## gender ethnicity score fcollege mcollege home urban unemp wage distance
## 1 male other 39.15 yes no yes yes 6.2 8.09 0.2
## 2 female other 48.87 no no yes yes 6.2 8.09 0.2
## 3 male other 48.74 no no yes yes 6.2 8.09 0.2
## 4 male afam 40.40 no no yes yes 6.2 8.09 0.2
## 5 female other 40.48 no no no yes 5.6 8.09 0.4
## 6 male other 54.71 no no yes yes 5.6 8.09 0.4
## tuition education income region
## 1 0.88915 12 high other
## 2 0.88915 12 low other
## 3 0.88915 12 low other
## 4 0.88915 12 low other
## 5 0.88915 13 low other
## 6 0.88915 12 low other
N_cd = nrow(cd)
X_cd = data.matrix(select(cd,urban ,gender , unemp, education))
X_cd = cbind(X_cd,intercept=1)
Y_cd = data.matrix(select(cd,wage))
ols_cd = solve(t(X_cd)%*%X_cd)%*%(t(X_cd)%*%Y_cd)
error_cd = Y_cd-X_cd%*%ols_cd
sample_error_cd = sum(error_cd^2)/(N_cd-4)
sd_error_cd = sqrt(diag(sample_error_cd*(solve(t(X_cd)%*%X_cd))))
ols_cd
## wage
## urban -0.05897659
## gender -0.09366804
## unemp 0.12985075
## education 0.02040418
## intercept 8.45004015
sd_error_cd
## urban gender unemp education intercept
## 0.044507264 0.037768018 0.006810907 0.010503906 0.176982413
\[First Stage:E \rightarrow W\] \[ Compute \hat{E}: fitted value of E\]
\[ Compute X\_W = X-E+\hat{E}\]
\[Second Stage: Y \rightarrow X\_W\]
E_cd = data.matrix(select(cd,education))
W_cd = data.matrix(select(cd,urban ,gender , unemp, distance))
W_cd = cbind(W_cd,intercept=1)
# first stage: regress Education on Distance
iv_1 = solve(t(W_cd)%*%W_cd)%*%(t(W_cd)%*%E_cd)
# fitted value of education
fitted_E = W_cd %*% iv_1
X_W_cd = cbind(X_cd[,c(1:3)],fitted_E)
X_W_cd = cbind(X_W_cd,intercept=1)
X_cd[1,]
## urban gender unemp education intercept
## 2.0 1.0 6.2 12.0 1.0
fitted_E[1,]
## [1] 13.80189
X_W_cd[1,]
## urban gender unemp education intercept
## 2.00000 1.00000 6.20000 13.80189 1.00000
iv_2 = solve(t(X_W_cd)%*%X_W_cd)%*%(t(X_W_cd)%*%Y_cd)
iv_2
## wage
## urban -0.01490041
## gender -0.07132907
## unemp 0.13634311
## education 0.67563584
## intercept -0.73550785
\[\beta = (X'W(W'W)^{-1}W'X)^{-1}X'W(W'W)^{-1}W'Y\]
XWWWW = t(X_cd)%*%W_cd%*%solve(t(W_cd)%*%W_cd)%*%t(W_cd) #part of the calculation
iv_ivm = solve(XWWWW%*%X_cd)%*%XWWWW%*%Y_cd
iv_ivm
## wage
## urban -0.01490041
## gender -0.07132907
## unemp 0.13634311
## education 0.67563584
## intercept -0.73550785