R Introduction for Econometrics

Jon Duan

February 9, 2017

Introduction

Try R in Browser

To actually become familiar with R, you may want to work through a short tutorial. The recommended are:

Learn Econometrics in R

Motivation

Motivation: Load Data and Plot for Phillips Curve

library(foreign)
phillips <- read.dta('http://www.urfie.net/downloads/phillips.dta')
plot(phillips$year,phillips$inf,type="l",
     main="Inflation",xlab="Time",ylab="Inflation")

Movtivation: Multiple Regression Analysis (Chapter 3)

gpa1 <- read.dta('http://fmwww.bc.edu/ec-p/data/wooldridge/gpa1.dta')
lm.1 <- lm(colGPA ~ hsGPA + ACT, data=gpa1)
summary(lm.1)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85442 -0.24666 -0.02614  0.28127  0.85357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.286328   0.340822   3.774 0.000238 ***
## hsGPA       0.453456   0.095813   4.733 5.42e-06 ***
## ACT         0.009426   0.010777   0.875 0.383297    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3403 on 138 degrees of freedom
## Multiple R-squared:  0.1764, Adjusted R-squared:  0.1645 
## F-statistic: 14.78 on 2 and 138 DF,  p-value: 1.526e-06

Basic R

Basic R

Basic R

Basic R: R as a Calculator

1+1
## [1] 2
5*(4-1)^2
## [1] 45
sqrt( log(10) )
## [1] 1.517427

Basic R: Install Packages

This R script downloads and installs all packages used at some point.http://www.urfie.net/read/mobile/index.html#p=20

It needs to be run once for each computer/user only

install.packages("AER")
install.packages("car")
install.packages("censReg")
install.packages("dummies")
install.packages("dynlm")
install.packages("bbmle")
install.packages("ggplot2")

Object in R

Object in R: show objecct

s.1 <- summary(lm.1); names(s.1)
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"
s.1$coefficients
##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 1.286327767 0.34082212 3.7741910 2.375872e-04
## hsGPA       0.453455885 0.09581292 4.7327219 5.421580e-06
## ACT         0.009426012 0.01077719 0.8746263 3.832969e-01

Data structure in R

Data in R: Dataframe

year  <- c(2008,2009,2010,2011,2012,2013) # Define one x vector for all:
product1<-c(0,3,6,9,7,8); product2<-c(1,2,3,5,9,6); product3<-c(2,4,4,2,3,2) # vector
sales_mat <- cbind(product1,product2,product3) # Define a matrix
rownames(sales_mat) <- year 
(sales <- as.data.frame(sales_mat)) # Create a data frame and display it:
##      product1 product2 product3
## 2008        0        1        2
## 2009        3        2        4
## 2010        6        3        4
## 2011        9        5        2
## 2012        7        9        3
## 2013        8        6        2

Data in R: Subset dataframe

# Full data frame (from Data-frames.R, has to be run first)
sales[3,] # row
##      product1 product2 product3
## 2010        6        3        4
sales[,3] # column
## [1] 2 4 4 2 3 2
# or
sales$product3
## [1] 2 4 4 2 3 2
# Subset: all years in which sales of product 3 were >=3
subset(sales, product3>=3)
##      product1 product2 product3
## 2009        3        2        4
## 2010        6        3        4
## 2012        7        9        3
# or
sales[sales$product3>=3,]
##      product1 product2 product3
## 2009        3        2        4
## 2010        6        3        4
## 2012        7        9        3

Data in R: Information of dataframe

head(sales,2)
##      product1 product2 product3
## 2008        0        1        2
## 2009        3        2        4
tail(sales,2)
##      product1 product2 product3
## 2012        7        9        3
## 2013        8        6        2
str(sales)
## 'data.frame':    6 obs. of  3 variables:
##  $ product1: num  0 3 6 9 7 8
##  $ product2: num  1 2 3 5 9 6
##  $ product3: num  2 4 4 2 3 2
length(sales)
## [1] 3
dim(sales)
## [1] 6 3

Plot in R

Plot in R: Plot With Detailed Annotation

http://www.cyclismo.org/tutorial/R/plotting.html

x<-c(1,3,4,7,8,9); y<-c(0,3,6,9,7,8);
plot(x,y, main="Example for an Outlier") # base plot
points(8,1, pch = 19, col = "red",  cex = 1.5) # extra object
abline(a=0.31,b=0.97,lty=2,lwd=2) # extra line
text(7,2,"outlier",pos=3) # add text annotation
arrows(7,2,8,1,length=0.15)

Statistics in R

Statistics Inference in R: Regression with Results

data(women) # Load a built-in data called ‘women’
fit = lm(weight ~ height, women) # Run a regression analysis
summary(fit)
## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14

Statistics Inference in R: Diagnostic Plots

par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)

par(mfrow=c(1,1)) # Change back to 1 x 1

Statistics Inference in R: Hypothesis Test

library(AER)
coeftest(fit)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -87.516667   5.936944 -14.741 1.711e-09 ***
## height        3.450000   0.091136  37.855 1.091e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## a z test (instead of a t test) can be performed by
## a different covariance matrix can be also used:
## either supplied as a function
coeftest(fit, df = Inf, vcov = vcovHC) ## store test results as a list "testres"
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept) -87.5167     9.0723 -9.6466 < 2.2e-16 ***
## height        3.4500     0.1412 24.4329 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Statistics Inference in R: Hypothesis Test

linearHypothesis(fit, "height=1", white.adjust="hc0")
## Linear hypothesis test
## 
## Hypothesis:
## height = 1
## 
## Model 1: restricted model
## Model 2: weight ~ height
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df     F    Pr(>F)    
## 1     14                       
## 2     13  1 509.4 8.222e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#anova(fit)#Anova(fit)

Statistics Inference in R: Hypothesis Test

mrwdata <- read.dta("https://github.com/davidrpugh/econometrics-labs/raw/master/lab-6/mrw1992.dta")

mrwdata$invest=mrwdata$igdp 

mrwdata$workpop=mrwdata$popgrowth

mrw_model_1 <- I(log(gdp85) - log(gdp60)) ~ log(gdp60) +
  log(invest/100) + log(workpop/100 + 0.05) + log(school/100)

mrw_lm <- lm(mrw_model_1, data = mrwdata)
#coeftest(mrw_lm) # need library(AER)

linearHypothesis(mrw_lm, "log(invest/100)+log(workpop/100 + 0.05)=1")
## Linear hypothesis test
## 
## Hypothesis:
## log(invest/100)  + log(workpop/100  + 0.05) = 1
## 
## Model 1: restricted model
## Model 2: I(log(gdp85) - log(gdp60)) ~ log(gdp60) + log(invest/100) + log(workpop/100 + 
##     0.05) + log(school/100)
## 
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1    100 12.630                                
## 2     99 11.364  1    1.2662 11.031 0.001256 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Statistics Inference in R: Confidence Interval

library("AER") #install.packages(c("AER", "car", "lme4"))
library(foreign); 
lm.1 <- lm(colGPA ~ hsGPA + ACT, data=gpa1)
confint(lm.1,level=.90) # 10% confidence
##                      5 %       95 %
## (Intercept)  0.721936430 1.85071910
## hsGPA        0.294792534 0.61211924
## ACT         -0.008420691 0.02727272
confint(lm.1,level=.95) # 5% confidence
##                   2.5 %     97.5 %
## (Intercept)  0.61241898 1.96023655
## hsGPA        0.26400467 0.64290710
## ACT         -0.01188376 0.03073578
#coefficients(lm.1)[2]
#vcov(lm.1)[2,2]

Statistics Inference in R: Confidence Interval

library(TeachingDemos)
conf.level = 0.95
seed = 100
ci.examp(mean.sim = lm.1$coefficients[2], sd = vcov(lm.1)[2,2], n = 25, reps = 50, conf.level = 0.95,
         method = "z", lower.conf = (1 - conf.level)/2,
         upper.conf = 1 - (1 - conf.level)/2)

Statistics Inference in R: F test for restricted model

crime1 <- read.dta("http://fmwww.bc.edu/ec-p/data/wooldridge/crime1.dta")
lm.1 <- lm(narr86 ~ pcnv + ptime86 + qemp86,data=crime1)
# F test
lm.2 <- lm(narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime,data=crime1)
anova(lm.1,lm.2)
## Analysis of Variance Table
## 
## Model 1: narr86 ~ pcnv + ptime86 + qemp86
## Model 2: narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   2721 1927.3                           
## 2   2719 1924.4  2     2.879 2.0339  0.131

Statistics Inference in R: F test for restricted model

linearHypothesis(lm.2, c("avgsen=0","tottime=0"))
## Linear hypothesis test
## 
## Hypothesis:
## avgsen = 0
## tottime = 0
## 
## Model 1: restricted model
## Model 2: narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   2721 1927.3                           
## 2   2719 1924.4  2     2.879 2.0339  0.131

Statistics Inference in R: F test for restricted model

confidenceEllipse(lm.2, which.coef=c("avgsen","tottime"), col="red",
                    xlim=c(-.05,0.05), ylim=c(-.05,0.05))
## we can add an additional point on the graph using the points() function

points(x=0, y=0, col="blue", pch=19, cex=1.5)

Programming in R

Programming in R: Writing your own functions

fun1 = function(arg1, arg2){
  w = arg1 ^ 2
  return(arg2 + w)
}
fun1(arg1 = 3, arg2 = 5)
## [1] 14

Control flow in R: For-loop

h = seq(from=1, to=8)
s = c()
for(i in 2:10){
  s[i] = h[i] * 10
}
s
##  [1] NA 20 30 40 50 60 70 80 NA NA

Control flow in R: If-statement

w = 3
if( w < 5 ){
  d=2
  }else if( w  < 8){
  d=10
  }else{
  d = 15
}
d
## [1] 2

MLE in R

MLE by Hand in R

https://www.r-bloggers.com/doing-maximum-likelihood-estimation-by-hand-in-r/

p.parameter <- 0.8 # Bernoulli distribution parameter p
sequence <- rbinom(n=10,size =  1, p.parameter) #sample from Bernoulli distribution by setting size = 1 in binomial distribution 
likelihood <- function(sequence, p.parameter)
{
  likelihood <- 1
 
  for (i in 1:length(sequence))
  {
    if (sequence[i] == 1)
    {
      likelihood <- likelihood * p.parameter
    }
    else
    {
      likelihood <- likelihood * (1 - p.parameter)
    }
  }
 
  return(likelihood)
}

MLE by Hand in R: Likelihood curve

possible.p <- seq(0, 1, by = 0.001)

library('ggplot2')
qplot(possible.p,
      sapply(possible.p, function (p) {likelihood(sequence, p)}),
      geom = 'line',
      main = 'Likelihood as a Function of P',
      xlab = 'P',
      ylab = 'Likelihood')

MLE by Hand in R: solve the optimization problem

(mle.results <- optimize(function(p) {likelihood(sequence, p)},
                        interval = c(0, 1),
                        maximum = TRUE))
## $maximum
## [1] 0.600002
## 
## $objective
## [1] 0.001194394

MLE by Hand in R:

xvec <- c(2,5,3,7,-3,-2,0) # or some other numbers

#then define a function (which is negative of the log lik) 

fn <- function(theta) { sum ( 0.5*(xvec - theta[1])^2/theta[2] + 0.5* log(theta[2]) ) } 

# c(0,1) is the initial value, starting points
nlm(fn, theta <- c(0,1), hessian=TRUE) 
## $minimum
## [1] 12.00132
## 
## $estimate
## [1]  1.714285 11.346934
## 
## $gradient
## [1] -4.144835e-08  2.254313e-08
## 
## $hessian
##               [,1]          [,2]
## [1,]  6.169068e-01 -4.602558e-06
## [2,] -4.602558e-06  2.717300e-02
## 
## $code
## [1] 1
## 
## $iterations
## [1] 19
#or 
optim(theta <- c(0,1), fn, hessian=TRUE)
## $par
## [1]  1.714538 11.354415
## 
## $value
## [1] 12.00132
## 
## $counts
## function gradient 
##       55       NA 
## 
## $convergence
## [1] 0
## 
## $message
## NULL
## 
## $hessian
##               [,1]          [,2]
## [1,]  6.165003e-01 -1.372058e-05
## [2,] -1.372058e-05  2.711229e-02

MLE in R: the bbmle package

https://cran.r-project.org/web/packages/bbmle/index.html

https://www.r-bloggers.com/fitting-a-model-by-maximum-likelihood/

# DGP regression with normal error  
set.seed(1001)
N <- 100
x <- runif(N)
y <- 5 * x + 3 + rnorm(N)

Plot of Data

plot(x, y)
abline(fit, col = "red")

OLS: regression with normal error

fit <- lm(y ~ x)
summary(fit)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91541 -0.75171  0.02844  0.75250  2.17175 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.0332     0.1808   16.78   <2e-16 ***
## x             4.8772     0.3117   15.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9501 on 98 degrees of freedom
## Multiple R-squared:  0.7142, Adjusted R-squared:  0.7112 
## F-statistic: 244.9 on 1 and 98 DF,  p-value: < 2.2e-16

Likelihood function example for normal error

LL <- function(beta0, beta1, mu, sigma) {
    # Find residuals
    #
    R = y - x * beta1 - beta0
    #
    # Calculate the likelihood for the residuals (with mu and sigma as parameters)
    #
    R = suppressWarnings(dnorm(R, mu, sigma))
    #
    # Sum the log likelihoods for all of the data points
    #
    -sum(log(R))
}

MLE for regression

library(bbmle)
 
fit <- mle2(LL, start = list(beta0 = 3, beta1 = 1, mu = 0, sigma = 1))
summary(fit)
## Maximum likelihood estimation
## 
## Call:
## mle2(minuslogl = LL, start = list(beta0 = 3, beta1 = 1, mu = 0, 
##     sigma = 1))
## 
## Coefficients:
##       Estimate Std. Error z value  Pr(z)    
## beta0 3.016569   0.089493 33.7071 <2e-16 ***
## beta1 4.877402   0.308568 15.8066 <2e-16 ***
## mu    0.016569   0.089493  0.1851 0.8531    
## sigma 0.940612   0.066516 14.1412 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## -2 log L: 271.5342

Simulation in R

Simulation in R: Probability Distributions

name description
dname( ) density or probability function
pname( ) cumulative density function
qname( ) quantile function
rname( ) random deviates
pnorm(27.4, mean=50, sd=20)
## [1] 0.1292381
qnorm(0.95, mean=100, sd=15)
## [1] 124.6728
dbinom(27, size=100, prob=0.25)
## [1] 0.08064075

Simulation in R: random number generator

# Sample from a standard normal RV with sample size n=5:
rnorm(5)
## [1]  0.05697841  0.05790674 -0.75461803 -1.10774250 -0.18851273
# Set the seed of the random number generator and take two samples:
set.seed(6254137)
rnorm(5)
## [1]  0.6601307  0.5123161 -0.4616180 -1.3161982  0.1811945
# Sample from a binomial RV with sample size n=10:
sample <- rbinom(10,1,0.5)
sample
##  [1] 0 0 0 1 1 1 1 0 0 0

Simulation in R: Probability Plots

# Display the Student's t distributions with various degrees of freedom and compare to the normal distribution
x <- seq(-4, 4, length=100);hx <- dnorm(x);degf <- c(1, 3, 8, 30)
colors <- c("red", "blue", "darkgreen", "gold", "black");labels <- c("df=1", "df=3", "df=8", "df=30", "normal")

plot(x, hx, type="l", lty=2, xlab="x value",
  ylab="Density", main="Comparison of t Distributions")

for (i in 1:4){
  lines(x, dt(x,degf[i]), lwd=2, col=colors[i])
}

legend("topright", inset=.05, title="Distributions", labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)

Simulation in R: Probability Plots

Simulation in R: Probability Plots

curve( dnorm(x,0,1), -10, 10, lwd=1, lty=1)
curve( dnorm(x,0,2),add=TRUE, lwd=2, lty=2)
curve( dnorm(x,0,3),add=TRUE, lwd=3, lty=3)
# Add the legend with greek sigma
legend("topleft",expression(sigma==1,sigma==2,sigma==3),lwd=1:3,lty=1:3)
# Add the text with the formula, centered at x=6 and y=0.3
text(6,.3,expression(f(x)==frac(1,sqrt(2*pi)*sigma)*e^{-frac(x^2,2*sigma^2)}))

Simulation in R: Probability Plots

Simulation in R: Monte Carlo Simulation

# Set the random seed
set.seed(123456)

# initialize ybar to a vector of length r=10000 to later store results:
ybar <- numeric(10000)

# repeat 10000 times:
for(j in 1:10000) {
  # Draw a sample and store the sample mean in pos. j=1,2,... of ybar: 
  sample <- rnorm(100,10,2)
  ybar[j] <- mean(sample)
}

Simulation in R:

ybar[1:20] # The first 20 of 10000 estimates:
##  [1] 10.033640  9.913197 10.217455 10.121745  9.837282 10.375066 10.026097
##  [8]  9.777042  9.903131 10.012415  9.930439 10.394639  9.642143 10.196132
## [15]  9.804443 10.203723  9.962646  9.620169  9.757859 10.328590
mean(ybar) # Simulated mean:
## [1] 9.998861
var(ybar) # Simulated variance:
## [1] 0.04034146

Simulation in R: Simlation in R

plot(density(ybar)) # Simulated density:
curve( dnorm(x,10,sqrt(.04)), add=TRUE,lty=2)

Bayesian in R

beliefs:

P(p < 0.310) = 0.5; P(p < 0.350) = 0.8

a beta prior density that matches beliefs.

#install.packages("LearnBayes")
library(LearnBayes)
beta.select(list(p=.5,x=.31), list(p=.8,x=.35))
## [1] 30.89 68.35