Jon Duan
February 9, 2017
To actually become familiar with R, you may want to work through a short tutorial. The recommended are:
Introduction to R works in web browser
swirl works through the R console.
Applied Econometrics with R AER author teaching econometrics with R.
Using R for Introductory Econometrics R companions for Wooldridge’s textbook.
An Introduction to R, by W. N. Venables, D. M. Smith and the R Core Team, 2015.
Econometrics Using R Free for students at UVic
library(foreign)
phillips <- read.dta('http://www.urfie.net/downloads/phillips.dta')
plot(phillips$year,phillips$inf,type="l",
main="Inflation",xlab="Time",ylab="Inflation")gpa1 <- read.dta('http://fmwww.bc.edu/ec-p/data/wooldridge/gpa1.dta')
lm.1 <- lm(colGPA ~ hsGPA + ACT, data=gpa1)
summary(lm.1)##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85442 -0.24666 -0.02614 0.28127 0.85357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286328 0.340822 3.774 0.000238 ***
## hsGPA 0.453456 0.095813 4.733 5.42e-06 ***
## ACT 0.009426 0.010777 0.875 0.383297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3403 on 138 degrees of freedom
## Multiple R-squared: 0.1764, Adjusted R-squared: 0.1645
## F-statistic: 14.78 on 2 and 138 DF, p-value: 1.526e-06
Basic R
1+1## [1] 2
5*(4-1)^2## [1] 45
sqrt( log(10) )## [1] 1.517427
This R script downloads and installs all packages used at some point.http://www.urfie.net/read/mobile/index.html#p=20
It needs to be run once for each computer/user only
install.packages("AER")
install.packages("car")
install.packages("censReg")
install.packages("dummies")
install.packages("dynlm")
install.packages("bbmle")
install.packages("ggplot2")s.1 <- summary(lm.1); names(s.1)## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
s.1$coefficients## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286327767 0.34082212 3.7741910 2.375872e-04
## hsGPA 0.453455885 0.09581292 4.7327219 5.421580e-06
## ACT 0.009426012 0.01077719 0.8746263 3.832969e-01
year <- c(2008,2009,2010,2011,2012,2013) # Define one x vector for all:
product1<-c(0,3,6,9,7,8); product2<-c(1,2,3,5,9,6); product3<-c(2,4,4,2,3,2) # vector
sales_mat <- cbind(product1,product2,product3) # Define a matrix
rownames(sales_mat) <- year
(sales <- as.data.frame(sales_mat)) # Create a data frame and display it:## product1 product2 product3
## 2008 0 1 2
## 2009 3 2 4
## 2010 6 3 4
## 2011 9 5 2
## 2012 7 9 3
## 2013 8 6 2
# Full data frame (from Data-frames.R, has to be run first)
sales[3,] # row## product1 product2 product3
## 2010 6 3 4
sales[,3] # column## [1] 2 4 4 2 3 2
# or
sales$product3## [1] 2 4 4 2 3 2
# Subset: all years in which sales of product 3 were >=3
subset(sales, product3>=3)## product1 product2 product3
## 2009 3 2 4
## 2010 6 3 4
## 2012 7 9 3
# or
sales[sales$product3>=3,]## product1 product2 product3
## 2009 3 2 4
## 2010 6 3 4
## 2012 7 9 3
head(sales,2)## product1 product2 product3
## 2008 0 1 2
## 2009 3 2 4
tail(sales,2)## product1 product2 product3
## 2012 7 9 3
## 2013 8 6 2
str(sales)## 'data.frame': 6 obs. of 3 variables:
## $ product1: num 0 3 6 9 7 8
## $ product2: num 1 2 3 5 9 6
## $ product3: num 2 4 4 2 3 2
length(sales)## [1] 3
dim(sales)## [1] 6 3
http://www.cyclismo.org/tutorial/R/plotting.html
x<-c(1,3,4,7,8,9); y<-c(0,3,6,9,7,8);
plot(x,y, main="Example for an Outlier") # base plot
points(8,1, pch = 19, col = "red", cex = 1.5) # extra object
abline(a=0.31,b=0.97,lty=2,lwd=2) # extra line
text(7,2,"outlier",pos=3) # add text annotation
arrows(7,2,8,1,length=0.15)data(women) # Load a built-in data called ‘women’
fit = lm(weight ~ height, women) # Run a regression analysis
summary(fit)##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
par(mfrow=c(2,2)) # Change the panel layout to 2 x 2
plot(fit)par(mfrow=c(1,1)) # Change back to 1 x 1library(AER)
coeftest(fit)##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.516667 5.936944 -14.741 1.711e-09 ***
## height 3.450000 0.091136 37.855 1.091e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## a z test (instead of a t test) can be performed by
## a different covariance matrix can be also used:
## either supplied as a function
coeftest(fit, df = Inf, vcov = vcovHC) ## store test results as a list "testres"##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -87.5167 9.0723 -9.6466 < 2.2e-16 ***
## height 3.4500 0.1412 24.4329 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearHypothesis(fit, "height=1", white.adjust="hc0")## Linear hypothesis test
##
## Hypothesis:
## height = 1
##
## Model 1: restricted model
## Model 2: weight ~ height
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 14
## 2 13 1 509.4 8.222e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#anova(fit)#Anova(fit)mrwdata <- read.dta("https://github.com/davidrpugh/econometrics-labs/raw/master/lab-6/mrw1992.dta")
mrwdata$invest=mrwdata$igdp
mrwdata$workpop=mrwdata$popgrowth
mrw_model_1 <- I(log(gdp85) - log(gdp60)) ~ log(gdp60) +
log(invest/100) + log(workpop/100 + 0.05) + log(school/100)
mrw_lm <- lm(mrw_model_1, data = mrwdata)
#coeftest(mrw_lm) # need library(AER)
linearHypothesis(mrw_lm, "log(invest/100)+log(workpop/100 + 0.05)=1")## Linear hypothesis test
##
## Hypothesis:
## log(invest/100) + log(workpop/100 + 0.05) = 1
##
## Model 1: restricted model
## Model 2: I(log(gdp85) - log(gdp60)) ~ log(gdp60) + log(invest/100) + log(workpop/100 +
## 0.05) + log(school/100)
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 100 12.630
## 2 99 11.364 1 1.2662 11.031 0.001256 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library("AER") #install.packages(c("AER", "car", "lme4"))
library(foreign);
lm.1 <- lm(colGPA ~ hsGPA + ACT, data=gpa1)
confint(lm.1,level=.90) # 10% confidence## 5 % 95 %
## (Intercept) 0.721936430 1.85071910
## hsGPA 0.294792534 0.61211924
## ACT -0.008420691 0.02727272
confint(lm.1,level=.95) # 5% confidence## 2.5 % 97.5 %
## (Intercept) 0.61241898 1.96023655
## hsGPA 0.26400467 0.64290710
## ACT -0.01188376 0.03073578
#coefficients(lm.1)[2]
#vcov(lm.1)[2,2]library(TeachingDemos)
conf.level = 0.95
seed = 100
ci.examp(mean.sim = lm.1$coefficients[2], sd = vcov(lm.1)[2,2], n = 25, reps = 50, conf.level = 0.95,
method = "z", lower.conf = (1 - conf.level)/2,
upper.conf = 1 - (1 - conf.level)/2)crime1 <- read.dta("http://fmwww.bc.edu/ec-p/data/wooldridge/crime1.dta")
lm.1 <- lm(narr86 ~ pcnv + ptime86 + qemp86,data=crime1)
# F test
lm.2 <- lm(narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime,data=crime1)
anova(lm.1,lm.2)## Analysis of Variance Table
##
## Model 1: narr86 ~ pcnv + ptime86 + qemp86
## Model 2: narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2721 1927.3
## 2 2719 1924.4 2 2.879 2.0339 0.131
linearHypothesis(lm.2, c("avgsen=0","tottime=0"))## Linear hypothesis test
##
## Hypothesis:
## avgsen = 0
## tottime = 0
##
## Model 1: restricted model
## Model 2: narr86 ~ pcnv + ptime86 + qemp86 + avgsen + tottime
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2721 1927.3
## 2 2719 1924.4 2 2.879 2.0339 0.131
confidenceEllipse(lm.2, which.coef=c("avgsen","tottime"), col="red",
xlim=c(-.05,0.05), ylim=c(-.05,0.05))
## we can add an additional point on the graph using the points() function
points(x=0, y=0, col="blue", pch=19, cex=1.5)fun1 = function(arg1, arg2){
w = arg1 ^ 2
return(arg2 + w)
}
fun1(arg1 = 3, arg2 = 5)## [1] 14
h = seq(from=1, to=8)
s = c()
for(i in 2:10){
s[i] = h[i] * 10
}
s## [1] NA 20 30 40 50 60 70 80 NA NA
w = 3
if( w < 5 ){
d=2
}else if( w < 8){
d=10
}else{
d = 15
}
d## [1] 2
https://www.r-bloggers.com/doing-maximum-likelihood-estimation-by-hand-in-r/
p.parameter <- 0.8 # Bernoulli distribution parameter p
sequence <- rbinom(n=10,size = 1, p.parameter) #sample from Bernoulli distribution by setting size = 1 in binomial distribution likelihood <- function(sequence, p.parameter)
{
likelihood <- 1
for (i in 1:length(sequence))
{
if (sequence[i] == 1)
{
likelihood <- likelihood * p.parameter
}
else
{
likelihood <- likelihood * (1 - p.parameter)
}
}
return(likelihood)
}possible.p <- seq(0, 1, by = 0.001)
library('ggplot2')
qplot(possible.p,
sapply(possible.p, function (p) {likelihood(sequence, p)}),
geom = 'line',
main = 'Likelihood as a Function of P',
xlab = 'P',
ylab = 'Likelihood')(mle.results <- optimize(function(p) {likelihood(sequence, p)},
interval = c(0, 1),
maximum = TRUE))## $maximum
## [1] 0.600002
##
## $objective
## [1] 0.001194394
xvec <- c(2,5,3,7,-3,-2,0) # or some other numbers
#then define a function (which is negative of the log lik)
fn <- function(theta) { sum ( 0.5*(xvec - theta[1])^2/theta[2] + 0.5* log(theta[2]) ) }
# c(0,1) is the initial value, starting points
nlm(fn, theta <- c(0,1), hessian=TRUE) ## $minimum
## [1] 12.00132
##
## $estimate
## [1] 1.714285 11.346934
##
## $gradient
## [1] -4.144835e-08 2.254313e-08
##
## $hessian
## [,1] [,2]
## [1,] 6.169068e-01 -4.602558e-06
## [2,] -4.602558e-06 2.717300e-02
##
## $code
## [1] 1
##
## $iterations
## [1] 19
#or
optim(theta <- c(0,1), fn, hessian=TRUE)## $par
## [1] 1.714538 11.354415
##
## $value
## [1] 12.00132
##
## $counts
## function gradient
## 55 NA
##
## $convergence
## [1] 0
##
## $message
## NULL
##
## $hessian
## [,1] [,2]
## [1,] 6.165003e-01 -1.372058e-05
## [2,] -1.372058e-05 2.711229e-02
https://cran.r-project.org/web/packages/bbmle/index.html
https://www.r-bloggers.com/fitting-a-model-by-maximum-likelihood/
# DGP regression with normal error
set.seed(1001)
N <- 100
x <- runif(N)
y <- 5 * x + 3 + rnorm(N)plot(x, y)
abline(fit, col = "red")fit <- lm(y ~ x)
summary(fit)##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91541 -0.75171 0.02844 0.75250 2.17175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0332 0.1808 16.78 <2e-16 ***
## x 4.8772 0.3117 15.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9501 on 98 degrees of freedom
## Multiple R-squared: 0.7142, Adjusted R-squared: 0.7112
## F-statistic: 244.9 on 1 and 98 DF, p-value: < 2.2e-16
LL <- function(beta0, beta1, mu, sigma) {
# Find residuals
#
R = y - x * beta1 - beta0
#
# Calculate the likelihood for the residuals (with mu and sigma as parameters)
#
R = suppressWarnings(dnorm(R, mu, sigma))
#
# Sum the log likelihoods for all of the data points
#
-sum(log(R))
}library(bbmle)
fit <- mle2(LL, start = list(beta0 = 3, beta1 = 1, mu = 0, sigma = 1))
summary(fit)## Maximum likelihood estimation
##
## Call:
## mle2(minuslogl = LL, start = list(beta0 = 3, beta1 = 1, mu = 0,
## sigma = 1))
##
## Coefficients:
## Estimate Std. Error z value Pr(z)
## beta0 3.016569 0.089493 33.7071 <2e-16 ***
## beta1 4.877402 0.308568 15.8066 <2e-16 ***
## mu 0.016569 0.089493 0.1851 0.8531
## sigma 0.940612 0.066516 14.1412 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## -2 log L: 271.5342
| name | description |
|---|---|
| dname( ) | density or probability function |
| pname( ) | cumulative density function |
| qname( ) | quantile function |
| rname( ) | random deviates |
pnorm(27.4, mean=50, sd=20)## [1] 0.1292381
qnorm(0.95, mean=100, sd=15)## [1] 124.6728
dbinom(27, size=100, prob=0.25)## [1] 0.08064075
# Sample from a standard normal RV with sample size n=5:
rnorm(5)## [1] 0.05697841 0.05790674 -0.75461803 -1.10774250 -0.18851273
# Set the seed of the random number generator and take two samples:
set.seed(6254137)
rnorm(5)## [1] 0.6601307 0.5123161 -0.4616180 -1.3161982 0.1811945
# Sample from a binomial RV with sample size n=10:
sample <- rbinom(10,1,0.5)
sample## [1] 0 0 0 1 1 1 1 0 0 0
# Display the Student's t distributions with various degrees of freedom and compare to the normal distribution
x <- seq(-4, 4, length=100);hx <- dnorm(x);degf <- c(1, 3, 8, 30)
colors <- c("red", "blue", "darkgreen", "gold", "black");labels <- c("df=1", "df=3", "df=8", "df=30", "normal")
plot(x, hx, type="l", lty=2, xlab="x value",
ylab="Density", main="Comparison of t Distributions")
for (i in 1:4){
lines(x, dt(x,degf[i]), lwd=2, col=colors[i])
}
legend("topright", inset=.05, title="Distributions", labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)curve( dnorm(x,0,1), -10, 10, lwd=1, lty=1)
curve( dnorm(x,0,2),add=TRUE, lwd=2, lty=2)
curve( dnorm(x,0,3),add=TRUE, lwd=3, lty=3)
# Add the legend with greek sigma
legend("topleft",expression(sigma==1,sigma==2,sigma==3),lwd=1:3,lty=1:3)
# Add the text with the formula, centered at x=6 and y=0.3
text(6,.3,expression(f(x)==frac(1,sqrt(2*pi)*sigma)*e^{-frac(x^2,2*sigma^2)}))
# Set the random seed
set.seed(123456)
# initialize ybar to a vector of length r=10000 to later store results:
ybar <- numeric(10000)
# repeat 10000 times:
for(j in 1:10000) {
# Draw a sample and store the sample mean in pos. j=1,2,... of ybar:
sample <- rnorm(100,10,2)
ybar[j] <- mean(sample)
}ybar[1:20] # The first 20 of 10000 estimates:## [1] 10.033640 9.913197 10.217455 10.121745 9.837282 10.375066 10.026097
## [8] 9.777042 9.903131 10.012415 9.930439 10.394639 9.642143 10.196132
## [15] 9.804443 10.203723 9.962646 9.620169 9.757859 10.328590
mean(ybar) # Simulated mean:## [1] 9.998861
var(ybar) # Simulated variance:## [1] 0.04034146
plot(density(ybar)) # Simulated density:
curve( dnorm(x,10,sqrt(.04)), add=TRUE,lty=2)beliefs:
P(p < 0.310) = 0.5; P(p < 0.350) = 0.8
a beta prior density that matches beliefs.
#install.packages("LearnBayes")
library(LearnBayes)
beta.select(list(p=.5,x=.31), list(p=.8,x=.35))## [1] 30.89 68.35