This is an R Markdown document.
We are analysing the returns of schooling based on 3,010 US men.
# Load packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v stringr 1.4.0
## v tidyr 1.1.0 v forcats 0.5.0
## v readr 1.3.1
## -- Conflicts ------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
# Set working directory
setwd("C:\\Users\\65961\\Desktop\\Coursera\\Econometrics\\Week4")
# Load dataset
data <- read_excel("_46d7b6e393ed4cd5a42ba1ec49adede2_TestExer4_Wage-round1.xlsx", sheet = 1)
Let us find out how the data looks like first.
head(data)
## # A tibble: 6 x 9
## logw educ age exper smsa south nearc daded momed
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.31 7 29 16 1 0 0 9.94 10.2
## 2 6.18 12 27 9 1 0 0 8 8
## 3 6.58 12 34 16 1 0 0 14 12
## 4 5.52 11 27 10 1 0 1 11 12
## 5 6.59 12 34 16 1 0 1 8 7
## 6 6.21 12 26 8 1 0 1 9 12
Next, we visually inspect the two variables to determine if a positive or negative relationship exists.
data %>%
ggplot(aes(x = educ, y = logw)) +
geom_point() +
ggtitle("Scatter Plot between log(Wage) and Education") +
theme(title = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10, colour = "black"),
axis.text = element_text(size = 10, colour = "black"))
For the wage data, the regression model will be \(logw\) = \(\beta\)1 + \(\beta\)2educ + \(\epsilon\)
model1 <- lm(logw ~ educ, data = data)
beta1 <- model1$coef[1]
beta2 <- model1$coef[2]
model1_summary <- summary(model1)
model1_summary_df <- data.frame("Predictors" = names(coef(model1)), "Estimates" =
coef(model1), "T_Value" = summary(model1)$coefficients[,3], "P_Value" = summary(model1)$coefficients[,4])
model1_summary_df
## Predictors Estimates T_Value P_Value
## (Intercept) (Intercept) 5.57088235 143.47049 0.00000e+00
## educ educ 0.05209424 18.15315 5.77145e-70
The estimated value of \(\beta\)2 suggests that, on average, an increase of education by 1 unit, in this case, 1 year, increases the log(Wage) by 0.05209424. Let us add the regression line to the previously plotted scatter plot.
base_plot <- ggplot(aes(x = educ, y = logw), data = data) +
geom_point()
base_plot +
geom_abline(slope = beta2, intercept = beta1, linetype = "solid") +
ggtitle("Scatter Plot Between log(Wage) and Education, including Regression") +
theme(title = element_text(size = 10, colour = "black"),
axis.title = element_text(size = 10, colour = "black"),
axis.text = element_text(size = 10, colour = "black"))
Let us use the predict() function in R to estimate log(Wage) given a number of education values.
new_data_model1 <- data.frame(educ = c(12, 16, 8, 4, 20))
logw_hat <- predict(model1, new_data_model1)
names(logw_hat) <- c("education = 12 years", "16 years", "8 years", "4 years", "20 years")
logw_hat
## education = 12 years 16 years 8 years
## 6.196013 6.404390 5.987636
## 4 years 20 years
## 5.779259 6.612767
We draw several samples from the wage data to calculate the average of \(\beta\)2 since it is a random variable. (\(\beta\)1 is also a random variable).
N <- nrow(data) # Number of observations
C <- 200 # Number of samples
S <- 100 # Size of each sample
sum_beta2 <- 0
for (i in 1:C) {
set.seed(5*i)
sample_set <- data[sample(1:N, size = S, replace = TRUE),]
model1_sample <- lm(logw ~ educ, data = sample_set)
sum_beta2 <- sum_beta2 + coef(model1_sample)[[2]]
}
print(sum_beta2/C, digits = 3)
## [1] 0.051
The result, b = 0.051, is the average of 200 estimates of \(\beta\)2.
The quadratic model is as follows: yi = \(\beta\)1 + \(\beta\)2x2i + ei
model2 <- lm(logw ~ I(educ^2), data = data)
b1 <- coef(model2)[[1]]
b2 <- coef(model2)[[2]]
educx <- c(5, 12, 8, 14, 10) # given values for educ
logwx <- b1 + b2*(educx^2) # logw corresponding to given educ
DlogwDeduc <- 2*b2*educx # marginal effect of educ on logw
elasticity <- DlogwDeduc*educx/logwx
b1; b2; DlogwDeduc; elasticity
## [1] 5.907874
## [1] 0.00193332
## [1] 0.01933320 0.04639967 0.03093312 0.05413295 0.03866639
## [1] 0.01622945 0.09000510 0.04102803 0.12054793 0.06337500
The next chunk of code will visualise how the quadratic model fits the data.
model2_function <- function(x){
b1 <- coef(model2)[[1]]
b2 <- coef(model2)[[2]]
b1 + b2*(x^2)
}
ggplot(aes(x = educ, y = logw), data = data) +
geom_point() +
geom_function(fun = model2_function, colour = "blue") +
ggtitle("Fitting a quadratic model to data") +
xlab("Education in years") +
ylab("Log of Wages")
Let’s see the log-linear model log(yi) = \(\beta\)1 + \(\beta\)2xi + ei .
One of the reasons to use the log of an independent variable is to make its distribution closer to the normal distribution. Let us draw the histogram of logw and log(logw) to compare them.
hist(data$logw, col = 'grey')
hist(log(data$logw), col = 'grey')
We are interested to find out the estimates of the coefficients and their interpretation, in the fitted values of logw, and in the marginal effect of an increase in educ on logw.
model3 <- lm(log(logw) ~ educ, data = data)
beta1 <- coef(model3)[[1]]
beta2 <- coef(model3)[[2]]
DyDx <- beta2/data$logw # Marginal effect of an increase in x on y
elasticity <- beta2*data$educ # Elasticity
beta1; beta2;
## [1] 1.720675
## [1] 0.008387558
The coefficients are b1 = 1.720675 and b2 = 0.008387558, showing that an increase in education (educ) of an individual by one unit (1 year) increases the log(wage) by 0.84 percent. Thus, for a log(wage) of 5, an increase of 3 years in education will increase the log(wage) by approximately 3 * 0.84 percent, which is equal to 0.126 log(wage). The next lines of code show the plot of the fitted values of the log-linear model and how to calculate the marginal effect and elasticity for the median log(wage) in the data. The fitted values are calculated using the formula y = eb1 + b2x .
ordat <- data[order(data$educ), ] # Order the dataset
model4 <- lm(log(logw) ~ educ, data = ordat)
ggplot() +
geom_point(aes(x = educ, y = logw), data = ordat) +
geom_line(aes(x = ordat$educ, y = exp(fitted(model4))), colour = "blue") +
ggtitle("The fitted value curve in the log-linear model")
logwx <- median(data$logw)
educx <- (log(logwx) - coef(model4)[[1]])/coef(model4)[[2]]
(DyDx <- logwx*coef(model4)[[2]])
## [1] 0.05273198
(elasticity <- educx*coef(model4)[[2]])
## [1] 0.1177977
R allows us to calculate the same quantities for several (logw, educ) pairs at a time, as shown in the following sequence:
b1 <- coef(model4)[[1]]
b2 <- coef(model4)[[2]]
# Pick a few values for educ
educx <- c(8, 11, 17)
# Estimates log(wage) for those and add one more
logwx <- c(10, exp(b1+b2*educx))
# Re-calculate educ for all log(wage)
educx <- (log(logwx) - b1)/b2
# Calculate and print elasticities
(elasticities <- b2*educx)
## [1] 0.58191023 0.06710047 0.09226314 0.14258849
A Monte Carlo simulation generates random values for the dependent variable when the regression coefficients and the distribution of the random term are given. We are going to use the values of the predictor variable (educ) in the dataset, and generate the response variable (logw) using the linear model with b1 = 100 and b2 = 10.
N <- nrow(data)
sde <- 50
x <- data$educ
nrsim <- 1000
b1 <- 100
b2 <- 10
vb2 <- numeric(nrsim) # Stores the estimates of b2
for(i in 1:nrsim){
set.seed(12345+10*i)
y <- b1+b2*x+rnorm(N, mean = 0, sd = sde)
model5 <- lm(y~x)
vb2[i] <- coef(model5)[[2]]
}
mb2 <- mean(vb2)
seb2 <- sd(vb2)
The mean and standard deviation of the estimated 3010 values of b2 are 9.996734 and 0.3437239 respectively. The figure below shows the simulated distribution of b2 and the theoretical one.
plot(density(vb2))
curve(dnorm(x, mb2, seb2), col = "red", add = TRUE)
legend("topright", legend = c("true", "simulated"),
lty = 1, col = c("red", "black"))
hist(vb2, prob = TRUE, ylim = c(0, 1.5))
curve(dnorm(x, mean = mb2, sd = seb2), col = "red", add = TRUE)
An interval estimate, which is also known as a confidence interval, is an interval centered on an estimated value, which includes the true parameter with a given probability, say 95%. The given probability follows the formula 100(1-\(\alpha\))%. An interval estimate of b2 based on the t-ratio is calculated as b2 +/- tc * se(b2). Something to be aware is that R has four types of functions related to distributions. They are: p for the cumulative distribution function; d for the density; r for a draw of a random number from the respective distribution and; q for quantile. This first letter is followed by a few letters suggesting what distribution we refer to, such as norm, t, f and chisq.
alpha <- 0.05 # Choose significance level
model6 <- lm(logw ~ educ, data = data)
b2 <- coef(model6)[[2]]
df <- df.residual(model6) # Degrees of freedom
smodel6 <- summary(model6)
seb2 <- coef(smodel6)[2,2] # se(b2)
tc <- qt(1-alpha/2, df)
lowb <- b2 - tc*seb2 # Lower bound
upb <- b2 + tc*seb2 # Upper bound
lowb; upb
## [1] 0.04646745
## [1] 0.05772103
The resulting confidence interval for the coefficient b2 in the model above is (0.04646745, 0.05772103). We can use confint(model) in R to calculate the confidence intervals.
ci <- confint(model6)
print(ci)
## 2.5 % 97.5 %
## (Intercept) 5.49474736 5.64701734
## educ 0.04646745 0.05772103
Hypothesis testing seeks to establish whether the data sample at hand provides sufficient evidence to support a certain conjecture (hypothesis) about a population parameter such as the intercept in a regression model, the slope, or some combination of them. The procedure requires three elements: the hypotheses (the null and the alternative), a test statistic, which in the case of a simple linear regression parameters is the t-ratio, and a significance level, \(\alpha\).
Suppose we believe that \(\beta\)2, the (population) parameters is different from zero. Our hypotheses will then be: H0 : \(\beta\)2 = 0, HA : \(\beta\)2 != 0 .
Let us perform the hypotheses testing.
alpha <- 0.05
model7 <- lm(logw ~ educ, data = data)
smodel7 <- summary(model7)
smodel7
##
## Call:
## lm(formula = logw ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73799 -0.27764 0.02373 0.28839 1.46080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.57088 0.03883 143.47 <2e-16 ***
## educ 0.05209 0.00287 18.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4214 on 3008 degrees of freedom
## Multiple R-squared: 0.09874, Adjusted R-squared: 0.09844
## F-statistic: 329.5 on 1 and 3008 DF, p-value: < 2.2e-16
b2 <- coef(model7)[["educ"]] # The coefficient on educ
seb2 <- sqrt(vcov(model7)[2,2]) # Standard error of b2
df <- df.residual(model7) # Degrees of freedom
t <- b2/seb2
tcr <- qt(1-alpha/2, df) # Critical value
The results t = 18.15315 and tcr = 1.960753 show that t > tcr, and therefore t falls in the rejection region. Hence, we reject the null hypothesis, H0, that \(\beta\)2.
# Plot the density function and the values of t:
curve(dt(x, df), -10000*seb2, 10000*seb2, ylab=" ", xlab="t")
abline(v=c(-tcr, tcr, t), col=c("red", "red", "blue"),
lty=c(2,2,3))
legend("topleft", legend=c("-tcr", "tcr", "t"), col=
c("red", "red", "blue"), lty=c(2, 2, 3))
Suupose we are interested to determine if \(\beta\)2 is greater than 5.5. This conjecture will go into the alternative hypothesis: H0 <= 5.5, HA > 5.5. The procedure is the same as for the two-tail test, but now the whole rejection region is to the right of the critical value tcr.
c <- 5.5
alpha <- 0.05
t <- (b2-c)/seb2
tcr <- qt(1-alpha, df) # Note: alpha is not divided by 2
t; tcr
## [1] -1898.418
## [1] 1.64536
The above shows that tcr = 1.64536, t = -1898.418. Since t does not fall in the rejection region this time, we have insufficient evidence to reject the null hypothesis H0: $<sub>2 <= 5.5.
A left-tail test is not different from the right-tail test (above), but of course the rejection region is to the left of tcr.
Say we are interested to determine if \(\beta\)2 is less than 15. Hence our hypotheses are as follow: H0 >= 15, HA < 15.
c <- 15
alpha <- 0.05
t <- (b2-c)/seb2
tcr <- qt(alpha, df) # Note: alpha is not divided by 2 and not subtract alpha from 1
t; tcr
## [1] -5208.859
## [1] -1.64536
The above results show that t falls into the rejection region, which is to the left of tcr, and hence, we can reject the null hypothesis H0: \(\beta\)2 >= 15.
In the context of a hypothesis test, the p-value is the area outside the calculated t-statistic; it is the probability that the t-ratio takes a value that is more extreme than the calculated one, under the assumption that the null hypothesis is true. We reject the null hypothesis if the p-value is less than a chosen significance level.
For a right-tail test, the p-value is the area to the right of the calculated t; for a left-tail test it is the area to the left of the calculated t; for a two-tail test the p-value is split in two equal amounts: p/2 to the left and p/2 to the right.
p-values are calculated in R by the function pt(t, df), where t is the calculated t-ratio and df is the number of degrees of freedom in the estimated model.
Right-tail test, H0: \(\beta\)2 <= c, HA: \(\beta\)2 > c.
c <- 5.5
t <- (b2-c)/seb2
p <- 1-pt(t, df) # pt() returns p-values
p
## [1] 1
The right-tail test above gives the p-value p = 1.
Left-tail test, H0: \(\beta\)2 >= c, HA: \(\beta\)2 < c.
c <- 15
t <- (b2-c)/seb2
p <- pt(t, df)
p
## [1] 0
The left-tail test above gives p-value p = 0.
Two-tail test, H0: \(\beta\)2 = c, HA: \(\beta\)2 != c.
c <- 0
t <- (b2-c)/seb2
p <- 2*(1-pt(abs(t), df))
t; p
## [1] 18.15315
## [1] 0
The two-tail test above shows p-value p = 0 for a t-ratio t = 18.15315.
R gives the p-values in a standard regression output by calling the summary() function, with a model as an argument, and indexing with $coefficients.
summary(model7)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.57088235 0.038829466 143.47049 0.00000e+00
## educ 0.05209424 0.002869708 18.15315 5.77145e-70
Let us determine the standard error for the wage equation for someone with 12 years in education. Hence, we need to retrieve var(b2) and \(\sigma\), the standard error of regression from the regression output.
alpha <- 0.05
x <- 12
xbar <- mean(data$educ, na.rm = TRUE)
model8 <- lm(logw ~ educ, data = data)
b1 <- coef(model8)[[1]]
b2 <- coef(model8)[[2]]
yhatx <- b1 + b2*x
smodel8 <- summary(model8)
df <- df.residual(model8)
tcr <- qt(1-alpha/2, df)
N <- nobs(model8) # Number of observations
N <- NROW(data)
varb2 <- vcov(model8)[2,2]
sighat2 <- smodel8$sigma^2 # Estimated variance
varf <- sighat2 + sighat2/N + (x - xbar)*varb2 # Forecast variance
sef <- sqrt(varf) # Standard error for forecaast
lb <- yhatx - tcr*sef
ub <- yhatx + tcr*sef
lb; ub; confint(model8)
## [1] 5.369661
## [1] 7.022366
## 2.5 % 97.5 %
## (Intercept) 5.49474736 5.64701734
## educ 0.04646745 0.05772103
The result is that the confidence interval is (5.369661, 7.022366), which is as expected, larger than the confidence interval of the estimated expected value of y.
Let us calculate the confidence intervals of the forecast for all the observations, and plot the upper and lower limits together with the regression line.
sef <- sqrt(sighat2 + sighat2/N + (data$educ - xbar)^2*varb2)
yhatv <- fitted.values(model8)
lbv <- yhatv - tcr*sef
ubv <- yhatv + tcr*sef
xeduc <- data$educ
dplot <- data.frame(xeduc, yhatv, lbv, ubv)
dplotord <- dplot[order(xeduc),]
xmax <- max(dplotord$xeduc, na.rm = TRUE)
xmin <- min(dplotord$xeduc, na.rm = TRUE)
ymax <- max(dplotord$ubv, na.rm = TRUE)
ymin <- min(dplotord$lbv, na.rm = TRUE)
plot(dplotord$xeduc, dplotord$yhatv,
xlim = c(xmin, xmax),
ylim = c(ymin, ymax),
xlab = "educ", ylab = "logw",
type = "l")
lines(dplotord$ubv ~ dplotord$xeduc, lty = 2)
lines(dplotord$lbv ~ dplotord$xeduc, lty = 2)
A different way of finding point and interval estimates for the predicted E(y|x) and forecasted y is to use the predict() function in R. The point estimate is the same for both prediction and forecast, but the interval estimates are very different, as seen below with educ = 12.
educx <- data.frame(educ = 12)
predict(model8, newdata = educx, interval = "confidence", level = 0.95)
## fit lwr upr
## 1 6.196013 6.17936 6.212667
predict(model8, newdata = educx, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 6.196013 5.369606 7.02242
Let us now use the predict() function to plot the confidence and prediction intervals, the regression line and the observations in the dataset.
xmin <- min(data$educ, na.rm = TRUE)
xmax <- max(data$educ, na.rm = TRUE)
educ <- seq(from = xmin, to = xmax)
ypredict <- predict(model8, newdata = data.frame(educ), interval = "confidence")
yforecast <- predict(model8, newdata = data.frame(educ), interval = "predict")
matplot(educ, cbind(ypredict[,1], ypredict[,2], ypredict[,3], yforecast[,2], yforecast[,3]),
type = "l", lty = c(1, 2, 2, 3, 3),
col = c("black", "red", "red", "blue", "blue"),
ylab = "logw", xlab = "educ")
points(data$educ, data$logw)
legend("topleft",
legend = c("E[y|x]", "lwr_pred", "upr_pred", "lwr_forcst", "upr_forcst"),
lty = c(1, 2, 2, 3, 3),
col = c("black", "red", "red", "blue", "blue"))
The figure above presents the predicted and forecasted bands on the same graph, showing they have the same point estimates, and that the forecasted band is much larger than the predicted one. The prediction interval includes the regression line, E[y|x], with a 95% probability. The forecasted interval, on the other hand, should include any true point with a 95% probability.
The total variation of y about its sample mean, SST, can be defined by the equation: SST = SSR + SSE.
The coefficient of determination, R2, is defined by the equation: R2 = SSR/SST = 1 - SSE/SST.
R2 takes values between 0 and 1, with higher values showing a closer fit of the regression line to the data. Let us retrieve the R2 by using either r.squared or summary(), as shown below.
(rsq = smodel8$r.squared)
## [1] 0.09873652
smodel8
##
## Call:
## lm(formula = logw ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73799 -0.27764 0.02373 0.28839 1.46080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.57088 0.03883 143.47 <2e-16 ***
## educ 0.05209 0.00287 18.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4214 on 3008 degrees of freedom
## Multiple R-squared: 0.09874, Adjusted R-squared: 0.09844
## F-statistic: 329.5 on 1 and 3008 DF, p-value: < 2.2e-16
If you need the SSE, or SSR, use the anova() function, which has the structure shown below.
anov <- anova(model8)
dfr <- data.frame(anov)
dfr
## Df Sum.Sq Mean.Sq F.value Pr..F.
## educ 1 58.51538 58.5153774 329.5368 5.77145e-70
## Residuals 3008 534.12630 0.1775686 NA NA
We can retrieve the SSE, SSR and SST from the anova() function as shown below.
sse <- anov[2,2]
ssr <- anov[1,2]
sst = sum(anov[1,2], anov[2,2])
sse; ssr; sst
## [1] 534.1263
## [1] 58.51538
## [1] 592.6417
The anova() results have other useful information like degrees of freedom and the estimated variance, \(\sigma\)2.
df <- anov[2,1]
estimated_var <- anov[2,3]
df; estimated_var
## [1] 3008
## [1] 0.1775686
Non-linear functional forms of regression models are useful when the relationship between two variables seems to be more complex than the linear one. It has the marginal effect, semi-elasticity and elasticity.
One can decide to use a non-linear functional form based on a mathematical model, reasoning or simply inspecting a scatter plot of the data.
In the food expenditure model, it is reasonable to believe that the amount spent on food increases faster at lower incomes than at higher incomes.
In the wage model, it is reasonable to believe that the amount of wage earned increases faster for individuals with lower education than those with higher education. In other words, it increases at an decreasing rate, which makes the regression curve flatten out at higher education.
The logarithmic function fits this profile and is relatively easy to interpret. The general form of a linear-log econometric model is as follows: yi = \(\beta\)1 + \(\beta\)2log(xi) + \(\epsilon\) .
The marginal effect of a change in x on y is the slope of the regression curve as follows: \(\frac{dy}{dx}\) = \(\frac{\beta2}{x}\).
Another measure of interest is the semi-elasticity of y with respect to x, which is given by the following equation: dy = \(\frac{\beta2}{100}\)(%\(\delta\)x). This suggests that a change in x by 1%, changes y by \(\frac{\beta2}{100}\) units of y. This is only determined for small changes in x.
Another quantity of interest is the elasticity of y with respect to x, as shown in the following equation: %\(\delta\)y = \(\frac{\beta2}{y}\)(%\(\delta\)x). This shows that a 1% increase in x produces a \(\frac{\beta2}{y}\) percent change in y.
Let us find these measures from the wage model.
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(xtable)
model9 <- lm(logw ~ log(educ), data = data)
tbl <- data.frame(xtable(model9))
kable(tbl, digits = 5, caption = "Linear-log model output for the *wage* example")
| Estimate | Std..Error | t.value | Pr…t.. | |
|---|---|---|---|---|
| (Intercept) | 4.72203 | 0.08644 | 54.63066 | 0 |
| log(educ) | 0.60111 | 0.03361 | 17.88539 | 0 |
b1 <- coef(model9)[[1]]
b2 <- coef(model9)[[2]]
pmodel9 <- predict(model9, newdata = data.frame(educ), interval = "confidence")
plot(data$educ, data$logw, xlab = "educ", ylab = "logw")
lines(pmodel9[,1]~educ, lty = 1, col = "black")
lines(pmodel9[,2]~educ, lty = 2, col = "red")
lines(pmodel9[,3]~educ, lty = 2, col = "red")
x <- 14 # For an individual with 14 years of education
y <- b1 + b2*log(x)
DyDx <- b2/x # Marginal effect
DyPDx <- b2/100 # Semi-elasticity
PDyPDx <- b2/y # Elasticity
DyDx; DyPDx; PDyPDx
## [1] 0.04293655
## [1] 0.006011116
## [1] 0.0952875
The results for an education of 14 years are as follows: \(\frac{dy}{dx}\) = 0.0429, which indicates that an increase in education of 1 year (one unit of x), increases logw by $0.0429; for a 1% increase in education, that is an increase of 1.4 years,logw increases by $0.00601; and finally, for a 1% increase in education, increases logw by 0.0953%.
Check if regression assumptions were violated.
ehat <- model9$residuals
plot(data$educ, ehat, xlab = "educ", ylab = "residuals")
From the above graph, the spread of residuals of the linear-log model, seems to be higher at higher education, which may indicate that the heteroskedasticity assumption is violated.
Performing exploratory data analysis below:
# First 6 rows of data
data %>%
head()
## # A tibble: 6 x 9
## logw educ age exper smsa south nearc daded momed
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.31 7 29 16 1 0 0 9.94 10.2
## 2 6.18 12 27 9 1 0 0 8 8
## 3 6.58 12 34 16 1 0 0 14 12
## 4 5.52 11 27 10 1 0 1 11 12
## 5 6.59 12 34 16 1 0 1 8 7
## 6 6.21 12 26 8 1 0 1 9 12
# Finding correlations
data_cor <- cor(data)
data_cor %>%
as.data.frame() %>%
mutate(variables1 = rownames(data_cor)) %>%
gather(key = 'variables2', value = 'correlation', -variables1) %>%
filter(correlation != 1) %>%
ggplot(aes(x = variables2, y = variables1, fill = correlation)) +
geom_tile() +
geom_text(aes(label = round(correlation,2)), size = 5, colour = "white") +
ggtitle("Correlation Heatmap") +
theme(title = element_text(size = 10),
axis.title = element_blank(),
axis.text.x = element_text(size = 8, colour = "black", angle = 45),
axis.text.y = element_text(size = 8, colour = "black"))
data %>%
ggplot(aes(x = educ, y = exp(logw))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Let’s try it out with exper.
data %>%
ggplot(aes(x = exper, y = exp(logw))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
How about with age?
data %>%
ggplot(aes(x = age, y = exp(logw))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Let’s try with age-squared.
data %>%
ggplot(aes(x = (age^2), y = exp(logw))) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Performing OLS, with logw (log wage) as dependent variable x2:
model_ols <- lm(logw ~ educ + exper + (exper^2) + smsa + south, data = data)
Summary of model above:
model_ols_df <- data.frame(Variables = names(model_ols$coefficients), Estimates =
model_ols$coefficients, PValues = summary(model_ols)$coef[,4])
model_ols_df
## Variables Estimates PValues
## (Intercept) (Intercept) 4.7884910 0.000000e+00
## educ educ 0.0813259 1.458153e-108
## exper exper 0.0403120 3.308617e-69
## smsa smsa 0.1541393 8.724563e-22
## south south -0.1789280 4.177788e-33
Variables south and smsa have the strongest influence on logw. Analysis to be continued for 2SLS.