# Let's load some libraries
packages <- c("lubridate","fpp2", "dplyr", "ggplot2", "ggthemes", "ggrepel", "zoo", "xts","plyr", "tidyr", "tidyverse", "Tmisc", "data.table", "reshape2", "ggpubr")
lapply(packages, library, character.only = T)
# Pathing data
data.path <-"C:/Users/lawso/OneDrive/Desktop/CWU Quarterly Schedule/2018-19/Fall Quarter 18/ECON 424 - Metrics/Data/"
. C8 (page 99) . C10 parts (i) through (iii) (page 100) o Parts (iv) through (vi) are optional, but I’ll give a bit of extra credit if you do them.
discrim <- paste(data.path, "Discrim.RData", sep="")
load(discrim)
discrim <- data
data.frame(
"Mean" = c(
format((mean(discrim$prpblck, na.rm = T)), scientific = F),
as.integer(mean(discrim$income, na.rm = T))),
"SD" = c(
format((sd(discrim$prpblck, na.rm = T)), scientific = F),
as.integer(sd(discrim$income, na.rm = T))),
row.names = c("prpblck", "income")
)
## Mean SD
## prpblck 0.1134864 0.1824165
## income 47053 13179
The averages for “prpblck” and “income” are 0.113 and 47,053, respectively. The standard deviations are likewise, 0.1824 and 13,179.29, respectively. It is apparent that prpblck represents a proportion of the black population, while income is represented in dollar terms.
aapl <- lm(psoda~prpblck+income, data = discrim)
summary(aapl)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.
v <- lm(psoda~prpblck, data = discrim)
v
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Coefficients:
## (Intercept) prpblck
## 1.03740 0.06493
summary(v)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
The estimate of the coefficient on prpblack with the simple regression is 0.065. This is lower than the prior estimate, and therefore shows that the discrimination effect decreases when income is excluded.
lpsoda <- lm(log(psoda)~prpblck+log(income), data = discrim)
lpsoda
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Coefficients:
## (Intercept) prpblck log(income)
## -0.79377 0.12158 0.07651
summary(lpsoda)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
paste( (0.2*100)*0.122, "percent increase")
## [1] "2.44 percent increase"
If “prpblck” increases by 20 percentage points, estimated psoda will increase by 2.44%
sq <- lm(log(psoda)~prpblck+log(income)+prppov, data = discrim)
sq
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Coefficients:
## (Intercept) prpblck log(income) prppov
## -1.46333 0.07281 0.13696 0.38036
Adding prppov causes the prpblck coefficient to fall to 0.0738.
cor(log(discrim$income), discrim$prppov, method = "pearson", use = "complete.obs")
## [1] -0.838467
The correlation is approximately -0.838. This makes sense, because one would expect that declines in income would result in higher poverty rates.
htv <- paste(data.path, "htv.RData", sep="")
load(htv)
htv <- data
santa <- range(htv$educ) #What is the range of the educ variable in the sample?
print(santa[2]-santa[1])
## [1] 14
claus <- count(htv$educ <= 12) # What percentage of men completed twelfth grade but no higher grade?
is <- claus[1,2]
coming <- claus[2,2]
to <- is + coming
town <- coming/to
town
## [1] 0.5674797
mean(htv$educ) #Do the men or their parents have, on average, higher levels of education?
## [1] 13.0374
(mean(htv$motheduc) + mean(htv$fatheduc))/2
## [1] 12.3126
The range of the education variable is 14 and the percentage of men who completed twelfth grade, but no higher, is 56.74%. The average education of the men in the sample is 13.04 years, whereas the average education of their parents is 12.31; therefore, the men on average have higher levels of education.
aobc <- lm(educ~motheduc+fatheduc, data = htv)
aobc
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc, data = htv)
##
## Coefficients:
## (Intercept) motheduc fatheduc
## 6.9644 0.3042 0.1903
summary(aobc)
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc, data = htv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2898 -0.9400 -0.4037 1.1239 8.1672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.96435 0.31982 21.776 <2e-16 ***
## motheduc 0.30420 0.03193 9.528 <2e-16 ***
## fatheduc 0.19029 0.02228 8.539 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.042 on 1227 degrees of freedom
## Multiple R-squared: 0.2493, Adjusted R-squared: 0.248
## F-statistic: 203.7 on 2 and 1227 DF, p-value: < 2.2e-16
Approximately 24.8% of the variation in education is explained by parent’s education. The coefficient on motheduc explains that for every annual increase in a mother’s education, her son’s education will tend to similarly increase by 0.3042 years.
nflx <- lm(educ~motheduc+fatheduc+abil, data = htv)
nflx
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc + abil, data = htv)
##
## Coefficients:
## (Intercept) motheduc fatheduc abil
## 8.4487 0.1891 0.1111 0.5025
summary(nflx)
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc + abil, data = htv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.407 -1.195 -0.199 1.076 7.012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.44869 0.28954 29.180 < 2e-16 ***
## motheduc 0.18913 0.02851 6.635 4.87e-11 ***
## fatheduc 0.11109 0.01988 5.586 2.85e-08 ***
## abil 0.50248 0.02572 19.538 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.784 on 1226 degrees of freedom
## Multiple R-squared: 0.4275, Adjusted R-squared: 0.4261
## F-statistic: 305.2 on 3 and 1226 DF, p-value: < 2.2e-16
Incorporating “ability” into the model does help to explain the variation of “educ” after controlling for parents’ education, and it increases the explanatory significance of the model to 42.61%.
abil.sq <- htv$abil^2
amd <- lm(educ~motheduc+fatheduc+abil+abil.sq, data = htv)
amd
##
## Call:
## lm(formula = educ ~ motheduc + fatheduc + abil + abil.sq, data = htv)
##
## Coefficients:
## (Intercept) motheduc fatheduc abil abil.sq
## 8.2402 0.1901 0.1089 0.4015 0.0506
amd.df <- as.data.frame(amd[1])
amd.df
## coefficients
## (Intercept) 8.24022643
## motheduc 0.19012613
## fatheduc 0.10893866
## abil 0.40146240
## abil.sq 0.05059901
abil.co <- amd.df[4,]
abil.co.sq <- amd.df[5,]
min.abil <- -abil.co / (abil.co.sq*2)
min.abil
## [1] -3.967098
The predicted education is minimized at an ability level of -3.9671
count(htv$abil <= min.abil)
## x freq
## 1 FALSE 1215
## 2 TRUE 15
Only 15 out of the 1230 men have ability levels of less than -3.967. This is important because it shows that only a small fraction of men have an ability level of less than this value while also having education that is >=0.
educ.est <- amd.df[1,] + (amd.df[2, ]*mean(htv$motheduc)) + (amd.df[3,]*mean(htv$fatheduc)) + (amd.df[4,]*htv$abil) + amd.df[5,]*abil.sq
htv %>% ggplot(aes(x = abil, y = educ.est))+
geom_point(size = 1, color = "blue" )+
theme_pubr()+
ylab("Predicted Education")+
xlab("Ability Level")