library(ggplot2) library(tidyr) library(dplyr) ## EXERCISE 4
library(foreign)
data <- read.dta("https://www.dropbox.com/sh/rqmo1hvij1veff0/AADZ4MZhDDk9R8sFSjBvmcRma/WAGE1.DTA?dl=1")
## regressions of wages on education.
mod <- lm(wage~educ,data)
summary(mod)
##
## Call:
## lm(formula = wage ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3396 -2.1501 -0.9674 1.1921 16.6085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.90485 0.68497 -1.321 0.187
## educ 0.54136 0.05325 10.167 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.378 on 524 degrees of freedom
## Multiple R-squared: 0.1648, Adjusted R-squared: 0.1632
## F-statistic: 103.4 on 1 and 524 DF, p-value: < 2.2e-16
## regression of tenure on education
mod2 <- lm(tenure~educ,data)
summary(mod2)
##
## Call:
## lm(formula = tenure ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.946 -4.894 -2.601 1.520 38.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.9457 1.4638 4.745 2.69e-06 ***
## educ -0.1466 0.1138 -1.288 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.22 on 524 degrees of freedom
## Multiple R-squared: 0.003155, Adjusted R-squared: 0.001253
## F-statistic: 1.659 on 1 and 524 DF, p-value: 0.1984
## this is not significant
library(foreign)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
auto <- read.dta("https://www.dropbox.com/sh/rqmo1hvij1veff0/AACNkMy47ilXAMh3nmiIs_Bqa/auto.dta?dl=1")
p = ggplot(auto, aes(x=price, y=weight)) + geom_point(colour="blue")+
geom_text(aes(label=auto$make), hjust=0, vjust=0)
print(p)
## or - Too much data so looks ugly but good to know for future
library(ggrepel)
t <- ggplot(auto, aes(x=price, y=weight)) + geom_point(colour="blue", size = 3)
t + geom_label_repel(aes(label=auto$make),
box.padding=0.005,
point.padding =0.005,
segment.color ='grey50') + theme_classic()
print(t)
## regression on price on weight
reg <- lm(price~weight,auto)
summary(reg)
##
## Call:
## lm(formula = price ~ weight, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3341.9 -1828.3 -624.1 1232.1 7143.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.7074 1174.4296 -0.006 0.995
## weight 2.0441 0.3768 5.424 7.42e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2502 on 72 degrees of freedom
## Multiple R-squared: 0.2901, Adjusted R-squared: 0.2802
## F-statistic: 29.42 on 1 and 72 DF, p-value: 7.416e-07
## this is how to predict prices from new weight values
##create a data frame with the new weights
new.weight <- data.frame(
weight =c(2500, 3500, 3000,4102.3110684,500)
)
##then use predict(model,newdata = new df)
new.cost <- predict(reg, newdata = new.weight)
new.cost[4]-new.cost[3]
## 4
## 2253.193
##this is the price for a 500lb car but the lightest on in the model was 1760lb so this is unlikely to be accurate
new.cost[5]
## 5
## 1015.324
##Monte carlo experiment
beta <- double(1000)
obs <- 100
for (i in 1:1000) {
## rnorm tells r that this is normally distributed - data.frame make variable eps into dataframe
df <- data.frame(eps=rnorm(obs))
df <- df %>% mutate(X=runif(obs))
head(df,10)
##Now we have added Y according to the formula Y=2+0.5X+ϵ
df <- df%>% mutate(Y= 2+X/2+eps)
beta[i] <- lm(Y~X, df)$coefficients[[2]]
}
mean(beta)
## [1] 0.50406
results = data.frame(beta)
histo <- ggplot(results, aes(x=beta)) +
geom_histogram(color="blue", fill="white",bins=40) +
theme_minimal()
print(histo)
##Monte carlo pt 2
obs <- 100
beta2 <- double(1000)
for (i in 1:1000){
df2 <- data.frame(eps=rnorm(obs))
df2 <- df2 %>% mutate(X=runif(obs)*eps)
df2 <- df2 %>% mutate (Y=2+0.5*X+eps)
beta2[i] <- lm(Y~X,df)$coefficients[[2]] }
mean(beta2)
## [1] 0.6339214
hist(beta2, 50)