library(ggplot2) library(tidyr) library(dplyr) ## EXERCISE 4

library(foreign)
data <- read.dta("https://www.dropbox.com/sh/rqmo1hvij1veff0/AADZ4MZhDDk9R8sFSjBvmcRma/WAGE1.DTA?dl=1")
## regressions of wages on education. 
mod <- lm(wage~educ,data)
summary(mod)
## 
## Call:
## lm(formula = wage ~ educ, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3396 -2.1501 -0.9674  1.1921 16.6085 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.90485    0.68497  -1.321    0.187    
## educ         0.54136    0.05325  10.167   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.378 on 524 degrees of freedom
## Multiple R-squared:  0.1648, Adjusted R-squared:  0.1632 
## F-statistic: 103.4 on 1 and 524 DF,  p-value: < 2.2e-16
## regression of tenure on education 
mod2 <- lm(tenure~educ,data)
summary(mod2)
## 
## Call:
## lm(formula = tenure ~ educ, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.946 -4.894 -2.601  1.520 38.813 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.9457     1.4638   4.745 2.69e-06 ***
## educ         -0.1466     0.1138  -1.288    0.198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.22 on 524 degrees of freedom
## Multiple R-squared:  0.003155,   Adjusted R-squared:  0.001253 
## F-statistic: 1.659 on 1 and 524 DF,  p-value: 0.1984
## this is not significant 
library(foreign)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
auto <- read.dta("https://www.dropbox.com/sh/rqmo1hvij1veff0/AACNkMy47ilXAMh3nmiIs_Bqa/auto.dta?dl=1")

p = ggplot(auto, aes(x=price, y=weight)) + geom_point(colour="blue")+ 
  geom_text(aes(label=auto$make), hjust=0, vjust=0)
print(p)

## or - Too much data so looks ugly but good to know for future 
library(ggrepel)
t <- ggplot(auto, aes(x=price, y=weight)) + geom_point(colour="blue", size = 3)
t + geom_label_repel(aes(label=auto$make),
                     box.padding=0.005,
                     point.padding =0.005,
                     segment.color ='grey50') + theme_classic()

print(t)

## regression on price on weight 
reg <- lm(price~weight,auto)
summary(reg)
## 
## Call:
## lm(formula = price ~ weight, data = auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3341.9 -1828.3  -624.1  1232.1  7143.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.7074  1174.4296  -0.006    0.995    
## weight         2.0441     0.3768   5.424 7.42e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2502 on 72 degrees of freedom
## Multiple R-squared:  0.2901, Adjusted R-squared:  0.2802 
## F-statistic: 29.42 on 1 and 72 DF,  p-value: 7.416e-07
## this is how to predict prices from new weight values 
##create a data frame with the new weights 
new.weight <- data.frame(
  weight =c(2500, 3500, 3000,4102.3110684,500)
)
##then use predict(model,newdata = new df)
new.cost <- predict(reg, newdata = new.weight)

new.cost[4]-new.cost[3]
##        4 
## 2253.193
##this is the price for a 500lb car but the lightest on in the model was 1760lb so this is unlikely to be accurate
new.cost[5]
##        5 
## 1015.324

##Monte carlo experiment

beta <- double(1000)
obs <- 100 
for (i in 1:1000) {
## rnorm tells r that this is normally distributed - data.frame make variable eps into dataframe
df <- data.frame(eps=rnorm(obs))
df <-  df %>% mutate(X=runif(obs))
head(df,10)
##Now we have added Y according to the formula Y=2+0.5X+ϵ
df <- df%>% mutate(Y= 2+X/2+eps)
beta[i] <- lm(Y~X, df)$coefficients[[2]]
}

mean(beta)
## [1] 0.50406
results = data.frame(beta)
histo <- ggplot(results, aes(x=beta)) + 
  geom_histogram(color="blue", fill="white",bins=40) +
  theme_minimal()
print(histo)

##Monte carlo pt 2

obs <- 100
beta2 <- double(1000)
for (i in 1:1000){
df2 <- data.frame(eps=rnorm(obs))
df2 <- df2 %>% mutate(X=runif(obs)*eps)
df2 <- df2 %>% mutate (Y=2+0.5*X+eps)
beta2[i] <- lm(Y~X,df)$coefficients[[2]] }
mean(beta2)
## [1] 0.6339214
hist(beta2, 50)