Original model

#load data 
meteorite_data <-read.csv("Data/meteorite_cleaned.csv")
meteorite_data <- as.data.frame(meteorite_data)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#groupby year and count total landings and avg mass for each year 
yearly_counts <- meteorite_data %>%
                 group_by(year) %>%
                 summarise(count=n(), mean = mean(mass))
yearly_counts
## # A tibble: 252 x 3
##     year count      mean
##    <int> <int>     <dbl>
##  1   601     1      376 
##  2   860     1      472 
##  3  1399     1   107000 
##  4  1490     1      103.
##  5  1491     1   127000 
##  6  1575     1 50000000 
##  7  1583     1    15000 
##  8  1600     1 10100000 
##  9  1621     1     1967 
## 10  1623     1    10400 
## # ... with 242 more rows
#remove year 601 and 860 observations 
(yearly_counts <- yearly_counts[-c(1,2), ])
## # A tibble: 250 x 3
##     year count      mean
##    <int> <int>     <dbl>
##  1  1399     1   107000 
##  2  1490     1      103.
##  3  1491     1   127000 
##  4  1575     1 50000000 
##  5  1583     1    15000 
##  6  1600     1 10100000 
##  7  1621     1     1967 
##  8  1623     1    10400 
##  9  1628     1    29000 
## 10  1632     1     1040 
## # ... with 240 more rows
library("ggpubr")
## Loading required package: ggplot2
cor(yearly_counts$year, yearly_counts$count, method = c("pearson"))
## [1] 0.3627341
#sd(yearly_counts$mean)
lambda = (-0.2) #optimal lamda using box cox 

yearly_counts$count_transformed = (yearly_counts$count^(lambda)-1)/lambda

lm_model <- lm(count_transformed ~ year+mean, data = yearly_counts)
summary(lm_model)
## 
## Call:
## lm(formula = count_transformed ~ year + mean, data = yearly_counts)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.05153 -0.21881 -0.02928  0.27272  2.83204 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.624e+01  7.040e-01 -23.070   <2e-16 ***
## year         9.583e-03  3.741e-04  25.615   <2e-16 ***
## mean         2.286e-08  1.153e-08   1.983   0.0484 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.579 on 247 degrees of freedom
## Multiple R-squared:  0.731,  Adjusted R-squared:  0.7289 
## F-statistic: 335.7 on 2 and 247 DF,  p-value: < 2.2e-16
(summary(lm_model)$sigma)
## [1] 0.57895

Simulation

library(purrr) # v. 0.3.4
library(broom) # v. 0.5.6
library(dplyr) # v. 1.0.0
library(ggplot2) # v. 3.3.1

set.seed(16)

#set true vals 
nrep = 252 #number of observations 
b0 = -1.624e+01
b1 = 9.583e-03
b2 = 2.286e-08
sd =summary(lm_model)$sigma
sd_year = sd(yearly_counts$year) 
sd_mass = sd(yearly_counts$mean)

mean_year = mean(yearly_counts$year) 
mean_mass = mean(yearly_counts$mean)


#simulate the random errors
eps = rnorm(n = nrep, mean = 0, sd = sd) 

#generate the years
year = rnorm(n = nrep,mean = mean_year, sd = sd_year)

#generate average mass 
avgmass = rnorm(n = nrep,mean=mean_mass, sd= sd_mass)
  
#simulate response 
yearlycount  = b0 + (b1*year) + (b2*avgmass) + eps
# check correlations 
pairs(cbind(yearlycount,year,avgmass), col = 4)

#create the model
m1=lm(yearlycount~year+avgmass)
summary(m1)
## 
## Call:
## lm(formula = yearlycount ~ year + avgmass)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34927 -0.35448 -0.02374  0.39278  1.87698 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.575e+01  7.162e-01 -21.992   <2e-16 ***
## year         9.336e-03  3.793e-04  24.612   <2e-16 ***
## avgmass      2.074e-08  1.161e-08   1.787   0.0752 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5748 on 249 degrees of freedom
## Multiple R-squared:  0.7094, Adjusted R-squared:  0.707 
## F-statistic: 303.9 on 2 and 249 DF,  p-value: < 2.2e-16
#ggplot2::autoplot(m1)
#Normal Q-Q Plot
par(mfrow=c(2,2))
plot(m1,pch =20, cex = 2, col = "aquamarine2")

#create function
sim_fun = function(nrep = 252, b0 = -1.624e+01, b1 = 9.583e-03,b2 = 2.286e-08, sigma =0.57895) {
  
     eps = rnorm(n = nrep, mean = 0, sd = sigma)
     year = rnorm(n = nrep,mean = 1706, sd = 177.6795)
     avgmass = rnorm(n = nrep,mean=423335.7, sd= 3267472)
     yearlycount  = b0 + (b1*year) + (b2*avgmass) + eps
     
     simdat = data.frame(yearlycount,year, avgmass)
     fit = lm(yearlycount~year+avgmass, data = simdat)
     fit
}
set.seed(11)
sim_fun()
## 
## Call:
## lm(formula = yearlycount ~ year + avgmass, data = simdat)
## 
## Coefficients:
## (Intercept)         year      avgmass  
##  -1.564e+01    9.236e-03    8.194e-09

Repeat the simulation many times

#simulate many times
sims = replicate(n = 1000, sim_fun(), simplify = FALSE )
sims[[1]]
## 
## Call:
## lm(formula = yearlycount ~ year + avgmass, data = simdat)
## 
## Coefficients:
## (Intercept)         year      avgmass  
##  -1.614e+01    9.505e-03    2.102e-08

Estimated regression coefficients

Distribution of \(\beta_1\):

sims %>%
     map_df(tidy) %>%
     filter(term == "year") %>%
     ggplot( aes(x = estimate) ) +
          geom_density(fill = "darkgrey", alpha = .5, color = "darkgrey") +
          geom_vline( aes(xintercept = b1, color ="true"))+ #true b1
          geom_vline( aes(xintercept = mean(estimate), color = "estimate"))

Distribution of \(\beta_2\):

sims %>%
     map_df(tidy) %>%
     filter(term == "avgmass") %>%
     ggplot( aes(x = estimate) ) +
          geom_density(fill = "darkgrey", alpha = .5, color = "darkgrey") +
          geom_vline( aes(xintercept = b2, color ="true"))+ #true b2
          geom_vline( aes(xintercept = mean(estimate), color = "estimate"))

#original model
tidy(m1)
## # A tibble: 3 x 5
##   term        estimate    std.error statistic  p.value
##   <chr>          <dbl>        <dbl>     <dbl>    <dbl>
## 1 (Intercept) -1.58e+1 0.716           -22.0  2.76e-60
## 2 year         9.34e-3 0.000379         24.6  1.23e-68
## 3 avgmass      2.07e-8 0.0000000116      1.79 7.52e- 2
summary(m1)$sigma
## [1] 0.5747692

Estimated standard deviation

sims %>%
     map_dbl(~summary(.x)$sigma) %>%
     data.frame(sigma = .) %>%
     ggplot(aes(x = sigma) ) +
          geom_density(fill = "darkgrey", alpha = .75, color = "darkgrey") +
          geom_vline( aes(xintercept = sd, color ="true"))+ #true b2
          geom_vline( aes(xintercept = mean(sigma), color = "estimate"))

The standard deviation is underestimated a bit more than 50% of the time:

sims %>%
     map_dbl(~summary(.x)$sigma) %>%
     {. < sd} %>%
     mean()
## [1] 0.54

Extract hypothesis test results

sims %>%
     map_df(tidy) %>%
     filter(term == "year") %>%
     pull(p.value) %>%
     {. <  0.05} %>%
     mean()
## [1] 1
sims %>%
     map_df(tidy) %>%
     filter(term == "avgmass") %>%
     pull(p.value) %>%
     {. <  0.05} %>%
     mean()
## [1] 0.555