#load data
meteorite_data <-read.csv("Data/meteorite_cleaned.csv")
meteorite_data <- as.data.frame(meteorite_data)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#groupby year and count total landings and avg mass for each year
yearly_counts <- meteorite_data %>%
group_by(year) %>%
summarise(count=n(), mean = mean(mass))
yearly_counts
## # A tibble: 252 x 3
## year count mean
## <int> <int> <dbl>
## 1 601 1 376
## 2 860 1 472
## 3 1399 1 107000
## 4 1490 1 103.
## 5 1491 1 127000
## 6 1575 1 50000000
## 7 1583 1 15000
## 8 1600 1 10100000
## 9 1621 1 1967
## 10 1623 1 10400
## # ... with 242 more rows
#remove year 601 and 860 observations
(yearly_counts <- yearly_counts[-c(1,2), ])
## # A tibble: 250 x 3
## year count mean
## <int> <int> <dbl>
## 1 1399 1 107000
## 2 1490 1 103.
## 3 1491 1 127000
## 4 1575 1 50000000
## 5 1583 1 15000
## 6 1600 1 10100000
## 7 1621 1 1967
## 8 1623 1 10400
## 9 1628 1 29000
## 10 1632 1 1040
## # ... with 240 more rows
library("ggpubr")
## Loading required package: ggplot2
cor(yearly_counts$year, yearly_counts$count, method = c("pearson"))
## [1] 0.3627341
#sd(yearly_counts$mean)
lambda = (-0.2) #optimal lamda using box cox
yearly_counts$count_transformed = (yearly_counts$count^(lambda)-1)/lambda
lm_model <- lm(count_transformed ~ year+mean, data = yearly_counts)
summary(lm_model)
##
## Call:
## lm(formula = count_transformed ~ year + mean, data = yearly_counts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.05153 -0.21881 -0.02928 0.27272 2.83204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.624e+01 7.040e-01 -23.070 <2e-16 ***
## year 9.583e-03 3.741e-04 25.615 <2e-16 ***
## mean 2.286e-08 1.153e-08 1.983 0.0484 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.579 on 247 degrees of freedom
## Multiple R-squared: 0.731, Adjusted R-squared: 0.7289
## F-statistic: 335.7 on 2 and 247 DF, p-value: < 2.2e-16
(summary(lm_model)$sigma)
## [1] 0.57895
library(purrr) # v. 0.3.4
library(broom) # v. 0.5.6
library(dplyr) # v. 1.0.0
library(ggplot2) # v. 3.3.1
set.seed(16)
#set true vals
nrep = 252 #number of observations
b0 = -1.624e+01
b1 = 9.583e-03
b2 = 2.286e-08
sd =summary(lm_model)$sigma
sd_year = sd(yearly_counts$year)
sd_mass = sd(yearly_counts$mean)
mean_year = mean(yearly_counts$year)
mean_mass = mean(yearly_counts$mean)
#simulate the random errors
eps = rnorm(n = nrep, mean = 0, sd = sd)
#generate the years
year = rnorm(n = nrep,mean = mean_year, sd = sd_year)
#generate average mass
avgmass = rnorm(n = nrep,mean=mean_mass, sd= sd_mass)
#simulate response
yearlycount = b0 + (b1*year) + (b2*avgmass) + eps
# check correlations
pairs(cbind(yearlycount,year,avgmass), col = 4)
#create the model
m1=lm(yearlycount~year+avgmass)
summary(m1)
##
## Call:
## lm(formula = yearlycount ~ year + avgmass)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34927 -0.35448 -0.02374 0.39278 1.87698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.575e+01 7.162e-01 -21.992 <2e-16 ***
## year 9.336e-03 3.793e-04 24.612 <2e-16 ***
## avgmass 2.074e-08 1.161e-08 1.787 0.0752 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5748 on 249 degrees of freedom
## Multiple R-squared: 0.7094, Adjusted R-squared: 0.707
## F-statistic: 303.9 on 2 and 249 DF, p-value: < 2.2e-16
#ggplot2::autoplot(m1)
#Normal Q-Q Plot
par(mfrow=c(2,2))
plot(m1,pch =20, cex = 2, col = "aquamarine2")
#create function
sim_fun = function(nrep = 252, b0 = -1.624e+01, b1 = 9.583e-03,b2 = 2.286e-08, sigma =0.57895) {
eps = rnorm(n = nrep, mean = 0, sd = sigma)
year = rnorm(n = nrep,mean = 1706, sd = 177.6795)
avgmass = rnorm(n = nrep,mean=423335.7, sd= 3267472)
yearlycount = b0 + (b1*year) + (b2*avgmass) + eps
simdat = data.frame(yearlycount,year, avgmass)
fit = lm(yearlycount~year+avgmass, data = simdat)
fit
}
set.seed(11)
sim_fun()
##
## Call:
## lm(formula = yearlycount ~ year + avgmass, data = simdat)
##
## Coefficients:
## (Intercept) year avgmass
## -1.564e+01 9.236e-03 8.194e-09
#simulate many times
sims = replicate(n = 1000, sim_fun(), simplify = FALSE )
sims[[1]]
##
## Call:
## lm(formula = yearlycount ~ year + avgmass, data = simdat)
##
## Coefficients:
## (Intercept) year avgmass
## -1.614e+01 9.505e-03 2.102e-08
Distribution of \(\beta_1\):
sims %>%
map_df(tidy) %>%
filter(term == "year") %>%
ggplot( aes(x = estimate) ) +
geom_density(fill = "darkgrey", alpha = .5, color = "darkgrey") +
geom_vline( aes(xintercept = b1, color ="true"))+ #true b1
geom_vline( aes(xintercept = mean(estimate), color = "estimate"))
Distribution of \(\beta_2\):
sims %>%
map_df(tidy) %>%
filter(term == "avgmass") %>%
ggplot( aes(x = estimate) ) +
geom_density(fill = "darkgrey", alpha = .5, color = "darkgrey") +
geom_vline( aes(xintercept = b2, color ="true"))+ #true b2
geom_vline( aes(xintercept = mean(estimate), color = "estimate"))
#original model
tidy(m1)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.58e+1 0.716 -22.0 2.76e-60
## 2 year 9.34e-3 0.000379 24.6 1.23e-68
## 3 avgmass 2.07e-8 0.0000000116 1.79 7.52e- 2
summary(m1)$sigma
## [1] 0.5747692
sims %>%
map_dbl(~summary(.x)$sigma) %>%
data.frame(sigma = .) %>%
ggplot(aes(x = sigma) ) +
geom_density(fill = "darkgrey", alpha = .75, color = "darkgrey") +
geom_vline( aes(xintercept = sd, color ="true"))+ #true b2
geom_vline( aes(xintercept = mean(sigma), color = "estimate"))
The standard deviation is underestimated a bit more than 50% of the time:
sims %>%
map_dbl(~summary(.x)$sigma) %>%
{. < sd} %>%
mean()
## [1] 0.54
sims %>%
map_df(tidy) %>%
filter(term == "year") %>%
pull(p.value) %>%
{. < 0.05} %>%
mean()
## [1] 1
sims %>%
map_df(tidy) %>%
filter(term == "avgmass") %>%
pull(p.value) %>%
{. < 0.05} %>%
mean()
## [1] 0.555