#Problem 1
library(gamair)
## Warning: package 'gamair' was built under R version 4.1.3
data(hubble)
# Figure 1.1 from Wood 2006
plot(hubble$x,hubble$y,xlab="Distance (Mpc)",
ylab=expression("Velocity (km"*s^{-1}*")"))
HubbleLM = lm(hubble$y~hubble$x)
summary(HubbleLM)
##
## Call:
## lm(formula = hubble$y ~ hubble$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -735.14 -132.92 -22.57 171.74 558.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.696 126.557 0.053 0.958
## hubble$x 76.127 9.493 8.019 5.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264.7 on 22 degrees of freedom
## Multiple R-squared: 0.7451, Adjusted R-squared: 0.7335
## F-statistic: 64.3 on 1 and 22 DF, p-value: 5.677e-08
coef(HubbleLM)
## (Intercept) hubble$x
## 6.696255 76.126957
abline(HubbleLM)

confint(HubbleLM)
## 2.5 % 97.5 %
## (Intercept) -255.7670 269.15950
## hubble$x 56.4387 95.81521
HubbleLM2 = glm(hubble$y~hubble$x)
summary(HubbleLM2)
##
## Call:
## glm(formula = hubble$y ~ hubble$x)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -735.14 -132.92 -22.57 171.74 558.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.696 126.557 0.053 0.958
## hubble$x 76.127 9.493 8.019 5.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 70084.97)
##
## Null deviance: 6048498 on 23 degrees of freedom
## Residual deviance: 1541869 on 22 degrees of freedom
## AIC: 339.8
##
## Number of Fisher Scoring iterations: 2
coef(HubbleLM2)
## (Intercept) hubble$x
## 6.696255 76.126957
HubX = c(hubble$x)
MAHubble = matrix(c(hubble$x,hubble$y),nrow=24,ncol=2, byrow = FALSE)
N = length(hubble$x)
beta <- solve(t(MAHubble)%*%MAHubble)%*%t(MAHubble)%*%HubX
sigma2 <- (1/N)*t(HubX-MAHubble%*%beta)%*%(HubX-MAHubble%*%beta)
negloglik <- function(par){
beta <- par[1:2]
sigma2 <- par[3]
-sum(dnorm(HubX,MAHubble%*%beta,sqrt(sigma2),log=TRUE))
}
optim(par=c(0,0,1),fn=negloglik,method = "Nelder-Mead")
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced
## $par
## [1] 1.000000e+00 2.889785e-10 1.311469e-14
##
## $value
## [1] -356.1973
##
## $counts
## function gradient
## 502 NA
##
## $convergence
## [1] 1
##
## $message
## NULL
M2 = lm(HubX~MAHubble-1)
summary(M2)
## Warning in summary.lm(M2): essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = HubX ~ MAHubble - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.415e-14 -5.116e-16 -6.200e-18 5.252e-16 2.383e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## MAHubble1 1.000e+00 2.013e-16 4.967e+15 <2e-16 ***
## MAHubble2 -3.672e-19 2.552e-18 -1.440e-01 0.887
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.169e-15 on 22 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.124e+32 on 2 and 22 DF, p-value: < 2.2e-16
coef(M2)
## MAHubble1 MAHubble2
## 1.000000e+00 -3.672151e-19
summary(M2)$sigma^2
## Warning in summary.lm(M2): essentially perfect fit: summary may be unreliable
## [1] 1.003956e-29
#Velocity = .0000000002889786*60*60*24*365 = .009113229
#Distance = .00000000000001311469 Mpc = 30900000000000000000 * .00000000000001311469 = 405243.9 km
#B = .009113229/405243.9 = .00000002248826
#1/.00000002248826 = 44,467,647 = 44 million
#I have attempted at least five variations on the code above. 44 million years is the most reasonable answer I have gotten so far. I have also tried to solve the problem using linear models. The other answers have ranged from 14 quadrillion years to just over a month. I do not understand this. I'm not even sure what I'd run a confidence interval on. That said, for the sake of answering the question, I will run a confidence interval on the linear model "HubbleLM".
confint(HubbleLM)
## 2.5 % 97.5 %
## (Intercept) -255.7670 269.15950
## hubble$x 56.4387 95.81521
#This indicates that a reasonable starting assumption for the velocity of a galaxy in the exact same position as our own is somewhere between about -255 and about 269 kilometers per second. From there, every additional Megaparsec of distance drives up velocity by between 56.4 and 95.8 kilometers per second
#While there do exist a small number of galaxies believed to be on a trajectory towards our own, they are all already extremely close. This indicates that the model might break down within certain ranges of distance.
# using the linear model I was able to get a different answer
# 5 Mpc = 5 * (3.09*10^19) = 1.545*10^20
# Velocity per year = 6.696255 + (76.126957 * (1.545*10^20))
# Convert to seconds = (((((6.696255 + (76.126957 * (1.545*10^20)))/365)/24)/60)/60)
# This leads me to 372,958,400,000,000
# This is also wrong, and doesn't fit with the data, at least as far as I can understand. I'm not sure how else to pursue this question.
#######
# Problem 2
df.challenger <- read.csv("https://www.dropbox.com/s/ezxj8d48uh7lzhr/challenger.csv?dl=1")
plot(df.challenger$Temp,df.challenger$O.ring,xlab="Temperature",ylab="Number of incidents")
# N = B0 + B1*T
# N = average number of O-ring failiures on a launch day with the given conditions
# T = Outside Temperature at time of launch
# B0 = The intercept of the model.
# B1 = The slope of T. How much each marginal unit of temperature changes the average number of O-ring failures on the given day.
ChallengerML = lm(df.challenger$O.ring ~ df.challenger$Temp)
summary(ChallengerML)
##
## Call:
## lm(formula = df.challenger$O.ring ~ df.challenger$Temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6582 -0.4389 -0.1594 0.1551 1.9058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.79365 1.40926 3.402 0.00269 **
## df.challenger$Temp -0.06266 0.02016 -3.108 0.00532 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6673 on 21 degrees of freedom
## Multiple R-squared: 0.3151, Adjusted R-squared: 0.2825
## F-statistic: 9.661 on 1 and 21 DF, p-value: 0.005321
coef(ChallengerML)
## (Intercept) df.challenger$Temp
## 4.79365079 -0.06265873
abline(ChallengerML)

confint(ChallengerML)
## 2.5 % 97.5 %
## (Intercept) 1.8629349 7.72436672
## df.challenger$Temp -0.1045819 -0.02073552
t.test(x = df.challenger$O.ring, y = df.challenger$Temp, exact = FALSE, paired = TRUE)
##
## Paired t-test
##
## data: df.challenger$O.ring and df.challenger$Temp
## t = -44.043, df = 22, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -72.38559 -65.87528
## sample estimates:
## mean of the differences
## -69.13043
# We reject the null hypothesis. This means that the above hypothesis test would indicate that temperature has at least some effect on average per-launch O-ring failures. However, it does not tell us whether that effect is a positive or a negative correlation.
t.test(x = df.challenger$O.ring, y = df.challenger$Temp, alternative = "less")
##
## Welch Two Sample t-test
##
## data: df.challenger$O.ring and df.challenger$Temp
## t = -46.689, df = 22.548, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -66.59067
## sample estimates:
## mean of x mean of y
## 0.4347826 69.5652174
# According to this test, we reject the null hypothesis. This means that the above test indicates that the correlation between temperature and average o-ring failures per launch is less than 0. To put it another way, the colder the temperatures, the more expected average O-ring failures.
# According to the tests that I have run, launch day temperature has a statistically significant effect of average O-ring failures. As temperatures decreases, O-ring failures increases. Specifically, it is reasonable to say that that for each degree lower, average failures increases by about .0627 per launch. A reasonable range for the effect of temperature is between about .0207 and about .1046 failures per degree of temperature lost.