#Problem 1

library(gamair)
## Warning: package 'gamair' was built under R version 4.1.3
data(hubble)

# Figure 1.1 from Wood 2006
plot(hubble$x,hubble$y,xlab="Distance (Mpc)",
     ylab=expression("Velocity (km"*s^{-1}*")"))

HubbleLM = lm(hubble$y~hubble$x)
summary(HubbleLM)
## 
## Call:
## lm(formula = hubble$y ~ hubble$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -735.14 -132.92  -22.57  171.74  558.61 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.696    126.557   0.053    0.958    
## hubble$x      76.127      9.493   8.019 5.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 264.7 on 22 degrees of freedom
## Multiple R-squared:  0.7451, Adjusted R-squared:  0.7335 
## F-statistic:  64.3 on 1 and 22 DF,  p-value: 5.677e-08
coef(HubbleLM)
## (Intercept)    hubble$x 
##    6.696255   76.126957
abline(HubbleLM)

confint(HubbleLM)
##                 2.5 %    97.5 %
## (Intercept) -255.7670 269.15950
## hubble$x      56.4387  95.81521
HubbleLM2 = glm(hubble$y~hubble$x)
summary(HubbleLM2)
## 
## Call:
## glm(formula = hubble$y ~ hubble$x)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -735.14  -132.92   -22.57   171.74   558.61  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.696    126.557   0.053    0.958    
## hubble$x      76.127      9.493   8.019 5.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 70084.97)
## 
##     Null deviance: 6048498  on 23  degrees of freedom
## Residual deviance: 1541869  on 22  degrees of freedom
## AIC: 339.8
## 
## Number of Fisher Scoring iterations: 2
coef(HubbleLM2)
## (Intercept)    hubble$x 
##    6.696255   76.126957
HubX = c(hubble$x)
MAHubble = matrix(c(hubble$x,hubble$y),nrow=24,ncol=2, byrow = FALSE)
N = length(hubble$x)

beta <- solve(t(MAHubble)%*%MAHubble)%*%t(MAHubble)%*%HubX 
sigma2 <- (1/N)*t(HubX-MAHubble%*%beta)%*%(HubX-MAHubble%*%beta)


negloglik <- function(par){
  beta <- par[1:2]
  sigma2 <- par[3]
  -sum(dnorm(HubX,MAHubble%*%beta,sqrt(sigma2),log=TRUE))
}
optim(par=c(0,0,1),fn=negloglik,method = "Nelder-Mead")
## Warning in sqrt(sigma2): NaNs produced
## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced

## Warning in sqrt(sigma2): NaNs produced
## $par
## [1] 1.000000e+00 2.889785e-10 1.311469e-14
## 
## $value
## [1] -356.1973
## 
## $counts
## function gradient 
##      502       NA 
## 
## $convergence
## [1] 1
## 
## $message
## NULL
M2 = lm(HubX~MAHubble-1)
summary(M2)
## Warning in summary.lm(M2): essentially perfect fit: summary may be unreliable
## 
## Call:
## lm(formula = HubX ~ MAHubble - 1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.415e-14 -5.116e-16 -6.200e-18  5.252e-16  2.383e-15 
## 
## Coefficients:
##             Estimate Std. Error    t value Pr(>|t|)    
## MAHubble1  1.000e+00  2.013e-16  4.967e+15   <2e-16 ***
## MAHubble2 -3.672e-19  2.552e-18 -1.440e-01    0.887    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.169e-15 on 22 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.124e+32 on 2 and 22 DF,  p-value: < 2.2e-16
coef(M2)
##     MAHubble1     MAHubble2 
##  1.000000e+00 -3.672151e-19
summary(M2)$sigma^2
## Warning in summary.lm(M2): essentially perfect fit: summary may be unreliable
## [1] 1.003956e-29
#Velocity = .0000000002889786*60*60*24*365 = .009113229
#Distance = .00000000000001311469 Mpc = 30900000000000000000 * .00000000000001311469  = 405243.9 km
#B = .009113229/405243.9 = .00000002248826
#1/.00000002248826 = 44,467,647 = 44 million

#I have attempted at least five variations on the code above. 44 million years is the most reasonable answer I have gotten so far.  I have also tried to solve the problem using linear models.  The other answers have ranged from 14 quadrillion years to just over a month.  I do not understand this.  I'm not even sure what I'd run a confidence interval on.  That said, for the sake of answering the question, I will run a confidence interval on the linear model "HubbleLM".

confint(HubbleLM)
##                 2.5 %    97.5 %
## (Intercept) -255.7670 269.15950
## hubble$x      56.4387  95.81521
#This indicates that a reasonable starting assumption for the velocity of a galaxy in the exact same position as our own is somewhere between about -255 and about 269 kilometers per second.  From there, every additional Megaparsec of distance drives up velocity by between 56.4 and 95.8 kilometers per second
#While there do exist a small number of galaxies believed to be on a trajectory towards our own, they are all already extremely close.  This indicates that the model might break down within certain ranges of distance.

# using the linear model I was able to get a different answer
# 5 Mpc = 5 * (3.09*10^19) = 1.545*10^20
# Velocity per year = 6.696255 + (76.126957 * (1.545*10^20))
# Convert to seconds =  (((((6.696255 + (76.126957 * (1.545*10^20)))/365)/24)/60)/60)
# This leads me to 372,958,400,000,000
# This is also wrong, and doesn't fit with the data, at least as far as I can understand.  I'm not sure how else to pursue this question.


#######

# Problem 2

df.challenger <- read.csv("https://www.dropbox.com/s/ezxj8d48uh7lzhr/challenger.csv?dl=1")
plot(df.challenger$Temp,df.challenger$O.ring,xlab="Temperature",ylab="Number of incidents")

# N = B0 + B1*T
# N = average number of O-ring failiures on a launch day with the given conditions
# T = Outside Temperature at time of launch
# B0 = The intercept of the model.
# B1 = The slope of T.  How much each marginal unit of temperature changes the average number of O-ring failures on the given day.

ChallengerML = lm(df.challenger$O.ring ~ df.challenger$Temp)
summary(ChallengerML)
## 
## Call:
## lm(formula = df.challenger$O.ring ~ df.challenger$Temp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6582 -0.4389 -0.1594  0.1551  1.9058 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         4.79365    1.40926   3.402  0.00269 **
## df.challenger$Temp -0.06266    0.02016  -3.108  0.00532 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6673 on 21 degrees of freedom
## Multiple R-squared:  0.3151, Adjusted R-squared:  0.2825 
## F-statistic: 9.661 on 1 and 21 DF,  p-value: 0.005321
coef(ChallengerML)
##        (Intercept) df.challenger$Temp 
##         4.79365079        -0.06265873
abline(ChallengerML)

confint(ChallengerML)
##                         2.5 %      97.5 %
## (Intercept)         1.8629349  7.72436672
## df.challenger$Temp -0.1045819 -0.02073552
t.test(x = df.challenger$O.ring, y = df.challenger$Temp, exact = FALSE, paired = TRUE)
## 
##  Paired t-test
## 
## data:  df.challenger$O.ring and df.challenger$Temp
## t = -44.043, df = 22, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -72.38559 -65.87528
## sample estimates:
## mean of the differences 
##               -69.13043
# We reject the null hypothesis.  This means that the above hypothesis test would indicate that temperature has at least some effect on average per-launch O-ring failures.  However, it does not tell us whether that effect is a positive or a negative correlation.

t.test(x = df.challenger$O.ring, y = df.challenger$Temp, alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  df.challenger$O.ring and df.challenger$Temp
## t = -46.689, df = 22.548, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -66.59067
## sample estimates:
##  mean of x  mean of y 
##  0.4347826 69.5652174
# According to this test, we reject the null hypothesis.  This means that the above test indicates that the correlation between temperature and average o-ring failures per launch is less than 0.  To put it another way, the colder the temperatures, the more expected average O-ring failures.

# According to the tests that I have run, launch day temperature has a statistically significant effect of average O-ring failures.  As temperatures decreases, O-ring failures increases.  Specifically, it is reasonable to say that that for each degree lower, average failures increases by about .0627 per launch.  A reasonable range for the effect of temperature is between about .0207 and about .1046 failures per degree of temperature lost.