Completed by Alex Blohm

GPAData <- read.table("CH01PR19.txt") #Import GPA Data
#head(GPAData) #First 6 observations
Y <- GPAData[,1] #Declare Y-value of Freshman Year College GPA
X <- GPAData[,2] #Declare X-value of ACT score
n = nrow(GPAData)
Xbar <- mean(X) #declared
Ybar <- mean(Y)
Sxx = sum((X-Xbar)^2) #needed for MSE to find standard error
b1 <- sum((X-Xbar)*(Y-Ybar))/((n-1)*var(X))
b0 <- Ybar-b1*Xbar
SSE = sum((Y-(b0 + b1*X))^2) #Sum squared error, from residuals
MSE = SSE/(n-2) #Mean Squared Error

LM <- lm(Y~X) #Create Linear Model for Y ~ X
#7a.
coefficients(LM) #Sample Intercept and Slope
## (Intercept)           X 
##  2.11404929  0.03882713
#7c.
new_obs <- data.frame(X <- 30) #Create a new observation matrix
new_obs
##   X....30
## 1      30
predict(LM, new_obs) #Predicted GPA with ACT score of 30.
##        1 
## 3.278863
#8e.
sum(LM$residuals) #Add all residuals from Linear Model LM
## [1] -2.942091e-15
#Yes they sum to basically zero (rounding caused a minor error)
#8.f.
sigma(LM)
## [1] 0.623125
Variance = sigma(LM)^2
Variance
## [1] 0.3882848
#Units GPA points.  AKA on average, Predicted GPA varies by 0.388 points. 
#4.a.
alpha = .01
CI = confint(LM, level = 1-alpha)# 99%CI for b0 and b1
CI[2,]
##       0.5 %      99.5 % 
## 0.005385614 0.072268640

We are 99% confident that the interval 1.27 to 2.95 captures the true slope. Since 0 is not in the interval, this suggests there is a relationship between ACT score and Freshman year GPA.

# names(summary(LM)) helps find the names

#4b. and c.
summary(LM)
## 
## Call:
## lm(formula = Y ~ X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.74004 -0.33827  0.04062  0.44064  1.22737 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.11405    0.32089   6.588  1.3e-09 ***
## X            0.03883    0.01277   3.040  0.00292 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6231 on 118 degrees of freedom
## Multiple R-squared:  0.07262,    Adjusted R-squared:  0.06476 
## F-statistic:  9.24 on 1 and 118 DF,  p-value: 0.002917
tstarb1 = summary(LM)$coef[2,"t value"]
tstarb1
## [1] 3.039777
RejectionQuantile = qt(1-alpha/2, df= n-2)
RejectionQuantile
## [1] 2.618137
pvalueb1 = summary(LM)$coef[2,"Pr(>|t|)"]
pvalueb1
## [1] 0.002916604
#From the summary we see the t value is 3.04 and P-value is 0.00292 which is less than alpha = 0.01.   Therefore we would reject the null and conclude there is a relationship between ACT score and GPA as discussed in the last homework.

Problem 3.

#2.13 a. First with formulas, then with R functions
# Same data and X,Y from Before

Xh <- 28  
MSE
## [1] 0.3882848
Yhath <- b0 + b1 * Xh # Predicted value for X_h=28
sYHath = sqrt(MSE*((1/n) + (((Xh-Xbar)^2)/Sxx))) #standard error of YHath
alpha = 0.05 #for 95% CI
LowerBYHath = Yhath-(qt(1-alpha/2, n-2)*sYHath)
UpperBYHath = Yhath+(qt(1-alpha/2, n-2)*sYHath)
c(LowerBYHath,UpperBYHath)
## [1] 3.061384 3.341033
# Confidence Inverval for mean response
NewAct = data.frame(X = 28)
ci2 <- predict.lm(LM, NewAct, interval = "confidence", 1-alpha)
ci2
## $fit
##        fit      lwr      upr
## 1 3.201209 3.061384 3.341033
## 
## $se.fit
## [1] 0.07060873
## 
## $df
## [1] 118
## 
## $residual.scale
## [1] 0.623125

We are 95% confident that the interval 3.06 to 3.34 captures the true mean freshman GPA for a student whose ACT score is 28.

#2.13 b.
sYHathNew = sqrt(MSE*(1 + (1/n) + ((Xh-Xbar)^2)/Sxx)) #standard predicted error of YHathNew (1+ at the front)
alpha = 0.05 #for 95% CI
LowerBYHathNew = Yhath-qt(1-alpha/2, n-2)*sYHathNew
UpperBYHathNew = Yhath+qt(1-alpha/2, n-2)*sYHathNew
c(LowerBYHathNew,UpperBYHathNew)
## [1] 1.959355 4.443063
# Prediction Inverval

pi <- predict(LM, NewAct, interval = "prediction")
pi
##        fit      lwr      upr
## 1 3.201209 1.959355 4.443063

With 95% confidence, we predice Mary Jone’s mean freshman year GPA to be within 1.95 and 4.0 (Since she can not obtain a GPA of more than a 4.0)

2.13 c.

The prediction interval is wider than the confidence interval (as it should be) because the prediction is based off a new data value that may depart the data from normality (a more conservative estimate).

#2.13 d. 
LowerBandYHath = Yhath-sqrt(2*qf(1-alpha, 2, n-2))*sYHath
UpperBandYHath = Yhath+sqrt(2*qf(1-alpha, 2, n-2))*sYHath

c(LowerBandYHath,UpperBandYHath)
## [1] 3.026159 3.376258
#2.23 a. 
GPAData <- read.table("CH01PR19.txt") #Import GPA Data
#head(GPAData) #First 6 observations
Y <- GPAData[,1] #Declare Y-value of Freshman Year College GPA
X <- GPAData[,2] #Declare X-value of ACT score
Yhat = b0+b1*X
Res = LM$res
SSE = sum(Res^2)                # Sum of squared error/residuals
SSR = sum((Yhat-Ybar)^2)        # Sum of squared regression
SSTO = (n-1)*var(Y)             # Sum of squared Total
c(SSE, SSR, SSTO, SSTO-SSE)     # SSTO=SSE+SSR
## [1] 45.817608  3.587846 49.405454  3.587846
MSE = SSE/(n-2)
MSR = SSR /1

dfSSE = n-2
dfSSTO = n-1

Table = data.frame("Source" = c("Regression", "Error", "Total"), "ss" = c(SSR, SSE, SSTO), "df" = c(1, dfSSE, dfSSTO), "MS"= c(MSR, MSE, ""))

Table
##       Source        ss  df                MS
## 1 Regression  3.587846   1    3.587845899313
## 2      Error 45.817608 118 0.388284811870229
## 3      Total 49.405454 119
anova = anova(LM)
anova
## Analysis of Variance Table
## 
## Response: Y
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## X           1  3.588  3.5878  9.2402 0.002917 **
## Residuals 118 45.818  0.3883                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSR2 = b1^2*Sxx
SSR2
## [1] 3.587846
#b. MSR is the estimated distance from the regression line to the mean of the Y.  They would be the same if beta 1 = 0. 
#c. 
alpha = 0.01
Fstar = (SSR/1)/(SSE/(n-2))
Fstar
## [1] 9.240243
CriticalF = qf(1-alpha, 1, n-2)
CriticalF
## [1] 6.854641
#Since our Fstar is > our CriticalF we reject the null and conclude B1 not = 0. Therefore the slope is not 0.

#d. R^2
# Coefficient of Determination
Rsq = SSR/SSTO    # R square
Rsq
## [1] 0.07262044
#e.
# Coefficient of Correlation
r = sqrt(Rsq)       
r      
## [1] 0.2694818
b1
## [1] 0.03882713
#Since b1 is positive, so is r.

#f.  r has a more clear-cut interpretation as the strength of the relationship between GPA and ACT scores because it includes the direction.
#Problems 1.22, 1.26, 2.7 a,b, 2.26, Plastic Hardness using data set CH01PR22.txt.
Plastic <- read.table("CH01PR22.txt") #Import Plastic Hardness Data

#1.22a.
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic) 

PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar

plot(XTime,YHardness,xlab="Hours", ylab="Hardness",type="p",pch = 20,cex = 1)

abline(Pb0,Pb1)

#It appears to be a good fit.
lm.p = lm(YHardness~XTime)
#b. 
Pred = Pb0+ Pb1*40
Pred
## [1] 249.975
new_obs <- data.frame(XTime <- 40) #Create a new observation matrix
new_obs
##   XTime....40
## 1          40
predict(lm.p, new_obs) #Predicted GPA with ACT score of 30.
##       1 
## 249.975
#c. 
Pb1
## [1] 2.034375
#1.26 a.
Plastic <- read.table("CH01PR22.txt") #Import Plastic Hardness Data

#1.22a.
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic) 

PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar

PYhat <- Pb0 + Pb1*XTime         # Predicted values
PRes <- YHardness-PYhat           # Residuals
head(PRes)
## [1] -2.150  3.850 -5.150 -1.150  0.575  2.575
ShouldBeZero = sum(PRes)
ShouldBeZero
## [1] -3.410605e-13
sum(lm.p$residuals)
## [1] -1.998401e-15
#Yes they basically sum to zero
#b.
PMSE <- sum(PRes^2)/(n-2) # Estimated
PMSE
## [1] 10.45893
Psd <- sqrt(PMSE)         # Estimated Standard Deviation
Psd
## [1] 3.234027
summary(lm.p)$sigma
## [1] 3.234027
#Measured in Brinell Units
#2.7 
Plastic <- read.table("CH01PR22.txt") #Import 
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic) 
PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar
PMSE <- sum(PRes^2)/(n-2) 
alpha = 0.01

# Confidence Interval for beta1
sb1 <- sqrt(PMSE/sum((XTime-PXbar)^2))  # Estimated SD of b1
sb1
## [1] 0.09039379
CI_beta1 <- c(Pb1-qt(1-alpha/2,n-2)*sb1,Pb1+qt(1-alpha/2,n-2)*sb1)  #CI for beta1
CI_beta1  
## [1] 1.765287 2.303463
confint(lm.p,2,level = 1-alpha)  # CI for beta1
##          0.5 %   99.5 %
## XTime 1.765287 2.303463
#We are 99% confident that the interval 1.77 to 2.30 captures the true slope relating plastic hardness to hours.

#b. According to our confidence interval, 2 will be in the interval.

# Test Hypothesis H0: beta1 = 2 vs Ha: beta1 != 2
t1_1 <- (Pb1-2)/sb1     # Test statistic 
t1_1 
## [1] 0.3802805
cri <- qt(1-alpha/2,n-2)# critical point, reject H0 if |t1_1| < cri 
cri                     # Conclusion: do not reject H0 
## [1] 2.976843
p_val_1 <-2*(1-pt(abs(t1_1),n-2)) # p-value reject H0 if p_val< 0.01
p_val_1    
## [1] 0.7094445
# Conclusion: do not (fail to) reject H0 

summary(lm.p)
## 
## Call:
## lm(formula = YHardness ~ XTime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1500 -2.2188  0.1625  2.6875  5.5750 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 168.60000    2.65702   63.45  < 2e-16 ***
## XTime         2.03438    0.09039   22.51 2.16e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.234 on 14 degrees of freedom
## Multiple R-squared:  0.9731, Adjusted R-squared:  0.9712 
## F-statistic: 506.5 on 1 and 14 DF,  p-value: 2.159e-12
##############################################################
#2.26
PYhat = Pb0+Pb1*XTime
Res = lm.p$res
SSE = sum(Res^2)                # Sum of squared error/residuals
SSR = sum((PYhat-PYbar)^2)        # Sum of squared regression
SSTO = (n-1)*var(YHardness)             # Sum of squared Total
c(SSE, SSR, SSTO, SSTO-SSE)     # SSTO=SSE+SSR
## [1]  146.425 5297.512 5443.938 5297.512
MSE = SSE/(n-2)
MSR = SSR /1

dfSSE = n-2
dfSSTO = n-1

Table = data.frame("Source" = c("Regression", "Error", "Total"), "ss" = c(SSR, SSE, SSTO), "df" = c(1, dfSSE, dfSSTO), "MS"= c(MSR, MSE, ""))

Table
##       Source       ss df               MS
## 1 Regression 5297.512  1        5297.5125
## 2      Error  146.425 14 10.4589285714286
## 3      Total 5443.938 15
anova = anova(lm.p)
anova
## Analysis of Variance Table
## 
## Response: YHardness
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## XTime      1 5297.5  5297.5  506.51 2.159e-12 ***
## Residuals 14  146.4    10.5                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#b. 
alpha = 0.01
Fstar = (SSR/1)/(SSE/(n-2))
Fstar
## [1] 506.5062
CriticalF = qf(1-alpha, 1, n-2)
CriticalF
## [1] 8.861593
#Since our Fstar is > our CriticalF we reject the null and conclude B1 not = 0. Therefore the slope is not 0.

#c.
plot(XTime, Res,xlab="Time (Hours)",ylab="Residuals", ylim=c(-30,30), type="p",pch = 25,cex = 1)
reg = PYhat - PYbar
points(XTime, reg, col = 'red', pch = 20)
abline(h=0)
legend (18, 30, legend = c("SSR", "Residuals"), col=c("red", "black"), pch=c(20,25),cex=1)

#plot(XTime, reg ,xlab="Hours",ylab="Regression-YMean",type="p",pch = 20,cex = 1)

#d. R^2
# Coefficient of Determination
Rsq = SSR/SSTO    # R square
Rsq
## [1] 0.9731031
# Coefficient of Correlation
r = sqrt(Rsq)       
r      
## [1] 0.9864599