Completed by Alex Blohm
GPAData <- read.table("CH01PR19.txt") #Import GPA Data
#head(GPAData) #First 6 observations
Y <- GPAData[,1] #Declare Y-value of Freshman Year College GPA
X <- GPAData[,2] #Declare X-value of ACT score
n = nrow(GPAData)
Xbar <- mean(X) #declared
Ybar <- mean(Y)
Sxx = sum((X-Xbar)^2) #needed for MSE to find standard error
b1 <- sum((X-Xbar)*(Y-Ybar))/((n-1)*var(X))
b0 <- Ybar-b1*Xbar
SSE = sum((Y-(b0 + b1*X))^2) #Sum squared error, from residuals
MSE = SSE/(n-2) #Mean Squared Error
LM <- lm(Y~X) #Create Linear Model for Y ~ X
#7a.
coefficients(LM) #Sample Intercept and Slope
## (Intercept) X
## 2.11404929 0.03882713
#7c.
new_obs <- data.frame(X <- 30) #Create a new observation matrix
new_obs
## X....30
## 1 30
predict(LM, new_obs) #Predicted GPA with ACT score of 30.
## 1
## 3.278863
#8e.
sum(LM$residuals) #Add all residuals from Linear Model LM
## [1] -2.942091e-15
#Yes they sum to basically zero (rounding caused a minor error)
#8.f.
sigma(LM)
## [1] 0.623125
Variance = sigma(LM)^2
Variance
## [1] 0.3882848
#Units GPA points. AKA on average, Predicted GPA varies by 0.388 points.
#4.a.
alpha = .01
CI = confint(LM, level = 1-alpha)# 99%CI for b0 and b1
CI[2,]
## 0.5 % 99.5 %
## 0.005385614 0.072268640
We are 99% confident that the interval 1.27 to 2.95 captures the true slope. Since 0 is not in the interval, this suggests there is a relationship between ACT score and Freshman year GPA.
# names(summary(LM)) helps find the names
#4b. and c.
summary(LM)
##
## Call:
## lm(formula = Y ~ X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.74004 -0.33827 0.04062 0.44064 1.22737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.11405 0.32089 6.588 1.3e-09 ***
## X 0.03883 0.01277 3.040 0.00292 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6231 on 118 degrees of freedom
## Multiple R-squared: 0.07262, Adjusted R-squared: 0.06476
## F-statistic: 9.24 on 1 and 118 DF, p-value: 0.002917
tstarb1 = summary(LM)$coef[2,"t value"]
tstarb1
## [1] 3.039777
RejectionQuantile = qt(1-alpha/2, df= n-2)
RejectionQuantile
## [1] 2.618137
pvalueb1 = summary(LM)$coef[2,"Pr(>|t|)"]
pvalueb1
## [1] 0.002916604
#From the summary we see the t value is 3.04 and P-value is 0.00292 which is less than alpha = 0.01. Therefore we would reject the null and conclude there is a relationship between ACT score and GPA as discussed in the last homework.
#2.13 a. First with formulas, then with R functions
# Same data and X,Y from Before
Xh <- 28
MSE
## [1] 0.3882848
Yhath <- b0 + b1 * Xh # Predicted value for X_h=28
sYHath = sqrt(MSE*((1/n) + (((Xh-Xbar)^2)/Sxx))) #standard error of YHath
alpha = 0.05 #for 95% CI
LowerBYHath = Yhath-(qt(1-alpha/2, n-2)*sYHath)
UpperBYHath = Yhath+(qt(1-alpha/2, n-2)*sYHath)
c(LowerBYHath,UpperBYHath)
## [1] 3.061384 3.341033
# Confidence Inverval for mean response
NewAct = data.frame(X = 28)
ci2 <- predict.lm(LM, NewAct, interval = "confidence", 1-alpha)
ci2
## $fit
## fit lwr upr
## 1 3.201209 3.061384 3.341033
##
## $se.fit
## [1] 0.07060873
##
## $df
## [1] 118
##
## $residual.scale
## [1] 0.623125
#2.13 b.
sYHathNew = sqrt(MSE*(1 + (1/n) + ((Xh-Xbar)^2)/Sxx)) #standard predicted error of YHathNew (1+ at the front)
alpha = 0.05 #for 95% CI
LowerBYHathNew = Yhath-qt(1-alpha/2, n-2)*sYHathNew
UpperBYHathNew = Yhath+qt(1-alpha/2, n-2)*sYHathNew
c(LowerBYHathNew,UpperBYHathNew)
## [1] 1.959355 4.443063
# Prediction Inverval
pi <- predict(LM, NewAct, interval = "prediction")
pi
## fit lwr upr
## 1 3.201209 1.959355 4.443063
#2.13 d.
LowerBandYHath = Yhath-sqrt(2*qf(1-alpha, 2, n-2))*sYHath
UpperBandYHath = Yhath+sqrt(2*qf(1-alpha, 2, n-2))*sYHath
c(LowerBandYHath,UpperBandYHath)
## [1] 3.026159 3.376258
#2.23 a.
GPAData <- read.table("CH01PR19.txt") #Import GPA Data
#head(GPAData) #First 6 observations
Y <- GPAData[,1] #Declare Y-value of Freshman Year College GPA
X <- GPAData[,2] #Declare X-value of ACT score
Yhat = b0+b1*X
Res = LM$res
SSE = sum(Res^2) # Sum of squared error/residuals
SSR = sum((Yhat-Ybar)^2) # Sum of squared regression
SSTO = (n-1)*var(Y) # Sum of squared Total
c(SSE, SSR, SSTO, SSTO-SSE) # SSTO=SSE+SSR
## [1] 45.817608 3.587846 49.405454 3.587846
MSE = SSE/(n-2)
MSR = SSR /1
dfSSE = n-2
dfSSTO = n-1
Table = data.frame("Source" = c("Regression", "Error", "Total"), "ss" = c(SSR, SSE, SSTO), "df" = c(1, dfSSE, dfSSTO), "MS"= c(MSR, MSE, ""))
Table
## Source ss df MS
## 1 Regression 3.587846 1 3.587845899313
## 2 Error 45.817608 118 0.388284811870229
## 3 Total 49.405454 119
anova = anova(LM)
anova
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X 1 3.588 3.5878 9.2402 0.002917 **
## Residuals 118 45.818 0.3883
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSR2 = b1^2*Sxx
SSR2
## [1] 3.587846
#b. MSR is the estimated distance from the regression line to the mean of the Y. They would be the same if beta 1 = 0.
#c.
alpha = 0.01
Fstar = (SSR/1)/(SSE/(n-2))
Fstar
## [1] 9.240243
CriticalF = qf(1-alpha, 1, n-2)
CriticalF
## [1] 6.854641
#Since our Fstar is > our CriticalF we reject the null and conclude B1 not = 0. Therefore the slope is not 0.
#d. R^2
# Coefficient of Determination
Rsq = SSR/SSTO # R square
Rsq
## [1] 0.07262044
#e.
# Coefficient of Correlation
r = sqrt(Rsq)
r
## [1] 0.2694818
b1
## [1] 0.03882713
#Since b1 is positive, so is r.
#f. r has a more clear-cut interpretation as the strength of the relationship between GPA and ACT scores because it includes the direction.
#Problems 1.22, 1.26, 2.7 a,b, 2.26, Plastic Hardness using data set CH01PR22.txt.
Plastic <- read.table("CH01PR22.txt") #Import Plastic Hardness Data
#1.22a.
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic)
PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar
plot(XTime,YHardness,xlab="Hours", ylab="Hardness",type="p",pch = 20,cex = 1)
abline(Pb0,Pb1)
#It appears to be a good fit.
lm.p = lm(YHardness~XTime)
#b.
Pred = Pb0+ Pb1*40
Pred
## [1] 249.975
new_obs <- data.frame(XTime <- 40) #Create a new observation matrix
new_obs
## XTime....40
## 1 40
predict(lm.p, new_obs) #Predicted GPA with ACT score of 30.
## 1
## 249.975
#c.
Pb1
## [1] 2.034375
#1.26 a.
Plastic <- read.table("CH01PR22.txt") #Import Plastic Hardness Data
#1.22a.
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic)
PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar
PYhat <- Pb0 + Pb1*XTime # Predicted values
PRes <- YHardness-PYhat # Residuals
head(PRes)
## [1] -2.150 3.850 -5.150 -1.150 0.575 2.575
ShouldBeZero = sum(PRes)
ShouldBeZero
## [1] -3.410605e-13
sum(lm.p$residuals)
## [1] -1.998401e-15
#Yes they basically sum to zero
#b.
PMSE <- sum(PRes^2)/(n-2) # Estimated
PMSE
## [1] 10.45893
Psd <- sqrt(PMSE) # Estimated Standard Deviation
Psd
## [1] 3.234027
summary(lm.p)$sigma
## [1] 3.234027
#Measured in Brinell Units
#2.7
Plastic <- read.table("CH01PR22.txt") #Import
XTime <- Plastic[,2]
YHardness = Plastic[,1]
n = nrow(Plastic)
PXbar <- mean(XTime) #declared P for plastic
PYbar <- mean(YHardness)
Pb1 <- sum((XTime-PXbar)*(YHardness-PYbar))/((n-1)*var(XTime))
Pb0 <- PYbar-Pb1*PXbar
PMSE <- sum(PRes^2)/(n-2)
alpha = 0.01
# Confidence Interval for beta1
sb1 <- sqrt(PMSE/sum((XTime-PXbar)^2)) # Estimated SD of b1
sb1
## [1] 0.09039379
CI_beta1 <- c(Pb1-qt(1-alpha/2,n-2)*sb1,Pb1+qt(1-alpha/2,n-2)*sb1) #CI for beta1
CI_beta1
## [1] 1.765287 2.303463
confint(lm.p,2,level = 1-alpha) # CI for beta1
## 0.5 % 99.5 %
## XTime 1.765287 2.303463
#We are 99% confident that the interval 1.77 to 2.30 captures the true slope relating plastic hardness to hours.
#b. According to our confidence interval, 2 will be in the interval.
# Test Hypothesis H0: beta1 = 2 vs Ha: beta1 != 2
t1_1 <- (Pb1-2)/sb1 # Test statistic
t1_1
## [1] 0.3802805
cri <- qt(1-alpha/2,n-2)# critical point, reject H0 if |t1_1| < cri
cri # Conclusion: do not reject H0
## [1] 2.976843
p_val_1 <-2*(1-pt(abs(t1_1),n-2)) # p-value reject H0 if p_val< 0.01
p_val_1
## [1] 0.7094445
# Conclusion: do not (fail to) reject H0
summary(lm.p)
##
## Call:
## lm(formula = YHardness ~ XTime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1500 -2.2188 0.1625 2.6875 5.5750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 168.60000 2.65702 63.45 < 2e-16 ***
## XTime 2.03438 0.09039 22.51 2.16e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.234 on 14 degrees of freedom
## Multiple R-squared: 0.9731, Adjusted R-squared: 0.9712
## F-statistic: 506.5 on 1 and 14 DF, p-value: 2.159e-12
##############################################################
#2.26
PYhat = Pb0+Pb1*XTime
Res = lm.p$res
SSE = sum(Res^2) # Sum of squared error/residuals
SSR = sum((PYhat-PYbar)^2) # Sum of squared regression
SSTO = (n-1)*var(YHardness) # Sum of squared Total
c(SSE, SSR, SSTO, SSTO-SSE) # SSTO=SSE+SSR
## [1] 146.425 5297.512 5443.938 5297.512
MSE = SSE/(n-2)
MSR = SSR /1
dfSSE = n-2
dfSSTO = n-1
Table = data.frame("Source" = c("Regression", "Error", "Total"), "ss" = c(SSR, SSE, SSTO), "df" = c(1, dfSSE, dfSSTO), "MS"= c(MSR, MSE, ""))
Table
## Source ss df MS
## 1 Regression 5297.512 1 5297.5125
## 2 Error 146.425 14 10.4589285714286
## 3 Total 5443.938 15
anova = anova(lm.p)
anova
## Analysis of Variance Table
##
## Response: YHardness
## Df Sum Sq Mean Sq F value Pr(>F)
## XTime 1 5297.5 5297.5 506.51 2.159e-12 ***
## Residuals 14 146.4 10.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#b.
alpha = 0.01
Fstar = (SSR/1)/(SSE/(n-2))
Fstar
## [1] 506.5062
CriticalF = qf(1-alpha, 1, n-2)
CriticalF
## [1] 8.861593
#Since our Fstar is > our CriticalF we reject the null and conclude B1 not = 0. Therefore the slope is not 0.
#c.
plot(XTime, Res,xlab="Time (Hours)",ylab="Residuals", ylim=c(-30,30), type="p",pch = 25,cex = 1)
reg = PYhat - PYbar
points(XTime, reg, col = 'red', pch = 20)
abline(h=0)
legend (18, 30, legend = c("SSR", "Residuals"), col=c("red", "black"), pch=c(20,25),cex=1)
#plot(XTime, reg ,xlab="Hours",ylab="Regression-YMean",type="p",pch = 20,cex = 1)
#d. R^2
# Coefficient of Determination
Rsq = SSR/SSTO # R square
Rsq
## [1] 0.9731031
# Coefficient of Correlation
r = sqrt(Rsq)
r
## [1] 0.9864599