uadata = read.table("CH03PR03.txt")
gpa = uadata[,1]
x= uadata[,2]
fit = lm(gpa~x)
boxplot(x, horizontal = TRUE, xlab="act scores")
#There does not appear to be any noteworthy features. Possibly a slight left skew.
hist(x)
# The histogram shows the same thing. The distribution of act scores appears mound-shaped and symmetric.
#c.
#plot(fit$residuals,fit$fitted.values)
plot (fit, which =1)
plot(fit, which=3) #equal variance
plot(x, abs(fit$residuals)) #equal variance
#normality of residuals
boxplot(fit$residuals)
hist(fit$residuals)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(zoo)
bptest(fit)
##
## studentized Breusch-Pagan test
##
## data: fit
## BP = 0.29397, df = 1, p-value = 0.5877
#Looking at the plot the residuals appear to be relatively scattered, implying a good linear fit. There are a few low outliers, however no outliers are beyond 3 standard deviations. The residuals do NOT appear to be normally distributed (left skew). The equal variance assumption does not appear to be met because of the s-shaped line.
#e.
LowACTScores = uadata[which(x<26),] #Subset for ACT < 26
GPALACT = LowACTScores[,1]
ACTLACT = LowACTScores[,2]
fitLow= lm(GPALACT~ACTLACT) #Fit Linear Regression for Low scores
HighACTScores = uadata[which(x>=26),] #Subset for ACT >= 26
GPAHACT = HighACTScores[,1]
ACTHACT = HighACTScores[,2]
fitHigh = lm(GPAHACT~ACTHACT) #Fit Linear Regression for High scores
plot(ACTLACT,abs(fitLow$residuals)) #absolute value of Residuals vs act
library(lmtest)
library(zoo)
bptest(fitLow)
##
## studentized Breusch-Pagan test
##
## data: fitLow
## BP = 6.4818, df = 1, p-value = 0.0109
plot(ACTHACT,abs(fitHigh$residuals)) #absolute value of Residuals vs act
bptest(fitHigh)
##
## studentized Breusch-Pagan test
##
## data: fitHigh
## BP = 0.096399, df = 1, p-value = 0.7562
#Neither plot nor B-P test showed significant deviations from the constant variance assumption with the alpha = 0.01. This is different however the low ACT scores have more issues for constant variance than the high.
#f.
IQCR = read.table("CH03PR03.txt", header = FALSE)
ACT= IQCR[,2] #ACT
IQ= IQCR[,3] #IQ Scores
CR=IQCR[,4] #Percent Class Rank
fit2=lm(gpa~ACT+IQ) #gpa predicted by ACT and IQ score
plot(fit2, which =1)
summary(fit2)
##
## Call:
## lm(formula = gpa ~ ACT + IQ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14807 -0.23843 -0.02077 0.28775 1.03430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.915124 0.360025 -5.319 5.07e-07 ***
## ACT 0.003609 0.008433 0.428 0.669
## IQ 0.041537 0.003076 13.504 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3912 on 117 degrees of freedom
## Multiple R-squared: 0.6375, Adjusted R-squared: 0.6313
## F-statistic: 102.9 on 2 and 117 DF, p-value: < 2.2e-16
There is a pattern in the residual plot, but only IQ is significant.
fit3=lm(gpa~ACT+CR) #GPA predicted by ACT and Class Rank
plot(fit3, which =1)
summary(fit3)
##
## Call:
## lm(formula = gpa ~ ACT + CR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11898 -0.35184 0.06358 0.45789 1.29946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.976621 0.311454 6.346 4.33e-09 ***
## ACT 0.018211 0.013818 1.318 0.19012
## CR 0.008788 0.002699 3.257 0.00148 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5992 on 117 degrees of freedom
## Multiple R-squared: 0.1497, Adjusted R-squared: 0.1352
## F-statistic: 10.3 on 2 and 117 DF, p-value: 7.589e-05
This residual plot shows more scatter and only Class Rank is significant in the summary (but ACT is more significant).
fit4=lm(gpa~ACT+IQ + CR) #GPA predicted by ACT, IQ, and Class rank
plot(fit4, which =1)
summary(fit4)
##
## Call:
## lm(formula = gpa ~ ACT + IQ + CR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03062 -0.24020 -0.01855 0.28195 1.04770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.837546 0.362397 -5.071 1.52e-06 ***
## ACT -0.001509 0.009114 -0.166 0.869
## IQ 0.040314 0.003177 12.689 < 2e-16 ***
## CR 0.002623 0.001820 1.441 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3894 on 116 degrees of freedom
## Multiple R-squared: 0.6439, Adjusted R-squared: 0.6347
## F-statistic: 69.92 on 3 and 116 DF, p-value: < 2.2e-16
This residual plot shows a relatively random scatter, but only IQ is significant.
fit5=lm(gpa~IQ+CR) # GPA predicted by IQ and Class Rank
plot(fit5, which =1)
summary(fit5)
##
## Call:
## lm(formula = gpa ~ IQ + CR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02910 -0.24494 -0.02009 0.28423 1.05243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.855628 0.344103 -5.393 3.66e-07 ***
## IQ 0.040224 0.003118 12.902 < 2e-16 ***
## CR 0.002506 0.001669 1.501 0.136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3878 on 117 degrees of freedom
## Multiple R-squared: 0.6438, Adjusted R-squared: 0.6377
## F-statistic: 105.7 on 2 and 117 DF, p-value: < 2.2e-16
This residual plot shows a slight pattern, IQ is significant. I believe that this is the best model with the most significance (ACT was not a good predictor). ```
Plastic = read.table("CH01PR22.txt")
x = Plastic[,2] #Time
y = Plastic[,1] #Hardness
n=nrow(Plastic)
fitp=lm(y~x)
Yh_hat=predict(fitp,newdata=data.frame(x=c(30,40)))
MSE=sum((fitp$residuals)^2)/(n-2)
xbar=mean(x)
Sxx=sum((x-xbar)^2)
confint(fitp,level=1-(0.1/2)) #don't divide by 4 like in notes
## 2.5 % 97.5 %
## (Intercept) 162.9013 174.29875
## x 1.8405 2.22825
confint(fitp, level= 1-.05)
## 2.5 % 97.5 %
## (Intercept) 162.9013 174.29875
## x 1.8405 2.22825
#We are 95 percent confident that the true intercept beta 0 is in the given confidence interval, and we are 95 percent confident that the true slope beta 1 is in the given confidence interval. We are 90% confident that they both are in their respective confidence intervals.
#b.
xbar
## [1] 28
#Since xbar is positive, then b0 and b1 are negatively correlated. You can't tell in these confidence intervals because the intervals are only an estimate of a single value.
#c. This means that we are 90% (family confidence coefficient .02) confident that both intervals capture the true slope and intercept of the regression line.
#4.9.a.
#Bonferroni CI for X=3,5,7
g=3
alphaF=.10
predict(fitp,new=data.frame(x=c(20,30,40)),interval = "confidence",level=1-alphaF/g)
## fit lwr upr
## 1 209.2875 206.7277 211.8473
## 2 229.6312 227.6762 231.5863
## 3 249.9750 246.7824 253.1676
#We are 90% confident that the the true hardness at 20, 30, and 40 hours is captured by all three confidence intervals.
#4.9.c.
#Scheffe procedure PI
g=2
W1=sqrt(2*qf((1-alphaF),g,(n-2)))
s_Yhpred=sqrt(MSE*(1+1/n+(c(30,40)-xbar)^2/Sxx))
simulPI=cbind(Yh_hat,Yh_hat-W1*s_Yhpred,Yh_hat+W1*s_Yhpred)
colnames(simulPI)=c("fitted","lower","upper")
rownames(simulPI)=c("30","40")
simulPI
## fitted lower upper
## 30 229.6312 221.8354 237.4271
## 40 249.9750 241.7889 258.1611
#Bonferroni PI
predict(fitp,new=data.frame(x = c(30,40)),interval = "predict", level=1-alphaF/g)
## fit lwr upr
## 1 229.6312 222.4710 236.7915
## 2 249.9750 242.4562 257.4938
simulPI
## fitted lower upper
## 30 229.6312 221.8354 237.4271
## 40 249.9750 241.7889 258.1611