The data set in Table 5.28 (given by Wetherill (1986) following Ruppert and Carroll (1980), Carroll and Ruppert (1985)) was obtained during a project to predict summer shrimp harvests from late-spring data in Pamlico Sound, North Carolina. The data set given here is concerned wiith just one part of this project: predicting salinity (col. 2) from lagged salinity (col. 3), the fresh water flow (col. 4), the period of measurement (col. 5) and the year (col. 6). Using any devices such as transformations, nonlinear terms, deleting outliers etc. as you deem appropriate, find a formula to predict salinity from the other variables, and assess the error in predictions made using this formula.
rm(list=ls(all=TRUE))
library(faraway)
d <- read.table("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/Homework/HW5/salinity.dat",header=F)
names(d)=c("obs", "y", "x3", "x4", "x5", "x6")
attach(d)
d[1:3,]
## obs y x3 x4 x5 x6
## 1 1 7.6 8.2 23.005 4 1972
## 2 2 7.7 7.6 23.873 5 1972
## 3 3 4.3 4.6 26.417 0 1973
d.lm <- lm(y~x3+x4+x5+x6,d)
summary(d.lm)
##
## Call:
## lm(formula = y ~ x3 + x4 + x5 + x6, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.75229 -0.51900 0.05684 0.51389 2.85992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -831.88289 448.54450 -1.855 0.0765 .
## x3 0.61639 0.11860 5.197 2.86e-05 ***
## x4 -0.26339 0.10299 -2.557 0.0176 *
## x5 0.06002 0.15987 0.375 0.7108
## x6 0.42648 0.22733 1.876 0.0734 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.266 on 23 degrees of freedom
## Multiple R-squared: 0.8494, Adjusted R-squared: 0.8232
## F-statistic: 32.44 on 4 and 23 DF, p-value: 3.768e-09
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/diagnose.R")
p <- 5
n <- dim(d)[1]
sigma<-summary(d.lm)$sigma
inf<- lm.influence(d.lm)
par(mfrow=c(2,4))
plot(x3,y,pch=16);plot(x4,y,pch=16);plot(x5,y,pch=16);plot(x6,y,pch=16)
# (1)check leverage
plot(inf$hat); abline(2*p/n,0,lty='dotted'); title("Leverage plot");
#(2)Internally standardized residual
e_ISR <- round(stanres(d.lm) , digits =2)
plot(e_ISR); abline(2,0,lty='dotted');abline(-2,0,lty='dotted');title("e_i*")
#(3)Externally studentized residual
d_ESR <- round(studres(d.lm) , digits =2)
plot(d_ESR); abline(qt(0.975,n-p-1),0,lty='dotted');abline(-qt(0.975,n-p-1),0,lty='dotted');title("d_i*")
#(4)DFFITS
DFFits <- round(dffits(d.lm), digits =2);
plot(DFFits);abline(2*sqrt(p/n),0,lty='dotted');abline(-2*sqrt(p/n),0,lty='dotted');
title("DFfits")
par(mfrow=c(2,4))
#(5)DFBETAS
DFBETAS <- round(dfbetas(d.lm),digits =2)
Itcept_df <- DFBETAS[,1]; plot(Itcept_df);abline(2/sqrt(n),0,lty="dotted");title("beta of Itcept")
x3_df<- DFBETAS[,2]; plot(x3_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x3")
x4_df<- DFBETAS[,3]; plot(x4_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x4")
x5_df<- DFBETAS[,4]; plot(x5_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x5")
x6_df<- DFBETAS[,5]; plot(x6_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x6")
# plot(DFBETAS[,1],pch=1,ylab="dfbetas",ylim=c(min(DFBETAS),max(DFBETAS)))
# abline(h=2*sqrt(1/n),lty=2);abline(h=-2*sqrt(1/n),lty=2);title("DFBETAS")
# if (p>1) {
# for (k in 1:p) {
# points(DFBETAS[,k],pch=k)
# }}
# legend("topright", c("beta0","beta3","beta4","beta5","beta6"), text.font = 0.5,pch = c(1:p))
#(6) Cook's D
plot(d.lm, which=4)
#(7) Cov_ratio
first <- ((n-p-1) / (n-p))+ d_ESR ^2/(n-p)
COVRATIO <- first^(-p) /(1-inf$hat)
plot(COVRATIO);abline(h=3*p/n+1,lty=2);abline(h=-3*p/n+1,lty=2);title("COVRATIO")
From the result of all tests, it seems that obs 16 is both an outlier and an influential point, so we will omit this point in the further analysis.
#delete the 16th observation
d2<-rbind(d[1:15,2:6],d[17:28,2:6])
d2.lm <- lm(y~x3+x4+x5+x6,d2)
summary(d2.lm)
##
## Call:
## lm(formula = y ~ x3 + x4 + x5 + x6, data = d2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.22869 -0.27146 0.03959 0.45901 2.13447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -583.77046 371.84278 -1.570 0.131
## x3 0.58852 0.09690 6.074 4.11e-06 ***
## x4 -0.58235 0.12270 -4.746 9.76e-05 ***
## x5 -0.08588 0.13648 -0.629 0.536
## x6 0.30490 0.18824 1.620 0.120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.031 on 22 degrees of freedom
## Multiple R-squared: 0.9045, Adjusted R-squared: 0.8871
## F-statistic: 52.09 on 4 and 22 DF, p-value: 6.603e-11
library(leaps)
b <- regsubsets(y~x3+x4+x5+x6, data=d2)
summary(b)
## Subset selection object
## Call: regsubsets.formula(y ~ x3 + x4 + x5 + x6, data = d2)
## 4 Variables (and intercept)
## Forced in Forced out
## x3 FALSE FALSE
## x4 FALSE FALSE
## x5 FALSE FALSE
## x6 FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
## x3 x4 x5 x6
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" " " "*"
## 4 ( 1 ) "*" "*" "*" "*"
rs<-summary(b)
par(mfrow=c(1,2))
plot(2:5, rs$cp, xlab="No. of Parameters",ylab="Cp Statistic")
abline (0, 1)
plot (2:5, rs$adjr2, xlab="No. of Parameters",ylab="Adjusted R-squqre")
Both tests show that it’s better to use model with 4 variables: incept, x3, x4, x6.
T-test shows that coefficients of x6 is not significant neither. So perhaps we shall do more tests:
g <- lm (y ~ . , data=d2)
step(g)
## Start: AIC=6.1
## y ~ x3 + x4 + x5 + x6
##
## Df Sum of Sq RSS AIC
## - x5 1 0.421 23.786 4.5777
## <none> 23.365 6.0961
## - x6 1 2.786 26.152 7.1379
## - x4 1 23.925 47.290 23.1326
## - x3 1 39.179 62.544 30.6808
##
## Step: AIC=4.58
## y ~ x3 + x4 + x6
##
## Df Sum of Sq RSS AIC
## <none> 23.786 4.5777
## - x6 1 3.935 27.720 6.7110
## - x4 1 28.966 52.751 24.0834
## - x3 1 38.760 62.546 28.6817
##
## Call:
## lm(formula = y ~ x3 + x4 + x6, data = d2)
##
## Coefficients:
## (Intercept) x3 x4 x6
## -660.1989 0.5826 -0.5410 0.3430
g <- lm (y ~ x3+x4+x5+x6, data=d2)
summary(g)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -583.77045703 371.84278250 -1.5699389 1.307015e-01
## x3 0.58852065 0.09689712 6.0736652 4.105488e-06
## x4 -0.58235350 0.12269722 -4.7462647 9.756124e-05
## x5 -0.08588038 0.13648209 -0.6292429 5.356702e-01
## x6 0.30489527 0.18823803 1.6197326 1.195370e-01
g <- update (g,.~. - x5)
summary(g)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -660.1989011 346.80024485 -1.903686 6.953787e-02
## x3 0.5826393 0.09517037 6.122067 3.030435e-06
## x4 -0.5409756 0.10221861 -5.292340 2.265456e-05
## x6 0.3430289 0.17586008 1.950579 6.339583e-02
g <- update (g,.~. - x6)
summary(g)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.2414243 2.8728839 5.653352 8.037305e-06
## x3 0.7131089 0.0715478 9.966887 5.244276e-10
## x4 -0.5584150 0.1076125 -5.189128 2.576210e-05
Above all, I think haveing 4 variables is the best choice: incept, x3, x4, x6.
d3 <- cbind(d2[,1:3],d2[,5])
names(d3) <-c("y", "x3", "x4", "x6")
g <- lm(y~., d3)
round(cor(d3), 3)
## y x3 x4 x6
## y 1.000 0.872 -0.646 0.748
## x3 0.872 1.000 -0.360 0.736
## x4 -0.646 -0.360 1.000 -0.320
## x6 0.748 0.736 -0.320 1.000
There are large correlations between predictors, and between the predictors and reponses.
#eigen decomposition
x <- model.matrix(g)[,-1]
e <- eigen (t(x) %*% x)
e$val #eigen values
## [1] 1.053030e+08 2.819885e+02 9.141320e+01
sqrt(e$val[1]/e$val) #condition index
## [1] 1.0000 611.0893 1073.2870
Several condition index are large, which imply strong association.
vif(x)
## x3 x4 x6
## 2.270092 1.157623 2.201904
VIF is actually fine. But other two tests show strong correlation and collinarity, and we shall perhaps do a Ridge Regression or PCA.
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.3
##
## Attaching package: 'MASS'
##
## The following object is masked _by_ '.GlobalEnv':
##
## studres
bla <-boxcox(g,plotit=T,lambda=seq(-0.3, 1.5, by=0.1))
bla$x[which(bla$y==max(bla$y))] # the lambda value that gives max Log-likelihood
## [1] 0.6272727
It seems like we might want to do a transformation on y with \(\lambda\) = 0.6272727.
#perform boxcox transformation on y, and do the regression
y2 <- d3$y^bla$x[which(bla$y==max(bla$y))]
d4 <- cbind(y2, d3)
reg4 <- lm(y2~x3+x4+x6,d4)
par(oma = c(4,1,1,1))
#make predictions using the existing data of x3, x4, x6:
pred <- predict(reg4, data.frame(d4$x3,d4$x4,d4$x6), interval="predict")
## Warning: 'newdata' had 27 rows but variables found have 28 rows
pred <- pred^(1/bla$x[which(bla$y==max(bla$y))]) #transforming y back
plot(pred[,1],pch='|',ylab="salinity",ylim=c(3,18))
points(pred[,2],col='red',pch='-', type='l')
points(pred[,3],col='red',pch='-', type='l')
points(d4$y, col = 'green', pch="*")
title("Prediction Interval")
legend("bottom", legend=c("Point Pred","Pred Interval", "Observed yi"), pch=c("|","-","*"),col=c("black","red","green"),xpd = TRUE,horiz = TRUE,inset = c(0,-0.4) )
In general, we want to make the boxcox transformation on y, and then do the regression and prediction , then transforming y back.
From the last figure, we can see that the prediction does a good job capturing almost all the observed yi’s in the prediction interval.