Chapter 5 (S & Y) exercises 9

The data set in Table 5.28 (given by Wetherill (1986) following Ruppert and Carroll (1980), Carroll and Ruppert (1985)) was obtained during a project to predict summer shrimp harvests from late-spring data in Pamlico Sound, North Carolina. The data set given here is concerned wiith just one part of this project: predicting salinity (col. 2) from lagged salinity (col. 3), the fresh water flow (col. 4), the period of measurement (col. 5) and the year (col. 6). Using any devices such as transformations, nonlinear terms, deleting outliers etc. as you deem appropriate, find a formula to predict salinity from the other variables, and assess the error in predictions made using this formula.

rm(list=ls(all=TRUE)) 
library(faraway)
d <- read.table("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/Homework/HW5/salinity.dat",header=F)
names(d)=c("obs", "y", "x3", "x4", "x5", "x6")
attach(d)
d[1:3,]
##   obs   y  x3     x4 x5   x6
## 1   1 7.6 8.2 23.005  4 1972
## 2   2 7.7 7.6 23.873  5 1972
## 3   3 4.3 4.6 26.417  0 1973
d.lm <- lm(y~x3+x4+x5+x6,d)
summary(d.lm)
## 
## Call:
## lm(formula = y ~ x3 + x4 + x5 + x6, data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.75229 -0.51900  0.05684  0.51389  2.85992 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -831.88289  448.54450  -1.855   0.0765 .  
## x3             0.61639    0.11860   5.197 2.86e-05 ***
## x4            -0.26339    0.10299  -2.557   0.0176 *  
## x5             0.06002    0.15987   0.375   0.7108    
## x6             0.42648    0.22733   1.876   0.0734 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.266 on 23 degrees of freedom
## Multiple R-squared:  0.8494, Adjusted R-squared:  0.8232 
## F-statistic: 32.44 on 4 and 23 DF,  p-value: 3.768e-09
(1)check for outliers/influential points:
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/diagnose.R")
p <- 5
n <- dim(d)[1]
sigma<-summary(d.lm)$sigma
inf<- lm.influence(d.lm)
par(mfrow=c(2,4))
plot(x3,y,pch=16);plot(x4,y,pch=16);plot(x5,y,pch=16);plot(x6,y,pch=16)

# (1)check leverage
plot(inf$hat); abline(2*p/n,0,lty='dotted'); title("Leverage plot");

#(2)Internally standardized residual
e_ISR <- round(stanres(d.lm) , digits =2)
plot(e_ISR); abline(2,0,lty='dotted');abline(-2,0,lty='dotted');title("e_i*")

#(3)Externally studentized residual
d_ESR <- round(studres(d.lm) , digits =2)
plot(d_ESR); abline(qt(0.975,n-p-1),0,lty='dotted');abline(-qt(0.975,n-p-1),0,lty='dotted');title("d_i*")

#(4)DFFITS
DFFits <- round(dffits(d.lm), digits =2); 
plot(DFFits);abline(2*sqrt(p/n),0,lty='dotted');abline(-2*sqrt(p/n),0,lty='dotted');
title("DFfits")

par(mfrow=c(2,4))
#(5)DFBETAS
DFBETAS <- round(dfbetas(d.lm),digits =2)
Itcept_df <- DFBETAS[,1]; plot(Itcept_df);abline(2/sqrt(n),0,lty="dotted");title("beta of Itcept")
x3_df<- DFBETAS[,2]; plot(x3_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x3")
x4_df<- DFBETAS[,3]; plot(x4_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x4")
x5_df<- DFBETAS[,4]; plot(x5_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x5")
x6_df<- DFBETAS[,5]; plot(x6_df);abline(2/sqrt(n),0,lty="dotted");title("beta of x6")

# plot(DFBETAS[,1],pch=1,ylab="dfbetas",ylim=c(min(DFBETAS),max(DFBETAS)))
# abline(h=2*sqrt(1/n),lty=2);abline(h=-2*sqrt(1/n),lty=2);title("DFBETAS")
# if (p>1) {
# for (k in 1:p) {
# points(DFBETAS[,k],pch=k)
# }}
# legend("topright", c("beta0","beta3","beta4","beta5","beta6"), text.font = 0.5,pch = c(1:p))

#(6) Cook's D
plot(d.lm, which=4)

#(7) Cov_ratio
first <- ((n-p-1) / (n-p))+ d_ESR ^2/(n-p)
COVRATIO <- first^(-p) /(1-inf$hat)
plot(COVRATIO);abline(h=3*p/n+1,lty=2);abline(h=-3*p/n+1,lty=2);title("COVRATIO")

From the result of all tests, it seems that obs 16 is both an outlier and an influential point, so we will omit this point in the further analysis.

#delete the 16th observation
d2<-rbind(d[1:15,2:6],d[17:28,2:6])
d2.lm <- lm(y~x3+x4+x5+x6,d2)
summary(d2.lm)
## 
## Call:
## lm(formula = y ~ x3 + x4 + x5 + x6, data = d2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.22869 -0.27146  0.03959  0.45901  2.13447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -583.77046  371.84278  -1.570    0.131    
## x3             0.58852    0.09690   6.074 4.11e-06 ***
## x4            -0.58235    0.12270  -4.746 9.76e-05 ***
## x5            -0.08588    0.13648  -0.629    0.536    
## x6             0.30490    0.18824   1.620    0.120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.031 on 22 degrees of freedom
## Multiple R-squared:  0.9045, Adjusted R-squared:  0.8871 
## F-statistic: 52.09 on 4 and 22 DF,  p-value: 6.603e-11
(2) Variable selection
(a)Adjusted \(R^2\) abd Mallows \(C_p\)
library(leaps)
b <- regsubsets(y~x3+x4+x5+x6, data=d2)
summary(b)
## Subset selection object
## Call: regsubsets.formula(y ~ x3 + x4 + x5 + x6, data = d2)
## 4 Variables  (and intercept)
##    Forced in Forced out
## x3     FALSE      FALSE
## x4     FALSE      FALSE
## x5     FALSE      FALSE
## x6     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
##          x3  x4  x5  x6 
## 1  ( 1 ) "*" " " " " " "
## 2  ( 1 ) "*" "*" " " " "
## 3  ( 1 ) "*" "*" " " "*"
## 4  ( 1 ) "*" "*" "*" "*"
rs<-summary(b)
par(mfrow=c(1,2))
plot(2:5, rs$cp, xlab="No. of Parameters",ylab="Cp Statistic")
abline (0, 1)
plot (2:5, rs$adjr2, xlab="No. of Parameters",ylab="Adjusted R-squqre")

Both tests show that it’s better to use model with 4 variables: incept, x3, x4, x6.

T-test shows that coefficients of x6 is not significant neither. So perhaps we shall do more tests:

(b) AIC
g <- lm (y ~ . , data=d2)
step(g)
## Start:  AIC=6.1
## y ~ x3 + x4 + x5 + x6
## 
##        Df Sum of Sq    RSS     AIC
## - x5    1     0.421 23.786  4.5777
## <none>              23.365  6.0961
## - x6    1     2.786 26.152  7.1379
## - x4    1    23.925 47.290 23.1326
## - x3    1    39.179 62.544 30.6808
## 
## Step:  AIC=4.58
## y ~ x3 + x4 + x6
## 
##        Df Sum of Sq    RSS     AIC
## <none>              23.786  4.5777
## - x6    1     3.935 27.720  6.7110
## - x4    1    28.966 52.751 24.0834
## - x3    1    38.760 62.546 28.6817
## 
## Call:
## lm(formula = y ~ x3 + x4 + x6, data = d2)
## 
## Coefficients:
## (Intercept)           x3           x4           x6  
##   -660.1989       0.5826      -0.5410       0.3430
(c)Backward elimination:
g <- lm (y ~ x3+x4+x5+x6, data=d2)
summary(g)$coefficients
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept) -583.77045703 371.84278250 -1.5699389 1.307015e-01
## x3             0.58852065   0.09689712  6.0736652 4.105488e-06
## x4            -0.58235350   0.12269722 -4.7462647 9.756124e-05
## x5            -0.08588038   0.13648209 -0.6292429 5.356702e-01
## x6             0.30489527   0.18823803  1.6197326 1.195370e-01
g <- update (g,.~. - x5)
summary(g)$coefficients
##                 Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept) -660.1989011 346.80024485 -1.903686 6.953787e-02
## x3             0.5826393   0.09517037  6.122067 3.030435e-06
## x4            -0.5409756   0.10221861 -5.292340 2.265456e-05
## x6             0.3430289   0.17586008  1.950579 6.339583e-02
g <- update (g,.~. - x6)
summary(g)$coefficients
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 16.2414243  2.8728839  5.653352 8.037305e-06
## x3           0.7131089  0.0715478  9.966887 5.244276e-10
## x4          -0.5584150  0.1076125 -5.189128 2.576210e-05

Above all, I think haveing 4 variables is the best choice: incept, x3, x4, x6.

(3) Multicollinearity
(a) pair wise correlation
d3 <- cbind(d2[,1:3],d2[,5])
names(d3) <-c("y", "x3", "x4", "x6")
g <- lm(y~., d3)
round(cor(d3), 3)
##         y     x3     x4     x6
## y   1.000  0.872 -0.646  0.748
## x3  0.872  1.000 -0.360  0.736
## x4 -0.646 -0.360  1.000 -0.320
## x6  0.748  0.736 -0.320  1.000

There are large correlations between predictors, and between the predictors and reponses.

(b) eigen decomposition
#eigen decomposition
x <- model.matrix(g)[,-1] 
e <- eigen (t(x) %*% x)
e$val     #eigen values
## [1] 1.053030e+08 2.819885e+02 9.141320e+01
sqrt(e$val[1]/e$val) #condition index
## [1]    1.0000  611.0893 1073.2870

Several condition index are large, which imply strong association.

(c) VIF
vif(x)
##       x3       x4       x6 
## 2.270092 1.157623 2.201904

VIF is actually fine. But other two tests show strong correlation and collinarity, and we shall perhaps do a Ridge Regression or PCA.

(4) Transformation
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.3
## 
## Attaching package: 'MASS'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     studres
bla <-boxcox(g,plotit=T,lambda=seq(-0.3, 1.5, by=0.1))

bla$x[which(bla$y==max(bla$y))]  # the lambda value that gives max Log-likelihood
## [1] 0.6272727

It seems like we might want to do a transformation on y with \(\lambda\) = 0.6272727.

(5) Prediction on Salinity
#perform boxcox transformation on y, and do the regression
y2 <- d3$y^bla$x[which(bla$y==max(bla$y))]  
d4 <- cbind(y2, d3)
reg4 <- lm(y2~x3+x4+x6,d4)

par(oma = c(4,1,1,1))
#make predictions using the existing data of x3, x4, x6:
pred <- predict(reg4, data.frame(d4$x3,d4$x4,d4$x6), interval="predict")
## Warning: 'newdata' had 27 rows but variables found have 28 rows
pred <- pred^(1/bla$x[which(bla$y==max(bla$y))]) #transforming y back
plot(pred[,1],pch='|',ylab="salinity",ylim=c(3,18))
points(pred[,2],col='red',pch='-', type='l')
points(pred[,3],col='red',pch='-', type='l')
points(d4$y, col = 'green', pch="*")
title("Prediction Interval")

legend("bottom", legend=c("Point Pred","Pred Interval", "Observed yi"), pch=c("|","-","*"),col=c("black","red","green"),xpd = TRUE,horiz = TRUE,inset = c(0,-0.4) )

In general, we want to make the boxcox transformation on y, and then do the regression and prediction , then transforming y back.

From the last figure, we can see that the prediction does a good job capturing almost all the observed yi’s in the prediction interval.