Chapter 4 (Faraway book) exercise 10

(Continuation of problem 18 from Chapter 3.) Consider the data set of Table 3.10, in which yarn strengths (X1) are tabulated alongside each of mean fiber length (X2), tensile strength of the fibers (X3) and fiber fineness (X4). Identifying X1 with the response Y and using each of the other three variables X2, X3, X4, or any subset of them that you consider appropriate, as pre- dictors, compute the diagnostics associated with the linear model and state which observations you consider to be outliers, or unduly influential, or both.

rm(list=ls(all=TRUE)) 
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/diagnose.R")
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/dnsim.R")
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/normtest.R")

fiber <- read.table('~/Dropbox/Statistics_study/STOR_664/Homework/HW2/HW2_C_18_data.dat')
names(fiber) <- c('obs', 'y','x2','x3','x4')
fiber = log(fiber)
attach(fiber)

log model, with all three X’s

fiber.lm <- lm(y ~ x2+x3+x4)
summary(fiber.lm)
## 
## Call:
## lm(formula = y ~ x2 + x3 + x4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10878 -0.04042 -0.01865  0.07459  0.11288 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.94934    2.20907   0.882   0.3906    
## x2           0.83492    0.15755   5.299 7.19e-05 ***
## x3           0.09126    0.39551   0.231   0.8204    
## x4          -0.38146    0.14939  -2.553   0.0213 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06937 on 16 degrees of freedom
## Multiple R-squared:  0.7184, Adjusted R-squared:  0.6656 
## F-statistic: 13.61 on 3 and 16 DF,  p-value: 0.0001143
shapiro.test(residuals(fiber.lm)) 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(fiber.lm)
## W = 0.914, p-value = 0.07609
ks.test(residuals(fiber.lm),'pnorm')
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(fiber.lm)
## D = 0.4567, p-value = 0.0002474
## alternative hypothesis: two-sided
anova(fiber.lm)  #question here, what's anova on multiple x?
## Analysis of Variance Table
## 
## Response: y
##           Df   Sum Sq  Mean Sq F value    Pr(>F)    
## x2         1 0.134618 0.134618 27.9725 7.343e-05 ***
## x3         1 0.030440 0.030440  6.3252   0.02297 *  
## x4         1 0.031377 0.031377  6.5199   0.02126 *  
## Residuals 16 0.077000 0.004813                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
norm.test(y,x2+x3+x4)
##   L-G   K-S   C-V   A-D 
## 0.220 0.341 0.167 0.189
X = model.matrix(~x2+x3+x4, fiber)
XtXi = solve(t(X) %*% X) 
hat = X %*% XtXi %*% t(X)

n = dim(X)[1]
p = dim(X)[2]

# (1)check leverage
hii = round(diag(hat), digits =3)

#(2)Internally standardized residual
e_ISR <- round(stanres(fiber.lm) , digits =2)

#(3)Externally studentized residual
d_ESR <- round(studres(fiber.lm) , digits =2)

#(4)DFFITS
DFFits <- round(dffits(fiber.lm), digits =2)

#(5)DFBETAS
DFB <- round(dfbetas(fiber.lm),digits =2)
Itcept_df <- DFB[,1]
x2_df<- DFB[,2]
x3_df<- DFB[,3]
x4_df<- DFB[,4]

#(6) Simulation
dnsim(y,x2+x3+x4)

## [1] "outliers index:"
## 10  5 19 11 18 17 
## 10 11 12 13 14 15

## [1] "outliers index:"
##  2  5 19 17 18 
##  9 10 11 14 16
## NULL
#summary
betas_count = (abs(Itcept_df) >= 2/sqrt(n)) + +
                (abs(x2_df) >= 2/sqrt(n))+ +
                (abs(x3_df) >= 2/sqrt(n)) + +
                (abs(x4_df) >= 2/sqrt(n))

all_count = (hii >= 2*p/n)+ +
                (abs(d_ESR) >= qt(0.975,n-p))+ +
                (DFFits >= 2 * sqrt(p/n))+ +
                (abs(Itcept_df) >= 2/sqrt(n)) + +
                (abs(x2_df) >= 2/sqrt(n))+ +
                (abs(x3_df) >= 2/sqrt(n)) + +
                (abs(x4_df) >= 2/sqrt(n))

summary <- cbind(1:n, hii, hii >= 2*p/n, e_ISR, 0, d_ESR, abs(d_ESR) >= qt(0.975,n-p), DFFits, DFFits >= 2 * sqrt(p/n), Itcept_df, abs(Itcept_df) >= 2/sqrt(n), x2_df, abs(x2_df) >= 2/sqrt(n), x3_df, abs(x3_df) >= 2/sqrt(n), x4_df, abs(x4_df) >= 2/sqrt(n),betas_count,all_count)

summary
##         hii   e_ISR   d_ESR   DFFits   Itcept_df   x2_df   x3_df   x4_df  
## 1   1 0.150 0 -0.24 0 -0.23 0  -0.10 0      0.06 0 -0.06 0 -0.04 0 -0.05 0
## 2   2 0.142 0 -1.06 0 -1.06 0  -0.43 0      0.34 0 -0.17 0 -0.30 0 -0.23 0
## 3   3 0.068 0  1.12 0  1.13 0   0.31 0      0.14 0 -0.06 0 -0.14 0 -0.05 0
## 4   4 0.087 0  1.28 0  1.31 0   0.41 0      0.17 0 -0.11 0 -0.18 0  0.01 0
## 5   5 0.064 0 -0.33 0 -0.32 0  -0.08 0     -0.02 0  0.01 0  0.03 0  0.00 0
## 6   6 0.188 0  1.53 0  1.60 0   0.77 0      0.49 1 -0.14 0 -0.55 1 -0.10 0
## 7   7 0.194 0  1.21 0  1.23 0   0.60 0      0.39 0 -0.15 0 -0.42 0 -0.08 0
## 8   8 0.387 0  1.37 0  1.41 0   1.12 1     -0.51 1  0.92 1  0.33 0 -0.07 0
## 9   9 0.287 0  0.11 0  0.11 0   0.07 0     -0.02 0  0.05 0  0.00 0 -0.02 0
## 10 10 0.204 0 -0.54 0 -0.53 0  -0.27 0     -0.22 0  0.12 0  0.17 0  0.18 0
## 11 11 0.132 0 -1.06 0 -1.06 0  -0.42 0     -0.06 0 -0.18 0  0.17 0 -0.05 0
## 12 12 0.093 0 -0.21 0 -0.20 0  -0.06 0     -0.01 0 -0.02 0  0.02 0 -0.01 0
## 13 13 0.065 0 -0.03 0 -0.03 0  -0.01 0      0.00 0  0.00 0  0.00 0  0.00 0
## 14 14 0.385 0 -0.83 0 -0.83 0  -0.65 0      0.32 0  0.33 0 -0.38 0 -0.51 1
## 15 15 0.170 0 -1.72 0 -1.85 0  -0.84 0     -0.13 0 -0.31 0  0.34 0 -0.17 0
## 16 16 0.057 0 -0.49 0 -0.48 0  -0.12 0      0.03 0  0.01 0 -0.04 0 -0.03 0
## 17 17 0.286 0 -0.43 0 -0.42 0  -0.27 0     -0.13 0  0.04 0  0.08 0  0.22 0
## 18 18 0.405 1 -0.72 0 -0.71 0  -0.59 0     -0.25 0  0.16 0  0.10 0  0.46 1
## 19 19 0.364 0  2.04 0  2.30 1   1.74 1     -1.15 1 -0.62 1  1.48 1  0.81 1
## 20 20 0.272 0 -0.86 0 -0.85 0  -0.52 0      0.04 0  0.41 0 -0.20 0 -0.04 0
##    betas_count all_count
## 1            0         0
## 2            0         0
## 3            0         0
## 4            0         0
## 5            0         0
## 6            2         2
## 7            0         0
## 8            2         3
## 9            0         0
## 10           0         0
## 11           0         0
## 12           0         0
## 13           0         0
## 14           1         1
## 15           0         0
## 16           0         0
## 17           0         0
## 18           1         2
## 19           4         6
## 20           0         0

log model, with only X2 and X4.

fiber.lm <- lm(y ~ x2+x4)
summary(fiber.lm)
## 
## Call:
## lm(formula = y ~ x2 + x4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11149 -0.03979 -0.01930  0.07103  0.12112 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.4244     0.7779   3.117  0.00628 ** 
## x2            0.8355     0.1531   5.458 4.25e-05 ***
## x4           -0.4040     0.1098  -3.681  0.00185 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06741 on 17 degrees of freedom
## Multiple R-squared:  0.7175, Adjusted R-squared:  0.6842 
## F-statistic: 21.58 on 2 and 17 DF,  p-value: 2.159e-05
shapiro.test(residuals(fiber.lm)) 
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(fiber.lm)
## W = 0.923, p-value = 0.1131
ks.test(residuals(fiber.lm),'pnorm')
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(fiber.lm)
## D = 0.4556, p-value = 0.0002585
## alternative hypothesis: two-sided
anova(fiber.lm)  #question here, what's anova on multiple x?
## Analysis of Variance Table
## 
## Response: y
##           Df   Sum Sq  Mean Sq F value    Pr(>F)    
## x2         1 0.134618 0.134618  29.622 4.388e-05 ***
## x4         1 0.061561 0.061561  13.546  0.001855 ** 
## Residuals 17 0.077256 0.004544                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
norm.test(y,x2+x4)
##   L-G   K-S   C-V   A-D 
## 0.069 0.032 0.044 0.049
X = model.matrix(~x2+x4, fiber)
XtXi = solve(t(X) %*% X) 
hat = X %*% XtXi %*% t(X)

n = dim(X)[1]
p = dim(X)[2]

# (1)check leverage
hii = round(diag(hat), digits =3)

#(2)Internally standardized residual
e_ISR <- round(stanres(fiber.lm) , digits =2)

#(3)Externally studentized residual
d_ESR <- round(studres(fiber.lm) , digits =2)

#(4)DFFITS
DFFits <- round(dffits(fiber.lm), digits =2)

#(5)DFBETAS
DFB <- round(dfbetas(fiber.lm),digits =2)
Itcept_df <- DFB[,1]
x2_df<- DFB[,2]
x4_df<- DFB[,3]

#(6) Simulation
dnsim(y,x2+x4)

## [1] "outliers index:"
## 18  5 17 19 11 10  9 
## 10 11 12 13 14 15 18

## [1] "outliers index:"
## 12  3  5  1 
##  6  8  9 10
## NULL
#summary

betas_count = (abs(Itcept_df) >= 2/sqrt(n)) + +
                (abs(x2_df) >= 2/sqrt(n))+ +
                (abs(x4_df) >= 2/sqrt(n))

all_count = (hii >= 2*p/n)+ +
                (abs(d_ESR) >= qt(0.975,n-p))+ +
                (DFFits >= 2 * sqrt(p/n))+ +
                (abs(Itcept_df) >= 2/sqrt(n)) + +
                (abs(x2_df) >= 2/sqrt(n))+ +
                (abs(x4_df) >= 2/sqrt(n))

summary <- cbind(1:n, hii, hii >= 2*p/n, e_ISR, 0, d_ESR, abs(d_ESR) >= qt(0.975,n-p), DFFits, DFFits >= 2 * sqrt(p/n), Itcept_df, abs(Itcept_df) >= 2/sqrt(n), x2_df, abs(x2_df) >= 2/sqrt(n), x4_df, abs(x4_df) >= 2/sqrt(n), betas_count,all_count)
summary 
##         hii   e_ISR   d_ESR   DFFits   Itcept_df   x2_df   x4_df  
## 1   1 0.119 0 -0.20 0 -0.19 0  -0.07 0      0.05 0 -0.05 0 -0.02 0
## 2   2 0.075 0 -0.98 0 -0.98 0  -0.28 0      0.15 0 -0.16 0 -0.04 0
## 3   3 0.054 0  1.12 0  1.13 0   0.27 0      0.03 0 -0.06 0  0.05 0
## 4   4 0.071 0  1.28 0  1.30 0   0.36 0      0.01 0 -0.11 0  0.16 0
## 5   5 0.057 0 -0.36 0 -0.35 0  -0.09 0      0.01 0  0.01 0 -0.03 0
## 6   6 0.094 0  1.41 0  1.46 0   0.47 0     -0.04 0 -0.12 0  0.29 0
## 7   7 0.100 0  1.10 0  1.11 0   0.37 0      0.00 0 -0.13 0  0.22 0
## 8   8 0.354 1  1.43 0  1.48 0   1.09 1     -0.59 1  0.94 1 -0.38 0
## 9   9 0.286 0  0.12 0  0.12 0   0.08 0     -0.04 0  0.06 0 -0.03 0
## 10 10 0.119 0 -0.60 0 -0.59 0  -0.22 0     -0.17 0  0.13 0  0.10 0
## 11 11 0.110 0 -1.11 0 -1.12 0  -0.39 0      0.27 0 -0.18 0 -0.23 0
## 12 12 0.084 0 -0.23 0 -0.23 0  -0.07 0      0.04 0 -0.02 0 -0.04 0
## 13 13 0.052 0 -0.01 0 -0.01 0   0.00 0      0.00 0  0.00 0  0.00 0
## 14 14 0.252 0 -0.68 0 -0.67 0  -0.39 0     -0.07 0  0.24 0 -0.25 0
## 15 15 0.141 0 -1.78 0 -1.92 0  -0.78 0      0.54 1 -0.31 0 -0.54 1
## 16 16 0.050 0 -0.48 0 -0.47 0  -0.11 0     -0.01 0  0.01 0  0.00 0
## 17 17 0.259 0 -0.48 0 -0.47 0  -0.28 0     -0.17 0  0.04 0  0.25 0
## 18 18 0.392 1 -0.77 0 -0.76 0  -0.61 0     -0.43 0  0.17 0  0.55 1
## 19 19 0.098 0  1.89 0  2.07 0   0.68 0      0.48 1 -0.45 1 -0.16 0
## 20 20 0.232 0 -0.81 0 -0.80 0  -0.44 0     -0.38 0  0.37 0  0.11 0
##    betas_count all_count
## 1            0         0
## 2            0         0
## 3            0         0
## 4            0         0
## 5            0         0
## 6            0         0
## 7            0         0
## 8            2         4
## 9            0         0
## 10           0         0
## 11           0         0
## 12           0         0
## 13           0         0
## 14           0         0
## 15           2         2
## 16           0         0
## 17           0         0
## 18           1         2
## 19           2         2
## 20           0         0