(Continuation of problem 18 from Chapter 3.) Consider the data set of Table 3.10, in which yarn strengths (X1) are tabulated alongside each of mean fiber length (X2), tensile strength of the fibers (X3) and fiber fineness (X4). Identifying X1 with the response Y and using each of the other three variables X2, X3, X4, or any subset of them that you consider appropriate, as pre- dictors, compute the diagnostics associated with the linear model and state which observations you consider to be outliers, or unduly influential, or both.
rm(list=ls(all=TRUE))
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/diagnose.R")
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/dnsim.R")
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/normtest.R")
fiber <- read.table('~/Dropbox/Statistics_study/STOR_664/Homework/HW2/HW2_C_18_data.dat')
names(fiber) <- c('obs', 'y','x2','x3','x4')
fiber = log(fiber)
attach(fiber)
fiber.lm <- lm(y ~ x2+x3+x4)
summary(fiber.lm)
##
## Call:
## lm(formula = y ~ x2 + x3 + x4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10878 -0.04042 -0.01865 0.07459 0.11288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.94934 2.20907 0.882 0.3906
## x2 0.83492 0.15755 5.299 7.19e-05 ***
## x3 0.09126 0.39551 0.231 0.8204
## x4 -0.38146 0.14939 -2.553 0.0213 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06937 on 16 degrees of freedom
## Multiple R-squared: 0.7184, Adjusted R-squared: 0.6656
## F-statistic: 13.61 on 3 and 16 DF, p-value: 0.0001143
shapiro.test(residuals(fiber.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(fiber.lm)
## W = 0.914, p-value = 0.07609
ks.test(residuals(fiber.lm),'pnorm')
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(fiber.lm)
## D = 0.4567, p-value = 0.0002474
## alternative hypothesis: two-sided
anova(fiber.lm) #question here, what's anova on multiple x?
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x2 1 0.134618 0.134618 27.9725 7.343e-05 ***
## x3 1 0.030440 0.030440 6.3252 0.02297 *
## x4 1 0.031377 0.031377 6.5199 0.02126 *
## Residuals 16 0.077000 0.004813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
norm.test(y,x2+x3+x4)
## L-G K-S C-V A-D
## 0.220 0.341 0.167 0.189
X = model.matrix(~x2+x3+x4, fiber)
XtXi = solve(t(X) %*% X)
hat = X %*% XtXi %*% t(X)
n = dim(X)[1]
p = dim(X)[2]
# (1)check leverage
hii = round(diag(hat), digits =3)
#(2)Internally standardized residual
e_ISR <- round(stanres(fiber.lm) , digits =2)
#(3)Externally studentized residual
d_ESR <- round(studres(fiber.lm) , digits =2)
#(4)DFFITS
DFFits <- round(dffits(fiber.lm), digits =2)
#(5)DFBETAS
DFB <- round(dfbetas(fiber.lm),digits =2)
Itcept_df <- DFB[,1]
x2_df<- DFB[,2]
x3_df<- DFB[,3]
x4_df<- DFB[,4]
#(6) Simulation
dnsim(y,x2+x3+x4)
## [1] "outliers index:"
## 10 5 19 11 18 17
## 10 11 12 13 14 15
## [1] "outliers index:"
## 2 5 19 17 18
## 9 10 11 14 16
## NULL
#summary
betas_count = (abs(Itcept_df) >= 2/sqrt(n)) + +
(abs(x2_df) >= 2/sqrt(n))+ +
(abs(x3_df) >= 2/sqrt(n)) + +
(abs(x4_df) >= 2/sqrt(n))
all_count = (hii >= 2*p/n)+ +
(abs(d_ESR) >= qt(0.975,n-p))+ +
(DFFits >= 2 * sqrt(p/n))+ +
(abs(Itcept_df) >= 2/sqrt(n)) + +
(abs(x2_df) >= 2/sqrt(n))+ +
(abs(x3_df) >= 2/sqrt(n)) + +
(abs(x4_df) >= 2/sqrt(n))
summary <- cbind(1:n, hii, hii >= 2*p/n, e_ISR, 0, d_ESR, abs(d_ESR) >= qt(0.975,n-p), DFFits, DFFits >= 2 * sqrt(p/n), Itcept_df, abs(Itcept_df) >= 2/sqrt(n), x2_df, abs(x2_df) >= 2/sqrt(n), x3_df, abs(x3_df) >= 2/sqrt(n), x4_df, abs(x4_df) >= 2/sqrt(n),betas_count,all_count)
summary
## hii e_ISR d_ESR DFFits Itcept_df x2_df x3_df x4_df
## 1 1 0.150 0 -0.24 0 -0.23 0 -0.10 0 0.06 0 -0.06 0 -0.04 0 -0.05 0
## 2 2 0.142 0 -1.06 0 -1.06 0 -0.43 0 0.34 0 -0.17 0 -0.30 0 -0.23 0
## 3 3 0.068 0 1.12 0 1.13 0 0.31 0 0.14 0 -0.06 0 -0.14 0 -0.05 0
## 4 4 0.087 0 1.28 0 1.31 0 0.41 0 0.17 0 -0.11 0 -0.18 0 0.01 0
## 5 5 0.064 0 -0.33 0 -0.32 0 -0.08 0 -0.02 0 0.01 0 0.03 0 0.00 0
## 6 6 0.188 0 1.53 0 1.60 0 0.77 0 0.49 1 -0.14 0 -0.55 1 -0.10 0
## 7 7 0.194 0 1.21 0 1.23 0 0.60 0 0.39 0 -0.15 0 -0.42 0 -0.08 0
## 8 8 0.387 0 1.37 0 1.41 0 1.12 1 -0.51 1 0.92 1 0.33 0 -0.07 0
## 9 9 0.287 0 0.11 0 0.11 0 0.07 0 -0.02 0 0.05 0 0.00 0 -0.02 0
## 10 10 0.204 0 -0.54 0 -0.53 0 -0.27 0 -0.22 0 0.12 0 0.17 0 0.18 0
## 11 11 0.132 0 -1.06 0 -1.06 0 -0.42 0 -0.06 0 -0.18 0 0.17 0 -0.05 0
## 12 12 0.093 0 -0.21 0 -0.20 0 -0.06 0 -0.01 0 -0.02 0 0.02 0 -0.01 0
## 13 13 0.065 0 -0.03 0 -0.03 0 -0.01 0 0.00 0 0.00 0 0.00 0 0.00 0
## 14 14 0.385 0 -0.83 0 -0.83 0 -0.65 0 0.32 0 0.33 0 -0.38 0 -0.51 1
## 15 15 0.170 0 -1.72 0 -1.85 0 -0.84 0 -0.13 0 -0.31 0 0.34 0 -0.17 0
## 16 16 0.057 0 -0.49 0 -0.48 0 -0.12 0 0.03 0 0.01 0 -0.04 0 -0.03 0
## 17 17 0.286 0 -0.43 0 -0.42 0 -0.27 0 -0.13 0 0.04 0 0.08 0 0.22 0
## 18 18 0.405 1 -0.72 0 -0.71 0 -0.59 0 -0.25 0 0.16 0 0.10 0 0.46 1
## 19 19 0.364 0 2.04 0 2.30 1 1.74 1 -1.15 1 -0.62 1 1.48 1 0.81 1
## 20 20 0.272 0 -0.86 0 -0.85 0 -0.52 0 0.04 0 0.41 0 -0.20 0 -0.04 0
## betas_count all_count
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 2 2
## 7 0 0
## 8 2 3
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 1 1
## 15 0 0
## 16 0 0
## 17 0 0
## 18 1 2
## 19 4 6
## 20 0 0
fiber.lm <- lm(y ~ x2+x4)
summary(fiber.lm)
##
## Call:
## lm(formula = y ~ x2 + x4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11149 -0.03979 -0.01930 0.07103 0.12112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4244 0.7779 3.117 0.00628 **
## x2 0.8355 0.1531 5.458 4.25e-05 ***
## x4 -0.4040 0.1098 -3.681 0.00185 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06741 on 17 degrees of freedom
## Multiple R-squared: 0.7175, Adjusted R-squared: 0.6842
## F-statistic: 21.58 on 2 and 17 DF, p-value: 2.159e-05
shapiro.test(residuals(fiber.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(fiber.lm)
## W = 0.923, p-value = 0.1131
ks.test(residuals(fiber.lm),'pnorm')
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(fiber.lm)
## D = 0.4556, p-value = 0.0002585
## alternative hypothesis: two-sided
anova(fiber.lm) #question here, what's anova on multiple x?
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x2 1 0.134618 0.134618 29.622 4.388e-05 ***
## x4 1 0.061561 0.061561 13.546 0.001855 **
## Residuals 17 0.077256 0.004544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
norm.test(y,x2+x4)
## L-G K-S C-V A-D
## 0.069 0.032 0.044 0.049
X = model.matrix(~x2+x4, fiber)
XtXi = solve(t(X) %*% X)
hat = X %*% XtXi %*% t(X)
n = dim(X)[1]
p = dim(X)[2]
# (1)check leverage
hii = round(diag(hat), digits =3)
#(2)Internally standardized residual
e_ISR <- round(stanres(fiber.lm) , digits =2)
#(3)Externally studentized residual
d_ESR <- round(studres(fiber.lm) , digits =2)
#(4)DFFITS
DFFits <- round(dffits(fiber.lm), digits =2)
#(5)DFBETAS
DFB <- round(dfbetas(fiber.lm),digits =2)
Itcept_df <- DFB[,1]
x2_df<- DFB[,2]
x4_df<- DFB[,3]
#(6) Simulation
dnsim(y,x2+x4)
## [1] "outliers index:"
## 18 5 17 19 11 10 9
## 10 11 12 13 14 15 18
## [1] "outliers index:"
## 12 3 5 1
## 6 8 9 10
## NULL
#summary
betas_count = (abs(Itcept_df) >= 2/sqrt(n)) + +
(abs(x2_df) >= 2/sqrt(n))+ +
(abs(x4_df) >= 2/sqrt(n))
all_count = (hii >= 2*p/n)+ +
(abs(d_ESR) >= qt(0.975,n-p))+ +
(DFFits >= 2 * sqrt(p/n))+ +
(abs(Itcept_df) >= 2/sqrt(n)) + +
(abs(x2_df) >= 2/sqrt(n))+ +
(abs(x4_df) >= 2/sqrt(n))
summary <- cbind(1:n, hii, hii >= 2*p/n, e_ISR, 0, d_ESR, abs(d_ESR) >= qt(0.975,n-p), DFFits, DFFits >= 2 * sqrt(p/n), Itcept_df, abs(Itcept_df) >= 2/sqrt(n), x2_df, abs(x2_df) >= 2/sqrt(n), x4_df, abs(x4_df) >= 2/sqrt(n), betas_count,all_count)
summary
## hii e_ISR d_ESR DFFits Itcept_df x2_df x4_df
## 1 1 0.119 0 -0.20 0 -0.19 0 -0.07 0 0.05 0 -0.05 0 -0.02 0
## 2 2 0.075 0 -0.98 0 -0.98 0 -0.28 0 0.15 0 -0.16 0 -0.04 0
## 3 3 0.054 0 1.12 0 1.13 0 0.27 0 0.03 0 -0.06 0 0.05 0
## 4 4 0.071 0 1.28 0 1.30 0 0.36 0 0.01 0 -0.11 0 0.16 0
## 5 5 0.057 0 -0.36 0 -0.35 0 -0.09 0 0.01 0 0.01 0 -0.03 0
## 6 6 0.094 0 1.41 0 1.46 0 0.47 0 -0.04 0 -0.12 0 0.29 0
## 7 7 0.100 0 1.10 0 1.11 0 0.37 0 0.00 0 -0.13 0 0.22 0
## 8 8 0.354 1 1.43 0 1.48 0 1.09 1 -0.59 1 0.94 1 -0.38 0
## 9 9 0.286 0 0.12 0 0.12 0 0.08 0 -0.04 0 0.06 0 -0.03 0
## 10 10 0.119 0 -0.60 0 -0.59 0 -0.22 0 -0.17 0 0.13 0 0.10 0
## 11 11 0.110 0 -1.11 0 -1.12 0 -0.39 0 0.27 0 -0.18 0 -0.23 0
## 12 12 0.084 0 -0.23 0 -0.23 0 -0.07 0 0.04 0 -0.02 0 -0.04 0
## 13 13 0.052 0 -0.01 0 -0.01 0 0.00 0 0.00 0 0.00 0 0.00 0
## 14 14 0.252 0 -0.68 0 -0.67 0 -0.39 0 -0.07 0 0.24 0 -0.25 0
## 15 15 0.141 0 -1.78 0 -1.92 0 -0.78 0 0.54 1 -0.31 0 -0.54 1
## 16 16 0.050 0 -0.48 0 -0.47 0 -0.11 0 -0.01 0 0.01 0 0.00 0
## 17 17 0.259 0 -0.48 0 -0.47 0 -0.28 0 -0.17 0 0.04 0 0.25 0
## 18 18 0.392 1 -0.77 0 -0.76 0 -0.61 0 -0.43 0 0.17 0 0.55 1
## 19 19 0.098 0 1.89 0 2.07 0 0.68 0 0.48 1 -0.45 1 -0.16 0
## 20 20 0.232 0 -0.81 0 -0.80 0 -0.44 0 -0.38 0 0.37 0 0.11 0
## betas_count all_count
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 2 4
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 2 2
## 16 0 0
## 17 0 0
## 18 1 2
## 19 2 2
## 20 0 0