rm(list=ls(all=TRUE))
d <- read.table('~/Dropbox/Statistics_study/STOR_664/data/charles.dat')
# x--- Mount Airy, y--- Charston
names(d) <- c('year', "x", "y")
attach(d)
d.lm <- lm(y~x)
summary(d.lm)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.39307 -0.22517 0.04767 0.22915 1.28841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.63641 2.25682 6.042 2.32e-07 ***
## x 0.56790 0.09595 5.918 3.58e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.447 on 47 degrees of freedom
## Multiple R-squared: 0.427, Adjusted R-squared: 0.4148
## F-statistic: 35.03 on 1 and 47 DF, p-value: 3.577e-07
alpha_hat <- summary(d.lm)$coefficients[1,1]
beta_hat <- summary(d.lm)$coefficients[2,1]
plot(x, summary(d.lm)$coefficients[1,1]+summary(d.lm)$coefficients[2,1]*x, type='l', ylab='Charston', xlab='Mount Airy', main = "Fitted line v.s. Original data points")
points(x,y,col=2)
X = model.matrix(~x , d)
XtXi = solve(t(X) %*% X)
hat = X %*% XtXi %*% t(X)
n = dim(X)[1]
p = dim(X)[2]
# ith diagnoal of hat matrix
hii = diag(hat)
# whether the ith point has a high leverage
plot(year, hii, ylab="leverage"); abline(2*p/n,0)
which((hii>2*p/n) == TRUE) #index of the elements with a high leverage
## 5 20 25 40
## 5 20 25 40
x[hii > 2*p/n] #x value of the elements with a high leverage
## [1] 24.8 22.3 22.3 25.0
year[hii > 2*p/n] #year of the elements with a high leverage
## [1] 1952 1967 1972 1987
So the 5, 20, 25, 40-th observations have high leverage. They correspond to year 1952, 1967, 1972, 1987 with Mount Airy temperature 24.8, 22.3, 22.3, 25, respectively.
e = summary(d.lm)$residual
s = summary(d.lm)$sigma
e_ISR = e/(s*sqrt(1-hii))
plot(x,e_ISR); abline(0,0)
hist(e_ISR, breaks=49, probability=TRUE, main = 'histogram of Internally studentized residual')
lines(density(e_ISR))
We want to take special notice of the observations with larger Internally standardized residuals.
d = e/(1-hii) #according to class notes number(5)
s_i = sqrt(((n-p)*s^2 - e^2/(1-hii))/(n-p-1)) #(9)
s_di = s_i/sqrt(1-hii) #(6)
d_ESR = d/s_di #(8)
#d_ESR has a t distribution with n-p degree of freedom
plot(year, d_ESR); abline(qt(0.975,n-p-1),0); abline(-qt(0.975,n-p-1),0)
which((abs(d_ESR) > qt(0.975,n-p-1)) == TRUE);
## 30 38 41
## 30 38 41
x[abs(d_ESR) > qt(0.975,n-p-1)]; y[abs(d_ESR) > qt(0.975,n-p-1)]; year[abs(d_ESR) > qt(0.975,n-p-1)]
## [1] 24.3 23.2 24.4
## [1] 28.4 28.1 26.1
## [1] 1977 1985 1988
So the 30, 38, 41-th elements are outliers, in year 1977, 1985, 1988, with x values 24.3, 23.2, 24.4 and y vaules 28.4, 28.1, 26.1 respectively.
y_hat = alpha_hat + x * beta_hat
dffits = d_ESR * sqrt(hii/(1-hii))
plot(year, dffits); abline(2 * sqrt(p/n),0); abline(-2 * sqrt(p/n),0)
which((abs(dffits) > 2 * sqrt(p/n)) == TRUE)
## 30 38 40 41
## 30 38 40 41
According to this method, 30, 38-th observations are influential.
# want to see if BETA = 0
# H0: BETA = 0 (here BETA is all the coefficients, including alpha and beta)
BETA_hat <- matrix(data = c(alpha_hat, beta_hat))
#the coefficients with i-th elements deleted:
BETA_delete_i_hat <- t(data.frame(alpha = 0, beta = rep(0,n)))
for (i in 1:n){
BETA_delete_i_hat[,i] = BETA_hat - XtXi %*% matrix(X[i,]) %*% e[i] /(1-hii[i])}
ckk <- rep(0,p)
for (k in 1:p) {ckk[k] = XtXi[k,k]}
# check alpha
DFBETAS_alpha = (BETA_hat[1] - BETA_delete_i_hat[1,])/(s_i * sqrt(ckk[1]))
plot(year, DFBETAS_alpha); abline(2/sqrt(n),0); abline(-2/sqrt(n),0)
which((abs(DFBETAS_alpha) > 2/sqrt(n)) == TRUE)
## 30 40 41
## 30 40 41
# check beta
DFBETAS_beta = (BETA_hat[2] - BETA_delete_i_hat[2,])/(s_i * sqrt(ckk[2]))
plot(year, DFBETAS_beta); abline(2/sqrt(n),0); abline(-2/sqrt(n),0)
which((abs(DFBETAS_beta) > 2/sqrt(n)) == TRUE)
## 30 40 41
## 30 40 41
The 30, 40, 41-th observations have large influence on the \(\alpha\) parameter.
The 30, 40, 41-th observations have large influence on the \(\beta\) parameter.
D = (e^2 * hii) / (p*(s^2)*((1-hii)^2))
plot(year, D); abline(qf(0.1, p, n-p),0);abline(qf(0.5, p, n-p),0)
which((D>qf(0.1, p, n-p)) == TRUE)
## 30 38 40 41
## 30 38 40 41
which((D>qf(0.5, p, n-p)) == TRUE) #a little more relaxed cutoff
## named integer(0)
According to this method, the 30, 38, 40, 41 elements are influential. Although there is one observation that is very close to the cut-off line w(the 41st observation).
covratio = ((n-p-1)/(n-p) + d_ESR^2/(n-p))^(-p) * (1-hii)^(-1)
plot(year,covratio); abline(3*p/n+1,0); abline(-3*p/n+1,0)
which((abs(covratio-1) > 3*p/n) ==TRUE)
## 5 25 38 41
## 5 25 38 41
According to this method, the 5, 25, 38, 41-th elements have influential effects on the variances of the parameter estimates.
summary = cbind(hii, hii > 2*p/n, e_ISR, 0, d_ESR, abs(d_ESR) > qt(0.975,n-p-1), dffits, dffits > 2 * sqrt(p/n), DFBETAS_alpha, abs(DFBETAS_alpha) > 2/sqrt(n), DFBETAS_beta, abs(DFBETAS_beta) > 2/sqrt(n), D, D > qf(0.1, p, n-p), covratio, abs(covratio-1) > 3*p/n)
round(summary, digits= 2)
## hii e_ISR d_ESR dffits DFBETAS_alpha DFBETAS_beta D
## 1 0.02 0 0.49 0 0.49 0 0.07 0 0.00 0 0.00 0 0.00 0
## 2 0.04 0 -0.28 0 -0.28 0 -0.05 0 0.03 0 -0.04 0 0.00 0
## 3 0.04 0 0.82 0 0.82 0 0.16 0 0.11 0 -0.11 0 0.01 0
## 4 0.03 0 -0.38 0 -0.37 0 -0.07 0 0.04 0 -0.04 0 0.00 0
## 5 0.10 1 0.66 0 0.65 0 0.21 0 -0.19 0 0.19 0 0.02 0
## 6 0.04 0 -1.78 0 -1.83 0 -0.38 0 0.27 0 -0.28 0 0.07 0
## 7 0.04 0 1.54 0 1.57 0 0.30 0 -0.20 0 0.20 0 0.05 0
## 8 0.02 0 -0.41 0 -0.41 0 -0.06 0 0.00 0 0.00 0 0.00 0
## 9 0.02 0 0.27 0 0.26 0 0.04 0 0.00 0 0.00 0 0.00 0
## 10 0.02 0 -0.83 0 -0.83 0 -0.13 0 -0.04 0 0.04 0 0.01 0
## 11 0.02 0 0.24 0 0.23 0 0.04 0 -0.01 0 0.01 0 0.00 0
## 12 0.02 0 -0.34 0 -0.34 0 -0.05 0 0.02 0 -0.02 0 0.00 0
## 13 0.02 0 -0.99 0 -0.99 0 -0.14 0 0.02 0 -0.02 0 0.01 0
## 14 0.05 0 -0.29 0 -0.29 0 -0.07 0 -0.05 0 0.05 0 0.00 0
## 15 0.04 0 -0.42 0 -0.42 0 -0.09 0 -0.07 0 0.07 0 0.00 0
## 16 0.03 0 0.00 0 0.00 0 0.00 0 0.00 0 0.00 0 0.00 0
## 17 0.02 0 -0.71 0 -0.70 0 -0.11 0 -0.05 0 0.05 0 0.01 0
## 18 0.05 0 -0.06 0 -0.06 0 -0.01 0 -0.01 0 0.01 0 0.00 0
## 19 0.02 0 -1.39 0 -1.40 0 -0.22 0 -0.10 0 0.09 0 0.02 0
## 20 0.09 1 -0.94 0 -0.94 0 -0.29 0 -0.26 0 0.25 0 0.04 0
## 21 0.02 0 0.72 0 0.72 0 0.10 0 0.00 0 0.00 0 0.01 0
## 22 0.02 0 0.11 0 0.11 0 0.02 0 -0.01 0 0.01 0 0.00 0
## 23 0.02 0 0.94 0 0.94 0 0.14 0 0.01 0 0.00 0 0.01 0
## 24 0.03 0 0.46 0 0.46 0 0.08 0 0.05 0 -0.05 0 0.00 0
## 25 0.09 1 0.00 0 0.00 0 0.00 0 0.00 0 0.00 0 0.00 0
## 26 0.02 0 -0.51 0 -0.51 0 -0.07 0 -0.01 0 0.01 0 0.00 0
## 27 0.06 0 -1.32 0 -1.33 0 -0.33 0 -0.27 0 0.27 0 0.05 0
## 28 0.02 0 0.91 0 0.91 0 0.14 0 -0.03 0 0.04 0 0.01 0
## 29 0.05 0 -0.75 0 -0.75 0 -0.17 0 -0.14 0 0.13 0 0.02 0
## 30 0.05 0 2.21 0 2.31 1 0.53 1 -0.39 1 0.40 1 0.13 1
## 31 0.03 0 0.99 0 0.99 0 0.18 0 -0.10 0 0.11 0 0.02 0
## 32 0.05 0 -0.06 0 -0.06 0 -0.01 0 -0.01 0 0.01 0 0.00 0
## 33 0.05 0 0.38 0 0.37 0 0.08 0 -0.06 0 0.06 0 0.00 0
## 34 0.07 0 0.12 0 0.11 0 0.03 0 -0.02 0 0.03 0 0.00 0
## 35 0.02 0 0.46 0 0.46 0 0.07 0 -0.02 0 0.02 0 0.00 0
## 36 0.05 0 -1.23 0 -1.24 0 -0.28 0 0.21 0 -0.22 0 0.04 0
## 37 0.02 0 0.30 0 0.29 0 0.04 0 0.01 0 -0.01 0 0.00 0
## 38 0.02 0 2.92 0 3.19 1 0.51 1 0.23 0 -0.22 0 0.11 1
## 39 0.07 0 0.12 0 0.11 0 0.03 0 -0.02 0 0.03 0 0.00 0
## 40 0.12 1 -1.27 0 -1.28 0 -0.48 0 0.43 1 -0.44 1 0.11 1
## 41 0.06 0 -3.21 0 -3.59 1 -0.88 0 0.69 1 -0.71 1 0.31 1
## 42 0.02 0 -0.44 0 -0.44 0 -0.07 0 0.02 0 -0.02 0 0.00 0
## 43 0.03 0 0.43 0 0.43 0 0.07 0 -0.03 0 0.04 0 0.00 0
## 44 0.03 0 0.43 0 0.43 0 0.07 0 -0.03 0 0.04 0 0.00 0
## 45 0.06 0 0.53 0 0.52 0 0.13 0 0.11 0 -0.11 0 0.01 0
## 46 0.03 0 0.89 0 0.88 0 0.15 0 -0.07 0 0.08 0 0.01 0
## 47 0.05 0 -0.75 0 -0.75 0 -0.17 0 -0.14 0 0.13 0 0.02 0
## 48 0.02 0 0.65 0 0.65 0 0.10 0 0.05 0 -0.04 0 0.01 0
## 49 0.07 0 0.66 0 0.66 0 0.18 0 0.15 0 -0.15 0 0.02 0
## covratio
## 1 1.05 0
## 2 1.08 0
## 3 1.05 0
## 4 1.07 0
## 5 1.13 1
## 6 0.95 0
## 7 0.98 0
## 8 1.06 0
## 9 1.06 0
## 10 1.04 0
## 11 1.06 0
## 12 1.06 0
## 13 1.02 0
## 14 1.10 0
## 15 1.08 0
## 16 1.08 0
## 17 1.05 0
## 18 1.10 0
## 19 0.98 0
## 20 1.10 0
## 21 1.04 0
## 22 1.07 0
## 23 1.03 0
## 24 1.07 0
## 25 1.14 1
## 26 1.05 0
## 27 1.03 0
## 28 1.03 0
## 29 1.07 0
## 30 0.88 0
## 31 1.03 0
## 32 1.10 0
## 33 1.09 0
## 34 1.12 0
## 35 1.06 0
## 36 1.03 0
## 37 1.06 0
## 38 0.72 1
## 39 1.12 0
## 40 1.11 0
## 41 0.68 1
## 42 1.06 0
## 43 1.06 0
## 44 1.06 0
## 45 1.10 0
## 46 1.04 0
## 47 1.07 0
## 48 1.05 0
## 49 1.10 0
#standarized residuals against fitted value
plot(y_hat, e_ISR, main="standarized residuals against fitted value")
abline(0,0)
#Normal probability plot for standardized residuals
qqnorm(e_ISR, main = "Normal probability plot for standardized residuals")
qqline(e_ISR)
the provided dnism.r file doesn’t give which points are outliers, so I added some code before the return() statement, to print the information out:
# print("Plot (a)")
# print("sorted externally standardized residuals")
# print(rs1)
#
# print("outliers:")
# outliers = rs1<rs2 | rs1 > rs3
# print(outliers)
#
# print("outliers index:")
# outliers_index = which((rs1<rs2 | rs1 > rs3) ==TRUE)
# print(outliers_index)
#
# print("")
# print("Plot (b)")
# print("sorted DFFITS")
# print(df1)
#
# print("outliers:")
# outliers = df1<df2 | df1 > df3
# print(outliers)
#
# print("outliers index:")
# outliers_index = which((df1<df2 | df1 > df3) ==TRUE)
# print(outliers_index)
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/diagnose.R")
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/dnsim.R")
dnsim(y,x)
## [1] "Plot (a)"
## [1] "sorted externally standardized residuals"
## 25 16 18 32 22 34
## 0.001124229 0.004471851 0.062788921 0.062788921 0.106814085 0.114811279
## 39 11 9 2 14 37
## 0.114811279 0.233905201 0.264131127 0.276848017 0.290179578 0.294877288
## 12 33 4 8 15 43
## 0.341706836 0.372125628 0.373637977 0.407588132 0.418155003 0.429190299
## 44 42 24 35 1 26
## 0.429190299 0.438499878 0.455447155 0.458595142 0.488790543 0.505007217
## 45 48 5 49 17 21
## 0.524200251 0.649284774 0.654598642 0.658334577 0.702013955 0.715062501
## 29 47 3 10 46 28
## 0.748690951 0.748690951 0.815148631 0.830698603 0.884557369 0.913287878
## 20 23 31 13 36 40
## 0.936805271 0.943721423 0.986397322 0.991638858 1.237153861 1.283585036
## 27 19 7 6 30 38
## 1.326692577 1.399462994 1.567187275 1.825457299 2.310592821 3.190950239
## 41
## 3.592263297
## [1] "outliers:"
## 25 16 18 32 22 34 39 11 9 2 14 37
## TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 12 33 4 8 15 43 44 42 24 35 1 26
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 45 48 5 49 17 21 29 47 3 10 46 28
## TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## 20 23 31 13 36 40 27 19 7 6 30 38
## TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
## 41
## TRUE
## [1] "outliers index:"
## 25 16 45 47 10 46 28 20 23 31 13 27 19 38 41
## 1 2 25 32 34 35 36 37 38 39 40 43 44 48 49
## [1] ""
## [1] "Plot (b)"
## [1] "sorted DFFITS"
## 25 16 18 32 22
## 0.0003489712 0.0008183148 0.0145032984 0.0145032984 0.0168487059
## 34 39 11 9 37
## 0.0304071730 0.0304071730 0.0351370373 0.0381286182 0.0446806803
## 2 12 8 42 14
## 0.0538343944 0.0539003634 0.0588373375 0.0658710728 0.0670271272
## 4 35 1 43 44
## 0.0673408360 0.0688897660 0.0705593022 0.0720488445 0.0720488445
## 26 24 33 15 21
## 0.0739051315 0.0833433707 0.0846023193 0.0893313234 0.1032227645
## 48 17 10 45 23
## 0.1036303871 0.1120463329 0.1258699134 0.1307591690 0.1362307966
## 28 13 46 3 29
## 0.1371933160 0.1444549892 0.1484920243 0.1610394127 0.1729363727
## 47 49 31 5 19
## 0.1729363727 0.1770181943 0.1777785563 0.2146095483 0.2233640733
## 36 20 7 27 6
## 0.2812654599 0.2907929747 0.3047469099 0.3309369248 0.3837864551
## 40 38 30 41
## 0.4799587262 0.5092979565 0.5253105316 0.8822399547
## [1] "outliers:"
## 25 16 18 32 22 34 39 11 9 37 2 12
## FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 8 42 14 4 35 1 43 44 26 24 33 15
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## 21 48 17 10 45 23 28 13 46 3 29 47
## FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## 49 31 5 19 36 20 7 27 6 40 38 30
## TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 41
## FALSE
## [1] "outliers index:"
## 16 26 33 15 48 17 28 13 46 3 47 49 31 40
## 2 21 23 24 26 27 31 32 33 34 36 37 38 46
## NULL
# the same with my (2)Internally standardized residual
plot(stanres(d.lm), main = "Internally standardized residual"); abline(0,0)
# the same with my (3)Externally studentized residual
plot(studres(d.lm)) ; abline(qt(0.975,n-p-1),0); abline(-qt(0.975,n-p-1),0)
# the same with my (4)DFFITS
plot(dffits(d.lm)); abline(2 * sqrt(p/n),0); abline(-2 * sqrt(p/n),0)
# the same with my (5)DEBETAS
plot(dfbetas(d.lm)[,1]); abline(2/sqrt(n),0); abline(-2/sqrt(n),0)
plot(dfbetas(d.lm)[,2]); abline(2/sqrt(n),0); abline(-2/sqrt(n),0)
#Under H0: beta = 0
#the following standardized beta_hat should have t distribution with n-p degree of freedom
stanbeta <- sqrt(sum((x-mean(x))^2)) * abs(beta_hat/summary(d.lm)$sigma)
stanbeta > qt(0.975, n-p)
## [1] TRUE
# reject H0, beta_hat is statistical significant.
#test of normality
shapiro.test(residuals(d.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(d.lm)
## W = 0.9638, p-value = 0.1353
ks.test(residuals(d.lm),'pnorm')
## Warning in ks.test(residuals(d.lm), "pnorm"): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(d.lm)
## D = 0.2709, p-value = 0.001506
## alternative hypothesis: two-sided
source("/Users/JieHuang1/Dropbox/Statistics_study/STOR_664/R/R_codes/normtest.R")
norm.test(y,x)
## L-G K-S C-V A-D
## 0.050 0.205 0.243 0.157
#ANOVA
anova(d.lm)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 7.0000 7.0000 35.027 3.577e-07 ***
## Residuals 47 9.3927 0.1998
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Overall, the linear model is significant.