이번 학기 '탐색적 자료분석 및 상담' 수업에서 나온 기말 과제 수행 결과를 올린 것이다. 과제 내용은 Multiple Linear Regression 결과와 여러가지 Plot들을 그리는 것이다.
출력할 결과물로는
* ANOVA table
* PRAMETER ESTIMATES
* scatterplots between all variables using pairs() function
* scatterplot of y versus yhat
* residual plot with boxplot
* absolute studentized residual plot
1.에서는 함수를 작성한 코드에 대한 내용이고 2.에서는 그 결과를 임의로 만든 데이터로 시뮬레이션 한 결과와 R 내부에 있는 다른 데이터셋을 가지고 결과를 살펴본 것이다.
# ==================================================# Multivariate
# Regression User-Defiend Function ###
# ==================================================#
MyReg <- function(X) {
Y <- as.matrix(X[, 1]) # response variable in the first column
x <- cbind(1, as.matrix(X[, -1])) # the explanatory variables following
n <- nrow(x) # Number of Obervations
p <- ncol(x) - 1 # Number of Explanatory variables
beta <- (solve((t(x)) %*% x)) %*% (t(x)) %*% Y
pred <- x %*% beta # Compute Fitted Values
residuals <- Y - pred # Compute Residuals
sig2 <- ((t(residuals)) %*% residuals)/(n - p - 1) # Compute Variance.
SST <- sum((Y - mean(Y))^2) # compute SST
df.SST <- n - 1
SSR <- sum((pred - mean(Y))^2) # compute SSR
df.SSR <- p
MSR <- SSR/df.SSR # compute MSR
SSE <- sum((Y - pred)^2) # compute SSE
df.SSE <- n - p - 1
MSE <- SSE/df.SSE # compute MSE
f.stat <- MSR/MSE # compute F-statistic
f.p.value <- pf(f.stat, df.SSR, df.SSE, lower.tail = FALSE) # compute p-value for F-statistic
R2 <- SSR/SST # compute R square
C <- solve((t(x)) %*% x) # define matrix C
std.error <- sqrt(sig2 * diag(C)) # compute Standard error for paramete estimates
t <- beta/std.error # compute t-statistic
t.p.value <- round(pt(abs(t), df.SSE, lower.tail = FALSE) + pt(-abs(t),
df.SSE), 4)
# compute p-value for t-statistic
H <- x %*% (solve((t(x)) %*% x)) %*% (t(x)) # hat matrix
std.resid <- residuals/(sqrt(sig2 * (1 - diag(H)))) # Studentized residuals
# Result List
# ==============================================================================================#
res <- list(beta = as.vector(beta), sig2 = sig2, pred = pred, residuals = residuals,
stdresid = std.resid, SSR = SSR, SSE = SSE, F = f.stat, P.value = f.p.value,
dat = X)
# ANOVA table
# ==============================================================================================#
# asterisks(*) defined function
ast <- function(d) {
if (d >= 0.05 && d < 0.1)
"." else if (d >= 0.01 && d < 0.05)
"*" else if (d >= 0.001 && d < 0.01)
"**" else if (d < 0.001)
"***" else " "
}
#
# -----------------------------------------------------------------------------------------------------------#
cat("\n == ANALYSIS OF VARIANCE ==\n\n", encodeString(c(" Source", "df",
"SS", "MS", "F", "P-value"), width = 8, justify = "r"), "\n", "-------------------------------------------------------\n",
encodeString(c("Regression", df.SSR, round(SSR, 2), round(MSR, 2), round(f.stat,
2), round(f.p.value, 2)), width = 8, justify = "r"), ast(f.p.value),
"\n", encodeString(c(" Error", df.SSE, round(SSE, 2), round(MSE,
2)), width = 8, justify = "r"), "\n", encodeString(c(" Total",
df.SST, round(SST, 2)), width = 8, justify = "r"), "\n", "-------------------------------------------------------\n",
paste("Estimated error variance :", round(sig2, 4)), "\n", paste("R-squares :",
round(R2, 4)), "\n\n\n")
# ============================================== ANOVA End
# ==================================================#
# Prameter estimates
# =======================================================================================#
# all variables name - 1st column
pe.name <- colnames(X)
pe.name[1] <- "(Intercept)"
#
# -----------------------------------------------------------------------------------------------------------#
# variable name + beta + std.error + t value + Pr(>|t|) + asterisks(*)
pe <- matrix(c(pe.name, round(beta, 4), round(std.error, 4), round(t, 4),
round(t.p.value, 4)), nr = length(pe.name))
cat(" == PARAMETER ESTIMATES ==\n\n", encodeString(c(" ", "Estimate",
"Std.Error", "t value", "Pr(>|t|)"), width = 11, justify = "r"), "\n")
for (i in 1:length(pe.name)) {
cat(encodeString(pe[i, ], width = 11, justify = "r"), ast(pe[i, 5]),
"\n")
}
# =========================================== Parameter End
# ================================================#
# Plot
# ====================================================================================================#
# scatterplots between all variables
pairs(X, col = ifelse(((std.resid > 2) | (std.resid < -2)), "red", "black"),
lwd = ifelse(((std.resid > 2) | (std.resid < -2)), 1.2, 1.9), cex = ifelse(((std.resid >
2) | (std.resid < -2)), 1.4, 0.9))
#
# ----------------------------------------------------------------------------------------------------------#
# scatterplot of y versus y.hat
symbols(pred, Y, circles = abs(std.resid), fg = "white", inches = 0.5, bg = ifelse(((std.resid >
2) | (std.resid < -2)), "red", "black"), xlab = "Fitted response values",
ylab = "Response variable")
if (any((std.resid > 2) | (std.resid < -2))) {
text(pred[((std.resid > 2) | (std.resid < -2))], Y[((std.resid > 2) |
(std.resid < -2))], labels = seq(n)[((std.resid > 2) | (std.resid <
-2))], cex = 0.7, col = "royalblue4")
}
abline(0, 1, col = "red")
#
# ----------------------------------------------------------------------------------------------------------#
# residual plot with boxplot
par(fig = c(0, 0.7, 0, 1), oma = c(1, 1, 1, 0)) # split screen
plot(pred, std.resid, xlab = "Fitted response values", ylab = "Studentized residuals",
col = ifelse(((std.resid > 2) | (std.resid < -2)), "red", "black"),
pch = ifelse(((std.resid > 2) | (std.resid < -2)), 16, 1), cex = ifelse(((std.resid >
2) | (std.resid < -2)), 1.3, 0.5))
if (any((std.resid > 2) | (std.resid < -2))) {
text(pred[((std.resid > 2) | (std.resid < -2))], std.resid[((std.resid >
2) | (std.resid < -2))], labels = seq(n)[((std.resid > 2) | (std.resid <
-2))], pos = 3, cex = 0.7, col = "royalblue4")
}
abline(h = c(-2, 0, 2), lty = c("dashed", "solid"), col = "gray")
rug(std.resid, side = 4, col = "gray")
par(fig = c(0.7, 1, 0, 1), oma = c(1, 0, 1, 0), new = T) # split screen
box <- boxplot(std.resid, plot = FALSE)
boxplot(std.resid, border = "royalblue3", col = "skyblue1", axes = F)
for (i in 1:length(box$stats)) {
text(box$stats[i], labels = round(box$stats[i], 2), pos = 2, offset = 1,
col = "red", cex = 0.7, adj = 1)
}
#
# ----------------------------------------------------------------------------------------------------------#
# absolute studentized residual plot
psr <- cbind(pred[order(pred)], abs(std.resid)[order(pred)])
# KDE reference curve
weight <- matrix(rep(0, n * n), nc = n)
kde <- rep(0, n)
for (i in 1:n) {
weight[i, ] <- (1/sqrt(2 * pi)) * exp(-((psr[, 1] - psr[i, 1])^2)/2)
kde[i] <- sum(weight[i, ] * psr[, 2])/sum(weight[i, ])
}
# MA(10) reference curve
ma <- rep(0, n)
for (i in 1:n) {
pos <- order(abs(psr[i, 1] - psr[, 1]))[1:10]
ma[i] <- mean((psr[, 2])[pos])
pos <- NULL
}
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0))
plot(pred, abs(std.resid), cex = 1.5, lwd = 1.5, pch = ifelse(std.resid <
0, 1, 2), xlab = "Fitted rersponse values", ylab = "Absolute studentized residuals")
lines(psr[, 1], kde, lty = "24", lwd = 2, col = "red") # KDE
text(psr[1, 1], kde[1] + 0.1, "KDE", col = "red", adj = 0)
lines(psr[, 1], ma, lty = "dashed", lwd = 2, col = "royalblue3") # MA(10)
text(psr[1, 1], ma[1] - 0.1, "MA(10)", col = "royalblue3", adj = 0)
legend("topright", legend = c("negative", "positive"), pch = c(1, 2))
# ======================================== Plot End
# ========================================================#
return(res) # 결과값 리스트 형식으로 리턴
}
N <- 100
set.seed(1234)
tX1 <- rnorm(N, 0, 1)
tX2 <- rnorm(N, 3, 1)
tX3 <- rgamma(N, 1, 3)
tX4 <- sample(c(0, 1), N, replace = T)
tX5 <- sample(1:3, N, replace = T, prob = c(1, 2, 3)/6)
tX6 <- rbinom(N, 4, 0.3)
tX <- cbind(rep(1, length(tX1)), tX1, tX2, tX3, tX4, tX5, tX6)
tbeta <- (0:6)/5
ty <- as.vector(tX %*% tbeta) + rnorm(N, 0, 2)
dat <- data.frame(y = ty, age = tX1, height = tX2, weight = tX3, smoking = tX4,
therapy = tX5, surgery = tX6)
res <- MyReg(dat)
##
## == ANALYSIS OF VARIANCE ==
##
## Source df SS MS F P-value
## -------------------------------------------------------
## Regression 6 195.3 32.55 8.72 0 ***
## Error 93 347.11 3.73
## Total 99 542.41
## -------------------------------------------------------
## Estimated error variance : 3.7323
## R-squares : 0.3601
##
##
## == PARAMETER ESTIMATES ==
##
## Estimate Std.Error t value Pr(>|t|)
## (Intercept) -0.7866 1.182 -0.6655 0.5074
## age 0.0198 0.1951 0.1012 0.9196
## height 0.5986 0.1953 3.0653 0.0028 **
## weight -0.0803 0.6073 -0.1322 0.8951
## smoking 0.2927 0.4105 0.713 0.4776
## therapy 1.2801 0.2794 4.5809 0 ***
## surgery 1.0944 0.2018 5.4219 0 ***
str(res) # data structure
## List of 10
## $ beta : num [1:7] -0.7866 0.0198 0.5986 -0.0803 0.2927 ...
## $ sig2 : num [1, 1] 3.73
## $ pred : num [1:100, 1] 7.35 6.73 4.71 4.58 4.16 ...
## $ residuals: num [1:100, 1] 0.273 4.41 1.076 1.635 3.603 ...
## $ stdresid : num [1:100, 1] 0.146 2.321 0.572 0.88 1.909 ...
## $ SSR : num 195
## $ SSE : num 347
## $ F : num 8.72
## $ P.value : num 1.56e-07
## $ dat :'data.frame': 100 obs. of 7 variables:
## ..$ y : num [1:100] 7.62 11.14 5.78 6.21 7.76 ...
## ..$ age : num [1:100] -1.207 0.277 1.084 -2.346 0.429 ...
## ..$ height : num [1:100] 3.41 2.53 3.07 2.5 2.17 ...
## ..$ weight : num [1:100] 0.301 0.371 0.213 0.373 0.248 ...
## ..$ smoking: num [1:100] 1 0 0 1 0 0 0 1 0 1 ...
## ..$ therapy: int [1:100] 2 3 2 2 2 3 2 2 3 1 ...
## ..$ surgery: num [1:100] 3 2 1 1 1 1 1 1 3 2 ...
res <- MyReg(faithful) # test another data frame.
##
## == ANALYSIS OF VARIANCE ==
##
## Source df SS MS F P-value
## -------------------------------------------------------
## Regression 1 286.48 286.48 1162.06 0 ***
## Error 270 66.56 0.25
## Total 271 353.04
## -------------------------------------------------------
## Estimated error variance : 0.2465
## R-squares : 0.8115
##
##
## == PARAMETER ESTIMATES ==
##
## Estimate Std.Error t value Pr(>|t|)
## (Intercept) -1.874 0.1601 -11.7021 0 ***
## waiting 0.0756 0.0022 34.089 0 ***
str(res)
## List of 10
## $ beta : num [1:2] -1.874 0.0756
## $ sig2 : num [1, 1] 0.247
## $ pred : num [1:272, 1] 4.1 2.21 3.72 2.81 4.55 ...
## $ residuals: num [1:272, 1] -0.5006 -0.4099 -0.3895 -0.5319 -0.0214 ...
## $ stdresid : num [1:272, 1] -1.0107 -0.8294 -0.7859 -1.0741 -0.0432 ...
## $ SSR : num 286
## $ SSE : num 66.6
## $ F : num 1162
## $ P.value : num 8.13e-100
## $ dat :'data.frame': 272 obs. of 2 variables:
## ..$ eruptions: num [1:272] 3.6 1.8 3.33 2.28 4.53 ...
## ..$ waiting : num [1:272] 79 54 74 62 85 55 88 85 51 85 ...
Simple Linear Regression도 문제 없이 결과가 나오는 것을 확인 할 수 있다.
Hankuk University of Foreign Studies. Dept of Statistics. Daewoo Choi Lab. Yong Cha.
한국외국어대학교 통계학과 최대우 교수 연구실 차용
e-mail : yong.stat@gmail.com