\(f(x) = \beta_0 + \beta_1x + \epsilon\)
회귀 계수의 해석은 나머지 변수들이 고정 되어 있을 때, 해당 변수가 1단위 변할때 y에 미치는 영향이라 할 수 있다.
partial residual plot으로 개별 변수별로 그림을 볼 수 있지만, 상호작용의 효과를 볼 수 없고 2개의 변수를 다룰 수 없다.
2개의 변수를 \(x, y\)축에 놓고 \(z\)축을 예측값으로 하는 surface plot을 그리면 변수들의 효과를 시각화 할 수 있다.
x, y축 이외의 변수는 연속형의 경우 median, 범주형의 경우 최빈값으로 고정한다.
surface <- function(object, ...) UseMethod('surface')
surface.lm <- function(object, view, data, n.grid = 30) {
# 1. others, central
# quadratic
varnames <- names(object$model)[-1]
varnames <- varnames[!varnames %in% '(weights)']
if (any(grepl('poly\\(', varnames))) {
varnames_poly <- varnames[grepl('poly\\(', varnames)]
varnames_poly <- stringr::str_extract(varnames_poly, '\\([a-zA-Z._]{1,}\\,')
varnames_poly <- gsub('[(,]', '', varnames_poly)
varnames_poly <- unique(varnames_poly)
varnames <- c(varnames[!grepl('poly\\(', varnames)], varnames_poly)
}
# interaction
if (any(grepl(':', varnames))) {
varnames <- varnames[!grepl(':', varnames)]
}
others <- varnames[!varnames %in% view]
# 2. design
dat_others <- data[, others]
design_others <- as.data.frame(lapply(dat_others, centeral))
dat <- object$model[, view]
v1 <- var_grid(dat[, view[1]], n.grid)
v2 <- var_grid(dat[, view[2]], n.grid)
design <- expand.grid(v1,v2)
names(design) <- view
if (nrow(design_others) >= 1) {
design <- cbind(design, design_others)
}
design$pred <- predict(object = object, newdata = design)
z <- t(matrix(design$pred, ncol = n.grid, nrow = n.grid))
# 3. plot
return(plotly_surface(v1, v2, z, view))
}
surface.glm <- function(object, view, data, type = c('response', 'link'), n.grid = 30) {
# arguments
type <- match.arg(type)
# 1. others, central
# quadratic
varnames <- names(object$model)[-1]
varnames <- varnames[!varnames %in% '(weights)']
if (any(grepl('poly\\(', varnames))) {
varnames_poly <- varnames[grepl('poly\\(', varnames)]
varnames_poly <- stringr::str_extract(varnames_poly, '\\([a-zA-Z._]{1,}\\,')
varnames_poly <- gsub('[(,]', '', varnames_poly)
varnames_poly <- unique(varnames_poly)
varnames <- c(varnames[!grepl('poly\\(', varnames)], varnames_poly)
}
# interaction
if (any(grepl(':', varnames))) {
varnames <- varnames[!grepl(':', varnames)]
}
others <- varnames[!varnames %in% view]
# 2. design
dat_others <- data[, others]
design_others <- as.data.frame(lapply(dat_others, centeral))
dat <- object$model[, view]
v1 <- var_grid(dat[, view[1]], n.grid)
v2 <- var_grid(dat[, view[2]], n.grid)
design <- expand.grid(v1,v2)
names(design) <- view
if (nrow(design_others) >= 1) {
design <- cbind(design, design_others)
}
design$pred <- predict(object = object, newdata = design, type = type)
z <- t(matrix(design$pred, ncol = n.grid, nrow = n.grid))
# 3. plot
return(plotly_surface(v1, v2, z, view, ztitle = type))
}
surface.gam <- function(object, view, type = c('link', 'response'), n.grid = 30) {
# arguments
varnames <- names(object$var.summary)
if (missing(view)) {
view <- varnames[1:2]
}
others <- varnames[!varnames %in% view]
viewclass <- sapply(object$var.summary[view], class)
type <- match.arg(type)
# others, central tendency
# qualitative : mode, quantitative : median
central <- function(x) {
if (length(x) >= 2) {
x[2]
} else {
x[1]
}
}
design_others <- as.data.frame(lapply(object$var.summary[others], central))
# view, making grid matrix
# qualitative : all, quantitative : expand min to max as many as n.grid
expand <- function(x) {
if (class(x) %in% c('integer', 'numeric')) {
seq(x[1], x[3], length.out = n.grid) # min, max
} else { # (class(x) %in% c('factor', 'character'))
factor(levels(x), levels(x))
}
}
grid <- lapply(object$var.summary[view], expand)
v1 <- grid[[view[1]]]
v2 <- grid[[view[2]]]
if (is.factor(v1)) {
v1 <- fac.seq(v1, n.grid)
}
if (is.factor(v2)) {
v2 <- fac.seq(v2, n.grid)
}
design <- expand.grid(v1, v2) # data.frame with n.grid * n.grid rows
names(design) <- view
# design <- do.call(expand.grid, grid)
if (nrow(design_others) >= 1) {
design <- cbind(design, design_others)
}
design$lp <- predict(object = object, newdata = design, type = type) # linear predictor
z <- t(matrix(design$lp, ncol = n.grid, nrow = n.grid))
return(plotly_surface(v1, v2, z, view, ztitle = type))
}
centeral <- function(x) {
x_class <- class(x)
if (any(x_class %in% c('integer', 'numeric'))) {
cent <- median(x, na.rm = T)
} else if (any(x_class %in% c('factor'))) {
cent <- factor(get_mode(x), levels = levels(x))
}
return(cent)
}
get_mode <- function(x, order = 1, type = c('value', 'count')) {
type <- match.arg(type)
tab <- table(x)
tab <- tab[order(-tab)]
if (type == 'value') {
result <- names(tab)[order]
} else if (type == 'count') {
result <- tab[order]
result <- unname(result)
}
return(result)
}
# generates a sequence of factor variables of length n.grid
fac.seq <- function(fac, n.grid) {
fn <- length(levels(fac)); gn <- n.grid
if (fn > gn) {
mf <- factor(levels(fac))[1:gn]
} else {
ln <- floor(gn/fn) # length of runs
mf <- rep(levels(fac)[fn], gn)
mf[1:(ln*fn)] <- rep(levels(fac), rep(ln, fn))
mf <- factor(mf, levels = levels(fac))
}
mf
}
var_grid <- function(x, n.grid) {
if (any(class(x) %in% c('integer', 'numeric'))) {
x <- seq(min(x), max(x), length.out = n.grid)
} else if (any(class(x) %in% c('character', 'factor'))) {
cent <- factor(get_mode(x), levels = levels(x))
x <- fac.seq(cent, n.grid)
}
return(x)
}
plotly_surface <- function(v1, v2, z, view, ztitle = 'response') {
plotly_obj <- plotly::plot_ly(x = v1, y = v2)
plotly_obj <- plotly::add_surface(p = plotly_obj, z = z)
plotly_obj <- plotly::layout(
p = plotly_obj,
scene = list(
xaxis = list(title = view[1]),
yaxis = list(title = view[2]),
zaxis = list(title = ztitle)
)
)
return(plotly_obj)
}
자전거 대여 횟수와 날씨 변수에 대한 데이터.
# registered weekend
y <- 'registered' # 'casual', 'registered'
dat_sub <- dat_tr[weekend == 'yes', ] # 'yes', 'no'
dat_sub[, sd := sd(count), by = 'hour']
head(dat_sub)
## date year month hour datetime season holiday weekdays
## 1: 2011-01-01 2011 1 0 2011-01-01 00:00:00 spring no saturday
## 2: 2011-01-01 2011 1 1 2011-01-01 01:00:00 spring no saturday
## 3: 2011-01-01 2011 1 2 2011-01-01 02:00:00 spring no saturday
## 4: 2011-01-01 2011 1 3 2011-01-01 03:00:00 spring no saturday
## 5: 2011-01-01 2011 1 4 2011-01-01 04:00:00 spring no saturday
## 6: 2011-01-01 2011 1 5 2011-01-01 05:00:00 spring no saturday
## weekend workingday weather temp atemp humidity windspeed casual registered
## 1: yes no best 9.84 14.395 81 0.0000 3 13
## 2: yes no best 9.02 13.635 80 0.0000 8 32
## 3: yes no best 9.02 13.635 80 0.0000 5 27
## 4: yes no best 9.84 14.395 75 0.0000 3 10
## 5: yes no best 9.84 14.395 75 0.0000 0 1
## 6: yes no better 9.84 12.880 75 6.0032 0 1
## count weather2 season2 sd
## 1: 16 not_worse spring 46.645703
## 2: 40 not_worse spring 33.038460
## 3: 32 not_worse spring 23.800923
## 4: 13 not_worse spring 13.017951
## 5: 1 not_worse spring 5.195167
## 6: 1 not_worse spring 7.075469
# 1. gam
model <- as.formula(paste(y, ' ~ 1'))
model <- update(
model, . ~ s(hour, k=14) +
s(temp, humidity, k=10) +
s(temp, windspeed, k=12) +
season2 + weather2
)
gam_fit <- gam(
formula = model,
family = quasipoisson(),
weights = 1/sd,
data = dat_sub
)
# 2. glm
model <- as.formula(paste(y, ' ~ 1'))
hour_fit <- gam(
formula = update(model, . ~ s(hour, k=14)),
family = quasipoisson(),
weights = 1/sd,
data = dat_sub
)
dat_sub$y_dh <- dat_sub[[y]] - hour_fit$fitted.values
model <- as.formula(paste('y_dh', ' ~ 1'))
model <- update(
model, . ~ poly(temp, 3, raw = T) + humidity + temp:humidity + season2 + weather2
)
glm_fit <- glm(model, data = dat_sub, weights = 1/sd)
# 3. lm
model <- as.formula(paste('y_dh', ' ~ 1'))
model <- update(
model, . ~ poly(temp, 3, raw = T) + humidity + temp:humidity + season2 + weather2
)
lm_fit <- lm(model, data = dat_sub, weights = 1/sd)
surface(gam_fit, c('temp', 'humidity'), 'response')
surface(glm_fit, c('temp', 'humidity'), data.frame(dat_sub))
surface(lm_fit, c('temp', 'humidity'), data.frame(dat_sub))