회귀식의 구조

\(f(x) = \beta_0 + \beta_1x + \epsilon\)

회귀 계수의 해석은 나머지 변수들이 고정 되어 있을 때, 해당 변수가 1단위 변할때 y에 미치는 영향이라 할 수 있다.

partial residual plot으로 개별 변수별로 그림을 볼 수 있지만, 상호작용의 효과를 볼 수 없고 2개의 변수를 다룰 수 없다.

2개의 변수를 \(x, y\)축에 놓고 \(z\)축을 예측값으로 하는 surface plot을 그리면 변수들의 효과를 시각화 할 수 있다.

x, y축 이외의 변수는 연속형의 경우 median, 범주형의 경우 최빈값으로 고정한다.


Generic functions

surface <- function(object, ...) UseMethod('surface')

surface.lm <- function(object, view, data, n.grid = 30) {
  # 1. others, central
  # quadratic
  varnames <- names(object$model)[-1]
  varnames <- varnames[!varnames %in% '(weights)']
  if (any(grepl('poly\\(', varnames))) {
    varnames_poly <- varnames[grepl('poly\\(', varnames)]
    varnames_poly <- stringr::str_extract(varnames_poly, '\\([a-zA-Z._]{1,}\\,')
    varnames_poly <- gsub('[(,]', '', varnames_poly)
    varnames_poly <- unique(varnames_poly)
    
    varnames <- c(varnames[!grepl('poly\\(', varnames)], varnames_poly)
  }
  
  # interaction
  if (any(grepl(':', varnames))) {
    varnames <- varnames[!grepl(':', varnames)]
  }
  others <- varnames[!varnames %in% view]
  
  # 2. design
  dat_others <- data[, others]
  design_others <- as.data.frame(lapply(dat_others, centeral))
  
  dat <- object$model[, view]
  v1 <- var_grid(dat[, view[1]], n.grid)
  v2 <- var_grid(dat[, view[2]], n.grid)
  
  design <- expand.grid(v1,v2)
  names(design) <- view
  
  if (nrow(design_others) >= 1) {
    design <- cbind(design, design_others)  
  }
  design$pred <- predict(object = object, newdata = design)
  z <- t(matrix(design$pred, ncol = n.grid, nrow = n.grid))
  
  # 3. plot
  return(plotly_surface(v1, v2, z, view))
}

surface.glm <- function(object, view, data, type = c('response', 'link'), n.grid = 30) {
  # arguments
  type <- match.arg(type)
  
  # 1. others, central
  # quadratic
  varnames <- names(object$model)[-1]
  varnames <- varnames[!varnames %in% '(weights)']
  if (any(grepl('poly\\(', varnames))) {
    varnames_poly <- varnames[grepl('poly\\(', varnames)]
    varnames_poly <- stringr::str_extract(varnames_poly, '\\([a-zA-Z._]{1,}\\,')
    varnames_poly <- gsub('[(,]', '', varnames_poly)
    varnames_poly <- unique(varnames_poly)
    
    varnames <- c(varnames[!grepl('poly\\(', varnames)], varnames_poly)
  }
  
  # interaction
  if (any(grepl(':', varnames))) {
    varnames <- varnames[!grepl(':', varnames)]
  }
  others <- varnames[!varnames %in% view]
  
  # 2. design
  dat_others <- data[, others]
  design_others <- as.data.frame(lapply(dat_others, centeral))
  
  dat <- object$model[, view]
  v1 <- var_grid(dat[, view[1]], n.grid)
  v2 <- var_grid(dat[, view[2]], n.grid)
  
  design <- expand.grid(v1,v2)
  names(design) <- view
  
  if (nrow(design_others) >= 1) {
    design <- cbind(design, design_others)  
  }
  design$pred <- predict(object = object, newdata = design, type = type)
  z <- t(matrix(design$pred, ncol = n.grid, nrow = n.grid))
  
  # 3. plot
  return(plotly_surface(v1, v2, z, view, ztitle = type))
}

surface.gam <- function(object, view, type = c('link', 'response'), n.grid = 30) {
  # arguments
  varnames <- names(object$var.summary)
  if (missing(view)) {
    view <- varnames[1:2]
  }
  others <- varnames[!varnames %in% view]
  viewclass <- sapply(object$var.summary[view], class)
  type <- match.arg(type)
  
  # others, central tendency
  # qualitative : mode, quantitative : median
  central <- function(x) {
    if (length(x) >= 2) {
      x[2]
    } else {
      x[1] 
    }
  }
  design_others <- as.data.frame(lapply(object$var.summary[others], central))
  
  # view, making grid matrix
  # qualitative : all, quantitative : expand min to max as many as n.grid
  expand <- function(x) {
    if (class(x) %in% c('integer', 'numeric')) {
      seq(x[1], x[3], length.out = n.grid) # min, max
    } else { # (class(x) %in% c('factor', 'character'))
      factor(levels(x), levels(x))
    }
  }
  grid <- lapply(object$var.summary[view], expand)
  v1 <- grid[[view[1]]]
  v2 <- grid[[view[2]]]
  
  if (is.factor(v1)) {
    v1 <- fac.seq(v1, n.grid)  
  }
  if (is.factor(v2)) {
    v2 <- fac.seq(v2, n.grid)  
  }
  design <- expand.grid(v1, v2) # data.frame with n.grid * n.grid rows
  names(design) <- view
  # design <- do.call(expand.grid, grid)
  if (nrow(design_others) >= 1) {
    design <- cbind(design, design_others)  
  }
  design$lp <- predict(object = object, newdata = design, type = type) # linear predictor
  z <- t(matrix(design$lp, ncol = n.grid, nrow = n.grid))
  
  return(plotly_surface(v1, v2, z, view, ztitle = type))
}




Etc functions

centeral <- function(x) {
  x_class <- class(x)
  
  if (any(x_class %in% c('integer', 'numeric'))) {
    cent <- median(x, na.rm = T)
  } else if (any(x_class %in% c('factor'))) {
    cent <- factor(get_mode(x), levels = levels(x))
  }
  
  return(cent)
}

get_mode <- function(x, order = 1, type = c('value', 'count')) {
  type <- match.arg(type)
  
  tab <- table(x)
  tab <- tab[order(-tab)]
  
  if (type == 'value') {
    result <- names(tab)[order]
  } else if (type == 'count') {
    result <- tab[order]
    result <- unname(result)
  }
  return(result)
}

# generates a sequence of factor variables of length n.grid
fac.seq <- function(fac, n.grid) { 
  fn <- length(levels(fac)); gn <- n.grid
  if (fn > gn) {
    mf <- factor(levels(fac))[1:gn]
  } else { 
    ln <- floor(gn/fn) # length of runs               
    mf <- rep(levels(fac)[fn], gn)
    mf[1:(ln*fn)] <- rep(levels(fac), rep(ln, fn))
    mf <- factor(mf, levels = levels(fac))
  }
  mf
}

var_grid <- function(x, n.grid) {
  if (any(class(x) %in% c('integer', 'numeric'))) {
    x <- seq(min(x), max(x), length.out = n.grid)  
  } else if (any(class(x) %in% c('character', 'factor'))) {
    cent <- factor(get_mode(x), levels = levels(x))
    x <- fac.seq(cent, n.grid)
  }
  return(x)
}

plotly_surface <- function(v1, v2, z, view, ztitle = 'response') {
  plotly_obj <- plotly::plot_ly(x = v1, y = v2)
  plotly_obj <- plotly::add_surface(p = plotly_obj, z = z)
  plotly_obj <- plotly::layout(
    p = plotly_obj,
    scene = list(
      xaxis = list(title = view[1]), 
      yaxis = list(title = view[2]),
      zaxis = list(title = ztitle)
    )
  )
  return(plotly_obj)
}




Data, Models

자전거 대여 횟수와 날씨 변수에 대한 데이터.

# registered weekend 
y <- 'registered' # 'casual', 'registered'
dat_sub <- dat_tr[weekend == 'yes', ] # 'yes', 'no'
dat_sub[, sd := sd(count), by = 'hour']
head(dat_sub)
##          date year month hour            datetime season holiday weekdays
## 1: 2011-01-01 2011     1    0 2011-01-01 00:00:00 spring      no saturday
## 2: 2011-01-01 2011     1    1 2011-01-01 01:00:00 spring      no saturday
## 3: 2011-01-01 2011     1    2 2011-01-01 02:00:00 spring      no saturday
## 4: 2011-01-01 2011     1    3 2011-01-01 03:00:00 spring      no saturday
## 5: 2011-01-01 2011     1    4 2011-01-01 04:00:00 spring      no saturday
## 6: 2011-01-01 2011     1    5 2011-01-01 05:00:00 spring      no saturday
##    weekend workingday weather temp  atemp humidity windspeed casual registered
## 1:     yes         no    best 9.84 14.395       81    0.0000      3         13
## 2:     yes         no    best 9.02 13.635       80    0.0000      8         32
## 3:     yes         no    best 9.02 13.635       80    0.0000      5         27
## 4:     yes         no    best 9.84 14.395       75    0.0000      3         10
## 5:     yes         no    best 9.84 14.395       75    0.0000      0          1
## 6:     yes         no  better 9.84 12.880       75    6.0032      0          1
##    count  weather2 season2        sd
## 1:    16 not_worse  spring 46.645703
## 2:    40 not_worse  spring 33.038460
## 3:    32 not_worse  spring 23.800923
## 4:    13 not_worse  spring 13.017951
## 5:     1 not_worse  spring  5.195167
## 6:     1 not_worse  spring  7.075469
# 1. gam
model <- as.formula(paste(y, ' ~ 1'))
model <- update(
  model, . ~ s(hour, k=14) + 
    s(temp, humidity, k=10) +
    s(temp, windspeed, k=12) + 
    season2 + weather2
)

gam_fit <- gam(
  formula = model,
  family = quasipoisson(),
  weights = 1/sd,
  data = dat_sub
)


# 2. glm
model <- as.formula(paste(y, ' ~ 1'))
hour_fit <- gam(
  formula = update(model, . ~ s(hour, k=14)),
  family = quasipoisson(),
  weights = 1/sd,
  data = dat_sub
)
dat_sub$y_dh <- dat_sub[[y]] - hour_fit$fitted.values
model <- as.formula(paste('y_dh', ' ~ 1'))
model <- update(
  model, . ~ poly(temp, 3, raw = T) + humidity + temp:humidity + season2 + weather2
)

glm_fit <- glm(model, data = dat_sub, weights = 1/sd)


# 3. lm
model <- as.formula(paste('y_dh', ' ~ 1'))
model <- update(
  model, . ~ poly(temp, 3, raw = T) + humidity + temp:humidity + season2 + weather2
)

lm_fit <- lm(model, data = dat_sub, weights = 1/sd)




함수 적용

surface(gam_fit, c('temp', 'humidity'), 'response')




surface(glm_fit, c('temp', 'humidity'), data.frame(dat_sub))




surface(lm_fit, c('temp', 'humidity'), data.frame(dat_sub))