Data visualization in ggplot2,R

1. Stats with geoms
2. Coordinates
3. Facets
4. Best practices

This report is a summary of the lesson by Rick Scavetta, DataCamp

1. Stats with geoms

The geom_/stat_ connection

stat_	geom_
stat_bin()	geom_histogram(), geom_freqpoly()
stat_count()	geom_bar()
stat_smooth()	geom_smooth()
stat_sum()	geom_count()
stat_bindot()	geom_dotplot()
stat_bin2d()	geom_bin2d()
stat_binhex()	geom_hex()
stat_contour()	geom_contour()
stat_quantile()	geom_quantile()
stat_sum()	geom_count()

기본적으로 stat과 geom 함수의 기능은 비슷하지만, 특별한 변환이 필요한 경우에는 stat을 활용한다.

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(title = "LOESS")

geom_smooth 내 method 기본값은 LOESS이다. LOESS는 non parametric smoothing algorithm 으로 관측치가 1000개 보다 적을 경우 사용된다. x 축을 기준으로 일정한 window를 설정하고 해당 window들의 가중평균을 계산한다. window의 size는 span 변수로 설정한다. span이 작을수록 noise가 더 심해진다.

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  geom_smooth(se = FALSE, span = 0.4) +
  labs(title = "geom_smooth(span = 0.4)")

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  geom_smooth(se = FALSE, span = 1) +
  labs(title = "geom_smooth(span = 1)")

method는 또한 lm과 같이 parametric model로도 설정가능하다. glm, rlm, gam도 동일하다. 관측치가 1000개가 넘는 경우 gam이 기본값으로 설정된다.

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "method = lm")

## fullrange를 설정하면 x 축 데이터의 범위 밖까지 회귀선을 출력
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +
  labs(title = "method = lm, fullrange = TRUE")

Grouping variables

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
  geom_point() +
  stat_smooth(method = "lm", se = F) +
  stat_smooth(aes(group = 1), method = "lm", se = F) +
  labs(title = "aes(group = 1)")

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
  geom_point() +
  stat_smooth(method = "lm", se = F) +
  stat_smooth(aes(color = "ALL"), method = "lm", se = F) +
  labs(title = "aes(color = ALL)")

aes(group = 1)을 통해 하나의 그룹으로 묶어 출력 가능, 숫자 1은 의미 없음.
aes(color = "ALL")은 group과 비슷하지만 범례를 출력력

Modifying stat_smooth

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_jitter(alpha = 0.25) +
  stat_smooth(method = "lm", aes(fill = Species), size = 2) +
  labs(title = "if standard error ribbons match the lines, it's easier to read the plot")

Quantile

quantile regression은 보통 heteroscedasticity(이분산성)이 독립변수들간에 존재할 경우 사용한다. 이분산성이 있을 경우 모델의 유효성이 떨어지기 때문이다.

heteroscedasticity는 회귀분석 시 오차의 분산이 일정하지 않은 경우를 의미하는데, 독립 변수의 값이 변할 때, 종속 변수의 분산이 일정하지 않는 경우를 말한다.

data("Journals")

p <- ggplot(Journals, aes(x = log(price/citations), y = log(subs))) +
  geom_point(alpha = 0.5) +
  labs(title = "Dealing with heteroscedasticity",
       x = "Library price relative total number of citations (log)",
       y = "Number of library subscriptions (log)")
p

model <- lm(log(subs) ~ log(price/citations), Journals)
residuals <- resid(model)

ggplot(Journals, aes(x = log(price/citations), y = residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residual Plot", x = "Library price relative total number of citations (log)", y = "Residuals")

이때 geom_quantile을 사용해서 데이터가 비대칭이거나 이상치가 많을 경우 분위수 별로 모델링이 가능하다.

p + geom_quantile(quantiles = c(0.05, 0.5, 0.95)) +
  labs(title = "Using geom_quantile")

stat_summary

stat_summary는 기본적으로 geom_pointrage()를 사용하며 다른 plot들도 사용이 가능하다.

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  stat_summary(fun = mean,
               geom = "point") +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               geom = "errorbar",
               width = .1) +
  labs(title = "1 sd using mean_sdl")

ggplot(iris, aes(x = Species, y = Sepal.Length)) +
  stat_summary(fun = mean,
               geom = "point") +
  stat_summary(fun.data = mean_cl_normal,
               geom = "errorbar",
               width = .1) +
  labs(title = "95% cofidence interval using mean_cl_normal")

### stat_function

특정함수를 설정하고 이론적 확률분포 생성 가능

mam.new <- data.frame(body = log10(mammals$body))

ggplot(mam.new, aes(body)) +
  geom_histogram(aes(y = ..density.. )) +
  geom_rug() +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(mam.new$body), sd = sd(mam.new$body))) +
  labs(title = "Normal Dist. line using geom_function")

stat_qq

ggplot(mam.new, aes(sample = body)) +
  stat_qq() +
  ## 기준선 추가
  stat_qq_line(col = "red") +
  labs(title = "QQ plot")

2. Coordinates

Zooming in

coord_cartesian(xlim = ..., expand = ..., clip = ...)
scale_x_continuous(limits = ...)
xlim(...)

scale_x_continuous, xlim으로 확대할 경우 데이터가 확대되면서 필터링이 발생하고 회귀선이 변한다. coord_cartesian을 사용하면 변하지 않는다.

ggplot(mtcars, aes(wt, mpg)) +
  geom_point(size = 2) +
  # Turn clipping off
  coord_cartesian(expand = 0, clip = "off") +
  theme_classic() +
  # Remove axis lines
  theme(axis.line = element_blank()) +
  labs(title = "expand = 0, clip = off")

Aspect ratio

Height-to-width ratio(종횡비)로 plot을 그릴 때 데이터 특성을 고려해서 설정해야 한다. 보통 척도가 동일하다면 1:1을 사용하는게 일반적이다.

sunspots.m <- data.frame(
  year = index(sunspot.month),
  value = reshape2::melt(sunspot.month)$value
)

ggplot(sunspots.m, aes(year, value)) +
  geom_line() +
  coord_fixed() +
  labs(title = "1:1 Aspect ratio")

ggplot(sunspots.m, aes(year, value)) +
  geom_line() +
  coord_fixed(0.055) +
  labs(title = "Aspect ration = 0.055")

scales

ggplot(msleep, aes(x = bodywt, y = 1)) +
  geom_jitter() +
  scale_x_continuous(limits = c(0, 7000), breaks = seq(0, 7000, 1000)) +
  labs(title = "raw data - positive skewed")

log변환을 해보자.

ggplot(msleep, aes(x = log10(bodywt), y = 1)) +
  geom_jitter() +
  scale_x_continuous(limits = c(-3, 4), breaks = -3:4) +
  ## x scale이 log 변환임을 강조
  annotation_logticks(sides = "b") +
  labs(title = "1. log10 trans using log10")

ggplot(msleep, aes(x = bodywt, y = 1)) +
  geom_jitter() +
  scale_x_log10(limits = c(1e-03, 1e+04)) +
  labs(title = "2. log10 trans using scale_x_log10")

ggplot(msleep, aes(x = bodywt, y = 1)) +
  geom_jitter() +
  coord_trans(x = "log10") +
  labs(title = "3. log10 trans using coord_trans")

coord_trans()의 경우 x 축 label이 raw data scale임을 알 수 있다. 즉 회귀분석 시 사용되는 데이터 역시 변형되지 않은 데이터를 사용하기에 주의해야한다 !!!

Double and flipped axes

coord_flip()

ggplot(mtcars, aes(fcyl, fill = fam)) +
  geom_bar(position = position_dodge(width = 0.5)) +
  coord_flip() +
  labs(title = "Using coord_flip()")

Polar coordinates

coord_polar(theta = x or y) : x or y 축을 잡고 구부려서 생성

ggplot(mtcars, aes(1, fill = fcyl)) +
  geom_bar() +
  coord_polar(theta = "y") +
  labs(title = "Using coord_polar() 1")

3. Facets

facet_grid

facet_grid(rows = vars(...), cols = vars(...)) ## Modern notation
facet_grid(rows ~ cols) ## Formula notation

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet rows by am and columns by cyl
  facet_grid(rows = vars(am), cols = vars(cyl))

labels and order

facet_grid(labeller = label_value / label_both / label_context)
fct_recode ## relabeling
fct_relevel ## change the order of levels

4. Best practices

ggplot(barley, aes(year, variety, fill = yield)) +
  geom_tile() +
  facet_wrap(vars(site), ncol = 1) +
  scale_fill_gradientn(colors = brewer.pal(9, "Reds")) +
  labs(title = "A heat map")

ggplot(barley, aes(yield, variety, color = year)) +
  geom_point() +
  facet_wrap(vars(site), ncol = 1) +
  labs(title = "A dot plot")

ggplot(barley, aes(year, yield, color = variety, group = variety)) +
  geom_line() +
  facet_wrap(vars(site), nrow = 1) +
  labs(title = "As a time series")

ggplot(barley, aes(year, yield, color = site, group = site)) +
  stat_summary(fun = mean, geom = "line", position = "dodge") +
  stat_summary(fun.data = mean_sdl, geom = "errorbar", position = "dodge") +
  labs(title = "Error bar")