About Us

Data Science in Hsinchu (DSHC) is a non-profit studying group. We meet weekly, discuss topics in data thinking, statistical modeling, machine learning, share modern statistical tools and data visualization techniques. We welcome any partitioner, from any industry, to share their expertise or experience in data analysis with us.

Agenda

Today I'll introduction to

  • Data Visualization
  • ggplot2 in R (command line tool)
  • plotly (online tool)

Data Visualization

  • 清晰有效地傳達與溝通訊息
  • 教學、研究、宣傳
  • 美學、功能兼顧
  • 統計圖形、訊息可視化

ggplot2 in R

  • 當前最多人使用的視覺化R套件
  • 取自 “The Grammar of Graphics” (Leland Wilkinson, 2005)
  • 由Hadley Wickham於2005年發表 (Chief Scientist in RStudio)
  • 設計理念
    • 採用圖層系統
    • 用抽象的概念來控制圖形,避免細節繁瑣
    • 圖形美觀

ggplot2的基本架構

  • 資料 (data) 和映射 (mapping)
  • 幾何對象 (geometric)
  • 座標尺度 (scale)
  • 統計轉換 (statistics)
  • 座標系統 (coordinante)
  • 圖層 (layer)
  • 刻面 (facet)
  • 主題 (theme)

ggplot2的基本架構

基本語法

ggplot(data=..., aes(x=..., y=...)) + geom_xxx(...) +
  stat_xxx(...) + facet_xxx(...) + ...
  • ggplot 描述 data 從哪來
  • aes 描述圖上的元素跟 data 之類的對應關係
  • geom_xxx 描述要畫圖的類型及相關調整的參數
  • 常用的類型諸如:geom_bar, geom_points, geom_line

注意

  • 使用 data.frame 儲存資料 (不可以丟 matrix 物件)
  • 使用 long format (利用reshape2將資料轉換成 1 row = 1 observation)

Various functions

library(ggplot2)
# list all geom
ls(pattern = '^geom_', env = as.environment('package:ggplot2'))
 [1] "geom_abline"     "geom_area"       "geom_bar"       
 [4] "geom_bin2d"      "geom_blank"      "geom_boxplot"   
 [7] "geom_contour"    "geom_crossbar"   "geom_density"   
[10] "geom_density2d"  "geom_dotplot"    "geom_errorbar"  
[13] "geom_errorbarh"  "geom_freqpoly"   "geom_hex"       
[16] "geom_histogram"  "geom_hline"      "geom_jitter"    
[19] "geom_line"       "geom_linerange"  "geom_map"       
[22] "geom_path"       "geom_point"      "geom_pointrange"
[25] "geom_polygon"    "geom_quantile"   "geom_raster"    
[28] "geom_rect"       "geom_ribbon"     "geom_rug"       
[31] "geom_segment"    "geom_smooth"     "geom_step"      
[34] "geom_text"       "geom_tile"       "geom_violin"    
[37] "geom_vline"     

Barchart

Plot by counting (default)

# by default, stat = "bin"
ggplot(diamonds, aes(x=clarity)) + 
  geom_bar()

Use stat = "identity" argument

tab <- data.frame(table(diamonds$clarity))
ggplot(tab, aes(x=Var1, y=Freq)) + 
  geom_bar(stat="identity")

Barchart 2

Add fill layer

ggplot(diamonds, aes(x=clarity, fill=cut)) + 
  geom_bar() + theme(legend.position="bottom")

Change position layer

ggplot(diamonds, aes(x=clarity, fill=cut)) + 
  geom_bar(position="dodge") + 
  theme(legend.position="NULL")

Barchart 3

Change coord layer

ggplot(diamonds, aes(x=clarity, fill=cut)) + 
  geom_bar() + coord_flip() +
  theme(legend.position="NULL")

Set theme layer

g <- ggplot(diamonds, aes(x=clarity, fill=cut))
g + geom_bar() + coord_flip() + theme_bw() +
  theme(legend.position="NULL")

Histogram

Plot histogram

ggplot(diamonds, aes(x=price)) + 
  geom_histogram(binwidth=1000)

Plot density

ggplot(diamonds, aes(x=price)) + 
  geom_histogram(aes(y=..density..), binwidth=1000) +
  geom_density(color="red")

Line chart 1

Summarise number of movie ratings by year of movie

library(dplyr)
mry <- transmute(movies, year, 
                 rating=cut(rating, 0:10)) %>%
  group_by(year, rating) %>%
  summarise(number=n()) %>% 
  data.frame
head(mry, 10)
   year rating number
1  1893  (6,7]      1
2  1894  (4,5]      6
3  1894  (5,6]      2
4  1894  (6,7]      1
5  1895  (4,5]      1
6  1895  (5,6]      2
7  1896  (2,3]      1
8  1896  (3,4]      1
9  1896  (4,5]      2
10 1896  (5,6]      5

Draw movies count by rating levels

p <- ggplot(mry, aes(x=year, y=number, 
                     group=rating))
p + geom_line()

Line chart 2

Add aesthetic mappings

p + geom_line(colour = "blue")

Set aesthetics to fixed value

p + geom_line(aes(colour=rating))

Scatterplot 1

g1 <- ggplot(mtcars, aes(x=wt, y=mpg, colour=cyl)) + geom_point()
g2 <- ggplot(mtcars, aes(x=wt, y=mpg, colour=factor(cyl))) + geom_point()
library(gridExtra)
grid.arrange(g1, g2, ncol=2)

Scatterplot 2

Add stat_xxx layer

ggplot(mtcars, aes(x=wt, y=mpg)) + 
  geom_point(color="red", size=5, alpha=0.5) +
  stat_smooth()

Add scale_xxx layer

ggplot(mtcars, aes(x=wt, y=mpg, colour=gear)) + 
  geom_point(size=5) +
  scale_color_continuous(low="yellow", high="red")

facet_xxx

ggplot(mtcars, aes(x=mpg, y=disp)) + 
  geom_point(aes(colour=qsec, size=cyl, shape=as.factor(am))) + 
  facet_wrap(~gear) +
  scale_size(range=c(3,6))

plotly (plot.ly)

Getting started: Plotly for R

# First, install and load the devtools package
install.packages("devtools")
library("devtools")

# Next, install plotly
install_github("ropensci/plotly")

# Import plotly package
library(plotly)

# Set your Plotly user credentials
# see: https://plot.ly/settings/api
set_credentials_file("DemoAccount", "lr1c37zw81")

Publish ggplot2 figure to plotly

p <- ggplot(iris, aes(Petal.Width, Sepal.Length, colour=Species)) +
  geom_point()
py <- plotly()
out <- py$ggplotly(p) # Publish figure with one line!
out$response$url
[1] "https://plot.ly/~JohnsonHsieh/132"

ggplot2 example

ggplot(iris, aes(Petal.Width, Sepal.Length, colour=Species)) +
  geom_point()

ggplot2 on plotly

Appendix

  • Change themes (library(ggthemes))
  • Customize filename (filename="PlotlyThemeExample")
  • Private graphs (world_readable=FALSE)
# install ggthemes package from CRAN 
install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
p <- ggplot(iris, aes(Petal.Width, Sepal.Length, colour=Species)) + geom_point()
p1 <- p + theme_wsj() + scale_colour_wsj("colors6")
py1 <- plotly()
out1 <- py1$ggplotly(p1, kwargs=list(filename="PlotlyThemeExample",
                                     fileopt="overwrite",
                                     world_readable=FALSE))
out1$response$url

學習資源