資料視覺化 (Visualization) by ggplot2

Jying-Nan Wang

2015-12-02

Useful Websites

Why ggplot2?

#install.packages("ggplot2")
# You’ll need to make sure you have the most recent version of R to get the most recent version of ggplot

Basic Components

Example 1: Scatter PLots

# ?mtcars
library(ggplot2)
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
p <- ggplot(mtcars, aes(wt, mpg))
p + geom_point()

# try it
# p + geom_point(size=3,colour=rgb(0,0.4,0.9),alpha=0.5,shape=1)
p <- ggplot(mtcars, aes(x=wt, y=mpg))
p + geom_point(aes(shape = factor(cyl)))

# try it
# p + geom_point(aes(colour = qsec))

更多ggolot2相關的設定

# 宣告使用ggplot, 資料來源是mtcars,資料mapping到圖的方式是用aes()
p <- ggplot(mtcars, aes(wt, mpg))
# 採用geom_point() 畫scatter plot,geom_smooth()畫迴歸線
p <- p + geom_point(shape=2)+ geom_smooth(lwd = 1, se = TRUE, method = "lm",color=rgb(0,0.5,0.5))
# 設定座標相關資訊
p <- p +xlab("車重") + ylab("油耗") + ggtitle("車重和油耗的關係") + expand_limits(x=c(1,6), y = c(8, 40)) 
# 設定中文(mac專用)
p + theme_bw(base_family = "STHeiti") 

Example 2: Histograms

head(movies)
##                      title year length budget rating votes   r1   r2  r3
## 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0
p <- ggplot(movies, aes(x=rating))
p + geom_histogram(alpha=0.8)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

p <- ggplot(movies, aes(x=rating))
p <- p + geom_histogram(alpha=0.8, aes(y = ..density..),binwidth=0.5)
p + geom_density()

p <- ggplot(movies, aes(x=rating))
p <- p + geom_histogram(aes(y = ..density..),colour = "darkgreen", fill = "white", binwidth = 0.5)
p + stat_density(size=1.5,color="#7D110C",alpha=0.2,fill="red")

p <- ggplot(movies, aes(x=rating))
p <- p + geom_histogram(aes(y = ..density..),colour = "darkgreen", fill = "white", binwidth = 0.5)
#p <- p + stat_density(size=1.5,color="#7D110C",alpha=0.2,fill="red")
p + facet_grid(Action ~ Comedy)

Example 3: Box Plots

set.seed(1234)
dat <- data.frame(cond = factor(rep(c("A","B"), each=200)), 
                   rating = c(rnorm(200),rnorm(200, mean=.8)))
# View first few rows
head(dat)
##   cond     rating
## 1    A -1.2070657
## 2    A  0.2774292
## 3    A  1.0844412
## 4    A -2.3456977
## 5    A  0.4291247
## 6    A  0.5060559
# A basic box plot
ggplot(dat, aes(x=cond, y=rating)) + geom_boxplot()

# A basic box with the conditions colored
ggplot(dat, aes(x=cond, y=rating, fill=cond)) + geom_boxplot()

# The above adds a redundant legend. With the legend removed:
ggplot(dat, aes(x=cond, y=rating, fill=cond)) + geom_boxplot() +
    guides(fill=FALSE)

# With flipped axes
ggplot(dat, aes(x=cond, y=rating, fill=cond)) + geom_boxplot() + 
    guides(fill=FALSE) + coord_flip()

# Add a diamond at the mean, and make it larger
ggplot(dat, aes(x=cond, y=rating)) + geom_boxplot() +
    stat_summary(fun.y=mean, geom="point", shape=5, size=4)

Example 3-1: Bar graphs with two variables

dat <- data.frame(
  time = factor(c("Lunch","Dinner"), levels=c("Lunch","Dinner")),
  total_bill = c(14.89, 17.23)
)
dat
##     time total_bill
## 1  Lunch      14.89
## 2 Dinner      17.23
# Very basic bar graph
ggplot(data=dat, aes(x=time, y=total_bill)) +
    geom_bar(stat="identity")

# Map the time of day to different fill colors
ggplot(data=dat, aes(x=time, y=total_bill, fill=time)) +
    geom_bar(stat="identity")

## This would have the same result as above
# ggplot(data=dat, aes(x=time, y=total_bill)) +
#    geom_bar(aes(fill=time), stat="identity")


# Add a black outline
ggplot(data=dat, aes(x=time, y=total_bill, fill=time)) +
    geom_bar(colour="black", stat="identity")

# No legend, since the information is redundant
ggplot(data=dat, aes(x=time, y=total_bill, fill=time)) +
    geom_bar(colour="black", stat="identity") +
    guides(fill=FALSE)

# Add title, narrower bars, fill color, and change axis labels
ggplot(data=dat, aes(x=time, y=total_bill, fill=time)) + 
    geom_bar(colour="black", fill="#DD8888", width=.8, stat="identity") + 
    guides(fill=FALSE) +
    xlab("Time of day") + ylab("Total bill") +
    ggtitle("Average bill for 2 people")

Example 3-2: Bar graphs with three variables

dat1 <- data.frame(
    sex = factor(c("Female","Female","Male","Male")),
    time = factor(c("Lunch","Dinner","Lunch","Dinner"), levels=c("Lunch","Dinner")),
    total_bill = c(13.53, 16.81, 16.24, 17.42)
)
dat1
##      sex   time total_bill
## 1 Female  Lunch      13.53
## 2 Female Dinner      16.81
## 3   Male  Lunch      16.24
## 4   Male Dinner      17.42

These are the variable mappings used here:

# Stacked bar graph -- this is probably not what you want
ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) +
    geom_bar(stat="identity")

# Bar graph, time on x-axis, color fill grouped by sex -- use position_dodge()
ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) +
    geom_bar(stat="identity", position=position_dodge())

ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black")

# Change colors
ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black") +
    scale_fill_manual(values=c("#999999", "#E69F00"))

# Bar graph, time on x-axis, color fill grouped by sex -- use position_dodge()
ggplot(data=dat1, aes(x=sex, y=total_bill, fill=time)) +
    geom_bar(stat="identity", position=position_dodge(), colour="black")

Example 4-1: Line graphs with two variables

# Basic line graph
ggplot(data=dat, aes(x=time, y=total_bill, group=1)) +
    geom_line()

## This would have the same result as above
# ggplot(data=dat, aes(x=time, y=total_bill)) +
#     geom_line(aes(group=1))

# Add points
ggplot(data=dat, aes(x=time, y=total_bill, group=1)) +
    geom_line() +
    geom_point()

# Change color of both line and points
# Change line type and point type, and use thicker line and larger points
# Change points to circles with white fill
ggplot(data=dat, aes(x=time, y=total_bill, group=1)) + 
    geom_line(colour="red", linetype="dashed", size=1.5) + 
    geom_point(colour="red", size=4, shape=21, fill="white")

# Change the y-range to go from 0 to the maximum value in the total_bill column,
# and change axis labels
ggplot(data=dat, aes(x=time, y=total_bill, group=1)) +
    geom_line(colour="red", linetype="dashed", size=1.5) +
    geom_point(colour="red", size=4, shape=21, fill="white") +
    expand_limits(y=0) +
    xlab("Time of day") + ylab("Total bill") +
    ggtitle("Average bill for 2 people")

Example 4-2: Line graphs with three variables

# Basic line graph with points
ggplot(data=dat1, aes(x=time, y=total_bill, group=sex)) +
    geom_line() +
    geom_point()

# Map sex to color
ggplot(data=dat1, aes(x=time, y=total_bill, group=sex, colour=sex)) +
    geom_line() +
    geom_point()

# Map sex to different point shape, and use larger points
ggplot(data=dat1, aes(x=time, y=total_bill, group=sex, shape=sex)) +
    geom_line() +
    geom_point()

# Use thicker lines and larger points, and hollow white-filled points
ggplot(data=dat1, aes(x=time, y=total_bill, group=sex, shape=sex)) + 
    geom_line(size=1.5) + 
    geom_point(size=3, fill="white") +
    scale_shape_manual(values=c(22,21))

Example 5: Bar and Line graphs

# A bar graph
ggplot(data=dat1, aes(x=time, y=total_bill, fill=sex)) + 
    geom_bar(colour="black", stat="identity",
             position=position_dodge(),
             size=.3) +                        # Thinner lines
    scale_fill_hue(name="Sex of payer") +      # Set legend title
    xlab("Time of day") + ylab("Total bill") + # Set axis labels
    ggtitle("Average bill for 2 people") +     # Set title
    theme_bw()

# A line graph
ggplot(data=dat1, aes(x=time, y=total_bill, group=sex, shape=sex, colour=sex)) + 
    geom_line(aes(linetype=sex), size=1) +     # Set linetype by sex
    geom_point(size=3, fill="white") +         # Use larger points, fill with white
    expand_limits(y=0) +                       # Set y range to include 0
    scale_colour_hue(name="Sex of payer",      # Set legend title
                     l=30)  +                  # Use darker colors (lightness=30)
    scale_shape_manual(name="Sex of payer",
                       values=c(22,21)) +      # Use points with a fill color
    scale_linetype_discrete(name="Sex of payer") +
    xlab("Time of day") + ylab("Total bill") + # Set axis labels
    ggtitle("Average bill for 2 people") +     # Set title
    theme_bw() +
    theme(legend.position=c(.7, .4))           # Position legend inside,This must go after theme_bw

Example: Multiple graphs on one page

head(ChickWeight)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
# This example uses the ChickWeight dataset, which comes with ggplot2
# First plot
p1 <- ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet, group=Chick)) +
    geom_line() +
    ggtitle("Growth curve for individual chicks")

# Second plot
p2 <- ggplot(ChickWeight, aes(x=Time, y=weight, colour=Diet)) +
    geom_point(alpha=.3) +
    geom_smooth(alpha=.2, size=1) +
    ggtitle("Fitted growth curve per diet")

# Third plot
p3 <- ggplot(subset(ChickWeight, Time==21), aes(x=weight, colour=Diet)) +
    geom_density() +
    ggtitle("Final weight, by diet")

# Fourth plot
p4 <- ggplot(subset(ChickWeight, Time==21), aes(x=weight, fill=Diet)) +
    geom_histogram(colour="black", binwidth=50) +
    facet_grid(Diet ~ .) +
    ggtitle("Final weight, by diet") +
    theme(legend.position="none")        # No legend (redundant in this graph)    

library(gridExtra)
grid.arrange(p1,p2,p3,p4, ncol=2)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

Just try it!! (GDP example)

gapminder=read.csv(file="gapminder.csv", header=TRUE)
str(gapminder)
## 'data.frame':    1692 obs. of  6 variables:
##  $ country  : Factor w/ 141 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134
summary(gapminder)
##         country          year           pop               continent  
##  Afghanistan:  12   Min.   :1952   Min.   :6.001e+04   Africa  :624  
##  Albania    :  12   1st Qu.:1966   1st Qu.:2.780e+06   Americas:300  
##  Algeria    :  12   Median :1980   Median :6.985e+06   Asia    :384  
##  Angola     :  12   Mean   :1980   Mean   :2.969e+07   Europe  :360  
##  Argentina  :  12   3rd Qu.:1993   3rd Qu.:1.955e+07   Oceania : 24  
##  Australia  :  12   Max.   :2007   Max.   :1.319e+09                 
##  (Other)    :1620                                                    
##     lifeExp        gdpPercap       
##  Min.   :23.60   Min.   :   241.2  
##  1st Qu.:48.12   1st Qu.:  1195.9  
##  Median :60.53   Median :  3534.8  
##  Mean   :59.45   Mean   :  7248.1  
##  3rd Qu.:70.88   3rd Qu.:  9380.2  
##  Max.   :82.60   Max.   :113523.1  
## 

Question 1

  1. 可否呈現「人均GDP」(gdpPercap) 與 「平均壽命」(lifeExp)之間的關係? 請問兩者有何關係?
  2. (續上題) 將 gdpPercap改為 log(gdpRercap),請問兩者有何關係?
  3. 可否僅呈現2007年的資料?

Answer 1

#temp=gapminder[gapminder$country=="Taiwan",]
#temp=gapminder[gapminder$year=="2007",]

temp=gapminder
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
#' scatterplot
p + geom_point()+ xlab("人均GDP") + ylab("平均壽命") + ggtitle("人均GDP與平均壽命之關係") + theme_gray(base_family = "STHeiti")

# 若是window系統,則不需加上 +theme_gray(base_family = "STHeiti") 
p <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) # just initializes
p + geom_point(size=2,alpha=0.8) + scale_x_log10()+ xlab("人均GDP") + ylab("平均壽命") + ggtitle("人均GDP與平均壽命之關係") + theme_bw(base_family = "STHeiti",base_size = 18)

temp=gapminder[gapminder$year=="2007",]
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
#' convey continent by color: MAP continent variable to aesthetic color
p + geom_point(size=3) + scale_x_log10()+ xlab("人均GDP") + ylab("平均壽命") + ggtitle("2007年人均GDP與平均壽命之關係") + theme_gray(base_family = "STHeiti",base_size = 14)

Question 2

Answer 2

temp=gapminder[gapminder$year=="2007",]
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
#' convey continent by color: MAP continent variable to aesthetic color
p + geom_point(aes(color = continent), size=3) + scale_x_log10()+ xlab("人均GDP") + ylab("平均壽命") + ggtitle("2007年人均GDP與平均壽命之關係") + theme_gray(base_family = "STHeiti",base_size = 14)

temp=gapminder[gapminder$year=="2007",]
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
p + geom_point(colour="blue", size=3) + scale_x_log10()+ xlab("人均GDP") + ylab("平均壽命") + ggtitle("2007年人均GDP與平均壽命之關係") + theme_gray(base_family = "STHeiti",base_size = 14)+facet_grid( ~ continent)

temp=gapminder[gapminder$continent=="Asia",]
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
p + geom_point(aes(color = year),alpha = (2/3), size = 3) + scale_x_log10() + xlab("人均GDP") + ylab("平均壽命") + ggtitle("亞洲人均GDP與平均壽命之關係") + theme_gray(base_family = "STHeiti",base_size = 14) 

Question 3

Answer 3

temp=gapminder
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
p + geom_point(alpha = (1/3), size = 2)+ scale_x_log10() + geom_smooth(lwd = 1, se = FALSE, method = "lm")

temp=gapminder
p <- ggplot(temp, aes(x = gdpPercap, y = lifeExp)) # just initializes
p + aes(color = continent) + geom_point()+ scale_x_log10() + geom_smooth(lwd = 1, se = FALSE, method="lm")

Question 4

Answer 4

p <- ggplot(subset(gapminder, country=="Taiwan"), aes(x = year, y = gdpPercap)) 
p + scale_y_log10()+ geom_line(size=1.5) + geom_point(size=4)

jCountries <- c("Taiwan", "Korea", "Japan", "Hong Kong")
ggplot(subset(gapminder, country %in% jCountries),
       aes(x = year, y = lifeExp, color = country)) + geom_line() + geom_point()

ggplot(subset(gapminder, country %in% jCountries),
       aes(x = year, y = gdpPercap, color = country))+ scale_y_log10()  + geom_line() + geom_point()

jCountries <- c("Taiwan", "Korea", "Japan", "Hong Kong")
p <- ggplot(subset(gapminder, country %in% jCountries), aes(x = year, y = gdpPercap, shape = country)) 
p + scale_y_log10()+ geom_line(size=1.5) + geom_point(size=4)

Question 5

Answer 5

temp=gapminder[gapminder$year>"1992",]
p <- ggplot(temp, aes(x=gdpPercap))
p + geom_histogram(aes(y = ..density..),colour = "darkgreen", fill = "white")+ scale_x_log10()+ facet_grid( ~ year)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

地圖資料呈現 (leaflet)

library(leaflet)

m <- leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addMarkers(lng=121.389432, lat=25.034963, popup="長庚大學管理學院")
m  # Print the map
m <- leaflet() %>% setView(lng=121.389432, lat=25.034963, zoom = 17)
m %>% addTiles() %>% 
  addMarkers(lng=c(121.389432,121.3905), lat=c(25.034963,25.0337), popup=c("長庚大學管理學院","長庚大學工學院"))
data(quakes)
leaflet(data = quakes[101:120,]) %>% addTiles() %>%
  addMarkers(~long, ~lat, popup = ~as.character(depth))
leaflet(quakes) %>% addTiles() %>% addMarkers(
  clusterOptions = markerClusterOptions()
)
## Assuming 'long' and 'lat' are longitude and latitude, respectively
library(htmltools)

df <- read.csv(textConnection(
  "Name,Lat,Long
  Samurai Noodle,47.597131,-122.327298
  Kukai Ramen,47.6154,-122.327157
  Tsukushinbo,47.59987,-122.326726"
))

leaflet(df) %>% addTiles() %>%
  addMarkers(~Long, ~Lat, popup = ~htmlEscape(Name))
cities <- read.csv(textConnection("
City,Lat,Long,Pop
                                  Boston,42.3601,-71.0589,645966
                                  Hartford,41.7627,-72.6743,125017
                                  New York City,40.7127,-74.0059,8406000
                                  Philadelphia,39.9500,-75.1667,1553000
                                  Pittsburgh,40.4397,-79.9764,305841
                                  Providence,41.8236,-71.4222,177994
                                  "))

leaflet(cities) %>% addTiles() %>%
  addCircles(lng = ~Long, lat = ~Lat, weight = 1,
             radius = ~sqrt(Pop) * 30, popup = ~City
  )

互動式圖形分析資料 (googleVis)

其它 googleVis 圖形

library(googleVis)
gapminder=read.csv(file="gapminder.csv", header=TRUE)

Geo=gvisGeoChart(subset(gapminder, year==2007), locationvar="country", 
                 colorvar="gdpPercap",
                 options=list(projection="kavrayskiy-vii"))
plot(Geo)

Table <- gvisTable(subset(gapminder, year==2007))
plot(Table)

PopTable <- gvisTable(gapminder, 
                      options=list(page='enable'))
plot(PopTable)

# 各國地圖
temp=gapminder[gapminder$year=="2007",]
G <- gvisGeoChart(temp, "country", "gdpPercap", options=list(width=600, height=600))
plot(G)

# 互動示圖形
M1 <- gvisMotionChart(gapminder, "country", "year", options=list(state='{"showTrails":false};'))
plot(M1)

# 合併圖形
GM <- gvisMerge(G,M1, horizontal=TRUE)
plot(GM)