library(tidyverse)
library(ggplot2)
library(lattice)
library(dplyr)

Data visualization

Q1.

The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message.

dta <- read.csv("income_tw.csv",header = T)
dta$Income <- factor(dta$Income, levels = dta$Income)
# sum(dta$Count)/4;sum(dta$Count)/4*2;sum(dta$Count)/4*3;sum(dta$Count)
# 3613068;7226135;10839202;14452270
unlist(lapply(c(1:length(dta$Count)), function(x) sum(dta$Count[1:x])))
##  [1]   807160  1108810  1422802  1752092  2121675  2574346  3069733
##  [8]  3587512  4145298  4729795  5320902  5893073  6459367  6965848
## [15]  7427684  7916647  8326534  8717097  9104407  9458729  9796099
## [22] 10116893 10395532 10652834 10882033 11097913 11400515 11665964
## [29] 11907472 12208045 12467310 12763106 13044702 13266002 13491119
## [36] 13686310 13881342 14069585 14284495 14392450 14452270
# 8 15 25 41
dta <- dta %>% mutate(Q = c(rep("Q1",8),rep("Q2",7),rep("Q3",10),rep("Q4",16)),
                      Income1 = Income) %>% 
  separate(.,Income1,c("from","ig","to")," ") %>% 
  select(.,-ig) 
dta$to[1] = "160,000";dta$to[41] = "2,500,000"
dta$from <- as.numeric(gsub(",","",dta$from))
dta$to <- as.numeric(gsub(",","",dta$to))
dta$med = round((dta$from+dta$to)/2,0)
dta
ggplot(aes(Count/10000, Income, col = Q), data = dta) +
  geom_point() +
  geom_segment(aes(xend=mean(Count/10000), yend=Income)) + 
  labs(x="Number of persons(x10,000)",
       title = "Distribution of disposable personal income in Taiwan in 2015") +
  theme_minimal() +
  theme(axis.text.y = element_text(color = c(rep("#F8766D",8),rep("#6BB100",7),rep("#00BCD8",10),rep("#B983FF",16))))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

ggplot(aes(Count*med/100000000000, Income, col = Q), data = dta) +
  geom_point() +
  geom_segment(aes(xend=mean(Count*med/100000000000), yend=Income)) + 
  labs(x="Total assets in each income class(x10^11)",
       title = "Distribution of total assets in each income class") +
  theme_minimal() +
  theme(axis.text.y = element_text(color = c(rep("#F8766D",8),rep("#6BB100",7),rep("#00BCD8",10),rep("#B983FF",16))))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

Q2.

Comment on how the graphs presented in this link violate the principles for effective graphics and how would you revise them.

library(viridis) #主題package
## Loading required package: viridisLite
library(ggthemes) #配色package

data <- read.csv("traffic.csv",fileEncoding="big5",stringsAsFactors = F)

type <- data %>%
  filter(year == 104) %>%
  group_by(type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

vtype <- data %>%
  filter(year == 104) %>%
  group_by(vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method <- data %>%
  filter(year == 104) %>%
  group_by(method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

# write.csv(type,file = "type.csv",row.names = F,fileEncoding="utf-8")
# write.csv(vtype,file = "vtype.csv",row.names = F,fileEncoding="utf-8")
# write.csv(method,file = "method.csv",row.names = F,fileEncoding="utf-8")

type_month <- data %>%
  filter(year == 104) %>%
  group_by(month,type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

vtype_month <- data %>%
  filter(year == 104) %>%
  group_by(month,vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method_month <- data %>%
  filter(year == 104) %>%
  group_by(month,method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

# write.csv(type_month,file = "type_month.csv",row.names = F,fileEncoding="utf-8")
# write.csv(vtype_month,file = "vtype_month.csv",row.names = F,fileEncoding="utf-8")
# write.csv(method_month,file = "method_month.csv",row.names = F,fileEncoding="utf-8")
ggplot(type[1:5,],aes(x=type,y =count,fill = type))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104年交通違規舉發件數") +
  theme(text=element_text(family="黑體-繁 中黑"))

顏色沒有幫助,以二維圖形呈現一維資料,legend與x軸資訊重疊

revise

qplot(count, reorder(type, count), data = type[1:5,]) +
  geom_segment(aes(xend = 0, yend = reorder(type, count))) +
  geom_text(aes(label = count), vjust = -0.8, hjust = 1) +
  ggtitle("104年交通違規舉發件數") +
  labs(y="") +
  theme(text=element_text(family="黑體-繁 中黑"))

ggplot(vtype,aes(x=vehicle_type,y =count,fill = vehicle_type))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104年交通違規舉發車種別件數") +
  theme(text=element_text(family="黑體-繁 中黑"))

問題同上

revise

qplot(count, reorder(vehicle_type, count), data = vtype) +
  geom_segment(aes(xend = 0, yend = reorder(vehicle_type, count))) +
  geom_text(aes(label = count), vjust = -0.8, hjust = 0.6) +
  ggtitle("104年交通違規舉發車種別件數") +
  labs(y="") +
  theme(text=element_text(family="黑體-繁 中黑"))

ggplot(method,aes(x=method,y =count,fill = method))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104年交通違規舉發方式別件數") +
  theme(text=element_text(family="黑體-繁 中黑"))

問題同上

revise

qplot(count, reorder(method, count), data = method) +
  geom_segment(aes(xend = 0, yend = reorder(method, count))) +
  geom_text(aes(label = count), vjust = -0.8, hjust = 0.6) +
  ggtitle("104年交通違規舉發方式別件數") +
  labs(y="") +
  theme(text=element_text(family="黑體-繁 中黑"))

top5type <- type[1:5,1]#取出數量前五的種類
top5 <- c()#產生空的框架
for(i in 1:5){
  X <- type_month %>%
    filter(type == as.character(top5type[i,]))
  top5 <- rbind(top5,X)
}#重新取出數量前五種類的資料

ggplot(top5,aes(x=type,y =count,fill = type))+
  geom_boxplot(alpha = 0.7) +
  ggtitle("104年交通違規盒鬚圖") +
  theme(text=element_text(family="黑體-繁 中黑"))

#畫出盒鬚圖

問題同上

revise

# qplot(type, count, data=top5, geom=c("boxplot","jitter")) +
#   ggtitle("104年交通違規盒鬚圖") +
#   theme(text=element_text(family="黑體-繁 中黑"))
ggplot(top5,aes(x=reorder(type, count),y=count))+
  geom_boxplot(alpha = 0.7) +
  geom_jitter(color = "gray",pch=21) +
  ggtitle("104年交通違規盒鬚圖") +
  labs(x="") +
  theme(text=element_text(family="黑體-繁 中黑"))

hide output

hide

bar <- list()#產生空的list
for(i in 1:12){
  X <- ungroup(top5) %>%
    filter(month == i)
  bar[[i]]<- ggplot(X,aes(x=type,y =count,fill = type))+
    geom_bar(stat="identity",width=.5,alpha = 0.8) +
    geom_text(aes(label = count),size = 3.5) +
    ggtitle(paste("104年",i,"月交通違規舉發")) +
  theme(text=element_text(family="黑體-繁 中黑"))
  print(bar[[i]] )
}#產生1至12月的bars

see

bar <- list()#產生空的list
for(i in 1:12){
  X <- ungroup(top5) %>%
    filter(month == i)
  bar[[i]]<- ggplot(X,aes(x=type,y =count,fill = type))+
    geom_bar(stat="identity",width=.5,alpha = 0.8) +
    geom_text(aes(label = count),size = 3.5) +
    ggtitle(paste("104年",i,"月交通違規舉發")) +
  theme(text=element_text(family="黑體-繁 中黑"))
  print(bar[[i]] )
}#產生1至12月的bars

畫太多張圖,每月無法比較,其餘問題同上

revise

ggplot(aes(count, reorder(type, count)), data = top5) +
  geom_segment(aes(xend = 0, yend = reorder(type, count))) +
  facet_wrap(. ~ month, nrow = 4) +
  ggtitle("104年各月交通違規舉發件數") +
  labs(y="") +
  theme(text=element_text(family="黑體-繁 中黑"),
        axis.text.x = element_text(angle = 90, hjust = 1, size = 8),
        axis.text.y = element_text(size = 7))

hide

trendline <- list() 
for(i in 1:5){
   X <- ungroup(top5) %>%
    filter(type == as.character(top5type[i,]))
  trendline[[i]] <- ggplot(X,aes(x=month,y =count))+
    geom_text(aes(label = count),size = 3.5) +
    geom_line(colour = "darkred",size = 1,alpha=0.8) +
    scale_x_continuous(breaks = 1:12) +
    theme_bw() +
    ggtitle(paste("104年",as.character(top5type[i,]),"趨勢線")) +
    theme(text=element_text(family="黑體-繁 中黑"))
  print(trendline[[i]] )
}#產生前5種類的trendllines

see

trendline <- list() 
for(i in 1:5){
   X <- ungroup(top5) %>%
    filter(type == as.character(top5type[i,]))
  trendline[[i]] <- ggplot(X,aes(x=month,y =count))+
    geom_text(aes(label = count),size = 3.5) +
    geom_line(colour = "darkred",size = 1,alpha=0.8) +
    scale_x_continuous(breaks = 1:12) +
    theme_bw() +
    ggtitle(paste("104年",as.character(top5type[i,]),"趨勢線")) +
    theme(text=element_text(family="黑體-繁 中黑"))
  print(trendline[[i]] )
}#產生前5種類的trendllines

畫太多張圖,每種類型無法比較,y軸沒有統一,其餘問題同上

revise

ggplot(aes(count, month), data = top5) +
  geom_segment(aes(xend = 0, yend = month)) +
  facet_wrap(. ~ type) +
  ggtitle("104年各類型交通違規各月舉發件數") +
  ylim('1','2','3','4','5','6','7','8','9','10','11','12') +
  theme(text=element_text(family="黑體-繁 中黑"),
        axis.text.x = element_text(angle = 90, hjust = 1, size = 5),
        axis.text.y = element_text(size = 5))

month <- data %>%
  filter(year == 104) %>%
  group_by(month) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

ggplot(month,aes(x=month,y =count))+
  geom_text(aes(label = count),size = 3.5) +
  geom_line(colour = "darkred",size = 1,alpha=0.8) +
  scale_x_continuous(breaks = 1:12) +
  theme_bw() +
  ggtitle(paste("104年交通違規總數趨勢線")) +
  theme(text=element_text(family="黑體-繁 中黑"))

折線沒有意義,文字與線重疊

revise

qplot(month, count, data=month, geom=c("point", "smooth"), se=F) +
  geom_segment(aes(xend = month, yend = 50000)) +
  geom_text(aes(label = count), vjust = -1, size = 2) +
  xlim('1','2','3','4','5','6','7','8','9','10','11','12') +
  ggtitle("104年交通違規總數趨勢線") +
  theme(text=element_text(family="黑體-繁 中黑"))
## Warning: Ignoring unknown parameters: se
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(vtype_month,aes(x=month,y =count,colour=vehicle_type))+
  geom_text(aes(label = count),size = 3.5) +
  geom_line(size = 1,alpha=0.8) +
  scale_x_continuous(breaks = 1:12) +
  theme_bw() +
  ggtitle(paste("104年交通違規車種趨勢線")) +
  theme(text=element_text(family="黑體-繁 中黑"))

車種數量差距大,違規次數差太多,重型機車在此圖看不出趨勢,文字與線重疊導致視覺上混亂,各車種違規趨勢好像也沒有比較的必要,因此傾向分開作圖

revise

機車

vtype_month %>% filter(vehicle_type == "機車") %>% 
  qplot(month, count, data=., geom=c("point", "smooth"), se=F) +
    geom_segment(aes(xend = month, yend = 25000)) +
    geom_text(aes(label = count), vjust = -1, size = 2) +
    xlim('1','2','3','4','5','6','7','8','9','10','11','12') +
    ggtitle("104年交通違規機車趨勢線") +
    theme(text=element_text(family="黑體-繁 中黑"))
## Warning: Ignoring unknown parameters: se
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

汽車

vtype_month %>% filter(vehicle_type == "汽車") %>% 
  qplot(month, count, data=., geom=c("point", "smooth"), se=F) +
    geom_segment(aes(xend = month, yend = 25000)) +
    geom_text(aes(label = count), vjust = -1, size = 2) +
    xlim('1','2','3','4','5','6','7','8','9','10','11','12') +
    ggtitle("104年交通違規汽車趨勢線") +
    theme(text=element_text(family="黑體-繁 中黑"))
## Warning: Ignoring unknown parameters: se
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

重型機車

vtype_month %>% filter(vehicle_type == "重型機車(250CC以上)") %>% 
  qplot(month, count, data=., geom=c("point", "smooth"), se=F) +
    geom_segment(aes(xend = month, yend = 200)) +
    geom_text(aes(label = count), vjust = -1, size = 2) +
    xlim('1','2','3','4','5','6','7','8','9','10','11','12') +
    ggtitle("104年交通違規重型機車(250CC以上)趨勢線") +
    theme(text=element_text(family="黑體-繁 中黑"))
## Warning: Ignoring unknown parameters: se
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(method_month,aes(x=month,y =count,colour=method))+
  geom_text(aes(label = count),size = 3.5) +
  geom_line(size = 1,alpha=0.8) +
  scale_x_continuous(breaks = 1:12) +
  theme_bw() +
  ggtitle(paste("104年交通違規舉發方式趨勢線")) +
  theme(text=element_text(family="黑體-繁 中黑"))

加上總數

revise

rb <-method_month %>% group_by(month) %>% summarise(count=sum(count)) %>% mutate(method = "總數")
method_month <- bind_rows(method_month,rb)
method_month$jit = 0
method_month[method_month$method == "攔停",]$jit = 0.1
method_month[method_month$method == "逕舉",]$jit = -0.1
qplot(month+jit, count, data=method_month, 
      colour=method, geom=c("point", "smooth"), se=F) +
  geom_segment(aes(xend = month+jit, yend = 10000)) +
  geom_text(aes(label = count), vjust = -1, size = 2) +
  xlim('1','2','3','4','5','6','7','8','9','10','11','12') +
  labs(x="month") +
  ggtitle("104年交通違規重型機車(250CC以上)趨勢線") +
  theme(text=element_text(family="黑體-繁 中黑"))
## Warning: Ignoring unknown parameters: se
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#熱度圖
ggplot(ungroup(top5),aes(x=factor(month), y=factor(type), fill=count)) +
 geom_tile(color="white", size=0.1) +
 scale_fill_viridis(name="件數")+ #使用viridis
 coord_equal()+
 labs(x=NULL, y=NULL, title="104年交通違規熱度圖")+ 
 theme_tufte(base_family="Helvetica")+ #使用ggthemes
 theme(text=element_text(family="黑體-繁 中黑"),
       plot.title=element_text(hjust=0),
       axis.ticks=element_blank(), 
       axis.text=element_text(size=7),
       legend.title=element_text(size=8),
       legend.text=element_text(size=6))

人類視覺對亮度較為敏感,因此改為灰階

revise

#熱度圖
ggplot(ungroup(top5),aes(x=factor(month), y=factor(type), fill=count)) +
  geom_tile(color="white", size=0.1) +
  scale_fill_gradient(name = "count",
                      low = "#FFFFFF",
                      high = "#012345") +
  coord_equal()+
  labs(x=NULL, y=NULL, title="104年交通違規熱度圖")+ 
  theme_bw()+
  theme(text=element_text(family="黑體-繁 中黑"),
       plot.title=element_text(hjust=0),
       axis.ticks=element_blank(), 
       axis.text=element_text(size=7),
       legend.title=element_text(size=8),
       legend.text=element_text(size=6))

Q3.

Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men. The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.

dta <- read.csv("dtaVisQ3data.csv",header = T)
names(dta)<-c("Country","Health Sciences","Physical Sciences","Engineering","CS and Math","Women Inventores")
dta <- gather(dta, subject, percentage, 2:6)
head(dta)
ggplot(aes(percentage, reorder(Country,percentage)), data = dta) +
  geom_point()+
  geom_segment(aes(xend = 0.2, yend = reorder(Country,percentage))) +
  facet_wrap(. ~ reorder(subject,percentage), nrow = 1) +
  labs(y="Country") +
  ggtitle("Still a man's world") 

各國比例皆接近0.2 (Health Sciences除外)