inclass1

The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message.

pacman::p_load(forcats, dplyr, readr)
dta1 <-read.csv("income_tw.csv", header = T)
dta1 <-dta1 |> mutate(
  num = seq(1:n())
)
dta1$H <- parse_number(dta1$Income)

str(dta1)
'data.frame':   41 obs. of  4 variables:
 $ Income: chr  "160,000 and under" "160,000 to 179,999" "180,000 to 199,999" "200,000 to 219,999" ...
 $ Count : int  807160 301650 313992 329290 369583 452671 495387 517779 557786 584497 ...
 $ num   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ H     : num  160000 160000 180000 200000 220000 240000 260000 280000 300000 320000 ...
summary(dta1)
    Income              Count             num           H          
 Length:41          Min.   : 59820   Min.   : 1   Min.   : 160000  
 Class :character   1st Qu.:241508   1st Qu.:11   1st Qu.: 340000  
 Mode  :character   Median :313992   Median :21   Median : 540000  
                    Mean   :352494   Mean   :21   Mean   : 648537  
                    3rd Qu.:461836   3rd Qu.:31   3rd Qu.: 790000  
                    Max.   :807160   Max.   :41   Max.   :2500000  

original plot

library(ggplot2)
  ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,-num)))+ #加負號income改從低到高
  geom_point(size=rel(1)) +
  labs(x="Number of persons", 
       y="",
       title = "Distribution of personal disposable income in Taiwan (2015)")+
  theme_minimal()

order count

library(ggplot2)
  ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,Count))) +
  geom_point(size=rel(1)) +
  labs(x="Number of persons (x 10,000)", 
       y="",
       title = "Distribution of personal disposable income in Taiwan (2015)")+
  theme_minimal()

排序以後可以更明顯看到,低於16萬和高於25萬為人數最多和最低

pacman::p_load(dplyr, readr, stringr)
dta1 <-dta1 |>
  mutate(percent = Count / sum(Count) * 100,
         cum = cumsum(Count)/sum(Count) *100,
         group6 = cut(cum, 
                     breaks = c(0,10,25,50,75,90,100),
                     labels = c("Low10%","10-25%","25-50%","50-75%","75-90%","Top10%"), include.lowest= T),
         group4 = cut(cum, 
                     breaks = c(0,25,50,75,100),
                     labels = c("Q1","Q2","Q3","Q4"), include.lowest= T)
)

library(ggplot2)
  ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,-num), color=group6)) +
  geom_point(aes(x = Count/10000, y=reorder(Income,-num), color=group6)) +
  facet_grid(~group4)+
  labs(x="Number of persons (x 10,000)", 
       y="",
       title = "Distribution of personal disposable income in Taiwan (2015)")+
  theme_minimal()

# for geom_text_repel() lable
dta1v <- subset(dta1, num %in% c("1","8","12","24","32","41"))
dta1v$label<-c("under20萬","20-29萬","30-41萬","42-61萬","62-87萬","higher88萬")


dtav<-dta1 |> group_by(group6) |>
  dplyr::summarize(group_count=sum(Count))|>
  mutate(Gp=group_count/sum(group_count)*100,
         Gp = paste0(round(Gp,1), "%")
)

pacman::p_load(ggrepel, ggthemes, tidyr)
dta1v <-cbind(dta1v,dtav[,3]) 
dta1v<- unite(dta1v,labelv,c("label","Gp"), sep="-",remove=F)
ggplot(dta1, aes(Count/10000, H/10000))+
  geom_point(aes(color = group6),pch=20,size = rel(2))+
  # add point label (annotation is an alternative)
  geom_label_repel(dta1v, mapping=aes(label =label, fontface="bold", hjust=1, color = group6), show.legend = FALSE)+ #show.legend = FALSE,可以
    # setting the breaks in x and y axis
  scale_y_continuous(limits = c(0, 260), breaks = c(0, 100, 150, 200, 250))+ 
  scale_x_continuous(limits = c(0, 100), breaks = c(0, 20, 40, 60, 80,100))+
  # theme
  theme_hc()+
  # plot and axis title
  labs(x = "Number of persons (x 10,000)", 
       y = "Income (y 10,000)",
       title = "Distribution of personal disposable income in Taiwan (2015)")+
  guides(color= guide_legend(title="Income", reverse = T))

inclass2

Comment on how the graphs presented in this link violate the principles for effective graphics and how would you revise them.

pacman::p_load(dplyr, ggplot2, viridis, ggthemes)

dta2 <- read.csv("traffic.csv",fileEncoding="big5",stringsAsFactors = F)
type <- dta2 %>%
  filter(year == 104) %>%
  group_by(type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

vtype <- dta2 %>%
  filter(year == 104) %>%
  group_by(vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method <- dta2 %>%
  filter(year == 104) %>%
  group_by(method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))
type_month <- dta2 %>%
  filter(year == 104) %>%
  group_by(month,type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

vtype_month <- dta2 %>%
  filter(year == 104) %>%
  group_by(month,vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method_month <- dta2 %>%
  filter(year == 104) %>%
  group_by(month,method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

original plot- part1

針對前五種舉發種類(不依規定轉彎、闖紅燈直行左轉、違規停車、違規停車拖吊、行車速度超速60公里以下)進行長條圖與盒鬚圖視覺化:

ggplot(type[1:5,],aes(x=type,y =count,fill = type))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104") +
  theme(text = element_text(family = "Songti SC")) #字體

ggplot(vtype,aes(x=vehicle_type,y =count,fill = vehicle_type))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104年交通違規舉發車種別件數") +
  theme(text = element_text(family = "Songti SC"))

ggplot(method,aes(x=method,y =count,fill = method))+
  geom_bar(stat="identity",width=.5,alpha = 0.8) +
  geom_text(aes(label = count),size = 3.5) +
  ggtitle("104年交通違規舉發方式別件數")  +
  theme(text = element_text(family = "Songti SC"))

整體而言視覺化有以下問題:

  • 顏色無意義

  • 缺乏排序、強調重點

  • 直方圖高data-ink ratio,卻又無意義

  • 用count缺乏分母,無法感受出top5有多大的比例

plot part1 - revised

# data management
type <-type |> mutate(
  percent = round(count/sum(as.numeric(count))*100,2),
  Type = "Type")
typeT5<- type[1:5,]
typeT6<- type[6:116,]
sum(typeT6$percent) #15.8%
[1] 15.8
sum(typeT6$count) #129161
[1] 129161
df<- data.frame("其他",129161,15.8,"Type")
names(df)<- c("type", "count", "percent", "Type")
top5v <- rbind(typeT5,df) 

#
vtype <-vtype |> mutate(
  percent = round(count/sum(as.numeric(count))*100,2),
  Type = "Type")

#
method <-method |> mutate(
  percent = round(count/sum(as.numeric(count))*100,2),
  Type = "Type")
#
p<-ggplot(top5v,aes(x=percent,y =reorder(type,percent), color = type))+
  geom_segment(aes(xend=0, yend=type),color="grey", size=1)+ #線段
  geom_point(size = 3) +
  geom_text(aes(label = percent),size = 3, color="black", nudge_x = 4) +
  scale_x_continuous(limits = c(0, 35), breaks = c(0, 10, 20, 30))+
  ggtitle("104年桃園市交通違規舉發項目百分比") +
  labs(x = "Percent", y = "")+
  theme_minimal()+
  theme(text = element_text(family = "Songti SC"))+ #字體
  theme(legend.position = "")
#
q<-ggplot(top5v,aes(x=percent,y =Type, fill = type, group=reorder(type,-percent)))+ #利用group讓y軸將六個項目combine一個
  geom_bar(stat="identity", width = 0.2)+
  geom_text(aes(label =type), size=1.5, position = position_stack(vjust = .5))+
  ggtitle("104年交通違規舉發項目百分比") +
  labs(x = "Proportion", y = "")+
  theme_minimal()+
  theme(text = element_text(family = "Songti SC"))+ #字體
  theme(legend.position = "")

pacman::p_load(gridExtra)
grid.arrange(p,q)

應該要再調整一下兩個圖的size會比較好

#
ggplot(vtype,aes(x=percent,y =Type, fill = vehicle_type, group=reorder(Type,percent)))+ #可以在這邊排序
  geom_bar(stat="identity", width = 0.3, position = "stack")+ #position讓百分比可以疊起來
  geom_text(aes(label =percent), position = position_stack(vjust = .5))+ #position_stack才會讓數字跟圖形一樣stack,vjust=.5置中於bar中
  ggtitle("104年交通違規舉發車種百分比") +
  labs(x = "Proportion", y = "")+
  theme_minimal()+
  theme(text = element_text(family = "Songti SC"))+ #字體
  guides(fill=
           guide_legend(
  title="vehicle_type",reverse = T)) #reverse= legend 改變順序(123 to 321)

如何讓圖不要留這麼多空白…

#
ggplot(method,aes(x=percent,y =Type, fill = method))+
  geom_bar(stat="identity", width = 0.3, position = "stack")+ #position讓百分比可以疊起來
  geom_text(aes(label =percent), position = position_stack(vjust = .5))+ #position_stack才會讓數字跟圖形一樣stack,vjust=.5置中於bar中,
  ggtitle("104年交通違規舉發方式百分比") +
  labs(x = "Proportion", y = "")+
  theme_minimal()+
  theme(text = element_text(family = "Songti SC"))+ #字體
  guides(fill=
           guide_legend(
  title="method",reverse = T)) #reverse= legend 改變順序(123 to 321)

original plot- part2

top5type <- type[1:5,1]#取出數量前五的種類
top5 <- data.frame()#產生空的框架
for(i in 1:5){
  X <- type_month %>%
    filter(type == as.character(top5type[i,])) #擷取兩個dataset type相同
  top5 <- rbind(top5,X) #取出後垂直合併成top5 dataset
}
top5 <-top5|> mutate(
  percent = round(count/sum(as.numeric(count))*100,2),
  Type = "Type")
bar <- list()#產生空的list
for(i in 1:12){
  X <- ungroup(top5) %>%
    filter(month == i)
  bar[[i]]<- ggplot(X,aes(x=type,y =count,fill = type))+
    geom_bar(stat="identity",width=.5,alpha = 0.8) +
    geom_text(aes(label = count),size = 3.5) +
    ggtitle(paste("104年",i,"月交通違規舉發")) +
    theme(text = element_text(family = "Songti SC"))
  print(bar[[i]] )

}#產生1至12月的bars

trendline <- list() 
for(i in 1:5){
   X <- ungroup(top5) %>%
    filter(type == as.character(top5type[i,]))
  trendline[[i]] <- ggplot(X,aes(x=month,y =count))+
    geom_text(aes(label = count),size = 3.5) +
    geom_line(colour = "darkred",size = 1,alpha=0.8) +
    scale_x_continuous(breaks = 1:12) +
    theme_bw() +
    ggtitle(paste("104年",as.character(top5type[i,]),"趨勢線")) +
    theme(text = element_text(family = "Songti SC"))
  print(trendline[[i]] )

}

#產生前5種類的trendllines

整體而言視覺化有以下問題:

  • 分別產生很多張圖,難以比較

  • 顏色的使用無必要性

  • 直方圖高data-ink ratio,卻又無意義

plot part2 - revised

ggplot(top5,aes(x=percent,y =type,color = month))+
    geom_point(size = 3) +
    ggtitle(paste("104年每月交通違規舉發")) +
    theme_minimal()+
    theme(text = element_text(family = "Songti SC"))

雖然點會重疊在一起,但是可以看出違規項目以及月份分布上具差異

另外,也可以透過顏色較直觀的看到離群月份是在上半年還是下半年,雖然理想上希望3月可以是淺色,12月是深色(待調整與研究)

ggplot(top5,aes(x=month,y =count, group=type, color=type))+
    geom_line(size = 0.5,alpha=0.8, color="black") +
    scale_x_continuous(breaks = 1:12) +
    theme_bw() +
    facet_wrap(.~type, ncol=5)+
    ggtitle(paste("104年各月趨勢線")) +
    theme(text = element_text(family = "Songti SC"))+
    theme(legend.position = "")

透過將五個違規項目拆成小圖放在一起,比較期趨勢線的差異。

inclass3

Use the free recall data to improve on the figure

reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

讀入多個檔案

# list.files
fls <-list.files(path = "C:/Users/user/Desktop/p_desktop/data management/1129 data visualization/Murd62", pattern = "fr")

# give it path
fL <- paste0("C:/Users/user/Desktop/p_desktop/data management/1129 data visualization/Murd62/", fls)

# 
pacman::p_load(stringr, dplyr,purrr)


# 計算每個檔案最大欄為長度
## 由於不確定每個檔案的column數,透過str_length計算欄位長度
lapply(fL, function(x){scan(x, what="numeric", sep="\t") |> str_length() |> max()}) #dtaleghth=c(24,39,39,42,45,44)
[[1]]
[1] 24

[[2]]
[1] 39

[[3]]
[1] 39

[[4]]
[1] 42

[[5]]
[1] 45

[[6]]
[1] 44
# input data by scan function
ff<-lapply(fL, function(x){scan(x, what="numeric", sep="\t")})

# 用strsplit分割字符串成column
## 根據估計的欄位長度,進行切割(最大欄位長度45,若以全部為"88"估計,最多23欄)
## 針對column內全是空的值刪除
ffs<-lapply(ff, function(x){
  y <- str_split_fixed(sapply(x, strsplit, "_and_") , " ", 23)
  y <-y[,!apply(y == "", 2, all)] 
  as.data.frame(y)
})

# 計算每個item的頻率
## 由於level順序有誤(1,10,2...)因此從新order資料順序
dta3a <-lapply(ffs, function(x) {
  y <- x |> tidyr::pivot_longer(cols = starts_with("V"),names_to = "Serial", values_to = "Item") # 轉long form
  y <- as.data.frame(table(y$Item)/1200) #計算item平均頻率
  y$Var1<-factor(as.numeric(as.character(y$Var1))) 
  y <-y[order(y$Var1),]
})

# 處理NA與不合理值
groupname<-c("10-2","15-2","20-1","20-2","30-1","40-1")
for (i in 1:6){
dta3a[[i]]$group <-groupname[i]
dta3a[[i]][dta3a[[i]]=="88" | dta3a[[i]]==""]<-NA # 將"88"與blank轉為NA
dta3a[[i]]$maxlist <- substr(dta3a[[i]]$group,start = 1, stop = 2)
# 透過找出每一data最大serial position,超出就是na
a<-as.numeric(unlist(dta3a[[i]]$Var1)) 
b<-as.numeric(dta3a[[i]]$maxlist[1])
dta3a[[i]]$Var1<-ifelse(a>b,NA,a)
}
pacman::p_load(data.table)
dta3r<-rbindlist(dta3a)

original plot

# 設置畫圖區域
## xlim 設大一點,最後一個點就不會超出了
plot(0,0, bty="n", xlim = c(0,42), ylim=c(0,1),
     xlab = "SERIA POSITION",
     ylab = "PROBABILITY OF RECALL",
     xaxs = "i", yaxs ="i") # i, 軸線相連;r,軸線有縫隙

# 用for loop一次畫六個資料的點與線
pchlist<-c(19,1,19,1,1,19) #針對每一資料圖形設定
for (i in 1:6){
points(dta3a[[i]]$Var1, 
       dta3a[[i]]$Freq,
       pch = pchlist[i])
lines(dta3a[[i]]$Var1, 
       dta3a[[i]]$Freq)}

# text label
text(2, 0.9, "10-2") #text(x,y,lable)
lines(c(2, 6), c(0.85, 0.6)) #lines(c(x1,x2),c(y1,y2))

text(14, 0.65, "15-2")
lines(c(12, 13), c(0.61, 0.63))

text(13, 0.48, "20-2")
lines(c(12, 13), c(0.45, 0.3))

text(18.5, 0.44, "20-1")
lines(c(16, 17), c(0.35, 0.4))

text(23, 0.6, "30-1")
lines(c(21.5, 21, 26), c(0.55, 0.52, 0.39))

text(35, 0.8, "40-1")
lines(c(33.5, 33, 37.5), c(0.77, 0.75, 0.59))

# axis (圖上的軸在這邊設定)
axis(1,seq(0, 40, 10), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)

plot by primary and recency effect

primary effect 會發生在前3個詞,recency effect會發生在後8個詞

pacman::p_load(ggplot2,ggrepel)
dta3rv <- dta3r[!is.na(dta3r$Var1), ]|> 
  mutate(seggroup = ifelse(Var1 %in% c(1,2,3), "primary", 
                    ifelse(group=="10-2" & Var1 %in% c(4:10),"recency",
                    ifelse(group=="15-2" & Var1 %in% c(8:15),"recency",
                    ifelse(group=="20-1" & Var1 %in% c(13:20),"recency",
                    ifelse(group=="20-2" & Var1 %in% c(13:20),"recency",
                    ifelse(group=="30-1" & Var1 %in% c(23:30),"recency",
                    ifelse(group=="40-1" & Var1 %in% c(33:40),"recency",
                  "non"))))))),
         seggroup =factor(seggroup, levels = c("primary","non","recency")))

str(dta3rv)
Classes 'data.table' and 'data.frame':  135 obs. of  5 variables:
 $ Var1    : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Freq    : num  0.703 0.569 0.475 0.463 0.459 ...
 $ group   : chr  "10-2" "10-2" "10-2" "10-2" ...
 $ maxlist : chr  "10" "10" "10" "10" ...
 $ seggroup: Factor w/ 3 levels "primary","non",..: 1 1 1 3 3 3 3 3 3 3 ...
 - attr(*, ".internal.selfref")=<externalptr> 
ggplot(dta3rv, aes(x=Var1, y= Freq, group=group, color=seggroup))+
  geom_point()+
  geom_line()+
  geom_text_repel(data=dta3rv[Freq>0.9], aes(label = group), vjust = 1, color="black")+
  scale_x_continuous(breaks=c(0, 10, 15, 20, 30, 40))+
  scale_y_continuous(limits = c(0, 1))+
  scale_color_manual(name = "Effect", values = c("lightpink1", 
                                           "grey", 
                                           "tomato1"))+
  labs(x = "Serial position", y = "Proportion of recall")+
  theme_minimal()+
  theme(legend.position = "bottom")

inclass4

Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men. The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.

dta4 <- read.csv("Economist_women-research.csv", header = T, skip= 1)
names(dta4)<- c("Country", "Health", "Physical", "Engineering", "ComputerMaths", "Inventor")
dta4 <-dta4[-c(13:18),]
dta4$Health <- as.numeric(dta4$Health)

pacman::p_load(tidyr)
dta4L<-dta4 |> pivot_longer(cols=-Country,
                     names_to= "Subject",
                     values_to= "F.per")|>
  mutate(M.per= 1-F.per,
         Differ=M.per-F.per)

dta4Lv<-dta4L |> pivot_longer(cols=-c(Country,Subject,Differ),
                     names_to= "Gender",
                     values_to= "Percent")
dta4Lv$Gender = substr(dta4Lv$Gender, start = 1, stop = 1)
library(ggplot2)
  ggplot(dta4Lv, aes(x=Percent , y=reorder(Country,Differ), color=Gender, shape=Gender, group=reorder(Country,Differ))) + # group by country後線就會依照國家兩性差異畫
  geom_point(aes(x = Percent, y=reorder(Country,Differ), color=Gender, shape=Gender)) +
  geom_line(color="black", lty=1)+
  scale_shape_manual(name="Gender", values = c(16,16))+
  scale_color_manual(name="Gender", values =c("red","blue"))+
  scale_x_continuous(limits = c(0,1), breaks=c(.2,.4,.6,.8,1))+
  facet_grid(~reorder(Subject,Differ))+
  labs(x="Percent", 
       y="Country",
       title = "")+
  theme_minimal()

用顏色標示性別比例,黑線代表其差距,並按照兩性差距進行國家與科目的排序。由圖可知:

  • 在健康領域兩性差異最小
  • 在國家方面,日本在五個領域的兩性差距都是最大的