The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message.
::p_load(forcats, dplyr, readr)
pacman<-read.csv("income_tw.csv", header = T)
dta1 <-dta1 |> mutate(
dta1 num = seq(1:n())
)$H <- parse_number(dta1$Income)
dta1
str(dta1)
'data.frame': 41 obs. of 4 variables:
$ Income: chr "160,000 and under" "160,000 to 179,999" "180,000 to 199,999" "200,000 to 219,999" ...
$ Count : int 807160 301650 313992 329290 369583 452671 495387 517779 557786 584497 ...
$ num : int 1 2 3 4 5 6 7 8 9 10 ...
$ H : num 160000 160000 180000 200000 220000 240000 260000 280000 300000 320000 ...
summary(dta1)
Income Count num H
Length:41 Min. : 59820 Min. : 1 Min. : 160000
Class :character 1st Qu.:241508 1st Qu.:11 1st Qu.: 340000
Mode :character Median :313992 Median :21 Median : 540000
Mean :352494 Mean :21 Mean : 648537
3rd Qu.:461836 3rd Qu.:31 3rd Qu.: 790000
Max. :807160 Max. :41 Max. :2500000
library(ggplot2)
ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,-num)))+ #加負號income改從低到高
geom_point(size=rel(1)) +
labs(x="Number of persons",
y="",
title = "Distribution of personal disposable income in Taiwan (2015)")+
theme_minimal()
library(ggplot2)
ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,Count))) +
geom_point(size=rel(1)) +
labs(x="Number of persons (x 10,000)",
y="",
title = "Distribution of personal disposable income in Taiwan (2015)")+
theme_minimal()
排序以後可以更明顯看到,低於16萬和高於25萬為人數最多和最低
::p_load(dplyr, readr, stringr)
pacman<-dta1 |>
dta1 mutate(percent = Count / sum(Count) * 100,
cum = cumsum(Count)/sum(Count) *100,
group6 = cut(cum,
breaks = c(0,10,25,50,75,90,100),
labels = c("Low10%","10-25%","25-50%","50-75%","75-90%","Top10%"), include.lowest= T),
group4 = cut(cum,
breaks = c(0,25,50,75,100),
labels = c("Q1","Q2","Q3","Q4"), include.lowest= T)
)
library(ggplot2)
ggplot(dta1, aes(x=Count/10000 , y=reorder(Income,-num), color=group6)) +
geom_point(aes(x = Count/10000, y=reorder(Income,-num), color=group6)) +
facet_grid(~group4)+
labs(x="Number of persons (x 10,000)",
y="",
title = "Distribution of personal disposable income in Taiwan (2015)")+
theme_minimal()
# for geom_text_repel() lable
<- subset(dta1, num %in% c("1","8","12","24","32","41"))
dta1v $label<-c("under20萬","20-29萬","30-41萬","42-61萬","62-87萬","higher88萬")
dta1v
<-dta1 |> group_by(group6) |>
dtav::summarize(group_count=sum(Count))|>
dplyrmutate(Gp=group_count/sum(group_count)*100,
Gp = paste0(round(Gp,1), "%")
)
::p_load(ggrepel, ggthemes, tidyr)
pacman<-cbind(dta1v,dtav[,3])
dta1v <- unite(dta1v,labelv,c("label","Gp"), sep="-",remove=F) dta1v
ggplot(dta1, aes(Count/10000, H/10000))+
geom_point(aes(color = group6),pch=20,size = rel(2))+
# add point label (annotation is an alternative)
geom_label_repel(dta1v, mapping=aes(label =label, fontface="bold", hjust=1, color = group6), show.legend = FALSE)+ #show.legend = FALSE,可以
# setting the breaks in x and y axis
scale_y_continuous(limits = c(0, 260), breaks = c(0, 100, 150, 200, 250))+
scale_x_continuous(limits = c(0, 100), breaks = c(0, 20, 40, 60, 80,100))+
# theme
theme_hc()+
# plot and axis title
labs(x = "Number of persons (x 10,000)",
y = "Income (y 10,000)",
title = "Distribution of personal disposable income in Taiwan (2015)")+
guides(color= guide_legend(title="Income", reverse = T))
Comment on how the graphs presented in this link violate the principles for effective graphics and how would you revise them.
::p_load(dplyr, ggplot2, viridis, ggthemes)
pacman
<- read.csv("traffic.csv",fileEncoding="big5",stringsAsFactors = F) dta2
<- dta2 %>%
type filter(year == 104) %>%
group_by(type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dta2 %>%
vtype filter(year == 104) %>%
group_by(vehicle_type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dta2 %>%
method filter(year == 104) %>%
group_by(method) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dta2 %>%
type_month filter(year == 104) %>%
group_by(month,type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dta2 %>%
vtype_month filter(year == 104) %>%
group_by(month,vehicle_type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dta2 %>%
method_month filter(year == 104) %>%
group_by(month,method) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
針對前五種舉發種類(不依規定轉彎、闖紅燈直行左轉、違規停車、違規停車拖吊、行車速度超速60公里以下)進行長條圖與盒鬚圖視覺化:
ggplot(type[1:5,],aes(x=type,y =count,fill = type))+
geom_bar(stat="identity",width=.5,alpha = 0.8) +
geom_text(aes(label = count),size = 3.5) +
ggtitle("104") +
theme(text = element_text(family = "Songti SC")) #字體
ggplot(vtype,aes(x=vehicle_type,y =count,fill = vehicle_type))+
geom_bar(stat="identity",width=.5,alpha = 0.8) +
geom_text(aes(label = count),size = 3.5) +
ggtitle("104年交通違規舉發車種別件數") +
theme(text = element_text(family = "Songti SC"))
ggplot(method,aes(x=method,y =count,fill = method))+
geom_bar(stat="identity",width=.5,alpha = 0.8) +
geom_text(aes(label = count),size = 3.5) +
ggtitle("104年交通違規舉發方式別件數") +
theme(text = element_text(family = "Songti SC"))
整體而言視覺化有以下問題:
顏色無意義
缺乏排序、強調重點
直方圖高data-ink ratio,卻又無意義
用count缺乏分母,無法感受出top5有多大的比例
# data management
<-type |> mutate(
type percent = round(count/sum(as.numeric(count))*100,2),
Type = "Type")
<- type[1:5,]
typeT5<- type[6:116,]
typeT6sum(typeT6$percent) #15.8%
[1] 15.8
sum(typeT6$count) #129161
[1] 129161
<- data.frame("其他",129161,15.8,"Type")
dfnames(df)<- c("type", "count", "percent", "Type")
<- rbind(typeT5,df)
top5v
#
<-vtype |> mutate(
vtype percent = round(count/sum(as.numeric(count))*100,2),
Type = "Type")
#
<-method |> mutate(
method percent = round(count/sum(as.numeric(count))*100,2),
Type = "Type")
#
<-ggplot(top5v,aes(x=percent,y =reorder(type,percent), color = type))+
pgeom_segment(aes(xend=0, yend=type),color="grey", size=1)+ #線段
geom_point(size = 3) +
geom_text(aes(label = percent),size = 3, color="black", nudge_x = 4) +
scale_x_continuous(limits = c(0, 35), breaks = c(0, 10, 20, 30))+
ggtitle("104年桃園市交通違規舉發項目百分比") +
labs(x = "Percent", y = "")+
theme_minimal()+
theme(text = element_text(family = "Songti SC"))+ #字體
theme(legend.position = "")
#
<-ggplot(top5v,aes(x=percent,y =Type, fill = type, group=reorder(type,-percent)))+ #利用group讓y軸將六個項目combine一個
qgeom_bar(stat="identity", width = 0.2)+
geom_text(aes(label =type), size=1.5, position = position_stack(vjust = .5))+
ggtitle("104年交通違規舉發項目百分比") +
labs(x = "Proportion", y = "")+
theme_minimal()+
theme(text = element_text(family = "Songti SC"))+ #字體
theme(legend.position = "")
::p_load(gridExtra)
pacmangrid.arrange(p,q)
應該要再調整一下兩個圖的size會比較好
#
ggplot(vtype,aes(x=percent,y =Type, fill = vehicle_type, group=reorder(Type,percent)))+ #可以在這邊排序
geom_bar(stat="identity", width = 0.3, position = "stack")+ #position讓百分比可以疊起來
geom_text(aes(label =percent), position = position_stack(vjust = .5))+ #position_stack才會讓數字跟圖形一樣stack,vjust=.5置中於bar中
ggtitle("104年交通違規舉發車種百分比") +
labs(x = "Proportion", y = "")+
theme_minimal()+
theme(text = element_text(family = "Songti SC"))+ #字體
guides(fill=
guide_legend(
title="vehicle_type",reverse = T)) #reverse= legend 改變順序(123 to 321)
如何讓圖不要留這麼多空白…
#
ggplot(method,aes(x=percent,y =Type, fill = method))+
geom_bar(stat="identity", width = 0.3, position = "stack")+ #position讓百分比可以疊起來
geom_text(aes(label =percent), position = position_stack(vjust = .5))+ #position_stack才會讓數字跟圖形一樣stack,vjust=.5置中於bar中,
ggtitle("104年交通違規舉發方式百分比") +
labs(x = "Proportion", y = "")+
theme_minimal()+
theme(text = element_text(family = "Songti SC"))+ #字體
guides(fill=
guide_legend(
title="method",reverse = T)) #reverse= legend 改變順序(123 to 321)
<- type[1:5,1]#取出數量前五的種類
top5type <- data.frame()#產生空的框架
top5 for(i in 1:5){
<- type_month %>%
X filter(type == as.character(top5type[i,])) #擷取兩個dataset type相同
<- rbind(top5,X) #取出後垂直合併成top5 dataset
top5
}<-top5|> mutate(
top5 percent = round(count/sum(as.numeric(count))*100,2),
Type = "Type")
<- list()#產生空的list
bar for(i in 1:12){
<- ungroup(top5) %>%
X filter(month == i)
<- ggplot(X,aes(x=type,y =count,fill = type))+
bar[[i]]geom_bar(stat="identity",width=.5,alpha = 0.8) +
geom_text(aes(label = count),size = 3.5) +
ggtitle(paste("104年",i,"月交通違規舉發")) +
theme(text = element_text(family = "Songti SC"))
print(bar[[i]] )
#產生1至12月的bars }
<- list()
trendline for(i in 1:5){
<- ungroup(top5) %>%
X filter(type == as.character(top5type[i,]))
<- ggplot(X,aes(x=month,y =count))+
trendline[[i]] geom_text(aes(label = count),size = 3.5) +
geom_line(colour = "darkred",size = 1,alpha=0.8) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
ggtitle(paste("104年",as.character(top5type[i,]),"趨勢線")) +
theme(text = element_text(family = "Songti SC"))
print(trendline[[i]] )
}
#產生前5種類的trendllines
整體而言視覺化有以下問題:
分別產生很多張圖,難以比較
顏色的使用無必要性
直方圖高data-ink ratio,卻又無意義
ggplot(top5,aes(x=percent,y =type,color = month))+
geom_point(size = 3) +
ggtitle(paste("104年每月交通違規舉發")) +
theme_minimal()+
theme(text = element_text(family = "Songti SC"))
雖然點會重疊在一起,但是可以看出違規項目以及月份分布上具差異
另外,也可以透過顏色較直觀的看到離群月份是在上半年還是下半年,雖然理想上希望3月可以是淺色,12月是深色(待調整與研究)
ggplot(top5,aes(x=month,y =count, group=type, color=type))+
geom_line(size = 0.5,alpha=0.8, color="black") +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
facet_wrap(.~type, ncol=5)+
ggtitle(paste("104年各月趨勢線")) +
theme(text = element_text(family = "Songti SC"))+
theme(legend.position = "")
透過將五個違規項目拆成小圖放在一起,比較期趨勢線的差異。
Use the free recall data to improve on the figure
reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.
# list.files
<-list.files(path = "C:/Users/user/Desktop/p_desktop/data management/1129 data visualization/Murd62", pattern = "fr")
fls
# give it path
<- paste0("C:/Users/user/Desktop/p_desktop/data management/1129 data visualization/Murd62/", fls)
fL
#
::p_load(stringr, dplyr,purrr)
pacman
# 計算每個檔案最大欄為長度
## 由於不確定每個檔案的column數,透過str_length計算欄位長度
lapply(fL, function(x){scan(x, what="numeric", sep="\t") |> str_length() |> max()}) #dtaleghth=c(24,39,39,42,45,44)
[[1]]
[1] 24
[[2]]
[1] 39
[[3]]
[1] 39
[[4]]
[1] 42
[[5]]
[1] 45
[[6]]
[1] 44
# input data by scan function
<-lapply(fL, function(x){scan(x, what="numeric", sep="\t")})
ff
# 用strsplit分割字符串成column
## 根據估計的欄位長度,進行切割(最大欄位長度45,若以全部為"88"估計,最多23欄)
## 針對column內全是空的值刪除
<-lapply(ff, function(x){
ffs<- str_split_fixed(sapply(x, strsplit, "_and_") , " ", 23)
y <-y[,!apply(y == "", 2, all)]
y as.data.frame(y)
})
# 計算每個item的頻率
## 由於level順序有誤(1,10,2...)因此從新order資料順序
<-lapply(ffs, function(x) {
dta3a <- x |> tidyr::pivot_longer(cols = starts_with("V"),names_to = "Serial", values_to = "Item") # 轉long form
y <- as.data.frame(table(y$Item)/1200) #計算item平均頻率
y $Var1<-factor(as.numeric(as.character(y$Var1)))
y<-y[order(y$Var1),]
y
})
# 處理NA與不合理值
<-c("10-2","15-2","20-1","20-2","30-1","40-1")
groupnamefor (i in 1:6){
$group <-groupname[i]
dta3a[[i]]=="88" | dta3a[[i]]==""]<-NA # 將"88"與blank轉為NA
dta3a[[i]][dta3a[[i]]$maxlist <- substr(dta3a[[i]]$group,start = 1, stop = 2)
dta3a[[i]]# 透過找出每一data最大serial position,超出就是na
<-as.numeric(unlist(dta3a[[i]]$Var1))
a<-as.numeric(dta3a[[i]]$maxlist[1])
b$Var1<-ifelse(a>b,NA,a)
dta3a[[i]] }
::p_load(data.table)
pacman<-rbindlist(dta3a) dta3r
# 設置畫圖區域
## xlim 設大一點,最後一個點就不會超出了
plot(0,0, bty="n", xlim = c(0,42), ylim=c(0,1),
xlab = "SERIA POSITION",
ylab = "PROBABILITY OF RECALL",
xaxs = "i", yaxs ="i") # i, 軸線相連;r,軸線有縫隙
# 用for loop一次畫六個資料的點與線
<-c(19,1,19,1,1,19) #針對每一資料圖形設定
pchlistfor (i in 1:6){
points(dta3a[[i]]$Var1,
$Freq,
dta3a[[i]]pch = pchlist[i])
lines(dta3a[[i]]$Var1,
$Freq)}
dta3a[[i]]
# text label
text(2, 0.9, "10-2") #text(x,y,lable)
lines(c(2, 6), c(0.85, 0.6)) #lines(c(x1,x2),c(y1,y2))
text(14, 0.65, "15-2")
lines(c(12, 13), c(0.61, 0.63))
text(13, 0.48, "20-2")
lines(c(12, 13), c(0.45, 0.3))
text(18.5, 0.44, "20-1")
lines(c(16, 17), c(0.35, 0.4))
text(23, 0.6, "30-1")
lines(c(21.5, 21, 26), c(0.55, 0.52, 0.39))
text(35, 0.8, "40-1")
lines(c(33.5, 33, 37.5), c(0.77, 0.75, 0.59))
# axis (圖上的軸在這邊設定)
axis(1,seq(0, 40, 10), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)
primary effect 會發生在前3個詞,recency effect會發生在後8個詞
::p_load(ggplot2,ggrepel)
pacman<- dta3r[!is.na(dta3r$Var1), ]|>
dta3rv mutate(seggroup = ifelse(Var1 %in% c(1,2,3), "primary",
ifelse(group=="10-2" & Var1 %in% c(4:10),"recency",
ifelse(group=="15-2" & Var1 %in% c(8:15),"recency",
ifelse(group=="20-1" & Var1 %in% c(13:20),"recency",
ifelse(group=="20-2" & Var1 %in% c(13:20),"recency",
ifelse(group=="30-1" & Var1 %in% c(23:30),"recency",
ifelse(group=="40-1" & Var1 %in% c(33:40),"recency",
"non"))))))),
seggroup =factor(seggroup, levels = c("primary","non","recency")))
str(dta3rv)
Classes 'data.table' and 'data.frame': 135 obs. of 5 variables:
$ Var1 : num 1 2 3 4 5 6 7 8 9 10 ...
$ Freq : num 0.703 0.569 0.475 0.463 0.459 ...
$ group : chr "10-2" "10-2" "10-2" "10-2" ...
$ maxlist : chr "10" "10" "10" "10" ...
$ seggroup: Factor w/ 3 levels "primary","non",..: 1 1 1 3 3 3 3 3 3 3 ...
- attr(*, ".internal.selfref")=<externalptr>
ggplot(dta3rv, aes(x=Var1, y= Freq, group=group, color=seggroup))+
geom_point()+
geom_line()+
geom_text_repel(data=dta3rv[Freq>0.9], aes(label = group), vjust = 1, color="black")+
scale_x_continuous(breaks=c(0, 10, 15, 20, 30, 40))+
scale_y_continuous(limits = c(0, 1))+
scale_color_manual(name = "Effect", values = c("lightpink1",
"grey",
"tomato1"))+
labs(x = "Serial position", y = "Proportion of recall")+
theme_minimal()+
theme(legend.position = "bottom")
Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men. The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.
<- read.csv("Economist_women-research.csv", header = T, skip= 1)
dta4 names(dta4)<- c("Country", "Health", "Physical", "Engineering", "ComputerMaths", "Inventor")
<-dta4[-c(13:18),]
dta4 $Health <- as.numeric(dta4$Health)
dta4
::p_load(tidyr)
pacman<-dta4 |> pivot_longer(cols=-Country,
dta4Lnames_to= "Subject",
values_to= "F.per")|>
mutate(M.per= 1-F.per,
Differ=M.per-F.per)
<-dta4L |> pivot_longer(cols=-c(Country,Subject,Differ),
dta4Lvnames_to= "Gender",
values_to= "Percent")
$Gender = substr(dta4Lv$Gender, start = 1, stop = 1) dta4Lv
library(ggplot2)
ggplot(dta4Lv, aes(x=Percent , y=reorder(Country,Differ), color=Gender, shape=Gender, group=reorder(Country,Differ))) + # group by country後線就會依照國家兩性差異畫
geom_point(aes(x = Percent, y=reorder(Country,Differ), color=Gender, shape=Gender)) +
geom_line(color="black", lty=1)+
scale_shape_manual(name="Gender", values = c(16,16))+
scale_color_manual(name="Gender", values =c("red","blue"))+
scale_x_continuous(limits = c(0,1), breaks=c(.2,.4,.6,.8,1))+
facet_grid(~reorder(Subject,Differ))+
labs(x="Percent",
y="Country",
title = "")+
theme_minimal()
用顏色標示性別比例,黑線代表其差距,並按照兩性差距進行國家與科目的排序。由圖可知: