Task: The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message.
source: https://www.stat.gov.tw/ct.asp?xItem=40875&ctNode=511&mp=4
::p_load(ggplot2, dplyr, readr, tidyr)
pacman
<- read.csv("income_tw.csv", header = T)
dat
<- dat |>
dat mutate(Income = factor(Income, levels = Income),
proportionCumulativeCount = rev(cumsum(rev(Count))) / sum(Count),
proportionCumulativeCount_group = cut(proportionCumulativeCount,
breaks = c(0, .01, .05, .1, .25, .5, 1),
labels = c("Top 1%",
"Top 5%",
"Top 10%",
"Top 25%",
"Top 50%",
"Below 50%"),
include.lowest = T))
ggplot(dat, aes(x = Income, y = Count/10^4,
fill = proportionCumulativeCount_group,
shape = proportionCumulativeCount_group)) +
geom_hline(yintercept = median(dat$Count/10^4), color = "gray90", lwd = .5, lty = 2) +
geom_vline(xintercept = "260,000 to 279,999",color = "gray80", lwd = .8, lty = 1) +
geom_point(aes(x = Income, y = Count/10^4), size=2) +
geom_segment(aes(xend = Income, yend = 0)) +
scale_fill_manual(name = "% of population",
values = c("gray80",
"gray70",
"gray60",
"gray50",
"gray40",
"black"))+
scale_shape_manual(name = "% of population", values = c(24, 24, 24, 24, 24, 25))+
#scale_linetype_manual(name = "% of population", values = c(1:6))+
labs(x = "Personal disposable income",
y = "Number of persons(x 10,000)",
title = "Distribution of personal disposable income in Taiwan (2015)")+
annotate("text", x = "340,000 to 359,999", y = 78,
label ="Median DPI=266,490", color="gray40",
size=3)+
theme_minimal()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, margin = margin(t = 6), hjust = 1, size = 5),
panel.grid.major.x = element_blank(),
legend.position = "top")+
# coord_flip()+
guides(fill=guide_legend(ncol = 6, reverse = TRUE),
shape=guide_legend(ncol = 6, reverse = TRUE))
ggplot(dat, aes(x = Income, y = Count/10^4,
color = proportionCumulativeCount_group,
shape = proportionCumulativeCount_group)) +
geom_hline(yintercept = median(dat$Count/10^4), color = "gray90", lwd = .5, lty = 2) +
geom_vline(xintercept = "260,000 to 279,999",color = "gray80", lwd = .8, lty = 1) +
geom_point(aes(x = Income, y = Count/10^4), size=2) +
geom_segment(aes(xend = Income, yend = 0)) +
scale_color_manual(name = "% of population",
values = c("royalblue4",
"royalblue3",
"blue",
"royalblue2",
"royalblue1",
"tomato"))+
scale_shape_manual(name = "% of population", values = c(17, 17, 17, 17, 17, 25))+
#scale_linetype_manual(name = "% of population", values = c(1:6))+
labs(x = "Personal disposable income",
y = "Number of persons(x 10,000)",
title = "Distribution of personal disposable income in Taiwan (2015)")+
annotate("text", x = "340,000 to 359,999", y = 78,
label ="Median DPI=266,490", color="gray40",
size=3)+
theme_minimal()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, margin = margin(t = 6), hjust = 1, size = 5),
panel.grid.major.x = element_blank(),
legend.position = "top")+
# coord_flip()+
guides(color=guide_legend(ncol = 6, reverse = TRUE),
shape=guide_legend(ncol = 6, reverse = TRUE))
Task: Comment on how the graphs presented in this link violate the principles for effective graphics and how would you revise them.
::p_load(dplyr, ggplot2, viridis, ggthemes)
pacman
<- read.csv("traffic.csv",fileEncoding="big5",stringsAsFactors = F) dat
<- dat %>%
type filter(year == 104) %>%
group_by(type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dat %>%
vtype filter(year == 104) %>%
group_by(vehicle_type) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dat %>%
method filter(year == 104) %>%
group_by(method) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
<- dat %>%
type_month filter(year == 104) %>%
group_by(month,type) %>%
summarise(count=sum(count)) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
<- dat %>%
vtype_month filter(year == 104) %>%
group_by(month,vehicle_type) %>%
summarise(count=sum(count)) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
<- dat %>%
method_month filter(year == 104) %>%
group_by(month,method) %>%
summarise(count=sum(count)) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
原本圖的缺點:
- 種類無排序
- 顏色無意義
- 使用長條圖data-ink ratio 太高
- 缺乏整合比較
# add font
windowsFonts(A=windowsFont("Microsoft JhengHei UI"))
1:5,] %>%
type[arrange(count) %>%
mutate(type=factor(type, levels=type)) %>%
ggplot(., aes(x=count/1000, y=type, color=type))+
geom_point()+
geom_segment(aes(xend=0, yend=type)) +
scale_color_manual(values=c("gray50", "gray50", "gray50", "gray50", "tomato")) +
geom_text(aes(label=count/1000), size=3, nudge_x=12)+
ggtitle("104年桃園市交通違規取締前五名舉發件數之種類")+
labs(x = "舉發件數 (x 1000)",
y = "舉發種類")+
theme_minimal()+
theme(legend.position = "",
text = element_text(family = "A"))
%>%
vtype arrange(count) %>%
mutate(vehicle_type=factor(vehicle_type, levels=vehicle_type)) %>%
ggplot(., aes(x=count/1000, y=vehicle_type, color=vehicle_type))+
geom_point()+
geom_segment(aes(xend=0, yend=vehicle_type)) +
scale_color_manual(values=c("gray50", "gray50", "tomato")) +
geom_text(aes(label=count/1000), size=3.5, nudge_x=28) +
ggtitle("104年交通違規舉發車種別件數")+
labs(x = "舉發件數 (x 1000)",
y = "車種別")+
theme_minimal()+
theme(legend.position = "",
text = element_text(family = "A"))
%>%
method arrange(count) %>%
mutate(method=factor(method, levels=method)) %>%
ggplot(., aes(x=count/1000, y=method, color=method))+
geom_point()+
geom_segment(aes(xend=0, yend=method)) +
scale_color_manual(values=c("gray50", "tomato")) +
geom_text(aes(label=count/1000), size=3.5, nudge_x=40) +
ggtitle("104年交通違規舉發方式別件數")+
labs(x = "舉發件數 (x 1000)",
y = "舉發方式")+
theme_minimal()+
theme(legend.position = "",
text = element_text(family = "A"))
%>%
dat filter(year == 104) %>%
filter(type %in% c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎"))%>%
group_by(type, method, vehicle_type) %>%
summarise(count=sum(count)) %>%
mutate(type=factor(type, levels=c("不依規定轉彎",
"闖紅燈直行左轉",
"違規停車拖吊",
"行車速度超速60公里以下",
"違規停車")),
vehicle_type=factor(vehicle_type, levels=c("機車",
"汽車",
"重型機車(250CC以上)")),
method=factor(method, levels=c("逕舉","攔停")))%>%
ggplot(., aes(x=count/1000, y=type, color=vehicle_type))+
geom_point()+
facet_wrap(method ~ vehicle_type)+
labs(x = "舉發件數 (x 1000)",
y = "舉發種類",
title = "104年桃園市交通違規取締",
subtitle = "前五名舉發種類、舉發方式及車種別")+
theme_minimal()+
theme(legend.position = "top",
text = element_text(family = "A"))+
guides(color=guide_legend(title="車種別", ncol = 3, reverse = F))
%>%
dat filter(year == 104) %>%
filter(type %in% c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎"))%>%
group_by(type, method, vehicle_type) %>%
# summarise(count=sum(count)) %>%
mutate(type=factor(type, levels=c("不依規定轉彎",
"闖紅燈直行左轉",
"違規停車拖吊",
"行車速度超速60公里以下",
"違規停車")),
vehicle_type=factor(vehicle_type, levels=c("機車",
"汽車",
"重型機車(250CC以上)")),
method=factor(method, levels=c("逕舉","攔停")))%>%
ggplot(., aes(x=count/1000, y=type, color=vehicle_type))+
geom_boxplot()+
facet_wrap(. ~ method)+
labs(x = "舉發件數 (x 1000)",
y = "舉發種類",
title = "104年桃園市交通違規盒鬚圖",
subtitle = "前五名舉發種類、舉發方式及車種別")+
theme_minimal()+
theme(legend.position = "top",
text = element_text(family = "A"))+
guides(color=guide_legend(title="車種別", ncol = 3, reverse = T))
%>%
dat filter(year == 104) %>%
filter(type %in% c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎"))%>%
group_by(month, type, method, vehicle_type) %>%
summarise(count=sum(count)) %>%
mutate(type=factor(type, levels=c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎")),
vehicle_type=factor(vehicle_type, levels=c("機車",
"汽車",
"重型機車(250CC以上)")),
method=factor(method, levels=c("逕舉","攔停")))%>%
ggplot(., aes(x=count/1000, y=month, fill=vehicle_type))+
geom_bar(position="stack", stat="identity", orientation="y")+
scale_y_reverse(breaks = 1:12) +
facet_wrap(type ~ method, ncol=2, nrow=5)+
scale_fill_viridis(discrete = T) +
labs(x = "舉發件數 (x 1000)",
y = "月份",
title = "104年各月份桃園市交通違規累積件數",
subtitle = "前五名舉發種類、舉發方式及車種別")+
theme_minimal()+
theme(legend.position = "top",
text = element_text(family = "A"))+
guides(fill=guide_legend(title="車種別", ncol = 3, reverse = T))
## `summarise()` has grouped output by 'month', 'type', 'method'. You can override using the `.groups` argument.
%>%
dat filter(year == 104) %>%
filter(type %in% c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎"))%>%
group_by(month, type, method, vehicle_type) %>%
summarise(count=sum(count)) %>%
mutate(type=factor(type, levels=c("違規停車",
"行車速度超速60公里以下",
"違規停車拖吊",
"闖紅燈直行左轉",
"不依規定轉彎")),
vehicle_type=factor(vehicle_type, levels=c("機車",
"汽車",
"重型機車(250CC以上)")),
method=factor(method, levels=c("逕舉","攔停")))%>%
ggplot(., aes(x=count/1000, y=month, fill=vehicle_type))+
geom_bar(position="fill", stat="identity", orientation="y")+
# geom_point()+
# geom_line()+
scale_y_reverse(breaks = 1:12) +
facet_wrap(type ~ method, ncol=2, nrow=5)+
scale_fill_viridis(discrete = T) +
labs(x = "舉發件數 (x 1000)",
y = "月份",
title = "104年各月份桃園市交通違規 比例",
subtitle = "前五名舉發種類、舉發方式及車種別")+
theme_minimal()+
theme(legend.position = "top",
text = element_text(family = "A"))+
guides(fill=guide_legend(title="車種別", ncol = 3, reverse = T))
## `summarise()` has grouped output by 'month', 'type', 'method'. You can override using the `.groups` argument.
## Warning: Removed 54 rows containing missing values (geom_bar).
<- type[1:5,1]#取出數量前五的種類
top5type <- data.frame()#產生空的框架
top5 for(i in 1:5){
<- type_month %>%
X filter(type == as.character(top5type[i,]))
<- rbind(top5,X)
top5 #重新取出數量前五種類的資料 }
<- list()#產生空的list
bar for(i in 1:12){
<- ungroup(top5) %>%
X filter(month == i)
<- ggplot(X,aes(x=type,y =count,fill = type))+
bar[[i]]geom_bar(stat="identity",width=.5,alpha = 0.8) +
geom_text(aes(label = count),size = 3.5) +
ggtitle(paste("104年",i,"月交通違規舉發")) +
theme(text = element_text(family = "A"))
print(bar[[i]] )
#產生1至12月的bars }
<- list()
trendline for(i in 1:5){
<- ungroup(top5) %>%
X filter(type == as.character(top5type[i,]))
<- ggplot(X,aes(x=month,y =count))+
trendline[[i]] geom_text(aes(label = count),size = 3.5) +
geom_line(colour = "darkred",size = 1,alpha=0.8) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
ggtitle(paste("104年",as.character(top5type[i,]),"趨勢線")) +
theme(text = element_text(family = "A"))
print(trendline[[i]] )
}#產生前5種類的trendllines
# 交通違規總數的月份趨勢圖:
<- dat %>%
month filter(year == 104) %>%
group_by(month) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
ggplot(month,aes(x=month,y =count))+
geom_text(aes(label = count),size = 3.5) +
geom_line(colour = "darkred",size = 1,alpha=0.8) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
ggtitle(paste("104年交通違規總數趨勢線")) +
theme(text = element_text(family = "A"))
# 針對車種的各月份趨勢圖:
ggplot(vtype_month,aes(x=month,y =count,colour=vehicle_type))+
geom_text(aes(label = count),size = 3.5) +
geom_line(size = 1,alpha=0.8) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
ggtitle(paste("104年交通違規車種趨勢線")) +
theme(text = element_text(family = "A"))
ggplot(method_month,aes(x=month,y =count,colour=method))+
geom_text(aes(label = count),size = 3.5) +
geom_line(size = 1,alpha=0.8) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
ggtitle(paste("104年交通違規舉發方式趨勢線")) +
theme(text = element_text(family = "A"))
<- dat %>%
month filter(year == 104) %>%
group_by(month) %>%
summarise(count = sum(count)) %>%
arrange(desc(count))
# 最後,我們針對前五種類的舉發種類與各月份做了熱度圖:
ggplot(ungroup(top5),aes(x=factor(month), y=factor(type), fill=count)) +
geom_tile(color="white", size=0.1) +
scale_fill_viridis(name="件數")+ #使用viridis
coord_equal()+
labs(x=NULL, y=NULL, title="104年交通違規熱度圖")+
theme_tufte(base_family="Helvetica")+ #使用ggthemes
theme(plot.title=element_text(hjust=0))+
theme(axis.ticks=element_blank())+
theme(axis.text=element_text(size=7))+
theme(legend.title=element_text(size=8))+
theme(legend.text=element_text(size=6)) +
theme(text = element_text(family = "A"))
Task: Use the free recall data to improve on the figure reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.
untar("Murd62.data.tgz")
<- read.table("Murd62/fr10-2.txt", sep = "", col.names = c(1:10), fill = T, nrows = 1200)
dat10_2 "Group"] = "10-2"
dat10_2[
<- read.table("Murd62/fr15-2.txt", sep = "", col.names = c(1:15), fill = T)
dat15_2 "Group"] = "15-2"
dat15_2[
<- read.table("Murd62/fr20-1.txt", sep = "", col.names = c(1:20), fill = T)
dat20_1 "Group"] = "20-1"
dat20_1[
<- read.table("Murd62/fr20-2.txt", sep = "", col.names = c(1:20), fill = T)
dat20_2 "Group"] = "20-2"
dat20_2[
<- read.table("Murd62/fr30-1.txt", sep = "", col.names = c(1:30), fill = T)
dat30_1 "Group"] = "30-1"
dat30_1[
<- read.table("Murd62/fr40-1.txt", sep = "", col.names = c(1:40), fill = T)
dat40_1 "Group"] = "40-1"
dat40_1[
<- list(dat10_2, dat15_2, dat20_1, dat20_2, dat30_1, dat40_1) dat_recall
<- lapply(dat_recall,
dat3 function(x) {y <- x |>
::pivot_longer(cols = starts_with("X"),
tidyrnames_to = "Serial",
values_to = "Item")
<- as.data.frame(table(y$Item)/1200) |>
y ::mutate(Group = x$Group[1])
dplyr|>
} ) ::bind_rows() dplyr
$Group <- factor(dat3$Group)
dat3
# remove "88"
<- dat3[!dat3$Var1 == "88", ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "16" & dat3_re$Group == "15-2"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "31" & dat3_re$Group == "30-1"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "41" & dat3_re$Group == "40-1"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "50" & dat3_re$Group == "40-1"), ] dat3_re
<-dat3_re|>
dat3_remutate(Var1 = as.numeric(Var1),
effectgroup=ifelse(Freq>=0.5 & Var1>5,"Recency effect",
ifelse(Freq>=0.5 & Var1<5, "Primacy effect", "none")))
<-dat3_re|>
dat3_remutate(effectgroup=factor(effectgroup, levels=c("Primacy effect",
"none",
"Recency effect")))
ggplot(dat3_re, aes(x=Var1, y= Freq, group=Group, color=Group))+
geom_point()+
geom_line()+
::geom_text_repel(data=dat3_re %>% filter(Freq>0.9), # Filter data first
ggrepelaes(label=Group)) +
# geom_label(data=dat3_re %>% filter(Freq>0.9), # Filter data first
# aes(label=Group))+
scale_x_continuous(breaks=c(0, 10, 15, 20, 30, 40))+
scale_color_manual(name = "", values = c("gray10",
"gray20",
"gray30",
"gray40",
"gray50",
"gray60"))+
labs(x = "Serial position", y = "Proportion of recall")+
theme_minimal()+
theme(legend.position = "")
color by effect
ggplot(dat3_re, aes(x=Var1, y= Freq, group=Group, color=effectgroup))+
geom_point()+
geom_line()+
::geom_text_repel(data=dat3_re %>% filter(Freq>0.9), # Filter data first
ggrepelaes(label=Group),
color="black") +
# geom_label(data=dat3_re %>% filter(Freq>0.9), # Filter data first
# aes(label=Group))+
scale_x_continuous(breaks=c(0, 10, 15, 20, 30, 40))+
scale_color_manual(name = "Effect type",
values = c("pink","gray80", "tomato"))+
labs(x = "Serial position", y = "Proportion of recall")+
theme_minimal()+
theme(legend.position = "bottom")
Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men.
Task: The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.
<- read.csv("Economist_women-research.csv", header = T)
dat # remove the extra information
<- dat[-c(1, 14:19),]
dat
# variable rename
colnames(dat) <- c("Country", "Health", "Physical", "Engineering", "ComputerMaths", "Inventor")
# reset row name
rownames(dat) <- NULL
::kable(dat) knitr
Country | Health | Physical | Engineering | ComputerMaths | Inventor |
---|---|---|---|---|---|
Japan | 0.24 | 0.11 | 0.11 | 0.11 | 0.08 |
Chile | 0.43 | 0.23 | 0.22 | 0.16 | 0.19 |
United Kingdom | 0.45 | 0.21 | 0.22 | 0.21 | 0.12 |
United States | 0.46 | 0.2 | 0.22 | 0.22 | 0.14 |
Mexico | 0.46 | 0.25 | 0.26 | 0.22 | 0.18 |
Denmark | 0.47 | 0.22 | 0.23 | 0.18 | 0.13 |
EU28 | 0.48 | 0.25 | 0.25 | 0.22 | 0.12 |
France | 0.48 | 0.24 | 0.25 | 0.22 | 0.17 |
Canada | 0.49 | 0.21 | 0.22 | 0.22 | 0.13 |
Australia | 0.5 | 0.23 | 0.25 | 0.24 | 0.12 |
Brazil | 0.57 | 0.33 | 0.32 | 0.24 | 0.19 |
Portugal | 0.57 | 0.37 | 0.36 | 0.27 | 0.26 |
<- dat|>
dat_l pivot_longer(cols = c(2:6), # wide to long
names_to = "Field",
values_to = "Female") |>
mutate(Female=as.numeric(Female),
Male=1-Female) |>
pivot_longer(cols = c(3:4), # wide to long
names_to = "Gender",
values_to = "Proportion")|>
mutate(Field=factor(Field, levels=c("Health",
"Physical",
"Engineering",
"ComputerMaths",
"Inventor"),
labels = c("Health sciences",
"Physical sciences",
"Engineering",
"Computer science and maths",
"Inventores (patent applications)")),
Gender=factor(Gender, levels=c("Male",
"Female"),
labels = c("Men","Women")),
Country=factor(Country, levels=c("Portugal",
"Brazil",
"Australia",
"Canada",
"EU28",
"France",
"Denmark",
"Mexico",
"United States",
"United Kingdom",
"Chile",
"Japan")))
windowsFonts("Arial Narrow"=windowsFont("Arial Narrow"))
ggplot(dat_l, aes(x = Proportion, y = forcats::fct_rev(Country), fill = Gender, width=.5)) +
geom_col(position = "fill")+
scale_fill_manual(name = "Gender",
values = c("tomato", "pink"))+
geom_vline(xintercept = 0.5,color = "firebrick", lwd = 0.5, lty = 1) +
geom_vline(xintercept = 0.25,color = "firebrick4", lwd = 0.5, lty = 3) +
facet_wrap(.~Field, nrow=1)+
labs(x="Proportion (%)",
y="",
title=expression(paste(bold("Still a man's world"))),
subtitle = "Women among researchers with papers published* (index in Scopus) in 2011-2015",
caption = expression(paste(' Source: "Gender in the Global Research Landscape" by Elsevier;',
italic("The Economist"))))+
::theme_ipsum()+
hrbrthemestheme(legend.position = "top",
axis.text.x = element_text(size = 6),
strip.text.x = element_text(size = 7.5))+
guides(fill=guide_legend(ncol = 2, reverse = TRUE))
<-glm(Gender~Country+Field+Proportion, data=dat_l, family = binomial(link = "logit"))
m1
<- dat_l|>
dat_l mutate(odd=Proportion/(1-Proportion),
fit=m1$fitted.values,
resid=m1$residuals)
ggplot(subset(dat_l, Gender=="Women"), aes(x = odd, y = forcats::fct_rev(Country), color=Field)) +
geom_vline(xintercept = 1,color = "firebrick", lwd = 0.5, lty = 1) +
geom_point(size=1.5)+
labs(x="Odds(Women to Men)",
y="Country",
title=expression(paste(bold("Still a man's world"))),
subtitle = "Women among researchers with papers published* (index in Scopus) in 2011-2015",
caption = expression(paste(' Source: "Gender in the Global Research Landscape" by Elsevier;',
italic("The Economist"))))+
::theme_ipsum()+
hrbrthemestheme(legend.position = "top")+
guides(color=guide_legend(ncol = 5, reverse = T))