R
的繪圖指令,包括ggplot2
的套件中的指令,以及lattice
套件,讓同學熟悉資料視覺化。雖然之前我們已經看過很多例子,但是仍然有必要詳細介紹包括直方圖、折線圖、散佈圖等等圖形,結合dplyr
套件中的指令,以培養視覺化資料的能力,並且在圖形上面顯示統計結果。例如我們可以視覺化資料的分佈並且找出每一個類別的平均數如圖 1.1:
file<-here("data","studentsfull.txt")
students<-read.table(file, sep='', header=TRUE)
p <- ggplot(students, aes(x=Department, y=Score)) +
geom_point() +
theme_classic()
p + ggplot2::stat_summary(fun.y='mean', colour='red',size=3, geom = "point")
Figure 1.1: 類別平均值的散佈圖
tmp1 <- nycflights13::weather
tmp1 <- tmp1[which(tmp1$origin=='JFK'), ]
tmp2 <- tmp1 %>%
group_by(month) %>%
summarise(Temp=mean(temp))
p1 <- ggplot(aes(x=as.factor(month), y=Temp, group=1), data = tmp2) +
geom_line(stat = 'identity') +
labs(x = "Month", y = "Temperature")
p1 + geom_emoji(emoji = "1f31e")
Figure 1.2: Emoji的折線圖
emoGG
套件,語法如下:
# downloading the package from GitHub
devtools::install_github(
repo = "dill/emoGG", # package path on GitHub
dependencies = TRUE, # assumes you have already installed needed packages
quick = TRUE # skips docs, demos, and vignettes)
#read data
file<-here::here('data','PP2197E5C.sav')
dat<-sjlabelled::read_spss(file)
library(reshape2)
dat$bluegreen<-car::recode(dat$partyid, "1=1;2=2;3=1;
4=1;5=2;6=2;7=3;8:9=4")
#function
myrecode <- function(x){
recode(x, '1'=0, '2'=0, '3'=1, '4' = 1)
}
dat<-dat %>% mutate_at(c("Q1","Q2","Q3","Q5","Q7","Q10", "Q11","Q15","Q18","Q19","Q20","Q21","Q22","Q23",
"Q24","Q25"), myrecode)
varname<-c(1,2,3,5,7,10,11,15,18,19,20,21,22,23,24,25)
S<-paste('Q',varname, sep='')
listtable<-lapply(dat[,S], function(x) xtabs(~x + dat$bluegreen))
MyList <- lapply(listtable, prop.table, 2)
k <- matrix(rep(NA,64),nrow=16,ncol=4)
for (i in 1:16){
k[i,]<-as.vector(MyList[[i]][2,])
}
k<-as.data.frame(k)
#data set
rownames(k)<-NULL
colnames(k)<-c("Pan-blue","Pan-green","TPP","IND")
#reverse the order of questions so Q1 changes to Q25
k$Q<-noquote(paste('Q', rev(varname), sep=''))
tmpo<-melt(k, id.vars=c("Q"))
tmpo$Qnew <-as.character(tmpo$Q)
#tmpop$value<-100*tmpop$value
tmpo<-tmpo%>% mutate(party=variable,
value=100*value,
Qnew=factor(Qnew,levels=S))
showtext::showtext_auto()
ylabel<-c('滿意蔡英文','滿意蘇貞昌','滿意陳時中','滿意紓困措施','校正回歸適合','對指揮中心信心','對政府防疫信心','地方可以買疫苗','願打中國疫苗','願打代理歐美疫苗','滿意買疫苗努力','指揮中心決定正確',
'指揮中心考慮民眾福利',
'相信陳時中','政府可以用個資','擔心資安被侵害')
G1<-ggplot(data=tmpo, aes(x=Qnew,y=value,label=value)) +
geom_point(stat="identity", aes(color=party), size = 6) +geom_text(aes(label=sprintf("%0.1f", value)),vjust = 2, size=3) +
labs(caption='資料來源:請見中華民國政大選研之友協會臉書粉絲頁version 4',
title='', x='', y='') +
geom_vline(xintercept = 50, linetype='dashed') +
theme_economist() +
theme(text=element_text(size=14,family='Georgia'),
plot.caption = element_text(size=11),
axis.text.x = element_text(size=14, family='YouYuan'),
plot.title = element_text(family='YouYuan', size=10),
legend.title = element_blank()) +
guides(fill=FALSE) +
coord_flip()+
scale_color_manual(values=c('#3322EE','#2AA111','white','yellow','#CDC1B2', '#B1000B'))+
scale_x_discrete(breaks=S,
labels=rev(ylabel))
G1
Figure 1.3: 民眾對疫情看法
R
內建了許多基本的繪圖指令,讓我們可以很輕鬆地畫出專業的圖形,但是ggplot2
也有許多技巧,所以我們必須學習並且熟練這些語法。一旦熟悉之後,你可以客製化你想要的感覺。
R
的基礎繪圖功能中,繪圖的元素有「點」、「線」、「大小」、「粗細」以及「顏色」等等。
xn<-c(1:10)
yn<-c(1:10)
col<-c(80, 75, 70, 60, 50, 40, 30, 20, 10, 0)
graycol<-paste("gray",col,sep="")
graycol
## [1] "gray80" "gray75" "gray70" "gray60" "gray50" "gray40" "gray30" "gray20"
## [9] "gray10" "gray0"
plot(xn, yn,
cex=c(rep(1, 5), rep(2,5)),
col=graycol,
pch=c(1,2,5,6,8, 16, 17, 20, 22, 24),
xlab="pch", ylab="cex",
xaxt="n", yaxt="n",
xlim=c(0,11),ylim=c(0,11), font.lab=2, cex.lab=1.5)
axis(1, at=c(1:10), labels=c(1,2,5,6,8, 16, 17, 20, 22, 24))
axis(2, at=c(1:10), labels=c(rep(1, 5), rep(2,5)))
abline(v=6, lwd=2, col='red'); abline(h=6, lwd=3, lty=2)
Figure 2.1: 圖型的元素
with(anscombe, plot(x1, y1, pch=16, cex=2, col='blue',
xlab='X', ylab='Y', main='x1-y1', xaxt='n'))
axis(side=1, labels=c(1,"" ,"" ,"" ,"" ,6), at=c(4,6,8,10,12,14))
Figure 2.2: 散佈圖一
TXT<-here("data","studentsfull.txt")
students<-read.table(TXT, header=T, sep='')
stu <- table(students$Department)
kableExtra::kable_styling(knitr::kable(stu, caption="科系分佈的次數"))
Var1 | Freq |
---|---|
Aerospace | 4 |
Chemistry | 3 |
Economics | 4 |
English | 3 |
Journalism | 4 |
Mechanics | 5 |
Physics | 3 |
barplot(stu, main="Departments", xlab="", ylab="frequency",
cex.names = 0.4)
Figure 2.3: 直方圖一
diamonds
這筆資料的cut變數,顯示其直方圖 2.4:
o.par <- par()
par(bg = '#a0a1c2')
barplot(table(diamonds$cut))
Figure 2.4: 鑽石切割等級的直方圖
par(o.par)
plot.new()
barplot(100*stu/nrow(students), main="Departments",
xlab="", ylab="Percent", border='red',
col='#0011EE22', ylim=c(0, 20), cex.names = 0.5)
Figure 2.5: 直方圖二
student.table <- table(students$Gender, students$Department)
barplot(100*prop.table(student.table, margin=2),
col=c('brown', 'white'), cex.axis= 0.5,
legend = levels(students$Gender))
Figure 2.6: 直方圖三
student.table <- table(students$Gender, students$Department)
barplot(100*prop.table(student.table, margin=2),
col=c('blue', '#EE330011'), cex.axis=0.7)
legend("top", fill=c('blue', '#EE330011'), c("M","F"))
Figure 2.7: 直方圖與圖例
par(xpd=T, mar=par()$mar+c(0,0,0,6))
TXT<-here::here("data","studentsfull.txt")
students<-read.table(TXT, header=T, sep='')
stu <- table(students$Department)
student.table <- table(students$Gender, students$Department)
barplot(student.table, col=c("red", "blue"),
legend=c('female','male'))
Figure 2.8: 圖例在圖外面
pt.students<-100*prop.table(student.table, 2)
barplot(pt.students, col=c("salmon1", "royalblue1"),
ylim=c(0,100),xlim=c(0.3,8), cex.lab=0.1)
legend(8.5, 90, c('female','male'),
col=c("salmon1", "royalblue1"),
pch=c(20,20), bty='n')
Figure 2.9: 圖例在圖外面
barplot(100*prop.table(student.table, margin=2),
names.arg=c("Aer","Che.", "Eco.", "Eng.", "Jou.", "Mec.","Phy."),
cex.labs=0.7)
Figure 2.10: 直方圖四
☛請嘗試讀取PP0797B2.sav這筆資料,然後畫長條圖表示partyid這個變數中,政黨各類別的相對次數。
boxplot(mtcars$mpg, ylim=c(0,40), yaxt='n')
Figure 2.11: 箱型圖一
quantile(mtcars$mpg, c(.25, .5, .75), type=6)
## 25% 50% 75%
## 15.27 19.20 22.80
R
計算如下:
qu<-quantile(mtcars$mpg, c(.25, .5, .75), type=7)
qu
## 25% 50% 75%
## 15.43 19.20 22.80
upper<- qu[3]+1.5*(qu[3]-qu[1])
lower<- qu[1]-1.5*(qu[3]-qu[1])
cat("upper", upper); cat("/", "lower", lower)
## upper 33.86
## / lower 4.363
# upper inner fence
min(max(mtcars$mpg), upper)
## [1] 33.86
# lower inner fence
max(min(mtcars$mpg), lower)
## [1] 10.4
boxplot(mtcars$mpg, ylim=c(0,40), yaxt='n')
axis(2, at=c(1:40, by=5), labels=c(1:40,by=5))
Figure 2.12: 盒型圖二
boxplot(state.area, ylab="Area of State")
Figure 2.13: 盒型圖三
y<-state.abb
identify(rep(1, length(y)), y, labels=seq_along(y)))
Orange$tree <-ordered(Orange$Tree, levels=c(1,2,3,4,5))
with(Orange, boxplot(circumference ~ tree))
Figure 2.14: 盒型圖四
☛請試著畫ISLR
套件中
par(mfrow=c(1,2))
hist(USArrests$Assault, col="tomato2", main="breaks_default")
hist(USArrests$Assault, breaks = 15, col="tomato3",main="breaks_15")
Figure 2.15: 長條圖一
hist(USArrests$Assault, col="tomato4", freq = F,
xlab="Assault", main="Assault in 50 States", breaks = 10)
Figure 2.16: 長條圖二
R
會自動挑選適合該變數分佈離散程度的寬度,breaks參數越大,寬度越小,有可能出現某一間隔沒有任何觀察值之狀況。但是寬度越大,變數的分佈越粗略。
par(mfrow=c(1,2))
y <- c()
for (i in 1:1000)
{x=rnorm(1000,0,1)
x.sample <- sample(x, 100)
y[i]=mean(x.sample)}
hist(y, 10, probability = T)
rug(jitter(y))
hist(y, 50, probability = T)
rug(jitter(y))
Figure 2.17: 兩個長條圖並列
par(mfrow=c(1,1))
set.seed(02138)
y <- c()
for (i in 1:1000)
{x=rnorm(1000,0,1)
x.sample <- sample(x, 100)
y[i]=mean(x.sample)}
hist(y, 100, probability = T, col="gray90")
curve(dunif(x, min=min(y), max=max(y)), add=T, col="blue", lwd=2)
curve(dnorm(x, mean=mean(y), sd=sd(y)), add=T, col="red", lwd=2)
Figure 2.18: 加上機率密度曲線的長條圖
Y<-ggplot2::diamonds$price/1000
hist(Y, freq = F , breaks=25)
lines(density(Y), lty=2,
lwd=2, col="#e122aa")
curve(dchisq(x, df=4), add=T,
lwd=2, col="darkblue")
curve(dexp(x, rate=.25), lwd=2, add=T)
Figure 2.19: 鑽石價格機率密度
with(airquality, plot(Wind, type='b'))
Figure 2.20: 空氣品質折線圖一
with(airquality, plot(Ozone, type='o', pch=16,
cex=1.2, lty=2, lwd=2, col='red'))
Figure 2.21: 空氣品質折線圖二
plot(LakeHuron, type = "o", pch=16, cex=1.2, lty=2) ## Index plot
Figure 2.22: Lake Huron折線圖1
library(tsbox)
ts_plot(LakeHuron)
Figure 2.23: Lake Huron折線圖2
x <- as.vector(airquality$Wind)
my_ts <- ts(x, frequency = 31, start=c(5,1),
end=c(9,30))
plot.ts(my_ts, type='b')
Figure 2.24: 風速趨勢圖
圖 2.24顯示5, 6, 7, 8, 9這五個月的風力。
第二個例子是用我們自己收集的統獨趨勢資料,裡面有統一、獨立、維持現狀、無反應等四個變數,也就是要畫四條線。首先轉換資料:
CSV<-here::here("data","Tondutrend.csv")
trend<-read.csv(CSV, header=T, sep=",")
tonduts<-ts(trend, start=c(1992.1), frequency=2)
tonduts
## Time Series:
## Start = 1992.1
## End = 2020.1
## Frequency = 2
## 統一 維持現狀 獨立 拒答
## 1992 2.40 66.10 7.70 23.80
## 1993 18.60 42.20 4.00 35.20
## 1993 20.40 51.40 11.40 16.70
## 1994 NA NA NA NA
## 1994 NA NA NA NA
## 1995 NA NA NA NA
## 1995 19.10 46.10 11.10 23.80
## 1996 22.30 41.10 12.20 24.40
## 1996 21.00 48.50 14.10 16.40
## 1997 27.80 47.10 16.10 9.00
## 1997 22.80 50.80 17.90 9.00
## 1998 20.50 49.90 17.40 12.20
## 1998 20.30 50.50 20.00 9.20
## 1999 18.80 53.80 18.90 8.50
## 1999 13.30 58.80 17.80 10.10
## 2000 20.80 52.50 16.20 10.60
## 2000 26.80 51.10 9.00 13.00
## 2001 23.80 55.50 13.30 7.40
## 2001 26.20 52.50 15.40 6.00
## 2002 21.50 53.50 17.90 7.10
## 2002 21.60 52.90 18.80 6.80
## 2003 17.60 56.60 19.20 6.60
## 2003 14.90 55.90 21.60 7.70
## 2004 12.80 60.20 19.60 7.40
## 2004 14.10 57.70 20.70 7.50
## 2005 15.70 59.30 19.80 5.20
## 2005 16.20 58.40 20.10 5.20
## 2006 14.90 60.90 18.50 5.70
## 2006 15.90 60.70 18.60 4.80
## 2007 18.20 53.30 23.10 5.40
## 2007 12.90 58.20 20.80 8.00
## 2008 11.50 59.60 20.40 8.50
## 2008 10.50 59.60 25.70 4.10
## 2009 10.20 63.10 21.30 5.50
## 2009 10.60 63.90 19.60 5.90
## 2010 12.30 61.00 22.80 3.80
## 2010 10.00 64.60 21.00 4.40
## 2011 11.10 61.30 23.00 4.60
## 2011 10.80 63.70 19.10 6.40
## 2012 10.70 64.80 19.10 5.40
## 2012 11.10 62.80 19.60 6.50
## 2013 12.30 59.20 23.30 5.20
## 2013 11.60 61.30 21.70 5.40
## 2014 10.90 60.80 22.90 5.40
## 2014 8.80 61.60 23.60 5.90
## 2015 9.60 61.60 20.50 8.40
## 2015 10.80 60.60 22.20 6.40
## 2016 10.00 61.30 22.80 5.90
## 2016 11.40 60.10 22.50 5.90
## 2017 13.20 58.70 23.10 5.00
## 2017 14.40 59.80 19.90 5.90
## 2018 16.50 58.10 20.00 5.50
## 2018 17.50 57.90 19.50 5.20
## 2019 10.60 57.60 25.80 6.00
## 2019 7.50 57.60 28.00 6.98
## 2020 8.61 56.56 30.30 4.54
## 2020 7.83 56.08 30.96 5.13
R
的優點在於允許時間點有缺漏,畫出來的圖形就有缺少某個或是某些時間點,不會強制連接前後兩個時間點。接下來是畫折線圖。圖 2.23 顯示2004年以來統獨趨勢一直相當穩定,但是在2016年之後偏統的支持度似乎上升,到了2019年又下降,而偏獨立的民意上升:
# Margin
par(xpd=NA, mar=par()$mar+c(2.5, 0, 0, 0),family='HanWangKaiMediumChuIn')
# Plot
plot(tonduts, plot.type=c("single"), lty=c(2,3,1,2),ylab="%",xlab=NULL,pch='1', lwd=3,frame.plot=F,col=c("gray20","gray60", "black", "gray80"),xaxt="n", main="台灣民眾的統獨立場, 1992.6-2021.12")
axis(1, at=seq(1992,2021,by=2))
axis(2, at=seq(10,70,by=10))
legend("bottomright", c("統一","維持現狀"),inset=c(0.35, -0.6), col=c("gray20","gray60"),lty=c(2,3), bty='n', lwd=3)
legend("bottomright", c("獨立","無反應"), inset=c(0, -0.6), col=c("black", "gray80"),lty=c(1,2), bty='n', lwd=3)
text(2000, 30, paste("第一次政黨輪替"))
Figure 2.25: 統獨趨勢折線圖1
R
會出現figure margin too large的錯誤訊息,這時候請調整一下par(mar=c()),例如par(mar=c(-0.2,-0.2,0,0))
par(family='TaipeiSansTCBeta')
png('./Fig/unplot.png')
ts_plot(tonduts, title='')
dev.off()
gra1<-here::here('Fig','unplot.png')
knitr::include_graphics(gra1, dpi=100)
library(here);library(foriegn)
CSV<-here("data","Tondutrend.csv")
trend<-read.csv(CSV, header=T, sep=",")
tonduts<-ts(trend, start=1992.6, frequency=2)
par(xpd=NA, mar=par()$mar+c(2.5, 0, 0, 0),
family='HanWangKaiMediumChuIn')
library(here);library(foriegn);library(showtext)
showtext_auto()
font.add("細明體", "MingLiu.ttf")
par(xpd=NA,
mar=par()$mar+c(0, 0, 0, 0),
family='細明體')
☛請畫圖表示MASS::accdeaths的趨勢。
with(anscombe, plot(x1, y1, pch=16, cex=1.2, col='gray25'))
Figure 2.26: 散佈圖
with(anscombe, summary(x1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 6.5 9.0 9.0 11.5 14.0
with(anscombe, summary(y1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.26 6.32 7.58 7.50 8.57 10.84
x<-c(1: max(anscombe$x1))
y<-c(1: max(anscombe$x1))
plot(x, y, type='n')
with(anscombe, points(x1, y1, pch=16, cex=2, col='gray25'))
Figure 2.27: 散佈圖二
x<-c(1: max(anscombe$x1))
y<-c(1: max(anscombe$x1))
plot(x, y, type='n')
with(anscombe, points(x1[which(x1<=8)],
y1[which(x1<=8)], pch=16, cex=2, col='red'))
with(anscombe, points(x1[which(x1>8)],
y1[which(x1>8)], pch=10, cex=2, col='#ee0022'))
abline(v=8, lwd=2, lty=3)
Figure 2.28: 散佈圖三
#RRGGBB
的顏色。可以查這個網站輸入Hex6位數找到喜歡的顏色。
plot(x, y, type='n')
with(anscombe, points(x1[which(x1<=8)],
y1[which(x1<=8)], pch=16, cex=2, col='red'))
with(anscombe, points(x1[which(x1>8)],
y1[which(x1>8)], pch=16, cex=2, col='darkblue'))
abline(v=8, lwd=2, lty=3)
with(anscombe, text(x1, y1,
c(1: nrow(anscombe)), pos=4))
Figure 2.29: 散佈圖四
plot(x, y, type='n', xlim=c(1, 16), xaxt='n')
axis(1, labels = c(2:15), at=c(2:15))
with(anscombe, points(x1[which(x1<=8)], y1[which(x1<=8)], pch=16,
cex=2, col='red'))
with(anscombe, points(x1[which(x1>8)], y1[which(x1>8)], pch=22,
cex=2, col='darkblue'))
abline(v=8, lwd=2, lty=3)
with(anscombe, text(x1, y1, paste(x1, y1, sep=","), pos=4))
Figure 2.30: 散佈圖五
x77 <- data.frame(state.x77)
with(x77, plot(Income, HS.Grad))
with(x77, text(Income, HS.Grad, rownames(x77), adj = -0.2, col='blue'))
Figure 2.31: 各州收入與高中畢業比例散佈圖
☛ 請根據ISLR::College這筆資料的Top10perc排序,篩選出新生為高中前10%畢業生的比率最高的前10名學校,觀察錄取率(Accept/Apps)與曾捐款的校友比率(perc.alumni)的關係,然後註記學校的名稱。
with(anscombe, symbols(x2, y1, circles=anscombe$x1, inches=0.2, fg='blue'))
Figure 2.32: 特殊點狀圖一
with(USArrests, symbols(Murder, Assault, circles=UrbanPop, inches=0.12, bg="red"))
Figure 2.33: 特殊點狀圖二
m.arrest<-with(USArrests, lm(Assault ~ Murder+UrbanPop))
summary(m.arrest)
##
## Call:
## lm(formula = Assault ~ Murder + UrbanPop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.78 -25.99 -1.97 22.49 111.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.620 33.217 -0.71 0.481
## Murder 15.071 1.572 9.59 1.2e-12 ***
## UrbanPop 1.175 0.473 2.48 0.017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.8 on 47 degrees of freedom
## Multiple R-squared: 0.684, Adjusted R-squared: 0.671
## F-statistic: 51 on 2 and 47 DF, p-value: 1.69e-12
☛請畫圖表示MASS::Boston這筆資料中,與生師比(ptratio)與房價中位數(medv)的關係,並且考慮低社會地位人口比例(lstat)的作用。
#設定路徑
setwd(here::here())
setwd(here::here('Fig'))
#設定圖形輸出的檔案名稱與大小
png("pie.png", width=4, height = 6, units = 'in', res=300)
#繪圖
pie(table(ISLR::Auto$origin))
#結束
dev.off()
quartz_off_screen 2
接下來我們介紹用ggplot2
繪圖。基本的指令為:
ggplot2
的例子。ggplot(ChickWeight, aes(x=Diet)) +
geom_bar(fill='#ee22ffaa') +
ggtitle('Diet')
Figure 3.1: ggplot2直方圖1
我們可以在
在產生直方圖時,我們也可以使用
ggplot(ChickWeight, aes(x=Diet)) +
geom_bar(stat='count', fill='#11ff44cc') +
ggtitle('Diet')
Figure 3.2: ggplot2直方圖2
stu<-here::here('data','studentsfull.txt')
students<-read.table(stu, header=T, sep='')
stu.t <- as.data.frame(table(students$Department, dnn=c("Dep")))
stu.t
## Dep Freq
## 1 Aerospace 4
## 2 Chemistry 3
## 3 Economics 4
## 4 English 3
## 5 Journalism 4
## 6 Mechanics 5
## 7 Physics 3
g1 <- ggplot(stu.t, aes(x=Dep, y=Freq)) +
geom_bar(stat = 'identity', fill="#44bb33") + ylab('Count')
g1
Figure 3.3: ggplot2直方圖3
ggplot(students, aes(x=Department)) +
geom_bar(fill='#33aacc')
Figure 3.4: ggplot2直方圖4
dt <- scan(what=list(company="character",
marketvalue="numeric"))
#Apple 851
#Microsoft 703
#Amazon 701
#Facebook 464
#DT<-data.table::setDT(dt)
dt <- data.frame(company=c("Apple", "Microsoft",
"Amazon", "Facebook"),
marketvalue=c(851,703,701,464))
DT <- dt %>% mutate(company=as.factor(company),
marketvalue=as.numeric(marketvalue))
ggplot(DT, aes(x=reorder(company, -marketvalue),
y=marketvalue)) +
geom_bar(aes(fill=company),stat="identity") +
theme_classic()
Figure 3.5: 次數分配轉成直方圖
ggplot2
裡面用☛請由小而大,呈現reshape2
的
ggplot(data=tips, aes(x=time)) +
geom_bar(aes(y=(..count..)/sum(..count..)),
stat="count", fill="#ee0011") +
scale_y_continuous(label = scales::percent)
Figure 3.6: 加上百分比的直方圖
g1 + geom_text(data=stu.t, aes(x=Dep, label=Freq, y=Freq+0.2), size=8)
Figure 3.7: 加上資料標籤的直方圖
# percentage
stu.t <- stu.t %>% mutate(prop.d=round(Freq/sum(Freq),3))
g2 <- ggplot(stu.t, aes(x = Dep, y=prop.d, fill=Dep)) +
geom_bar(stat="identity", position="dodge",
color="black", width = 0.75)+
geom_text(aes(x=Dep, y=prop.d,
label=sprintf("%1.1f%%", 100*prop.d)), size=8, colour='gray40', hjust=1) +
scale_y_continuous(labels = scales::percent) +
labs(y='%') +
coord_flip() +
theme(legend.position = 'none')
g2
Figure 3.8: 加上資料標籤的水平直方圖
salary<-alr4::salary
ggplot(salary, aes(x=rank)) +
stat_count(geom='bar', fill='#0aaee1') +
geom_text(aes(label=..count..), stat='count', colour='white', vjust=1.6)
Figure 3.9: ggplot2直方圖三
salary<-alr4::salary
ggplot(salary, aes(x=rank)) +
geom_bar(fill='#0aaee1') +
geom_text(aes(label=..count..), stat='count', colour='white', vjust=1.6)
Figure 3.10: ggplot2直方圖四
#ggplot2
p1<-ggplot(salary)+geom_bar(aes(x=rank, y=stat(prop), group=1), fill='#bb00cc')
#layer
p1+stat_count(aes(x=rank,y=stat(prop), group =1, label=round(100*stat(prop),digits = 3)),
geom='text', color='white',vjust=1.3) +
scale_y_continuous(label = scales::percent)
Figure 3.11: ggplot2直方圖五
full<-here::here('data','studentsfull.txt')
students<-read.table(full, header=T, sep='')
stu <- dplyr::mutate(students, Pass=(Score>70))
kableExtra::kable_styling(knitr::kable(stu,caption ='學生成績資料'))
ID | Name | Department | Score | Gender | Pass |
---|---|---|---|---|---|
10322011 | Ariel | Aerospace | 78 | F | TRUE |
10325023 | Becky | Physics | 86 | F | TRUE |
10430101 | Carl | Journalism | 69 | M | FALSE |
10401032 | Dimitri | English | 83 | M | TRUE |
10307120 | Enrique | Chemistry | 80 | M | TRUE |
10207005 | Fernando | Chemistry | 66 | M | FALSE |
10305019 | George | Mechanics | 75 | F | TRUE |
10305022 | Howell | Mechanics | 81 | M | TRUE |
10305029 | Ian | Mechanics | 60 | M | FALSE |
10305031 | Julio | Mechanics | 89 | M | TRUE |
10322014 | Kaori | Aerospace | 82 | F | TRUE |
10425026 | Luke | Physics | 88 | M | TRUE |
10401022 | Miguel | English | 92 | M | TRUE |
10501006 | Neo | English | 77 | M | TRUE |
10321010 | Olivia | Economics | 85 | F | TRUE |
10321011 | Peter | Economics | 88 | M | TRUE |
10405017 | Qing | Mechanics | 88 | F | TRUE |
10422007 | Ricky | Aerospace | 91 | M | TRUE |
10422008 | Seiko | Aerospace | 80 | F | TRUE |
10430005 | Terresa | Journalism | 62 | F | FALSE |
10530009 | Usla | Journalism | 87 | F | TRUE |
10421001 | Vivian | Economics | 70 | F | FALSE |
10307018 | Wendy | Chemistry | 85 | F | TRUE |
10425003 | Physics | 93 | M | TRUE | |
10221030 | Yoko | Economics | 66 | F | FALSE |
10430015 | Zoe | Journalism | 92 | F | TRUE |
ggplot(stu, aes(x=Department, fill=Pass)) +
geom_bar(position='dodge')
Figure 3.12: 並列直方圖
library(dplyr)
stu <- students %>% mutate(Pass=(Score>70))
stu.ag <- summarize(group_by(stu, Department, Pass), Count=n())
stu.ag <- stu.ag %>% mutate(Pct=Count/sum(Count))
kableExtra::kable_styling(knitr::kable(stu.ag, caption='學生成績統計'))
Department | Pass | Count | Pct |
---|---|---|---|
Aerospace | TRUE | 4 | 1.0000 |
Chemistry | FALSE | 1 | 0.3333 |
Chemistry | TRUE | 2 | 0.6667 |
Economics | FALSE | 2 | 0.5000 |
Economics | TRUE | 2 | 0.5000 |
English | TRUE | 3 | 1.0000 |
Journalism | FALSE | 2 | 0.5000 |
Journalism | TRUE | 2 | 0.5000 |
Mechanics | FALSE | 1 | 0.2000 |
Mechanics | TRUE | 4 | 0.8000 |
Physics | TRUE | 3 | 1.0000 |
ggplot(stu.ag, aes(x=Department, y=Pct, fill=Pass)) +
geom_bar(stat='identity') +
scale_y_continuous(label = scales::percent)
Figure 3.13: ggbarplot2堆疊直方圖
newstu<-stu.ag[stu.ag$Pass==TRUE,]
ggplot(newstu, aes(x=Department, y=Pct, fill=Pass)) +
geom_bar(stat='identity') +
scale_y_continuous(label = scales::percent)
Figure 3.14: ggplot2堆疊直方圖
janitor
套件中有一個產生次數分配的函式:%>%
這個符號可以結合ggplot2
,例如:
t1.g <- students %>%
janitor::tabyl(Department) %>%
ggplot() + aes(x=Department,y=n, group=1) +
geom_bar(stat='identity', fill='#ABC013')
t1.g
Figure 3.15: tabyl直方圖ㄧ
ggplot2
的資料,產出圖3.15。
t1 <- students %>%
janitor::tabyl(Department)
kable(t1, caption='學生系別次數分配')
Department | n | percent |
---|---|---|
Aerospace | 4 | 0.1538 |
Chemistry | 3 | 0.1154 |
Economics | 4 | 0.1538 |
English | 3 | 0.1154 |
Journalism | 4 | 0.1538 |
Mechanics | 5 | 0.1923 |
Physics | 3 | 0.1154 |
t2 <- students %>%
janitor::tabyl(Department) %>%
ggplot() + aes(x=Department,y=percent, group=1) +
geom_bar(stat='identity', fill='#11FF00AA') +
scale_y_continuous(label = scales::percent)
t2
Figure 3.16: tabyl直方圖二
t3 <- students %>%
janitor::tabyl(Department, Gender) %>%
janitor::adorn_percentages('col')
kableExtra::kable_styling(knitr::kable(t3, caption='性別與科系交叉表'))
Department | F | M |
---|---|---|
Aerospace | 0.2308 | 0.0769 |
Chemistry | 0.0769 | 0.1538 |
Economics | 0.2308 | 0.0769 |
English | 0.0000 | 0.2308 |
Journalism | 0.2308 | 0.0769 |
Mechanics | 0.1538 | 0.2308 |
Physics | 0.0769 | 0.1538 |
t3.n <- reshape2::melt(t3)
kable(t3.n)
Department | variable | value |
---|---|---|
Aerospace | F | 0.2308 |
Chemistry | F | 0.0769 |
Economics | F | 0.2308 |
English | F | 0.0000 |
Journalism | F | 0.2308 |
Mechanics | F | 0.1538 |
Physics | F | 0.0769 |
Aerospace | M | 0.0769 |
Chemistry | M | 0.1538 |
Economics | M | 0.0769 |
English | M | 0.2308 |
Journalism | M | 0.0769 |
Mechanics | M | 0.2308 |
Physics | M | 0.1538 |
ggplot2
得到以下的堆疊圖3.17:
t3.g <- students %>%
janitor::tabyl(Department, Gender) %>%
janitor::adorn_percentages('col') %>%
melt() %>%
ggplot() + aes(x=Department,y=value, fill=variable) +
geom_bar(stat='identity') +
scale_y_continuous(label = scales::percent)
t3.g
Figure 3.17: tabyl直方圖三
t4 <- students %>%
janitor::tabyl(Department, Gender) %>%
janitor::adorn_percentages('row') %>% adorn_ns()
kableExtra::kable_styling(knitr::kable(t4, caption='性別與系別交叉表(系為100%)'))
Department | F | M |
---|---|---|
Aerospace | 0.7500 (3) | 0.2500 (1) |
Chemistry | 0.3333 (1) | 0.6667 (2) |
Economics | 0.7500 (3) | 0.2500 (1) |
English | 0.0000 (0) | 1.0000 (3) |
Journalism | 0.7500 (3) | 0.2500 (1) |
Mechanics | 0.4000 (2) | 0.6000 (3) |
Physics | 0.3333 (1) | 0.6667 (2) |
%>%
串連,送到ggplot2
,就大功告成,見圖3.18:
t4.g <- students %>%
janitor::tabyl(Department, Gender) %>%
janitor::adorn_percentages('row') %>%
melt() %>%
ggplot()+
aes(x=Department,y=value,fill=variable) +
geom_bar(stat='identity') +
scale_y_continuous(label = scales::percent) +
scale_fill_manual(values=c('#C4106A','#12026C'))
t4.g
Figure 3.18: tabyl直方圖四
☛請用
ggplot2
的長條圖使用的函數為ggplot(mtcars, aes(x=mpg)) +
geom_histogram(stat='bin', binwidth=1, fill="#a310ec")
Figure 3.19: ggplot2長條圖一
ggplot(mtcars, aes(x=wt, fill=as.factor(am))) +
geom_density(position="identity", alpha=.4)
Figure 3.20: ggplot2長條圖二
☛請畫出
\[\frac{H+2\times Double+3\times Triple+4\times HR}{AB}\]
ggplot(mtcars, aes(x=wt, y=mpg)) +
geom_point()
Figure 3.21: ggplot2散佈圖一
set.seed(02138)
ggplot(data=data.frame(x=rnorm(100, 1, 1.5),y=rnorm(100,0,1)),
aes(x=x,y=y))+
geom_point(col="#b10a2c", shape=5, size=2)
Figure 3.22: ggplot2散佈圖二
ggplot(mtcars, aes(x=wt, y=mpg)) +
geom_point(col='red') +
geom_smooth(method="lm", se=F, col='blue')
Figure 3.23: ggplot2散佈圖加上迴歸線
ggplot(Orange, aes(x=age, y=circumference)) +
geom_point(aes(col=Tree), size=3) +
geom_smooth(method="loess", se=F) +
theme_bw()
Figure 3.24: 散佈圖加上無母數迴歸線
☛請用散佈圖加迴歸線表示長打率跟三振率(SO/AB)的關係:
ggplot(anscombe, aes(x=x1, y=y1)) +
geom_point(size=3)
Figure 3.25: 三個變數散佈圖一
n <- nrow(anscombe)
anscombe$r <- rep(0, n)
anscombe$r[anscombe$x1>8]<-1
sc1<-ggplot(anscombe, aes(x=x1, y=y1, shape=factor(r)))
sc1 + geom_point(aes(color=factor(r)) , size=3)
Figure 3.26: 三個變數散佈圖二
sc2<-ggplot(anscombe, aes(x=x1, y=y1))
sc2 + geom_point(aes(color=x2) , size=3)
Figure 3.27: 三個變數散佈圖三
☛reshape2
裡面的 french_fries 針對油炸與薯條的味道進行實驗。在這筆資料裡有幾個測量薯條口味的變數,請找兩個變數,畫出散佈圖,但是同時要顯示三個實驗組別的分佈。
ggplot2
無法直接畫出一個連續變數的盒型圖,所以我們先創造一個只有一個值的變數,然後對應我們要顯示的連續變數,例如圖 3.28:
mtcars <- mutate(mtcars, X=1)
ggplot(mtcars, aes(x=as.factor(X), y=mpg)) +
geom_boxplot() +
labs(x="",y='mpg') +
stat_summary(fun.y=median, geom="point", shape=16, size=2) +
theme_bw()
Figure 3.28: 盒型圖一
g1 <-ggplot(mtcars, aes(x=as.factor(cyl), y=hp, group=cyl)) +
geom_boxplot() +
labs(x='Cylinder', y="Horse Power") +
stat_summary(fun.y=median, geom="point", shape=16, size=2) +
theme_bw()
g1
Figure 3.29: 盒型圖二
當我們需要改變在X軸的類別變數的名稱時,可以用
o1<-ggplot(Orange, aes(x=Tree, y=circumference)) +
geom_boxplot(aes(colour=Tree), size=1.2)
o1+scale_x_discrete(breaks=1:5,
labels=c("A","B", "C", "D", "E"))
Figure 3.30: 橘子樹箱型圖1
Orange <- Orange %>% mutate(trees=factor(Tree,
levels=c(1,2,3,4,5)))
o1<-ggplot(Orange, aes(x=trees, y=circumference)) +
geom_boxplot(aes(colour=trees), size=1.2)
o1+scale_x_discrete(breaks=1:5,
labels=c("A","B", "C", "D", "E"))
Figure 3.31: 橘子樹箱型圖2
g1 + scale_y_continuous(limits = c(50, 150)) +
theme_bw()
Figure 3.32: 盒型圖四
☛請分別畫出
ggplot2
裡面的的diamonds
資料中的cut的次數分配,先針對這個變數計算個數如表 3.6:
dt <-group_by(diamonds, cut)
dat = summarize(dt, count=n())
kableExtra::kable_styling(knitr::kable(dat, caption='鑽石切割等級次數分佈'))
cut | count |
---|---|
Fair | 1610 |
Good | 4906 |
Very Good | 12082 |
Premium | 13791 |
Ideal | 21551 |
# Add addition columns, needed for drawing with geom_rect.
dat$fraction = dat$count / sum(dat$count)
dat = dat[order(dat$fraction), ]
dat$ymax = cumsum(dat$fraction)
dat$ymin = c(0, head(dat$ymax, n=-1))
kableExtra::kable_styling(knitr::kable(dat))
cut | count | fraction | ymax | ymin |
---|---|---|---|---|
Fair | 1610 | 0.0298 | 0.0298 | 0.0000 |
Good | 4906 | 0.0910 | 0.1208 | 0.0298 |
Very Good | 12082 | 0.2240 | 0.3448 | 0.1208 |
Premium | 13791 | 0.2557 | 0.6005 | 0.3448 |
Ideal | 21551 | 0.3995 | 1.0000 | 0.6005 |
p1 = ggplot(dat, aes(fill=cut, ymax=ymax, ymin=ymin, xmax=4, xmin=3)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(0, 4)) +
theme(panel.grid=element_blank()) +
theme(axis.text=element_blank()) +
theme(axis.ticks=element_blank()) +
annotate("text", x = 0, y = 0, label = "Diamond Cut") +
labs(title="", x="", y='')
p1
Figure 3.33: 甜甜圈圖一
p1 = ggplot(dat, aes(fill=cut, ymax=ymax, ymin=ymin, xmax=4, xmin=0)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(0, 4)) +
theme(panel.grid=element_blank()) +
theme(axis.text=element_blank()) +
theme(axis.ticks=element_blank()) +
# annotate("text", x = 0, y = 0, label = "Diamond Cut") +
labs(title="Diamond Cut",x="",y='')
p1
Figure 3.34: 甜甜圈圖二
☛請嘗試把
R
的基本指令對airquality
這筆時間序列資料畫折線圖,例如圖 3.35:
ggplot(airquality, aes(x=Day, y=Wind)) +
geom_line()
Figure 3.35: ggplot折線圖一
N <- summarize(airquality, n())
airquality <- mutate(airquality, index=seq_along(1:N[,1]))
ggplot(airquality, aes(x=index, y=Wind)) +
geom_line()
Figure 3.36: ggplot折線圖二
airquality$Date <- paste0(airquality$Month, "/", airquality$Day)
airquality$Date <- as.Date(airquality$Date, format="%m/%d")
ggplot(airquality, aes(x=Date, y=Wind)) +
geom_line()
Figure 3.37: ggplot2折線圖三
airquality.n <- select(airquality, Date, Wind, Temp)
library(reshape2)
airquality.l<-melt(airquality.n, id.vars=c('Date'))
head(airquality.l)
## Date variable value
## 1 2024-05-01 Wind 7.4
## 2 2024-05-02 Wind 8.0
## 3 2024-05-03 Wind 12.6
## 4 2024-05-04 Wind 11.5
## 5 2024-05-05 Wind 14.3
## 6 2024-05-06 Wind 14.9
ggplot(airquality.l, aes(x=Date)) +
geom_line(aes(y=value, col=variable)) +
labs(x='Month')
Figure 3.38: ggplot2折線圖四
☛請畫出
ggplot2
可以應用在比較迴歸係數的大小以及標準誤的範圍,方便讀者瞭解模型的估計結果。例如我們估計有三個自變數的迴歸模型:
m1 <- with(mtcars, lm(mpg ~ wt + hp + am))
summary(m1)
##
## Call:
## lm(formula = mpg ~ wt + hp + am)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.422 -1.792 -0.379 1.225 5.532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.00288 2.64266 12.87 2.8e-13 ***
## wt -2.87858 0.90497 -3.18 0.00357 **
## hp -0.03748 0.00961 -3.90 0.00055 ***
## am 2.08371 1.37642 1.51 0.14127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.54 on 28 degrees of freedom
## Multiple R-squared: 0.84, Adjusted R-squared: 0.823
## F-statistic: 49 on 3 and 28 DF, p-value: 2.91e-11
df <- data.frame(coef=coef(m1)[-1], se=coef(summary(m1))[-1,
"Std. Error"] )
dt <- mutate(df, lower=coef-qt(0.975,
nrow(mtcars))*se, upper=coef+qt(0.975,
nrow(mtcars))*se)
s1 <- ggplot() +
geom_segment(data=dt,
mapping=aes(x=row.names(dt), y=lower,
xend=row.names(dt), yend=upper),
size=2, color="blue")
s1
Figure 3.39: ggplot圓點圖一
s1 + geom_point(data=dt,
aes(x=row.names(dt), y=coef), size=4, shape=21, fill="white") +
scale_x_discrete(breaks=c('wt','hp','am'),
labels=c("Weight", "Horse Power", "Auto")) +
xlab("") +
ylab("Estimates") +
coord_flip() +
theme(axis.text.y = element_text(face="bold", color="#993333", size=10))
Figure 3.40: ggplot圓點圖二
☛請把係數名稱跟係數順序根據圖 3.39 得到的順序排列一遍:
tmp <- dt[order(rev(row.names(dt))),]
s2=ggplot() +
geom_point(data=tmp, aes(x=row.names(tmp),
y=rev(coef)), size=4, shape=21, fill="#e21b33") +
scale_x_discrete(breaks=c("wt","hp","am"),
labels=c( "Auto", "Horse Power","Weight")) +
ylab("") +
xlab("") +
coord_flip() +
theme(axis.text.y = element_text(face="bold", color="#00ABFE", size=14) )
s2+geom_segment(data=dt, mapping=aes(x=row.names(dt), y=rev(lower), xend=row.names(dt), yend=rev(upper)),
size=2, color="#FABEC1")
Figure 3.41: ggplot2係數圖
ggplot2
還有更多有趣的圖形,也允許使用者設定圖形風格、資料點顏色、副標題、圖例的標題等等,請同學上網多多參考他人的例子。例如圖3.42顯示經濟學人雜誌風格的直方圖:
library(ggthemes)
# Number of Cases
dt <- data.frame(Year=c(2013, 2014, 2015, 2016, 2017),
Amount=c(65, 56, 78, 85, 79))
# Economist theme
p1<-ggplot() + theme_economist() + scale_fill_economist() +
geom_bar(aes(x=Year, y=Amount),
data=dt, stat='identity')
#data label
p3 <- p1+ theme(axis.title= element_text(color="blue", size=14, face="bold"),
axis.text = element_text(size=14),
legend.text=element_text(size=18))+
ggtitle("Number of Cases") +
labs(y="N", subtitle="Children, Spouses, Financial, etc.")
p3
Figure 3.42: 主題風繪圖
ggplot(diamonds, aes(cut, price)) +
geom_bar(stat = "summary_bin", fun.y = "mean", aes(fill=cut))
Figure 3.43: 平均值直方圖
pscl
這個套件裡面的admit
資料為例,用直方圖 3.44顯示不同分數等級的人數:
d1 <- ggplot(pscl::admit, aes(score))
d1 + geom_bar(fill = '#b4b106')
Figure 3.44: 分數等級之直方圖
d1 + stat_summary(aes(y = gre.verbal), fun = 'mean', geom='bar', fill='#3bb010')
Figure 3.45: 分數等級之平均值直方圖
ggplot2
這個套件裡面的mpg
資料為例,圖 3.46顯示不同車型在市區行駛的每加侖哩程數,這裡我們用ggplot(mpg, aes(class, cty)) +
geom_point() +
stat_summary(geom = 'pointrange',
fun.y = mean, colour = "red", size = 0.6)
Figure 3.46: 平均值點狀圖ㄧ
ggplot(mpg, aes(class, cty)) +
geom_point() +
stat_summary(fun='mean', colour = "orange", size = 0.6) +
stat_summary(fun='median', colour = "blue", size = 0.6)
Figure 3.47: 平均值點狀圖二
G1<-ggplot(mpg, aes(as.factor(drv), cty, fill = as.factor(drv))) +
geom_boxplot() +
stat_summary(geom = 'crossbar',
width=0.8, fatten=0.5, colour='white',
fun.data=function(x){return(c(y=mean(x),
ymin=mean(x),ymax=mean(x)))}) +
theme(legend.position = 'none')
G1
Figure 3.48: 平均值箱型圖
HH
套件裡面有一個函式叫做\(\texttt{likert}\),可以用來分析多個題目的同意程度,呈現每一題同意的多寡,讓讀者比較哪些題目有相對比較高的同意程度。library(HH)
file <- here::here('data','HXC23014 Harvard Poll Data.sav')
dt <- sjlabelled::read_spss(file)
dt$pid<-c()
dt$pid[dt$Q61==1] <- 1
dt$pid[dt$Q61==2] <- 2
dt$pid[dt$Q61==3 & dt$Q62==1] <- 2
dt$pid[dt$Q61==4 & dt$Q62==1] <- 2
dt$pid[dt$Q61==5 & dt$Q62==1] <- 2
dt$pid[dt$Q61==3 & dt$Q62==2] <- 1
dt$pid[dt$Q61==4 & dt$Q62==2] <- 1
dt$pid[dt$Q61==5 & dt$Q62==2] <- 1
dt <- dt|>mutate(Gender = recode_factor(Q1, `1` = 'Male', `2`='Female',
.default = NA_character_)) |>
mutate(heritage = dplyr::recode_factor(Q32,
`1`="Heritage Doesn't Influence",
`2`="Heritage Have Influence")) |>
mutate(pid = dplyr::recode(pid,
`1`='Democrats', `2`='Republicans')) |>
mutate(nD103 = as.numeric(D103)) |>
mutate(nD103 = car::recode(nD103, "1:2=1;3=2;4:5=3;6:8=4")) |>
mutate(edu = dplyr::recode_factor(nD103,
`1`="Less than high school",
`2`="High school",
`3` = "Some college",
`4` = "Bachlor degree")) |>
mutate(age = car::recode(Q57, "18:34=1;35:49=2;50:64=3;65:90=4;95:98=NA")) |>
mutate(age.f = recode_factor(age, `1`= "18-34 years old",
`2`= "35-49 years old",
`3`= "50-64 years old",
`4`= "65-90 years old",
.default = NA_character_))
likertdf24 <- dt %>% dplyr::select(heritage, edu, age.f, Gender)
t0<-with(likertdf24, table(heritage))
t1<-with(likertdf24, table(Gender, heritage))
t2<-with(likertdf24, table(age.f, heritage))
t3<-with(likertdf24, table(edu, heritage))
t.all <- data.frame(rbind(t0, t1, t2, t3))
t.all
Heritage.Doesn.t.Influence Heritage.Have.Influence
t0 924 723 Male 414 401 Female 506 318 18-34 years old 231 279 35-49 years old 273 282 50-64 years old 242 97 65-90 years old 177 64 Less than high school 59 44 High school 257 113 Some college 301 150 Bachlor degree 307 416
colnames(t.all)<-c("Heritage Doesn't Influence", "Heritage Have Influence")
alltmp1 <- data.frame(Question=row.names(t.all), t.all[,1:2])
rownames(alltmp1) <- c(1:11)
alltmp1$Subtable <- c('Heritage Have Something with You?',
rep('Gender',2), rep('Age', 4),
rep('Education', 4))
colnames(alltmp1)<-c('Question',"Heritage Doesn't Influence", "Heritage Have Influence",'Subtable')
alltmp1[1,1]<-'All survey responses'
p1 <- HH::likert(Question ~.|Subtable, alltmp1, as.percent = TRUE,
positive.order = F,
scales = list(y = list(relation = "free")), layout = c(1, 5),
main='', ylab='')
p1
Figure 3.49: 傳統態度與人口背景
x<-rbinom(1000, 610, 0.2)
y<-rgamma(1000, 3)
qplot(x, y, color='#4b4102') +
theme(legend.position = 'none')
Figure 4.1: qplot散佈圖1
可以看出不需要變數在資料框,就可以畫圖。而且語法更為直觀,但是可以加入
利用
x <- seq(-2, 2, length = 100)
y <- x^2
qplot(x, y, geom = "line")
Figure 4.2: qplot函數圖
library(carData)
qplot(data=WeightLoss, group, y = ..count../sum(..count..),
fill = group, geom = c('bar')) +
scale_fill_brewer(palette = "Set5") +
scale_y_continuous(labels = scales::percent) +
labs(y='')
Figure 4.3: qplot直方圖
library(ggplot2)
qplot(cut, data = diamonds,
geom = c("bar"), fill = color, group = color)
Figure 4.4: qplot直方堆疊圖
library(ggplot2)
df <- diamonds %>% dplyr::select(cut, color)
newdf <-reshape2::melt(df, id.vars=c("cut"))
colnames(newdf)<-c("cut", "color.g", "color")
qdf <- summarize(group_by(newdf, cut, color), N=n())
g1 <- ggplot(qdf, aes(x=cut, y=N, fill=color)) +
geom_bar(stat="identity")
g1 + scale_fill_grey(start = 0.2, end = 0.9) +
theme_bw()
Figure 4.5: 灰階原始數目直方圖
library(carData)
set.seed(02138)
SLID.sub <- SLID[sample(nrow(SLID), 200),]
qplot(education, wages, data=SLID.sub, size=I(2),shape=I(17),color=sex)
Figure 4.6: qplot散佈圖2
qplot(age, wages, data=SLID.sub, size=I(2),shape=I(4),
geom=c('point','smooth'))
Figure 4.7: qplot散佈圖3
set.seed(11601)
x1<-c()
for (i in 1:100){x1[i] <- sample(SLID$wages, 1)
}
x2<-c()
for (i in 1:500){x2[i] <- sample(SLID$wages, 1)
}
x3<-c()
for (i in 1:1000){x3[i] <- sample(SLID$wages, 1)
}
x4<-c()
for (i in 1:2000){x4[i] <- sample(SLID$wages, 1)
}
mydata <- data.frame(X=c(x1,x2,x3,x4), sample.n=c(rep('100',100),
rep('500',500),rep('1000',1000),rep('2000',2000)))
mydata$sample.n <- factor(mydata$sample.n, levels=c('100','500',
'1000','2000'))
p<-qplot(X, geom = 'histogram', bins=30, facets = sample.n ~., data=mydata, fill=sample.n)
p + theme_bw()
Figure 4.8: qplot多重圖形
☛請把係數名稱跟係數順序根據圖 3.39 得到的順序排列一遍:
lattice
套件用格子(trellis)圖形來產生高品質的資料視覺化,尤其是用在多變量的資料。
Deepayan Sakar在2008年出版「Lattice: Multivariate Data Visualisation in R」,詳細地介紹lattice套件的功能。有興趣的同學可以閱讀。
lattice
套件的直方圖、長條圖、雙變數直方圖、散佈圖、折線圖、雙變數長條圖、箱型圖、點狀圖功能等等。
library(lattice)
barchart(starwars$skin_color,col="gray60",
scale=list(cex=0.5, relation="free"),
aspect=.8)
Figure 5.1: lattice直方圖一
lattice
的直方圖是橫的,適合類別多的變數,例如我國面對最重要的問題,或者最常看的網站,如果用橫的直方圖會比較容易呈現。
☛請嘗試對
histogram(faithful$eruptions, col='#a01eb2')
Figure 5.2: lattice長條圖一
R
基礎指令hist函數得到的圖形相比,多了一個外框。
lattice
並沒有單變數的直方圖,以長條圖代替直方圖。但是比較類別變數與連續變數之間的關係時,可以用直方圖,並且可以控制另一個類別變數。例如圖 5.3顯示品種跟顏色的關係:
library(MASS)
data(crabs)
crabs$sp<-factor(crabs$sp, labels=c("Blue", "Orange"))
crabs$sex<-factor(crabs$sex, labels=c("Female", "Male"))
barchart(sp ~ BD|sex, data=crabs,
col=RColorBrewer::brewer.pal(n=4, 'RdGy'),
main=paste0("Crab's Body Depth(mm)"))
Figure 5.3: lattice直方圖二
lattice::dotplot(VADeaths, group=F, type="o")
Figure 5.4: lattice點狀圖一
vad <- as.data.frame.table(VADeaths)
names(vad) <- c("age", "demographic", "deaths")
head(vad, n=3)
## age demographic deaths
## 1 50-54 Rural Male 11.7
## 2 55-59 Rural Male 18.1
## 3 60-64 Rural Male 26.9
dpt<-dotplot(age ~ deaths, vad, group=demographic,
type = "o")
update(dpt, auto.key=list(points=T, lines=T))
Figure 5.5: lattice點狀圖二
orange.mean <- tapply(Orange$circumference,
Orange$Tree, mean)
dotplot(names(orange.mean) ~ orange.mean,
aspect = .5,
ylab = "Group",
xlab = "Mean Circumference")
Figure 5.6: lattice點狀圖一
dplyr
的o.mean<-Orange %>% group_by(Tree) %>% summarise(mean(circumference))
dotplot(Tree ~ `mean(circumference)`, aspect=0.5, data=o.mean,
ylab = "Group", cex=1.5,
xlab = "Mean Circumference")
Figure 5.7: lattice點狀圖二
dplyr
的pipeline,而是要在函數內設定data=。o.mean<-Orange %>% group_by(Tree) %>% summarise(avg=mean(circumference))
d3<-dotplot(reorder(Tree,-avg) ~ avg, aspect=0.5, data=o.mean,
ylab = "Group",
xlab = "Mean Circumference",
cex=1.5)
d3
Figure 5.8: lattice點狀圖三
with(lattice::ethanol, xyplot(NOx ~ E))
Figure 5.9: lattice散佈圖一
with(lattice::ethanol, xyplot(NOx ~ E|C, col='#FF3300'))
Figure 5.10: lattice散佈圖二
lattice
,類別變數對應連續變數的密度長條圖,可以一次展示在同一個圖上面。用以下指令畫圖5.11:
histogram(~temp|origin, data=nycflights13::weather,
type="density", layout=c(1,3))
Figure 5.11: lattice長條圖
lattice::bwplot(as.factor(cyl) ~ mpg, data=mtcars)
Figure 5.12: lattice箱型圖
lattice::xyplot(MASS::accdeaths)
Figure 5.13: lattice折線圖一
tsp<-TSstudio::ts_reshape(MASS::accdeaths,
type = 'long')
head(tsp, n=5)
## year month value
## 1 1973 1 9007
## 2 1973 2 8106
## 3 1973 3 8928
## 4 1973 4 9137
## 5 1973 5 10017
xyplot(value ~ month | as.factor(year),
data=tsp, type="l", col="black",
index.cond=list(c(4,5,6,1,2,3)),
scales = list(at=c(1,6, 12),
labels=c("Jan.","Jun.","Dec.")),
strip = strip.custom(bg="gray90"),
ylab="percent")
Figure 5.14: 每年的意外死亡折線圖
ggplot2
,但是提供ggplot2
以外的選擇。
ggpubr
指的是ggplot publication ready。請下載ggpubr
這個套件。詳細介紹可以參考這個網站。
ggpubr
的優點是美化ggplot2
的圖形,指令相當類似,例如圖6.1顯示鄉村與城市的死亡率長條圖,並且以虛線標示變數的平均值:
mor<-lattice::USRegionalMortality
ggpubr::gghistogram(mor, x = "Rate", bins=40,
add = "mean", rug = TRUE,
color = "Status", fill = "Status",
palette = c("#EE0011", "#111100"))
Figure 6.1: ggpubr長條圖
ggpubr
的圖形顯示出觀察值。
p <- ggpubr::ggboxplot(mtcars, x = "cyl", y = "mpg",
color = "cyl", palette =c("#00AFBB", "#E7B800", "#FC4E07"),
add = "jitter", shape = "cyl")
p
Figure 6.2: ggpubr的箱型圖
ggstatsplot
套件結合圖形與統計,直接呈現圖形上面的統計結果。先下載套件:
utils::install.packages(pkgs = "ggstatsplot")
# for reproducibility
set.seed(02139)
data(ISLR::Default)
# plot
ggstatsplot::ggpiestats(
data = ISLR::Default,
x = student,
y = default,
title = "Student and Default", # title for the plot
legend.title = "student", # title for the legend
caption = substitute(paste(italic("Source"), ": ISLR")),
messages = FALSE
)
Figure 6.3: 卡方檢定圓餅圖
ggstatsplot
可以顯示相關係數。在執行下面指令前先下載ggside
,方法如下:
# downloading the package from GitHub
devtools::install_github(
repo = "jtlandis/ggside", # package path on GitHub
dependencies = FALSE, # assumes you have already installed needed packages
quick = TRUE # skips docs, demos, and vignettes
)
library(ggside)
ggstatsplot::ggscatterstats(
data = alr4::florida,
x = Gore,
y = Bush,
xlab = "Vote for Gore",
ylab = "Vote for Bush",
title = "County-by-county vote for president in Florida in 2000 for Bush, Gore",
caption = substitute(paste(italic("Source"), ": alr4")),
messages = FALSE,
bf.message = F,
results.subtitle = T
)
Figure 6.4: ggstatsplot散佈圖與迴歸係數
還記得一開始的課程目標是畫出類別變數與連續變數之間的關係,並且標示各類別的平均值?請見圖 7.1。
library(alr4)
data(salary)
p <- ggplot(salary, aes(x=rank, y=salary)) +
geom_point() +
theme_classic()
p + ggplot2::stat_summary(fun='mean', colour='red',size=3, geom = "point")
Figure 7.1: ggplot2散佈圖加上平均值
p + stat_summary(fun = mean, colour='#2200ae',size=1.3,
fun.min = function(x) mean(x) - sd(x),
fun.max = function(x) mean(x) + sd(x),
geom = "pointrange") +
stat_summary(fun = mean, geom = "line") +
scale_x_discrete(breaks=c('Asst','Assoc','Prof'),
labels=c('Assistant Professor','Associate Professor', 'Full Professor'))
Figure 7.2: ggplot2散佈圖加上平均值及標準差
carData
套件裡面的Ericksen這筆資料,製作poverty以及crime的散佈圖,並且加上城市的名稱。
MASS::birthwt
)的分佈。
reshape2::tips
這筆資料的tips與total_bill兩個變數的關係,並且標記顧客是否抽煙(smoker)。請記得加上圖例。
ggplot2
繪製「灰階」堆疊圖顯示hsb2的race與ses的關係。
flights
這筆資料之中的arr_delay,畫圖顯示每一家航空公司(carrier)的分布情況:
airquality
這筆資料的8,9兩個月份的資料,並且用ggplot2
畫出
library(dplyr)
DT <- tibble(Country=c("Spain", "France", "Germany","Italy", "Sweden"),
Populist=c(52,58,65,50, 73),
Mixed=c(70,72,70,65,81),
Nonpopulist=c(85,84,88,68, 90))
kableExtra::kable_styling(knitr::kable(DT))
Country | Populist | Mixed | Nonpopulist |
---|---|---|---|
Spain | 52 | 70 | 85 |
France | 58 | 72 | 84 |
Germany | 65 | 70 | 88 |
Italy | 50 | 65 | 68 |
Sweden | 73 | 81 | 90 |
ggthemes
套件set.seed(2019)
m=1000; n=580; p1=0.54; p2=0.60; p3=0.70; p4=0.80
res1=rbinom(m, n, p1);res2=rbinom(m, n, p2); res3=rbinom(m,n,p3);
res4=rbinom(m, n, p4)
dplyr
指令整理資料。圖上面加上資料數值。)
lattice
畫圖顯示
最後更新時間: 2024-04-01 19:53:32