Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.
#women是內建資料檔,只有2個variables(height和weight)
head(women) height weight
1 58 115
2 59 117
3 60 120
4 61 123
5 62 126
6 63 129
#basic R,畫women的plot
#The type = "n" is used when you want to draw multiple graphs (say lines) on same plot then type = "n" can be used prepare the layout with limits and label of axis.
plot(women, type='n')
#畫上一個點,women第一個row的資料
points(women[1,])# 用lattice畫xyplot
# Y=weight X=height data來自women
# 只畫一筆資料subset=row.names(women)==1
# type:1-character string giving the type of plot desired. default就是p
# 這邊只有一個subject,所以選其他的type不會顯示
lattice::xyplot(weight ~ height,
data=women,
subset=row.names(women)==1, type='p')lattice::xyplot(weight ~ height,
data=women,
type='p')# weight ~ height的資料以line表示
lattice::xyplot(weight ~ height,
data=women,
type='l')library(ggplot2)
# 畫ggplot只畫表格X、Y軸
# aes(X=height, Y=weight),不寫X、Y也沒關係,用位置判斷
ggplot(data=women[1,], aes(height, weight)) #畫好表格後+加上資料,延用資料data=women[1,],所以只會有一個point
ggplot(data=women[1,], aes(height, weight)) +
geom_point()# 比較basic plot跟lattice xyplot畫出來資料的class
w1 <- plot(women, type='n')w2 <- lattice::xyplot(weight ~ height, data=women)
# class(w1) R不知道這是甚麼,因為basic plot在class上沒甚麼意義
class(w1)[1] "NULL"
# class(w2) lattice:A trellis object, as returned by high level lattice functions like xyplot, is a list with the "class" attribute set to "trellis"
class(w2)[1] "trellis"
# trellis的內建功能有哪些
methods(class='trellis')[1] [ dim dimnames dimnames<- plot print summary
[8] t update
see '?methods' for accessing help and source code
The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below. Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.
班級分為小、中、大(依比例) IQ分為低、中、高(依比例)
knitr::include_graphics("langMath.png")dta<-read.table("D:/langMathDutch.txt", header=TRUE,sep = "")
head(dta) school pupil IQV size lang arith
1 1 17001 15.0 29 46 24
2 1 17002 14.5 29 45 19
3 1 17003 9.5 29 33 24
4 1 17004 11.0 29 46 26
5 1 17005 8.0 29 20 9
6 1 17006 9.5 29 30 13
str(dta)'data.frame': 2287 obs. of 6 variables:
$ school: int 1 1 1 1 1 1 1 1 1 1 ...
$ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
$ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ size : int 29 29 29 29 29 29 29 29 29 29 ...
$ lang : int 46 45 33 46 20 30 30 57 36 36 ...
$ arith : int 24 19 24 26 9 13 13 30 23 22 ...
summary(dta) school pupil IQV size lang
Min. : 1 Min. : 17001 Min. : 4.00 Min. : 5.0 Min. : 9.00
1st Qu.: 67 1st Qu.: 677013 1st Qu.:10.50 1st Qu.:17.0 1st Qu.:35.00
Median :141 Median :1417007 Median :12.00 Median :24.0 Median :42.00
Mean :133 Mean :1337225 Mean :11.83 Mean :23.1 Mean :40.93
3rd Qu.:195 3rd Qu.:1957204 3rd Qu.:13.00 3rd Qu.:28.0 3rd Qu.:48.00
Max. :258 Max. :2587010 Max. :18.00 Max. :37.0 Max. :58.00
arith
Min. : 2.00
1st Qu.:14.00
Median :20.00
Mean :19.44
3rd Qu.:25.00
Max. :30.00
library(dplyr)
#class size分組(column size),以總人數均分small, medium, large
#verbal IQ分組(column IQV),以人數均分為low, middle, high
#選前六項,列成table
dta %<>%
mutate(class_f=cut(size,
breaks=quantile(size, probs=c(0, .33, .66, 1)),
label=c("small", "medium", "large"),
ordered=T)) %>%
mutate(IQV_f=cut(IQV,
breaks=quantile(IQV, probs=c(0, .33, .66, 1)),
label=c("low", "middle", "high"),
ordered=T))
head(dta) school pupil IQV size lang arith class_f IQV_f
1 1 17001 15.0 29 46 24 large high
2 1 17002 14.5 29 45 19 large high
3 1 17003 9.5 29 33 24 large low
4 1 17004 11.0 29 46 26 large low
5 1 17005 8.0 29 20 9 large low
6 1 17006 9.5 29 30 13 large low
str(dta)'data.frame': 2287 obs. of 8 variables:
$ school : int 1 1 1 1 1 1 1 1 1 1 ...
$ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
$ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ size : int 29 29 29 29 29 29 29 29 29 29 ...
$ lang : int 46 45 33 46 20 30 30 57 36 36 ...
$ arith : int 24 19 24 26 9 13 13 30 23 22 ...
$ class_f: Ord.factor w/ 3 levels "small"<"medium"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ IQV_f : Ord.factor w/ 3 levels "low"<"middle"<..: 3 3 1 1 1 1 1 3 1 1 ...
p1<- ggplot(dta,
aes(lang, arith)) +
geom_point(size=rel(.5)) +
stat_smooth(method="lm",
formula=y ~ x,
se=T, #95%的預測線,default是T,不要顯示預測線要=F
col='blue',
size=rel(.5)) +
labs(x='Language score',
y='Arithmetic score')+
theme_bw()
p1p2<-p1+
facet_grid(dta$class_f~dta$IQV_f)
p2#發現圖有NA
#看一下有多少NA數值
sum(is.na(dta))[1] 7
#刪除遺漏值存成rm.dta
rm.dta <- dta[complete.cases(dta), ]
#重畫一次
p1 <- ggplot(rm.dta,
aes(lang, arith)) +
geom_point(size=rel(.5),
alpha=.5) +
stat_smooth(method="lm",
formula=y ~ x,
se=T,
col='blue',
size=rel(.5)) +
labs(x='Language score',
y='Arithmetic score')+
theme_bw()
p1p2<-p1+
facet_grid(rm.dta$class_f~rm.dta$IQV_f)
p2#畫出來跟我們想要的不太一樣(雖然意思一樣
#參考婉禎建議facet_wrap指令
#facets表示形式爲:~變量(~單元格) #facet_grid是基於兩個因子進行設置,facets表示形式爲:變量~變量(行~列)
#如果把一個因子用點表示,也可以達到facet_wrap的效果,也可以用加號設置成兩個以上變量,例如:變量+變量~變量 的形式,表示對三個變量設置分面。
p3<-p1+
facet_wrap(.~IQV_f+class_f, #設定哪兩個變項要放一起
nrow=3)
p3p4<-p1+
facet_wrap(.~IQV_f+class_f, #設定哪兩個變項要放一起
nrow=3, #設定圖形有幾個row
labeller=labeller(.mlti_line=F)) #有沒有加這行似乎沒甚麼差別
p4sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age. Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271. Data: autism{WWGbook}
knitr::include_graphics("VSAE1.png")knitr::include_graphics("VSAE2.png")library(WWGbook)
dta<-autism
#'data.frame': 612 obs. of 4 variables
attach(dta)
str(dta)'data.frame': 612 obs. of 4 variables:
$ age : int 2 3 5 9 13 2 3 5 9 13 ...
$ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
$ sicdegp: int 3 3 3 3 3 3 3 3 3 3 ...
$ childid: int 1 1 1 1 1 3 3 3 3 3 ...
head(dta) age vsae sicdegp childid
1 2 6 3 1
2 3 7 3 1
3 5 18 3 1
4 9 25 3 1
5 13 27 3 1
6 2 17 3 3
#看一下各類別變項的次數分配狀況
sum1<-summarise(dta,
nchildren=n(),
nage=n_distinct(age),
nsicdegp=n_distinct(sicdegp))
sum1 nchildren nage nsicdegp
1 612 5 3
#看不懂原始圖片(Age, in years, centered)的意思
#先看一下dta$age的資料結構
summary(dta$age) Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 4.000 5.771 9.000 13.000
#猜測center是mean,但看起來不像
p0<- ggplot(dta,
aes(x=age-5.771, y=vsae)) +
geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
geom_point(size=rel(1))
p0#猜測center是median,但看起來也是不像
p0m<- ggplot(dta,
aes(x=age-4, y=vsae)) +
geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
geom_point(size=rel(1))
p0m#放棄猜測center的意思
p1<- ggplot(dta,
aes(x=age, y=vsae)) +
geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
geom_point(size=rel(1))
p1p2<-p1+
stat_smooth(method="lm",
formula=y ~ x,
se=T,
col='blue',
size=rel(1)) +
labs(x='Age',
y='VSA score')+
theme_bw()
p2#加上facet_grid後,資料按照原來的label 1,2,3
p3<-p2+
facet_grid(.~dta$sicdegp)
p3#給變項sicdegp新的label name
sicdegp<- factor(sicdegp)
dta$sicdegp <-factor(dta$sicdegp,
levels=c(1, 2, 3),
labels=c("L", "M", "H"))
p1<- ggplot(dta,
aes(x=age, y=vsae)) +
geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
geom_point(size=rel(1))
p1p2<-p1+
stat_smooth(method="lm",
formula=y ~ x,
se=T,
col='blue',
size=rel(1)) +
labs(x='Age',
y='VSA score')+
theme_bw()
p2p3<-p2+
facet_grid(.~dta$sicdegp)
p3#除了(Age, in years, centered)之外,其他還蠻像的#畫圖要依據age的分組先算mean跟se of vsae
p <- dta %>% group_by(age) %>%
summarize(m_vsae=mean(vsae),
se_vsae=sd(vsae)/sqrt(n()), .groups='drop')
p# A tibble: 5 x 3
age m_vsae se_vsae
<int> <dbl> <dbl>
1 2 9.09 0.309
2 3 NA NA
3 5 21.5 1.40
4 9 NA NA
5 13 60.6 5.02
#不知道為何age=3、age=5這兩組mean跟se算不出來?
#mean(dta$vsae)也是NA
mean(dta$vsae)[1] NA
#感覺是資料有問題
#結果發現column vsae 有 NA's:2
summary(dta) age vsae sicdegp childid
Min. : 2.000 Min. : 1.00 L:192 Min. : 1.00
1st Qu.: 2.000 1st Qu.: 10.00 M:255 1st Qu.: 48.75
Median : 4.000 Median : 14.00 H:165 Median :107.50
Mean : 5.771 Mean : 26.41 Mean :105.38
3rd Qu.: 9.000 3rd Qu.: 27.00 3rd Qu.:158.00
Max. :13.000 Max. :198.00 Max. :212.00
NA's :2
#再確認一次na的個數=2
sum(is.na(dta))[1] 2
K-Nearest Neighbours(KNN)運用在遺漏值填補上:{DMwR}
找和自己很像的K個附近的資料,然後從他們身上複製自己所沒有的東西。 安裝的時候出現Warning in install.packages :package ‘DMwR’ is not available for this version of R.A version of this package for your version of R might be available elsewhere,see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
MICE運用在遺漏值填補上:{mice}
提供了很多資料探勘的模型來預測遺漏值
library(mice)
#遺漏值的處理
#選擇1:直接刪除一整筆資料,缺點是少了一筆資料
#選擇2:平均值來填補遺漏值,缺點是對資料不會有甚麼影響(好像也是優點?)
#選擇3:最推崇的就是「k-NearestNeighbours」或「mice套件」來填補遺漏值
#mice的全名為Multivariate Imputation via Chained Equations
#看missing data的pattern
# There are 610 observations with no missing values.
# There are 2 observations with missing values in vsae.
md.pattern(dta) age sicdegp childid vsae
610 1 1 1 1 0
2 1 1 1 0 1
0 0 0 2 2
#填補遺漏值
#細節有點複雜(還搞不太懂)
imputed_dta<-mice(dta, m=5, maxit=50, method='pmm', seed=500)
iter imp variable
1 1 vsae
1 2 vsae
1 3 vsae
1 4 vsae
1 5 vsae
2 1 vsae
2 2 vsae
2 3 vsae
2 4 vsae
2 5 vsae
3 1 vsae
3 2 vsae
3 3 vsae
3 4 vsae
3 5 vsae
4 1 vsae
4 2 vsae
4 3 vsae
4 4 vsae
4 5 vsae
5 1 vsae
5 2 vsae
5 3 vsae
5 4 vsae
5 5 vsae
6 1 vsae
6 2 vsae
6 3 vsae
6 4 vsae
6 5 vsae
7 1 vsae
7 2 vsae
7 3 vsae
7 4 vsae
7 5 vsae
8 1 vsae
8 2 vsae
8 3 vsae
8 4 vsae
8 5 vsae
9 1 vsae
9 2 vsae
9 3 vsae
9 4 vsae
9 5 vsae
10 1 vsae
10 2 vsae
10 3 vsae
10 4 vsae
10 5 vsae
11 1 vsae
11 2 vsae
11 3 vsae
11 4 vsae
11 5 vsae
12 1 vsae
12 2 vsae
12 3 vsae
12 4 vsae
12 5 vsae
13 1 vsae
13 2 vsae
13 3 vsae
13 4 vsae
13 5 vsae
14 1 vsae
14 2 vsae
14 3 vsae
14 4 vsae
14 5 vsae
15 1 vsae
15 2 vsae
15 3 vsae
15 4 vsae
15 5 vsae
16 1 vsae
16 2 vsae
16 3 vsae
16 4 vsae
16 5 vsae
17 1 vsae
17 2 vsae
17 3 vsae
17 4 vsae
17 5 vsae
18 1 vsae
18 2 vsae
18 3 vsae
18 4 vsae
18 5 vsae
19 1 vsae
19 2 vsae
19 3 vsae
19 4 vsae
19 5 vsae
20 1 vsae
20 2 vsae
20 3 vsae
20 4 vsae
20 5 vsae
21 1 vsae
21 2 vsae
21 3 vsae
21 4 vsae
21 5 vsae
22 1 vsae
22 2 vsae
22 3 vsae
22 4 vsae
22 5 vsae
23 1 vsae
23 2 vsae
23 3 vsae
23 4 vsae
23 5 vsae
24 1 vsae
24 2 vsae
24 3 vsae
24 4 vsae
24 5 vsae
25 1 vsae
25 2 vsae
25 3 vsae
25 4 vsae
25 5 vsae
26 1 vsae
26 2 vsae
26 3 vsae
26 4 vsae
26 5 vsae
27 1 vsae
27 2 vsae
27 3 vsae
27 4 vsae
27 5 vsae
28 1 vsae
28 2 vsae
28 3 vsae
28 4 vsae
28 5 vsae
29 1 vsae
29 2 vsae
29 3 vsae
29 4 vsae
29 5 vsae
30 1 vsae
30 2 vsae
30 3 vsae
30 4 vsae
30 5 vsae
31 1 vsae
31 2 vsae
31 3 vsae
31 4 vsae
31 5 vsae
32 1 vsae
32 2 vsae
32 3 vsae
32 4 vsae
32 5 vsae
33 1 vsae
33 2 vsae
33 3 vsae
33 4 vsae
33 5 vsae
34 1 vsae
34 2 vsae
34 3 vsae
34 4 vsae
34 5 vsae
35 1 vsae
35 2 vsae
35 3 vsae
35 4 vsae
35 5 vsae
36 1 vsae
36 2 vsae
36 3 vsae
36 4 vsae
36 5 vsae
37 1 vsae
37 2 vsae
37 3 vsae
37 4 vsae
37 5 vsae
38 1 vsae
38 2 vsae
38 3 vsae
38 4 vsae
38 5 vsae
39 1 vsae
39 2 vsae
39 3 vsae
39 4 vsae
39 5 vsae
40 1 vsae
40 2 vsae
40 3 vsae
40 4 vsae
40 5 vsae
41 1 vsae
41 2 vsae
41 3 vsae
41 4 vsae
41 5 vsae
42 1 vsae
42 2 vsae
42 3 vsae
42 4 vsae
42 5 vsae
43 1 vsae
43 2 vsae
43 3 vsae
43 4 vsae
43 5 vsae
44 1 vsae
44 2 vsae
44 3 vsae
44 4 vsae
44 5 vsae
45 1 vsae
45 2 vsae
45 3 vsae
45 4 vsae
45 5 vsae
46 1 vsae
46 2 vsae
46 3 vsae
46 4 vsae
46 5 vsae
47 1 vsae
47 2 vsae
47 3 vsae
47 4 vsae
47 5 vsae
48 1 vsae
48 2 vsae
48 3 vsae
48 4 vsae
48 5 vsae
49 1 vsae
49 2 vsae
49 3 vsae
49 4 vsae
49 5 vsae
50 1 vsae
50 2 vsae
50 3 vsae
50 4 vsae
50 5 vsae
completedta <-complete(imputed_dta,1)mice package 參考資料:https://www.gushiciku.cn/pl/258g/zh-tw
參考資料 https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/
attach(completedta)
head(completedta) age vsae sicdegp childid
1 2 6 H 1
2 3 7 H 1
3 5 18 H 1
4 9 25 H 1
5 13 27 H 1
6 2 17 H 3
summary(completedta) age vsae sicdegp childid
Min. : 2.000 Min. : 1.00 L:192 Min. : 1.00
1st Qu.: 2.000 1st Qu.: 10.00 M:255 1st Qu.: 48.75
Median : 4.000 Median : 14.00 H:165 Median :107.50
Mean : 5.771 Mean : 26.53 Mean :105.38
3rd Qu.: 9.000 3rd Qu.: 27.00 3rd Qu.:158.00
Max. :13.000 Max. :198.00 Max. :212.00
completedta$age_g<-as.factor(completedta$age)
completedta$group<-as.factor(completedta$sicdegp)
str(completedta)'data.frame': 612 obs. of 6 variables:
$ age : int 2 3 5 9 13 2 3 5 9 13 ...
$ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
$ sicdegp: Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
$ childid: int 1 1 1 1 1 3 3 3 3 3 ...
$ age_g : Factor w/ 5 levels "2","3","5","9",..: 1 2 3 4 5 1 2 3 4 5 ...
$ group : Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
#填補遺漏值後再算一遍
library(magrittr)
library(ggplot2)
pd <- position_dodge(.3)
pd<ggproto object: Class PositionDodge, Position, gg>
compute_layer: function
compute_panel: function
preserve: total
required_aes:
setup_data: function
setup_params: function
width: 0.3
super: <ggproto object: Class PositionDodge, Position, gg>
p <- completedta %>% group_by(age) %>%
summarize(m_vsae=mean(vsae),
se_vsae=sd(vsae)/sqrt(n()), .groups='drop')
p# A tibble: 5 x 3
age m_vsae se_vsae
<int> <dbl> <dbl>
1 2 9.09 0.309
2 3 15.3 0.649
3 5 21.5 1.40
4 9 40.2 3.03
5 13 60.6 5.02
#填補遺漏值後再算一遍
library(magrittr)
library(ggplot2)
pd <- position_dodge(.3) #點之間的位置錯開0.3單位
completedta %>% group_by(age, group) %>%
summarize(m_vsae=mean(vsae),
se_vsae=sd(vsae)/sqrt(n()), .groups='drop') %>%
ggplot() +
aes(x=age-2, m_vsae,
group=group,
shape=group)+
geom_errorbar(aes(ymin=m_vsae - se_vsae,
ymax=m_vsae + se_vsae),
width=.2, size=.3,
position=pd)+
geom_line(position=pd,
linetype='dotted') + #線的樣態
geom_point(position=pd,
size=rel(2)) + #改點的大小
scale_shape(guide=guide_legend(title='Group')) +
labs(x="Age(in years-2)", y="VSAE score") +
theme_bw()+ #theme設為bw的模板
theme(legend.position=c(.1, .7)) #改legend位置Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.
#讀檔案
dta<-read.csv("D://diabetes_mell.csv", head=T)
#看資料
head(dta) SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
1 51624 1 3 2 32.22 Males White No Overweight
2 51626 1 4 2 22.00 Males Black No Normal weight
3 51627 1 4 2 18.22 Males Black No Normal weight
4 51628 2 4 1 42.39 Females Black Yes Overweight
5 51629 1 1 2 32.61 Males Hispanic No Overweight
6 51630 2 3 2 30.57 Females White No Overweight
summary(dta) SEQN RIAGENDR RIDRETH1 DIQ010
Min. :51624 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:54229 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
Median :56852 Median :2.000 Median :3.000 Median :2.000
Mean :56869 Mean :1.502 Mean :2.623 Mean :1.924
3rd Qu.:59504 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000
Max. :62160 Max. :2.000 Max. :4.000 Max. :2.000
BMXBMI gender race diabetes
Min. :12.58 Length:8706 Length:8706 Length:8706
1st Qu.:20.22 Class :character Class :character Class :character
Median :25.44 Mode :character Mode :character Mode :character
Mean :26.01
3rd Qu.:30.40
Max. :84.87
BMI
Length:8706
Class :character
Mode :character
str(dta)'data.frame': 8706 obs. of 9 variables:
$ SEQN : int 51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
$ RIAGENDR: int 1 1 1 2 1 2 1 1 1 1 ...
$ RIDRETH1: int 3 4 4 4 1 3 2 3 1 3 ...
$ DIQ010 : int 2 2 2 1 2 2 2 2 2 1 ...
$ BMXBMI : num 32.2 22 18.2 42.4 32.6 ...
$ gender : chr "Males" "Males" "Males" "Females" ...
$ race : chr "White" "Black" "Black" "Black" ...
$ diabetes: chr "No" "No" "No" "Yes" ...
$ BMI : chr "Overweight" "Normal weight" "Normal weight" "Overweight" ...
#講義page 38
#參考https://cran.r-project.org/web/packages/ggalluvial/vignettes/order-rectangles.html
#pacman::p_load(ggalluvial)
library(ggalluvial)
dta_1 <- data.frame(with(dta[, c("BMI", "race", "gender", "diabetes")],
xtabs(~ BMI + diabetes + gender + race)))
attach(dta_1)
p1<-ggplot(dta_1,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq))
p1p2<-p1+
#x軸畫出離散圖,column為race, gender, diabetes
#expands是column的間距
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(0.1, 0.05)) +
#X軸名稱空著
#Y軸命名
#圖片給主題
labs(x='',y='No. individuals',
title="Diabetes in overall population in US 2009-2010",
front=1,
subtitle="stratified by race, gender and diabetes mellitus")
p2#用BMI填入資料中,製作成沖積圖alluvium
p3<-p2+
geom_alluvium(aes(fill=BMI))
p3p4<-p3+
geom_stratum(reverse = FALSE)
#加上reverse=FALSE 資料會反轉
p4p5<-p4+
geom_text(stat = "stratum", size=3,
aes(label = after_stat(stratum)),
reverse = FALSE)
#同p4,需加reverse=FALSE,否則標籤部會隨著資料反轉
p5p6<-p5+
scale_fill_manual(values=c('gray15','orange'))+
#標籤放在底部
theme(legend.position='bottom')
p6