inclass 1

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

#women是內建資料檔,只有2個variables(height和weight)
head(women)
  height weight
1     58    115
2     59    117
3     60    120
4     61    123
5     62    126
6     63    129
#basic R,畫women的plot
#The type = "n" is used when you want to draw multiple graphs (say lines) on same plot then type = "n" can be used prepare the layout with limits and label of axis. 
plot(women, type='n')
#畫上一個點,women第一個row的資料
points(women[1,])

# 用lattice畫xyplot
# Y=weight X=height data來自women
# 只畫一筆資料subset=row.names(women)==1  
# type:1-character string giving the type of plot desired. default就是p
# 這邊只有一個subject,所以選其他的type不會顯示
lattice::xyplot(weight ~ height, 
                data=women,
                subset=row.names(women)==1, type='p')

lattice::xyplot(weight ~ height, 
                data=women,
                type='p')

# weight ~ height的資料以line表示
lattice::xyplot(weight ~ height, 
                data=women,
                type='l')

library(ggplot2)
# 畫ggplot只畫表格X、Y軸
# aes(X=height, Y=weight),不寫X、Y也沒關係,用位置判斷
ggplot(data=women[1,], aes(height, weight)) 

#畫好表格後+加上資料,延用資料data=women[1,],所以只會有一個point
ggplot(data=women[1,], aes(height, weight)) +
  geom_point()

# 比較basic plot跟lattice xyplot畫出來資料的class
w1 <- plot(women, type='n')

w2 <- lattice::xyplot(weight ~ height, data=women)
# class(w1) R不知道這是甚麼,因為basic plot在class上沒甚麼意義
class(w1)
[1] "NULL"
# class(w2) lattice:A trellis object, as returned by high level lattice functions like xyplot, is a list with the "class" attribute set to "trellis"
class(w2)
[1] "trellis"
# trellis的內建功能有哪些
methods(class='trellis')
[1] [          dim        dimnames   dimnames<- plot       print      summary   
[8] t          update    
see '?methods' for accessing help and source code

inclass 2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below. Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

班級分為小、中、大(依比例) IQ分為低、中、高(依比例)

knitr::include_graphics("langMath.png")

dta<-read.table("D:/langMathDutch.txt", header=TRUE,sep = "")
head(dta)
  school pupil  IQV size lang arith
1      1 17001 15.0   29   46    24
2      1 17002 14.5   29   45    19
3      1 17003  9.5   29   33    24
4      1 17004 11.0   29   46    26
5      1 17005  8.0   29   20     9
6      1 17006  9.5   29   30    13
str(dta)
'data.frame':   2287 obs. of  6 variables:
 $ school: int  1 1 1 1 1 1 1 1 1 1 ...
 $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
 $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
 $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
 $ arith : int  24 19 24 26 9 13 13 30 23 22 ...
summary(dta)
     school        pupil              IQV             size           lang      
 Min.   :  1   Min.   :  17001   Min.   : 4.00   Min.   : 5.0   Min.   : 9.00  
 1st Qu.: 67   1st Qu.: 677013   1st Qu.:10.50   1st Qu.:17.0   1st Qu.:35.00  
 Median :141   Median :1417007   Median :12.00   Median :24.0   Median :42.00  
 Mean   :133   Mean   :1337225   Mean   :11.83   Mean   :23.1   Mean   :40.93  
 3rd Qu.:195   3rd Qu.:1957204   3rd Qu.:13.00   3rd Qu.:28.0   3rd Qu.:48.00  
 Max.   :258   Max.   :2587010   Max.   :18.00   Max.   :37.0   Max.   :58.00  
     arith      
 Min.   : 2.00  
 1st Qu.:14.00  
 Median :20.00  
 Mean   :19.44  
 3rd Qu.:25.00  
 Max.   :30.00  
library(dplyr)
#class size分組(column size),以總人數均分small, medium, large
#verbal IQ分組(column IQV),以人數均分為low, middle, high 
#選前六項,列成table
dta %<>% 
 mutate(class_f=cut(size, 
                    breaks=quantile(size, probs=c(0, .33, .66, 1)),
                    label=c("small", "medium", "large"), 
                    ordered=T)) %>%
 mutate(IQV_f=cut(IQV, 
                    breaks=quantile(IQV, probs=c(0, .33, .66, 1)),
                    label=c("low", "middle", "high"), 
                    ordered=T)) 
head(dta)
  school pupil  IQV size lang arith class_f IQV_f
1      1 17001 15.0   29   46    24   large  high
2      1 17002 14.5   29   45    19   large  high
3      1 17003  9.5   29   33    24   large   low
4      1 17004 11.0   29   46    26   large   low
5      1 17005  8.0   29   20     9   large   low
6      1 17006  9.5   29   30    13   large   low
str(dta)
'data.frame':   2287 obs. of  8 variables:
 $ school : int  1 1 1 1 1 1 1 1 1 1 ...
 $ pupil  : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
 $ IQV    : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ size   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ lang   : int  46 45 33 46 20 30 30 57 36 36 ...
 $ arith  : int  24 19 24 26 9 13 13 30 23 22 ...
 $ class_f: Ord.factor w/ 3 levels "small"<"medium"<..: 3 3 3 3 3 3 3 3 3 3 ...
 $ IQV_f  : Ord.factor w/ 3 levels "low"<"middle"<..: 3 3 1 1 1 1 1 3 1 1 ...
p1<- ggplot(dta, 
            aes(lang, arith)) +
  geom_point(size=rel(.5)) +
  stat_smooth(method="lm", 
              formula=y ~ x,
              se=T, #95%的預測線,default是T,不要顯示預測線要=F
              col='blue',
              size=rel(.5)) +
  labs(x='Language score',
       y='Arithmetic score')+
  theme_bw()
p1

p2<-p1+
  facet_grid(dta$class_f~dta$IQV_f)
p2

#發現圖有NA
#看一下有多少NA數值
sum(is.na(dta))
[1] 7
#刪除遺漏值存成rm.dta
rm.dta <- dta[complete.cases(dta), ]
#重畫一次
p1 <- ggplot(rm.dta, 
             aes(lang, arith)) +
  geom_point(size=rel(.5), 
             alpha=.5) +
  stat_smooth(method="lm", 
              formula=y ~ x,
              se=T,
              col='blue',
              size=rel(.5)) +
  labs(x='Language score',
       y='Arithmetic score')+
  theme_bw()
p1

p2<-p1+
  facet_grid(rm.dta$class_f~rm.dta$IQV_f) 
p2

#畫出來跟我們想要的不太一樣(雖然意思一樣
#參考婉禎建議facet_wrap指令
#facets表示形式爲:~變量(~單元格) #facet_grid是基於兩個因子進行設置,facets表示形式爲:變量~變量(行~列)
#如果把一個因子用點表示,也可以達到facet_wrap的效果,也可以用加號設置成兩個以上變量,例如:變量+變量~變量 的形式,表示對三個變量設置分面。
p3<-p1+
  facet_wrap(.~IQV_f+class_f,  #設定哪兩個變項要放一起
             nrow=3)
p3

p4<-p1+
  facet_wrap(.~IQV_f+class_f,  #設定哪兩個變項要放一起
             nrow=3, #設定圖形有幾個row 
             labeller=labeller(.mlti_line=F)) #有沒有加這行似乎沒甚麼差別
p4

inclass 3

sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age. Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271. Data: autism{WWGbook}

knitr::include_graphics("VSAE1.png")

knitr::include_graphics("VSAE2.png")

library(WWGbook)
dta<-autism
#'data.frame':  612 obs. of  4 variables
attach(dta)
str(dta)
'data.frame':   612 obs. of  4 variables:
 $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
 $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
 $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
 $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
head(dta)
  age vsae sicdegp childid
1   2    6       3       1
2   3    7       3       1
3   5   18       3       1
4   9   25       3       1
5  13   27       3       1
6   2   17       3       3
#看一下各類別變項的次數分配狀況
sum1<-summarise(dta,
                nchildren=n(),
                nage=n_distinct(age),
                nsicdegp=n_distinct(sicdegp))
sum1
  nchildren nage nsicdegp
1       612    5        3
#看不懂原始圖片(Age, in years, centered)的意思
#先看一下dta$age的資料結構
summary(dta$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.000   4.000   5.771   9.000  13.000 
#猜測center是mean,但看起來不像
p0<- ggplot(dta, 
            aes(x=age-5.771, y=vsae)) +
  geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
  geom_point(size=rel(1)) 
p0

#猜測center是median,但看起來也是不像
p0m<- ggplot(dta, 
            aes(x=age-4, y=vsae)) +
  geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
  geom_point(size=rel(1)) 
p0m

#放棄猜測center的意思
p1<- ggplot(dta, 
            aes(x=age, y=vsae)) +
  geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
  geom_point(size=rel(1)) 
p1

p2<-p1+
  stat_smooth(method="lm", 
              formula=y ~ x,
              se=T,
              col='blue',
              size=rel(1)) +
  labs(x='Age',
       y='VSA score')+
  theme_bw()
p2

#加上facet_grid後,資料按照原來的label 1,2,3
p3<-p2+
  facet_grid(.~dta$sicdegp) 
p3

#給變項sicdegp新的label name

sicdegp<- factor(sicdegp)
dta$sicdegp <-factor(dta$sicdegp, 
                      levels=c(1, 2, 3),
                      labels=c("L", "M", "H"))

p1<- ggplot(dta, 
            aes(x=age, y=vsae)) +
  geom_line(aes(group=childid),linetype='solid', Width=1, col="gray80")+
  geom_point(size=rel(1)) 
p1

p2<-p1+
  stat_smooth(method="lm", 
              formula=y ~ x,
              se=T,
              col='blue',
              size=rel(1)) +
  labs(x='Age',
       y='VSA score')+
  theme_bw()
p2

p3<-p2+
  facet_grid(.~dta$sicdegp) 
p3

#除了(Age, in years, centered)之外,其他還蠻像的
#畫圖要依據age的分組先算mean跟se of vsae 
p <- dta %>% group_by(age) %>%
summarize(m_vsae=mean(vsae), 
se_vsae=sd(vsae)/sqrt(n()), .groups='drop') 
p
# A tibble: 5 x 3
    age m_vsae se_vsae
  <int>  <dbl>   <dbl>
1     2   9.09   0.309
2     3  NA     NA    
3     5  21.5    1.40 
4     9  NA     NA    
5    13  60.6    5.02 
#不知道為何age=3、age=5這兩組mean跟se算不出來?
#mean(dta$vsae)也是NA
mean(dta$vsae)
[1] NA
#感覺是資料有問題
#結果發現column vsae 有 NA's:2  
summary(dta)
      age              vsae        sicdegp    childid      
 Min.   : 2.000   Min.   :  1.00   L:192   Min.   :  1.00  
 1st Qu.: 2.000   1st Qu.: 10.00   M:255   1st Qu.: 48.75  
 Median : 4.000   Median : 14.00   H:165   Median :107.50  
 Mean   : 5.771   Mean   : 26.41           Mean   :105.38  
 3rd Qu.: 9.000   3rd Qu.: 27.00           3rd Qu.:158.00  
 Max.   :13.000   Max.   :198.00           Max.   :212.00  
                  NA's   :2                                
#再確認一次na的個數=2
sum(is.na(dta))
[1] 2

K-Nearest Neighbours(KNN)運用在遺漏值填補上:{DMwR}

找和自己很像的K個附近的資料,然後從他們身上複製自己所沒有的東西。 安裝的時候出現Warning in install.packages :package ‘DMwR’ is not available for this version of R.A version of this package for your version of R might be available elsewhere,see the ideas at

https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

MICE運用在遺漏值填補上:{mice}

提供了很多資料探勘的模型來預測遺漏值

library(mice)
#遺漏值的處理
#選擇1:直接刪除一整筆資料,缺點是少了一筆資料
#選擇2:平均值來填補遺漏值,缺點是對資料不會有甚麼影響(好像也是優點?)
#選擇3:最推崇的就是「k-NearestNeighbours」或「mice套件」來填補遺漏值
#mice的全名為Multivariate Imputation via Chained Equations
#看missing data的pattern
# There are 610 observations with no missing values. 
# There are 2 observations with missing values in vsae. 
md.pattern(dta)

    age sicdegp childid vsae  
610   1       1       1    1 0
2     1       1       1    0 1
      0       0       0    2 2
#填補遺漏值
#細節有點複雜(還搞不太懂)
imputed_dta<-mice(dta, m=5, maxit=50, method='pmm', seed=500)

 iter imp variable
  1   1  vsae
  1   2  vsae
  1   3  vsae
  1   4  vsae
  1   5  vsae
  2   1  vsae
  2   2  vsae
  2   3  vsae
  2   4  vsae
  2   5  vsae
  3   1  vsae
  3   2  vsae
  3   3  vsae
  3   4  vsae
  3   5  vsae
  4   1  vsae
  4   2  vsae
  4   3  vsae
  4   4  vsae
  4   5  vsae
  5   1  vsae
  5   2  vsae
  5   3  vsae
  5   4  vsae
  5   5  vsae
  6   1  vsae
  6   2  vsae
  6   3  vsae
  6   4  vsae
  6   5  vsae
  7   1  vsae
  7   2  vsae
  7   3  vsae
  7   4  vsae
  7   5  vsae
  8   1  vsae
  8   2  vsae
  8   3  vsae
  8   4  vsae
  8   5  vsae
  9   1  vsae
  9   2  vsae
  9   3  vsae
  9   4  vsae
  9   5  vsae
  10   1  vsae
  10   2  vsae
  10   3  vsae
  10   4  vsae
  10   5  vsae
  11   1  vsae
  11   2  vsae
  11   3  vsae
  11   4  vsae
  11   5  vsae
  12   1  vsae
  12   2  vsae
  12   3  vsae
  12   4  vsae
  12   5  vsae
  13   1  vsae
  13   2  vsae
  13   3  vsae
  13   4  vsae
  13   5  vsae
  14   1  vsae
  14   2  vsae
  14   3  vsae
  14   4  vsae
  14   5  vsae
  15   1  vsae
  15   2  vsae
  15   3  vsae
  15   4  vsae
  15   5  vsae
  16   1  vsae
  16   2  vsae
  16   3  vsae
  16   4  vsae
  16   5  vsae
  17   1  vsae
  17   2  vsae
  17   3  vsae
  17   4  vsae
  17   5  vsae
  18   1  vsae
  18   2  vsae
  18   3  vsae
  18   4  vsae
  18   5  vsae
  19   1  vsae
  19   2  vsae
  19   3  vsae
  19   4  vsae
  19   5  vsae
  20   1  vsae
  20   2  vsae
  20   3  vsae
  20   4  vsae
  20   5  vsae
  21   1  vsae
  21   2  vsae
  21   3  vsae
  21   4  vsae
  21   5  vsae
  22   1  vsae
  22   2  vsae
  22   3  vsae
  22   4  vsae
  22   5  vsae
  23   1  vsae
  23   2  vsae
  23   3  vsae
  23   4  vsae
  23   5  vsae
  24   1  vsae
  24   2  vsae
  24   3  vsae
  24   4  vsae
  24   5  vsae
  25   1  vsae
  25   2  vsae
  25   3  vsae
  25   4  vsae
  25   5  vsae
  26   1  vsae
  26   2  vsae
  26   3  vsae
  26   4  vsae
  26   5  vsae
  27   1  vsae
  27   2  vsae
  27   3  vsae
  27   4  vsae
  27   5  vsae
  28   1  vsae
  28   2  vsae
  28   3  vsae
  28   4  vsae
  28   5  vsae
  29   1  vsae
  29   2  vsae
  29   3  vsae
  29   4  vsae
  29   5  vsae
  30   1  vsae
  30   2  vsae
  30   3  vsae
  30   4  vsae
  30   5  vsae
  31   1  vsae
  31   2  vsae
  31   3  vsae
  31   4  vsae
  31   5  vsae
  32   1  vsae
  32   2  vsae
  32   3  vsae
  32   4  vsae
  32   5  vsae
  33   1  vsae
  33   2  vsae
  33   3  vsae
  33   4  vsae
  33   5  vsae
  34   1  vsae
  34   2  vsae
  34   3  vsae
  34   4  vsae
  34   5  vsae
  35   1  vsae
  35   2  vsae
  35   3  vsae
  35   4  vsae
  35   5  vsae
  36   1  vsae
  36   2  vsae
  36   3  vsae
  36   4  vsae
  36   5  vsae
  37   1  vsae
  37   2  vsae
  37   3  vsae
  37   4  vsae
  37   5  vsae
  38   1  vsae
  38   2  vsae
  38   3  vsae
  38   4  vsae
  38   5  vsae
  39   1  vsae
  39   2  vsae
  39   3  vsae
  39   4  vsae
  39   5  vsae
  40   1  vsae
  40   2  vsae
  40   3  vsae
  40   4  vsae
  40   5  vsae
  41   1  vsae
  41   2  vsae
  41   3  vsae
  41   4  vsae
  41   5  vsae
  42   1  vsae
  42   2  vsae
  42   3  vsae
  42   4  vsae
  42   5  vsae
  43   1  vsae
  43   2  vsae
  43   3  vsae
  43   4  vsae
  43   5  vsae
  44   1  vsae
  44   2  vsae
  44   3  vsae
  44   4  vsae
  44   5  vsae
  45   1  vsae
  45   2  vsae
  45   3  vsae
  45   4  vsae
  45   5  vsae
  46   1  vsae
  46   2  vsae
  46   3  vsae
  46   4  vsae
  46   5  vsae
  47   1  vsae
  47   2  vsae
  47   3  vsae
  47   4  vsae
  47   5  vsae
  48   1  vsae
  48   2  vsae
  48   3  vsae
  48   4  vsae
  48   5  vsae
  49   1  vsae
  49   2  vsae
  49   3  vsae
  49   4  vsae
  49   5  vsae
  50   1  vsae
  50   2  vsae
  50   3  vsae
  50   4  vsae
  50   5  vsae
completedta <-complete(imputed_dta,1)

mice package 參考資料:https://www.gushiciku.cn/pl/258g/zh-tw

參考資料 https://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/

attach(completedta)
head(completedta)
  age vsae sicdegp childid
1   2    6       H       1
2   3    7       H       1
3   5   18       H       1
4   9   25       H       1
5  13   27       H       1
6   2   17       H       3
summary(completedta)
      age              vsae        sicdegp    childid      
 Min.   : 2.000   Min.   :  1.00   L:192   Min.   :  1.00  
 1st Qu.: 2.000   1st Qu.: 10.00   M:255   1st Qu.: 48.75  
 Median : 4.000   Median : 14.00   H:165   Median :107.50  
 Mean   : 5.771   Mean   : 26.53           Mean   :105.38  
 3rd Qu.: 9.000   3rd Qu.: 27.00           3rd Qu.:158.00  
 Max.   :13.000   Max.   :198.00           Max.   :212.00  
completedta$age_g<-as.factor(completedta$age)
completedta$group<-as.factor(completedta$sicdegp)

str(completedta)
'data.frame':   612 obs. of  6 variables:
 $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
 $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
 $ sicdegp: Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
 $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
 $ age_g  : Factor w/ 5 levels "2","3","5","9",..: 1 2 3 4 5 1 2 3 4 5 ...
 $ group  : Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
#填補遺漏值後再算一遍
library(magrittr)
library(ggplot2)
pd <- position_dodge(.3)
pd
<ggproto object: Class PositionDodge, Position, gg>
    compute_layer: function
    compute_panel: function
    preserve: total
    required_aes: 
    setup_data: function
    setup_params: function
    width: 0.3
    super:  <ggproto object: Class PositionDodge, Position, gg>
p <- completedta %>% group_by(age) %>%
summarize(m_vsae=mean(vsae), 
se_vsae=sd(vsae)/sqrt(n()), .groups='drop')
p
# A tibble: 5 x 3
    age m_vsae se_vsae
  <int>  <dbl>   <dbl>
1     2   9.09   0.309
2     3  15.3    0.649
3     5  21.5    1.40 
4     9  40.2    3.03 
5    13  60.6    5.02 
#填補遺漏值後再算一遍
library(magrittr)
library(ggplot2)
pd <- position_dodge(.3)   #點之間的位置錯開0.3單位

completedta %>% group_by(age, group) %>%
summarize(m_vsae=mean(vsae), 
se_vsae=sd(vsae)/sqrt(n()), .groups='drop') %>%
ggplot() +
  aes(x=age-2, m_vsae, 
      group=group,
      shape=group)+
  geom_errorbar(aes(ymin=m_vsae - se_vsae,
                    ymax=m_vsae + se_vsae),
                width=.2, size=.3, 
                position=pd)+
  geom_line(position=pd, 
            linetype='dotted') +   #線的樣態
  geom_point(position=pd, 
             size=rel(2)) +        #改點的大小
  scale_shape(guide=guide_legend(title='Group')) +
  labs(x="Age(in years-2)", y="VSAE score") +
  theme_bw()+   #theme設為bw的模板
  theme(legend.position=c(.1, .7)) #改legend位置

Inclass 4

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

#讀檔案
dta<-read.csv("D://diabetes_mell.csv", head=T)  
#看資料
head(dta)
   SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
1 51624        1        3      2  32.22   Males    White       No    Overweight
2 51626        1        4      2  22.00   Males    Black       No Normal weight
3 51627        1        4      2  18.22   Males    Black       No Normal weight
4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
6 51630        2        3      2  30.57 Females    White       No    Overweight
summary(dta)
      SEQN          RIAGENDR        RIDRETH1         DIQ010     
 Min.   :51624   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:54229   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:2.000  
 Median :56852   Median :2.000   Median :3.000   Median :2.000  
 Mean   :56869   Mean   :1.502   Mean   :2.623   Mean   :1.924  
 3rd Qu.:59504   3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:2.000  
 Max.   :62160   Max.   :2.000   Max.   :4.000   Max.   :2.000  
     BMXBMI         gender              race             diabetes        
 Min.   :12.58   Length:8706        Length:8706        Length:8706       
 1st Qu.:20.22   Class :character   Class :character   Class :character  
 Median :25.44   Mode  :character   Mode  :character   Mode  :character  
 Mean   :26.01                                                           
 3rd Qu.:30.40                                                           
 Max.   :84.87                                                           
     BMI           
 Length:8706       
 Class :character  
 Mode  :character  
                   
                   
                   
str(dta)
'data.frame':   8706 obs. of  9 variables:
 $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
 $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
 $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
 $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
 $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
 $ gender  : chr  "Males" "Males" "Males" "Females" ...
 $ race    : chr  "White" "Black" "Black" "Black" ...
 $ diabetes: chr  "No" "No" "No" "Yes" ...
 $ BMI     : chr  "Overweight" "Normal weight" "Normal weight" "Overweight" ...
#講義page 38
#參考https://cran.r-project.org/web/packages/ggalluvial/vignettes/order-rectangles.html
#pacman::p_load(ggalluvial)
library(ggalluvial)
dta_1 <- data.frame(with(dta[, c("BMI", "race", "gender", "diabetes")], 
                         xtabs(~ BMI + diabetes + gender + race)))
attach(dta_1)
p1<-ggplot(dta_1, 
       aes(axis1=race,
           axis2=gender, 
           axis3=diabetes, 
           y=Freq)) 
p1

p2<-p1+
#x軸畫出離散圖,column為race, gender, diabetes
#expands是column的間距
  scale_x_discrete(limits=c("race", 
                            "gender", 
                            "diabetes"), 
                   expand=c(0.1, 0.05)) +
#X軸名稱空著
#Y軸命名
#圖片給主題
  labs(x='',y='No. individuals',
       title="Diabetes in overall population in US 2009-2010",
       front=1,
       subtitle="stratified by race, gender and diabetes mellitus")
p2

#用BMI填入資料中,製作成沖積圖alluvium
p3<-p2+
  geom_alluvium(aes(fill=BMI)) 
p3

p4<-p3+
  geom_stratum(reverse = FALSE) 
#加上reverse=FALSE 資料會反轉
p4

p5<-p4+
  geom_text(stat = "stratum", size=3, 
            aes(label = after_stat(stratum)),
            reverse = FALSE)
#同p4,需加reverse=FALSE,否則標籤部會隨著資料反轉
p5

p6<-p5+
  scale_fill_manual(values=c('gray15','orange'))+
#標籤放在底部
  theme(legend.position='bottom')
p6