Q1

Find out what each code chunk (indicated by ‘##’) in the R scriptdoes and provide comments ##

#先畫框
plot(women, type='n') #type='n'是指不畫點或線

#再畫第一列資料點
points(women[1,]) 

#上兩式合起來可以這樣表達
plot(women[1,], type='p') #type='p'畫點

# 使用 lattice
lattice::xyplot(weight ~ height, 
  data=women,
  subset=row.names(women)==1, type='p') #type='p'畫點

# 使用 ggplot2
library(ggplot2)

#aes(height, weight)產生x 軸為身高, y 軸為體重的畫布
ggplot(data=women[1,], aes(height, weight))+ 
  geom_point() #add points

Q2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands.

After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131.Class size ranges from 4 to 35.

The response variables are score on a language test and that on an arithmetic test.

The research interest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.

The class size is categorized into small, medium, and large with roughly equal number of observations in each category.

The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category.

Reproduce the plot below.

Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score

Data wrangling

# input data
dta2 <-read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/langMathDutch.txt",header=T,sep='') #sep=''以空白鍵分隔

# first 6 rows 
head(dta2)
##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13
# structure of data
str(dta2)
## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...
typeof(dta2)
## [1] "list"
# install tools
pacman::p_load(dplyr, magrittr)

# rename
dta2 %<>%
 dplyr::rename(School_ID=school, Pupil_ID=pupil,class_size=size) 
# 
summary(dta2$class_size)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0    17.0    24.0    23.1    28.0    37.0
# class_size百分位數
quantile(dta2$class_size, probs=seq(from=0, to=1, by=.33)) 
##  0% 33% 66% 99% 
##   5  20  27  37
quantile(order(dta2$class_size), probs=seq(from=0, to=1, by=.33))
##      0%     33%     66%     99% 
##    1.00  755.38 1509.76 2264.14
# 將資料切割分為三組,並新增變數
dta2$sizef <- with(dta2, cut(class_size, ordered=T, breaks=c(0, 19, 27, 50), labels=c("Small", "Medium", "Large"))) #break=3則是依照class size的全距來切三段

#
with(dta2, table(sizef))
## sizef
##  Small Medium  Large 
##    749    837    701

班級人數在[0-10)人為small,總計有749人;[19-27)人為medium,總計有837人; 27人以上為large,總計有701人

# 
summary(dta2$IQV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   10.50   12.00   11.83   13.00   18.00
# verbal IQ百分位數
quantile(dta2$IQV, probs=seq(from=0, to=1, by=.33)) 
##   0%  33%  66%  99% 
##  4.0 11.0 12.5 17.0
quantile(order(dta2$IQV), probs=seq(from=0, to=1, by=.33))
##      0%     33%     66%     99% 
##    1.00  755.38 1509.76 2264.14
# 將IQ資料切割分為三組,並新增變數
dta2$IQVf <- with(dta2, cut(IQV, order=T, breaks = c(0, 11, 12.5, 20), labels = c("Low", "Middle", "High")))

#
with(dta2, table(IQVf))
## IQVf
##    Low Middle   High 
##    825    758    704

Verbal IQ在[0-11)者為Low組,總計有825人;在[11-12.5)者為Middle組,總計有758人;12.5分以上者為High組,總計有704人。

ggplot 繪圖

library(ggplot2)

# 開一張畫布canvas
p0 <- ggplot(data=dta2, 
             aes(x=lang, y=arith))
p0

# add points of data
p1 <- p0 +
  geom_point()
p1

#grouping by color
p2 <- p1 + 
  geom_point(data=dta2, 
             aes(color=IQVf))
p2

#grouping by color
p3 <- p1 + 
  geom_point(data=dta2, 
             aes(color=sizef))
p3

#smoothing lines加回歸線和信賴區間

##整體資料的回歸線
p4 <- p1 +
  stat_smooth(data=dta2, 
              formula=y ~ x,
              method='lm', 
              se=TRUE) #se=TRUE加信賴區間
p4

##IQ的回歸線和信賴區間
p5 <- p2 +
  stat_smooth(data=dta2, 
              aes(color=IQVf), 
              formula = y ~ x,
              method='lm', 
              se=TRUE)
p5

##class size的回歸線和信賴區間
p6 <- p3 +
  stat_smooth(data=dta2, 
              aes(color=sizef), 
              formula = y ~ x,
              method='lm', 
              se=TRUE)
p6

#加上XY軸標籤axis labels
p7 <- p4+
  labs(x="Language score", 
       y="Arithmetic score")
p7

#調整Y軸vertical axis
p8 <-p7+scale_y_continuous(limits=c(0, 30), 
                    breaks=seq(5, 30, 5))
p8

# 分組panel grid
p9 <- p8 +
 facet_grid(. ~ sizef + IQVf)
p9

p10 <- p8 +
  facet_grid(IQVf ~ sizef)
  #facet_wrap(.~IQVf +sizef, nrow=3, labeller=label(.mlti_line=F))
p10

排列方式對了,但是分組標籤和題目的要求不一樣…

#facet_wrap()和facet_grid()效果不同

pacman::p_load(patchwork, gridExtra)

p10 <- p8 +
  facet_wrap(.~IQVf +sizef, nrow=3, labeller= labeller(.multi_line= F))
p10

參考資料facet_wrap()和facet_grid()差異

(1)http://zevross.com/blog/2019/04/02/easy-multi-panel-plots-in-r-using-facet_wrap-and-facet_grid-from-ggplot2/

(2)https://statisticsglobe.com/difference-between-facet_grid-and-facet_wrap-ggplot2-r

Q3

A sample of 158 children with autisim spectrum disorder were recruited.

Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization.

It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills.

Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale.

These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.

Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.

Data: autism{WWGbook}

Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID

Replicate the two plots above using ggplot2.

# input data
data(autism, package="WWGbook")

autism |> head() |> knitr::kable()
age vsae sicdegp childid
2 6 3 1
3 7 3 1
5 18 3 1
9 25 3 1
13 27 3 1
2 17 3 3
dta3<-autism
str(dta3)
## 'data.frame':    612 obs. of  4 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
typeof(dta3)
## [1] "list"
# 新增sicdf為ordinal variable
dta3$sicdf <- factor(dta3$sicdegp, levels = c(1, 2, 3), labels = c("L", "M", "H"))

# 新增平均年齡meanAge
summary(dta3$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   4.000   5.771   9.000  13.000
m<-mean(dta3$age)
dta3$meanAge<-(dta3$age-m)

graphic 1

library(ggplot2)

#produce canvas開一個畫布
p0<-ggplot(data=dta3, aes(x=meanAge, y=vsae))+
    labs(x="Age (in years, centered)", y="VASE score") #axis labels
p0

#
p1<-p0+geom_point()+                         #add points
    geom_line(aes(group = childid))+         #add lines
    geom_smooth(formula=(y~x), method="lm")+ #smoothing lines
    facet_grid(. ~sicdf)                     #panel grid

      
p1
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).

graphic 2

#新增變數
dta3$Age2<-(dta3$age-2)

#參考講義p.27-29
pd <- position_dodge(.3)

p <- na.omit(dta3) %>% group_by(sicdf, Age2) %>%  #na.omit(dta3)把na值刪掉就可以出現第三條線了。
  summarize(m_p=mean(vsae), 
            se_p=sd(vsae)/sqrt(n()), .groups='drop') %>%
  ggplot() + 
  aes(Age2, m_p, 
      group=sicdf,
      shape=sicdf) +
  geom_errorbar(aes(ymin=m_p - se_p,
                    ymax=m_p + se_p),
                width=.2, size=.3, 
                position=pd) +
  geom_line(position=pd, 
            linetype='dotted') +
  geom_point(position=pd, 
             size=rel(3)) +
  scale_shape(guide=guide_legend(title='Group')) +
   labs(x="Age (in years - 2)", y="VSAE score") +
  theme(legend.position=c(.1, .8))
p

還不知道該如何修改圖示(未完待續….)

Q4

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

# 作法參照上課講義p.38
# tool
pacman::p_load(ggalluvial,ggplot2)
# input data
dta4 <-read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/diabetes_mell.csv",header=T)

head(dta4)
##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight
str(dta4)
## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : chr  "Males" "Males" "Males" "Females" ...
##  $ race    : chr  "White" "Black" "Black" "Black" ...
##  $ diabetes: chr  "No" "No" "No" "Yes" ...
##  $ BMI     : chr  "Overweight" "Normal weight" "Normal weight" "Overweight" ...
typeof(dta4)
## [1] "list"
dta_v4 <- data.frame(with(dta4[, c("gender", "race", "diabetes", "BMI")], 
                         xtabs(~ gender + race + diabetes + BMI)))
dta_v4
##     gender     race diabetes           BMI Freq
## 1  Females    Black       No Normal weight  347
## 2    Males    Black       No Normal weight  429
## 3  Females Hispanic       No Normal weight  712
## 4    Males Hispanic       No Normal weight  706
## 5  Females    White       No Normal weight  998
## 6    Males    White       No Normal weight  873
## 7  Females    Black      Yes Normal weight    6
## 8    Males    Black      Yes Normal weight   15
## 9  Females Hispanic      Yes Normal weight   11
## 10   Males Hispanic      Yes Normal weight   11
## 11 Females    White      Yes Normal weight   12
## 12   Males    White      Yes Normal weight   20
## 13 Females    Black       No    Overweight  431
## 14   Males    Black       No    Overweight  363
## 15 Females Hispanic       No    Overweight  688
## 16   Males Hispanic       No    Overweight  671
## 17 Females    White       No    Overweight  868
## 18   Males    White       No    Overweight  956
## 19 Females    Black      Yes    Overweight   87
## 20   Males    Black      Yes    Overweight   71
## 21 Females Hispanic      Yes    Overweight  107
## 22   Males Hispanic      Yes    Overweight  103
## 23 Females    White      Yes    Overweight  103
## 24   Males    White      Yes    Overweight  118
## plot
ggplot(dta_v4, 
       aes(axis1=race,
           axis2=diabetes, 
           axis3=BMI, 
           y=Freq)) +
  scale_x_discrete(limits=c("race", 
                            "diabetes", 
                            "BMI"), 
                   expand=c(.1, .05)) +
  labs(x='Health and Behavior between Races', 
       y='Count') +
  geom_alluvium(aes(fill=gender)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_fill_manual(values=c('deepskyblue3','hotpink3'))+
  theme_minimal() +
  ggtitle("Subjects stratified by race, diabetes, BMI, and gender")

更換顏色可參考:https://www.datanovia.com/en/blog/awesome-list-of-657-r-color-names/