Find out what each code chunk (indicated by ‘##’) in the R scriptdoes and provide comments ##
#先畫框
plot(women, type='n') #type='n'是指不畫點或線
#再畫第一列資料點
points(women[1,])
#上兩式合起來可以這樣表達
plot(women[1,], type='p') #type='p'畫點
# 使用 lattice
::xyplot(weight ~ height,
latticedata=women,
subset=row.names(women)==1, type='p') #type='p'畫點
# 使用 ggplot2
library(ggplot2)
#aes(height, weight)產生x 軸為身高, y 軸為體重的畫布
ggplot(data=women[1,], aes(height, weight))+
geom_point() #add points
The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands.
After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131.Class size ranges from 4 to 35.
The response variables are score on a language test and that on an arithmetic test.
The research interest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.
The class size is categorized into small, medium, and large with roughly equal number of observations in each category.
The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category.
Reproduce the plot below.
Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.
Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score
# input data
<-read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/langMathDutch.txt",header=T,sep='') #sep=''以空白鍵分隔
dta2
# first 6 rows
head(dta2)
## school pupil IQV size lang arith
## 1 1 17001 15.0 29 46 24
## 2 1 17002 14.5 29 45 19
## 3 1 17003 9.5 29 33 24
## 4 1 17004 11.0 29 46 26
## 5 1 17005 8.0 29 20 9
## 6 1 17006 9.5 29 30 13
# structure of data
str(dta2)
## 'data.frame': 2287 obs. of 6 variables:
## $ school: int 1 1 1 1 1 1 1 1 1 1 ...
## $ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ size : int 29 29 29 29 29 29 29 29 29 29 ...
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ arith : int 24 19 24 26 9 13 13 30 23 22 ...
typeof(dta2)
## [1] "list"
# install tools
::p_load(dplyr, magrittr)
pacman
# rename
%<>%
dta2 ::rename(School_ID=school, Pupil_ID=pupil,class_size=size) dplyr
#
summary(dta2$class_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 17.0 24.0 23.1 28.0 37.0
# class_size百分位數
quantile(dta2$class_size, probs=seq(from=0, to=1, by=.33))
## 0% 33% 66% 99%
## 5 20 27 37
quantile(order(dta2$class_size), probs=seq(from=0, to=1, by=.33))
## 0% 33% 66% 99%
## 1.00 755.38 1509.76 2264.14
# 將資料切割分為三組,並新增變數
$sizef <- with(dta2, cut(class_size, ordered=T, breaks=c(0, 19, 27, 50), labels=c("Small", "Medium", "Large"))) #break=3則是依照class size的全距來切三段
dta2
#
with(dta2, table(sizef))
## sizef
## Small Medium Large
## 749 837 701
班級人數在[0-10)人為small,總計有749人;[19-27)人為medium,總計有837人; 27人以上為large,總計有701人
#
summary(dta2$IQV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 10.50 12.00 11.83 13.00 18.00
# verbal IQ百分位數
quantile(dta2$IQV, probs=seq(from=0, to=1, by=.33))
## 0% 33% 66% 99%
## 4.0 11.0 12.5 17.0
quantile(order(dta2$IQV), probs=seq(from=0, to=1, by=.33))
## 0% 33% 66% 99%
## 1.00 755.38 1509.76 2264.14
# 將IQ資料切割分為三組,並新增變數
$IQVf <- with(dta2, cut(IQV, order=T, breaks = c(0, 11, 12.5, 20), labels = c("Low", "Middle", "High")))
dta2
#
with(dta2, table(IQVf))
## IQVf
## Low Middle High
## 825 758 704
Verbal IQ在[0-11)者為Low組,總計有825人;在[11-12.5)者為Middle組,總計有758人;12.5分以上者為High組,總計有704人。
library(ggplot2)
# 開一張畫布canvas
<- ggplot(data=dta2,
p0 aes(x=lang, y=arith))
p0
# add points of data
<- p0 +
p1 geom_point()
p1
#grouping by color
<- p1 +
p2 geom_point(data=dta2,
aes(color=IQVf))
p2
#grouping by color
<- p1 +
p3 geom_point(data=dta2,
aes(color=sizef))
p3
#smoothing lines加回歸線和信賴區間
##整體資料的回歸線
<- p1 +
p4 stat_smooth(data=dta2,
formula=y ~ x,
method='lm',
se=TRUE) #se=TRUE加信賴區間
p4
##IQ的回歸線和信賴區間
<- p2 +
p5 stat_smooth(data=dta2,
aes(color=IQVf),
formula = y ~ x,
method='lm',
se=TRUE)
p5
##class size的回歸線和信賴區間
<- p3 +
p6 stat_smooth(data=dta2,
aes(color=sizef),
formula = y ~ x,
method='lm',
se=TRUE)
p6
#加上XY軸標籤axis labels
<- p4+
p7 labs(x="Language score",
y="Arithmetic score")
p7
#調整Y軸vertical axis
<-p7+scale_y_continuous(limits=c(0, 30),
p8 breaks=seq(5, 30, 5))
p8
# 分組panel grid
<- p8 +
p9 facet_grid(. ~ sizef + IQVf)
p9
<- p8 +
p10 facet_grid(IQVf ~ sizef)
#facet_wrap(.~IQVf +sizef, nrow=3, labeller=label(.mlti_line=F))
p10
排列方式對了,但是分組標籤和題目的要求不一樣…
#facet_wrap()和facet_grid()效果不同
::p_load(patchwork, gridExtra)
pacman
<- p8 +
p10 facet_wrap(.~IQVf +sizef, nrow=3, labeller= labeller(.multi_line= F))
p10
參考資料facet_wrap()和facet_grid()差異
(2)https://statisticsglobe.com/difference-between-facet_grid-and-facet_wrap-ggplot2-r
A sample of 158 children with autisim spectrum disorder were recruited.
Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization.
It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills.
Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale.
These assessments were repeated on these children when they were 3, 5, 9, 13 years of age.
Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.
Data: autism{WWGbook}
Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID
Replicate the two plots above using ggplot2.
# input data
data(autism, package="WWGbook")
|> head() |> knitr::kable() autism
age | vsae | sicdegp | childid |
---|---|---|---|
2 | 6 | 3 | 1 |
3 | 7 | 3 | 1 |
5 | 18 | 3 | 1 |
9 | 25 | 3 | 1 |
13 | 27 | 3 | 1 |
2 | 17 | 3 | 3 |
<-autism
dta3str(dta3)
## 'data.frame': 612 obs. of 4 variables:
## $ age : int 2 3 5 9 13 2 3 5 9 13 ...
## $ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
## $ sicdegp: int 3 3 3 3 3 3 3 3 3 3 ...
## $ childid: int 1 1 1 1 1 3 3 3 3 3 ...
typeof(dta3)
## [1] "list"
# 新增sicdf為ordinal variable
$sicdf <- factor(dta3$sicdegp, levels = c(1, 2, 3), labels = c("L", "M", "H"))
dta3
# 新增平均年齡meanAge
summary(dta3$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 4.000 5.771 9.000 13.000
<-mean(dta3$age)
m$meanAge<-(dta3$age-m) dta3
library(ggplot2)
#produce canvas開一個畫布
<-ggplot(data=dta3, aes(x=meanAge, y=vsae))+
p0labs(x="Age (in years, centered)", y="VASE score") #axis labels
p0
#
<-p0+geom_point()+ #add points
p1geom_line(aes(group = childid))+ #add lines
geom_smooth(formula=(y~x), method="lm")+ #smoothing lines
facet_grid(. ~sicdf) #panel grid
p1
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
#新增變數
$Age2<-(dta3$age-2)
dta3
#參考講義p.27-29
<- position_dodge(.3)
pd
<- na.omit(dta3) %>% group_by(sicdf, Age2) %>% #na.omit(dta3)把na值刪掉就可以出現第三條線了。
p summarize(m_p=mean(vsae),
se_p=sd(vsae)/sqrt(n()), .groups='drop') %>%
ggplot() +
aes(Age2, m_p,
group=sicdf,
shape=sicdf) +
geom_errorbar(aes(ymin=m_p - se_p,
ymax=m_p + se_p),
width=.2, size=.3,
position=pd) +
geom_line(position=pd,
linetype='dotted') +
geom_point(position=pd,
size=rel(3)) +
scale_shape(guide=guide_legend(title='Group')) +
labs(x="Age (in years - 2)", y="VSAE score") +
theme(legend.position=c(.1, .8))
p
還不知道該如何修改圖示(未完待續….)
Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.
# 作法參照上課講義p.38
# tool
::p_load(ggalluvial,ggplot2)
pacman# input data
<-read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/diabetes_mell.csv",header=T)
dta4
head(dta4)
## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
str(dta4)
## 'data.frame': 8706 obs. of 9 variables:
## $ SEQN : int 51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
## $ RIAGENDR: int 1 1 1 2 1 2 1 1 1 1 ...
## $ RIDRETH1: int 3 4 4 4 1 3 2 3 1 3 ...
## $ DIQ010 : int 2 2 2 1 2 2 2 2 2 1 ...
## $ BMXBMI : num 32.2 22 18.2 42.4 32.6 ...
## $ gender : chr "Males" "Males" "Males" "Females" ...
## $ race : chr "White" "Black" "Black" "Black" ...
## $ diabetes: chr "No" "No" "No" "Yes" ...
## $ BMI : chr "Overweight" "Normal weight" "Normal weight" "Overweight" ...
typeof(dta4)
## [1] "list"
<- data.frame(with(dta4[, c("gender", "race", "diabetes", "BMI")],
dta_v4 xtabs(~ gender + race + diabetes + BMI)))
dta_v4
## gender race diabetes BMI Freq
## 1 Females Black No Normal weight 347
## 2 Males Black No Normal weight 429
## 3 Females Hispanic No Normal weight 712
## 4 Males Hispanic No Normal weight 706
## 5 Females White No Normal weight 998
## 6 Males White No Normal weight 873
## 7 Females Black Yes Normal weight 6
## 8 Males Black Yes Normal weight 15
## 9 Females Hispanic Yes Normal weight 11
## 10 Males Hispanic Yes Normal weight 11
## 11 Females White Yes Normal weight 12
## 12 Males White Yes Normal weight 20
## 13 Females Black No Overweight 431
## 14 Males Black No Overweight 363
## 15 Females Hispanic No Overweight 688
## 16 Males Hispanic No Overweight 671
## 17 Females White No Overweight 868
## 18 Males White No Overweight 956
## 19 Females Black Yes Overweight 87
## 20 Males Black Yes Overweight 71
## 21 Females Hispanic Yes Overweight 107
## 22 Males Hispanic Yes Overweight 103
## 23 Females White Yes Overweight 103
## 24 Males White Yes Overweight 118
## plot
ggplot(dta_v4,
aes(axis1=race,
axis2=diabetes,
axis3=BMI,
y=Freq)) +
scale_x_discrete(limits=c("race",
"diabetes",
"BMI"),
expand=c(.1, .05)) +
labs(x='Health and Behavior between Races',
y='Count') +
geom_alluvium(aes(fill=gender)) +
geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
scale_fill_manual(values=c('deepskyblue3','hotpink3'))+
theme_minimal() +
ggtitle("Subjects stratified by race, diabetes, BMI, and gender")
更換顏色可參考:https://www.datanovia.com/en/blog/awesome-list-of-657-r-color-names/