Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.
# basic plot
plot(women, type='n') #只有畫布
points(women[1,]) #畫點
# Trellis
::xyplot(weight~height, #先告知變項名
latticedata=women, #資料
subset=row.names(women)==1, type='p') #用subset只取row1
# Trellis
::xyplot(weight~height,
latticedata=women[1,], #指定row1資料
type='p') #畫點
# ggplot
::p_load(ggplot2)
pacmanggplot(data=women[1,], aes(height, weight))+
geom_point()
# save basic plot as an object
<- plot(women, type='n') w1
# save xyplot as an object
<-lattice::xyplot(weight~height, data=women) w2
# NULL, basic plot can not save as an object
class(w1)
[1] "NULL"
# trellis, xyplot save as a trellis object
class(w2)
[1] "trellis"
# list all avaliable method in trellis object
methods(class='trellis')
[1] [ dim dimnames dimnames<- plot print summary
[8] t update
see '?methods' for accessing help and source code
The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.
Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.
Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score
#IQ categorized into low, medium, high
#class size categorized into small, medium, large
::p_load(dplyr)
pacman<- read.table("langMathDutch.txt", header = T)
dta2 <- dta2 |> mutate(
dta2 IQL = cut(IQV,
breaks= quantile(IQV, probs=c(0, .25, .75, 1)),
label= c("Low","Middle","High"),
ordered=T,
include.lowest=T),
classL = cut(size,
breaks= quantile(size, probs=c(0, .25, .75, 1)),
label= c("Small","Medium","High"),
ordered=T,
include.lowest=T)
)table(dta2$IQL)
Low Middle High
593 1188 506
table(dta2$classL)
Small Medium High
608 1120 559
::p_load(patchwork, gridExtra)
pacman<-ggplot(dta2,
p1aes(lang, arith)) +
stat_smooth(method="lm",
formula=y ~ x) +
geom_point(shape=20) +
labs(x="Language score", y="Arithmetic score") +
theme_bw()
<- p1+facet_wrap(.~classL+IQL, nrow = 3, labeller= labeller(.multi_line = F)) #.multi_line 可以讓兩個變項的LABLE合成一個
p2
<- p1+facet_grid(~ classL+ IQL)#facet_grid(. ~ classL+ IQL)
p3
p1
#facet_wrap可以自由排版分面行方向的个数(nrow=)和列方向的个数(ncol=) p2
#横分面是.~x,縱分面是y.~,y~x表兩維度分面繪圖,無法自由排版分面 p3
A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age. Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.
Data: autism{WWGbook}
Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID
data(autism, package = "WWGbook")
<-autism
dta3str(dta3)
'data.frame': 612 obs. of 4 variables:
$ age : int 2 3 5 9 13 2 3 5 9 13 ...
$ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
$ sicdegp: int 3 3 3 3 3 3 3 3 3 3 ...
$ childid: int 1 1 1 1 1 3 3 3 3 3 ...
$sicdegp <- factor(dta3$sicdegp, levels = 1:3, labels = c("L","M","H")) dta3
ggplot(dta3, aes(x= scale(age, center = T, scale = F),
y=vsae))+
geom_line(aes(group=childid))+ #line grouped by childid
geom_point()+
geom_smooth(formula=y~poly(x,2), method="lm", se=T)+ #poly curve
facet_grid(.~sicdegp)+
labs(x="Age (in years, centered)", y="VSAE score")+
theme_minimal()
<-position_dodge(.3)#避免重疊
pd str(dta3)
'data.frame': 612 obs. of 4 variables:
$ age : int 2 3 5 9 13 2 3 5 9 13 ...
$ vsae : int 6 7 18 25 27 17 18 12 18 24 ...
$ sicdegp: Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
$ childid: int 1 1 1 1 1 3 3 3 3 3 ...
na.omit(dta3) |>
group_by(age, sicdegp) |>
summarize(m_p = mean(vsae),
se_p = sd(vsae)/sqrt(n()), .groups = "drop") |> #becomes ungrouped
ggplot(aes(x = age-2, m_p,
group = sicdegp,
shape = sicdegp,
linetype = sicdegp))+ # 資料中個變數與圖形上屬性對應,透過aesthtic包裹起來
geom_errorbar(aes(ymin = m_p - se_p,
ymax = m_p + se_p),
width = .2, size = .3,
position = pd,
linetype = 1) +
geom_line(position = pd) +
geom_point(position = pd, size = rel(3)) + # rel() meand relative, 點大小相對縮放
scale_shape_manual(values = c(1,2,16)) + # symbol
labs(x = "Age (in years, -2)", y = "VSEA score") +
theme_bw() +
theme(legend.position = c(.1, .8),
legend.background = element_rect(fill="white",
size=0.5, linetype="solid",
colour = "black"))
原本會獲得Warning: Removed 2 rows containing missing values (geom_point). 因此,第三條線畫不出來,須把它刪除。
Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.
<- read.csv("diabetes_mell.csv", header = T)
dta4
::p_load(ggalluvial)
pacman
# 建4個變項freq的dataframe
<- data.frame(with(dta4[,c("BMI", "race", "gender", "diabetes")],
dtav4 xtabs(~BMI + race + gender + diabetes)))
str(dtav4)
'data.frame': 24 obs. of 5 variables:
$ BMI : Factor w/ 2 levels "Normal weight",..: 1 2 1 2 1 2 1 2 1 2 ...
$ race : Factor w/ 3 levels "Black","Hispanic",..: 1 1 2 2 3 3 1 1 2 2 ...
$ gender : Factor w/ 2 levels "Females","Males": 1 1 1 1 1 1 2 2 2 2 ...
$ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ Freq : int 347 431 712 688 998 868 429 363 706 671 ...
# 調整變項level順序
levels(dtav4$race) <- list(Hispanic = "Hispanic", White ="White", Black = "Black")
$gender <- relevel(dtav4$gender, "Males")
dtav4$diabetes <- relevel(dtav4$diabetes, "Yes")
dtav4
# plot
ggplot(dtav4,
aes(axis1=race,
axis2=gender,
axis3=diabetes,
y=Freq))+
scale_x_discrete(limits=c("race", "gender", "diabetes"),
expand = c(.1, .05))+
labs(y = "No. individuals") +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat = "stratum",
aes(label = after_stat(stratum))) +
scale_fill_manual(values = c("#A6aeb1", "#EFCAAA")) +
theme_minimal() +
ggtitle("Diabetes in overall population in US 2009-2010",
subtitle = "stratified by race, gender and diabetes mellitus") +
theme(legend.position = "bottom")