inclass1- basic, trellis, ggplot

Find out what each code chunk (indicated by ‘##’) in the R script does and provide comments.

# basic plot
plot(women, type='n') #只有畫布
points(women[1,]) #畫點

# Trellis
lattice::xyplot(weight~height, #先告知變項名
                data=women, #資料
                subset=row.names(women)==1, type='p') #用subset只取row1

# Trellis
lattice::xyplot(weight~height, 
                data=women[1,], #指定row1資料
                type='p') #畫點

# ggplot
pacman::p_load(ggplot2)
ggplot(data=women[1,], aes(height, weight))+
  geom_point()

# save basic plot as an object
w1 <- plot(women, type='n')

# save xyplot as an object
w2 <-lattice::xyplot(weight~height, data=women)
# NULL, basic plot can not save as an object
class(w1)
[1] "NULL"
# trellis, xyplot save as a trellis object
class(w2)
[1] "trellis"
# list all avaliable method in trellis object
methods(class='trellis')
[1] [          dim        dimnames   dimnames<- plot       print      summary   
[8] t          update    
see '?methods' for accessing help and source code

inclass2

The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class. The class size is categorized into small, medium, and large with roughly equal number of observations in each category. The verbal IQ is categorized into low, middle and high with roughly equal number of observations in each category. Reproduce the plot below.

Source: Snijders, T. & Bosker, R. (2002). Multilevel Analysis.

Column 1: School ID Column 2: Pupil ID Column 3: Verbal IQ score Column 4: The number of pupils in a class Column 5: Language test score Column 6: Arithmetic test score

#IQ categorized into low, medium, high
#class size categorized into small, medium, large
pacman::p_load(dplyr)
dta2 <- read.table("langMathDutch.txt", header = T)
dta2 <- dta2 |> mutate(
  IQL = cut(IQV, 
             breaks= quantile(IQV, probs=c(0, .25, .75, 1)),
             label= c("Low","Middle","High"),
             ordered=T,
             include.lowest=T),
   classL = cut(size, 
             breaks= quantile(size, probs=c(0, .25, .75, 1)),
             label= c("Small","Medium","High"),
             ordered=T,
             include.lowest=T) 
)
table(dta2$IQL)

   Low Middle   High 
   593   1188    506 
table(dta2$classL)

 Small Medium   High 
   608   1120    559 
pacman::p_load(patchwork, gridExtra)
p1<-ggplot(dta2,
aes(lang, arith)) +
stat_smooth(method="lm",
formula=y ~ x) +
geom_point(shape=20) +
labs(x="Language score", y="Arithmetic score") +
theme_bw()

p2<- p1+facet_wrap(.~classL+IQL, nrow = 3, labeller= labeller(.multi_line = F)) #.multi_line 可以讓兩個變項的LABLE合成一個

p3<- p1+facet_grid(~ classL+ IQL)#facet_grid(. ~ classL+ IQL)

p1

p2 #facet_wrap可以自由排版分面行方向的个数(nrow=)和列方向的个数(ncol=)

p3 #横分面是.~x,縱分面是y.~,y~x表兩維度分面繪圖,無法自由排版分面

inclass3

A sample of 158 children with autisim spectrum disorder were recruited. Social development was assessed using the Vineland Adaptive Behavior Interview survey form, a parent-reported measure of socialization. It is a combined score that included assessment of interpersonal relationships, play/leisure time activities, and coping skills. Initial language development was assessed using the Sequenced Inventory of Communication Development (SICD) scale. These assessments were repeated on these children when they were 3, 5, 9, 13 years of age. Source: West, B.T., Welch, K.B., & Galecki, A.T. (2002). Linear Mixed Models: Practical Guide Using Statistical Software. p. 220-271.

Data: autism{WWGbook}

Column 1: Age (in years) Column 2: Vineland Socialization Age Equivalent score Column 3: Sequenced Inventory of Communication Development Expressive Group (1 = Low, 2 = Medium, 3 = High) Column 4: Child ID

data(autism, package = "WWGbook")
dta3<-autism
str(dta3)
'data.frame':   612 obs. of  4 variables:
 $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
 $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
 $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
 $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
dta3$sicdegp <- factor(dta3$sicdegp, levels = 1:3, labels = c("L","M","H"))
ggplot(dta3, aes(x= scale(age, center = T, scale = F), 
                 y=vsae))+
       geom_line(aes(group=childid))+ #line grouped by childid  
         geom_point()+
         geom_smooth(formula=y~poly(x,2), method="lm", se=T)+ #poly curve
         facet_grid(.~sicdegp)+
  labs(x="Age (in years, centered)", y="VSAE score")+
  theme_minimal()

pd <-position_dodge(.3)#避免重疊
str(dta3)
'data.frame':   612 obs. of  4 variables:
 $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
 $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
 $ sicdegp: Factor w/ 3 levels "L","M","H": 3 3 3 3 3 3 3 3 3 3 ...
 $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
na.omit(dta3) |>
  group_by(age, sicdegp) |>
  summarize(m_p = mean(vsae),
            se_p = sd(vsae)/sqrt(n()), .groups = "drop") |> #becomes ungrouped
  ggplot(aes(x = age-2, m_p,
             group = sicdegp,
             shape = sicdegp,
             linetype = sicdegp))+ # 資料中個變數與圖形上屬性對應,透過aesthtic包裹起來 
  geom_errorbar(aes(ymin = m_p - se_p,
                    ymax = m_p + se_p),
                width = .2, size = .3,
                position = pd,
                linetype = 1) +
  geom_line(position = pd) +
  geom_point(position = pd, size = rel(3)) + # rel() meand relative, 點大小相對縮放
  scale_shape_manual(values = c(1,2,16)) + # symbol
  labs(x = "Age (in years, -2)", y = "VSEA score") +
  theme_bw() +
  theme(legend.position = c(.1, .8),
        legend.background = element_rect(fill="white",
                                         size=0.5, linetype="solid",
                                         colour = "black"))

原本會獲得Warning: Removed 2 rows containing missing values (geom_point). 因此,第三條線畫不出來,須把它刪除。

inclass4- Alluvial

Use the diabetes dataset to generate a plot similar to the one below and inteprete the plot.

dta4 <- read.csv("diabetes_mell.csv", header = T)

pacman::p_load(ggalluvial)

# 建4個變項freq的dataframe
dtav4 <- data.frame(with(dta4[,c("BMI", "race", "gender", "diabetes")],
                         xtabs(~BMI + race + gender + diabetes)))
str(dtav4)
'data.frame':   24 obs. of  5 variables:
 $ BMI     : Factor w/ 2 levels "Normal weight",..: 1 2 1 2 1 2 1 2 1 2 ...
 $ race    : Factor w/ 3 levels "Black","Hispanic",..: 1 1 2 2 3 3 1 1 2 2 ...
 $ gender  : Factor w/ 2 levels "Females","Males": 1 1 1 1 1 1 2 2 2 2 ...
 $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ Freq    : int  347 431 712 688 998 868 429 363 706 671 ...
# 調整變項level順序
levels(dtav4$race) <- list(Hispanic = "Hispanic", White ="White", Black = "Black")
dtav4$gender <- relevel(dtav4$gender, "Males")
dtav4$diabetes <- relevel(dtav4$diabetes, "Yes")

# plot
ggplot(dtav4,
       aes(axis1=race,
           axis2=gender,
           axis3=diabetes,
           y=Freq))+
  scale_x_discrete(limits=c("race", "gender", "diabetes"),
                   expand = c(.1, .05))+
  labs(y = "No. individuals") +
  geom_alluvium(aes(fill=BMI)) + 
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_fill_manual(values = c("#A6aeb1", "#EFCAAA")) +
  theme_minimal() +
  ggtitle("Diabetes in overall population in US 2009-2010",
          subtitle = "stratified by race, gender and diabetes mellitus") +
  theme(legend.position = "bottom")