1 In-class 1

The ChickWeight.R script illustrates how to implement ‘small multiples’ in base graphics given the 4 different diets of the ChickWeight{datasets} example.

Task: Adapt the script to produce a plot of 5 panels in which each panel shows a histogram of IQ for each of 5 classes with over 30 pupils in the nlschools{MASS} dataset.

1.1 ChickWeight

# input data
dta <- ChickWeight

# split into 4 list based on the Diet column
dta_diet <- split(dta, dta$Diet)

# set Graphical Parameters
# mfrow: A vector of the form c(nr, nc). Subsequent figures will be drawn in an nr-by-nc array on the device by columns (mfcol), or rows (mfrow), respectively.
# mar: A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1.

par(mfrow=c(2, 2), mar=c(4, 4, 1, 1))

# scatter plot
lapply(dta_diet, 
       function(x) {
         plot(x$weight ~ x$Time, 
              xlab="Time (day)", 
              ylab="Weight (gm)")
         legend('topleft', 
                paste("Diet", x$Diet[1], sep="="), 
                bty='n')}
       )|> 
  invisible()

1.2 nlschools

# input data
dat1 <- MASS::nlschools

# split by class
dat1_class <- split(dat1, dat1$class)

# find the classes with over 30 pupils
pupils30 <- lapply(dat1_class, function(x) nrow(x[])>30)
which(pupils30 == TRUE)
##  5480 15580 15980 16180 18380 
##    26    78    80    82    91
# set plot
par(mfrow=c(1, 5), mar=c(4, 4, 1, 1))

# histogram
lapply(dat1_class[c("5480","15580","15980","16180","18380")], 
       function(x) {
         hist(x$IQ, 
              xlab = "IQ",
              ylab = "Number of student",
              main = paste("class", x$class[1], sep=" "))
         }
       )|> 
  invisible()

2 In-class 2

Deaths per 100,000 from male suicides for 5 age groups and 15 countries are given in the table below. The data set is available as suicides2{HSAUR3}.

Task: Construct a side-by-side box plot of the data across different age groups and comment briefly.

2.1 Data input and manage

# input data
dat2 <- HSAUR3::suicides2

# column rename
names(dat2) <- c("25-34", "35-44", "45-54", "55-64","65-74")

# wide to long
dat2_l <- data.table::melt(dat2, variable.name = "Age", value.name="Number")

2.2 Plot

boxplot(Number ~ Age, 
        data = dat2_l,
        horizontal=T,
        varwidth = T,
        frame = FALSE,
        cex.axis = 0.8,
        xlab="Deaths per 100,000 from male suicides")
abline(v=seq(0, 100, 10), lty=3, col='gray') # grid
abline(v=mean(dat2_l$Number), lty=2, col="#00AD9A") # mean line 
stripchart(Number ~ Age,
           data = dat2_l, 
           add=T,
           col='darkgray', pch=1, 
           cex=.8,
           method='jitter')

  • the higher age group, the higher deaths per 100,000 from male suicides
  • 年輕人的命比較韌?

3 In-class 3

Task: Use the sat_gpa.txt dataset to replicate the plot below:

3.1 Input data

dat3 <- read.table("sat_gpa.txt", header = T)

3.2 Plot

# set margin area for annotation
par(mar=c(10, 2, 2, 2))

with(dat3, plot(SAT_No ~ GPA_No, 
                type="n",
                xlim=c(1150, 1400), 
                ylim=c(2.6, 3.4),
                xlab="SAT(V+M)", 
                ylab="First Year GPA"))
# add pair-line
with(dat3[-3,], segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lwd=2, col="black"))
segments(x0=1200, y0=2.85, x1=1312, y1=3.12, lwd=4, col=rgb(0,0,0,.6))

# Point
with(dat3, points(x=SAT_Yes, y=GPA_Yes, pch=21, bg="white", cex=2))
with(dat3, points(x=SAT_No, y=GPA_No, pch=21, bg="black", cex=2))

# Text label of College
with(dat3[-3,], text(SAT_Yes+25, GPA_Yes, labels=College, cex=.8))
## font = 1 is the default text font (neither italic nor bold), font = 2 specifies bold face text, font = 3 specifies italic text, and font = 4 specifies both bold and italic text.
with(dat3, text(1312+25, 3.12, labels="Bowdoin", cex=.8, font=2))

# add legend
legend("topleft", c("Submitted SAT Scores", "Did NOT Submit SAT Scores"), 
       pch=c(21, 19), 
       bg=c("white","black"), 
       pt.cex=1.5, bty="n")

# set axis and scale but without label
axis(1, seq(1150, 1400, length=16), labels = F)
axis(2, seq(2.6, 3.4, length=21), labels = F)

# add annotation
mtext("Figure 1.4 The mean SAT coupled with the mean first-year GPA for the class \n of 1999 at six schools shown for those who submitted SAT scores for admis- \n sion and those who did not", side=1, cex=1, adj = 0, line=7)

3.3 Reference

  • Source: Wainer, H. (2001). Uneducated Guesses. Using Evidence to Uncover Misguided Education Policies.

4 Homework 1

The Law School Admission Test (LSAT) consists of a number of dichotomous items, which can be answered correctly or incorrectly. These items are constructed so that a person who scores high on the test is more likely to do well in a law school. The data set LSAT{ltm} contains answers by 1,000 persons to 5 test items.

Task: Generate a plot similar to the one below:

4.1 Input data

dat1 <- ltm::LSAT

names(dat1) <- c("Item1", "Item2", "Item3", "Item4","Item5")

dat1L <- dat1 |>
  dplyr::mutate(Total = rowSums(dat1,c(1:5)))|>
  tidyr::pivot_longer(cols = starts_with("Item"), 
                      names_to = "Item", 
                      values_to = "Score")

dat1LFreq <- as.data.frame(table(dat1L))

dat1final <- dat1LFreq |>
  dplyr::group_by(Total, Item)|>
  dplyr::mutate(Procorrect=Freq/sum(Freq))|>
  subset(Total!=c(0,5)& Score==1)

dat1final$Total<-as.numeric(as.character(dat1final$Total))

4.2 Plot

plot(0, 0, bty = "n", type = "n",
     xlim = c(0, 5), 
     ylim = c(0, 1), 
     xlab = "Total score", 
     ylab = "Proportional Correct")

# point and line
lines(dat1final$Total[dat1final$Item == "Item1"], 
      dat1final$Procorrect[dat1final$Item == "Item1"], 
      type = "b",
      col=1,
      pch = 1)

lines(dat1final$Total[dat1final$Item == "Item2"], 
      dat1final$Procorrect[dat1final$Item == "Item2"], 
      type = "b",
      col = 2,
      pch = 2)

lines(dat1final$Total[dat1final$Item == "Item3"], 
      dat1final$Procorrect[dat1final$Item == "Item3"], 
      type = "b",
      col = 3,
      pch = 3)

lines(dat1final$Total[dat1final$Item == "Item4"], 
      dat1final$Procorrect[dat1final$Item == "Item4"],
      type = "b",
      col = 4,
      pch = 4)

lines(dat1final$Total[dat1final$Item == "Item5"], 
      dat1final$Procorrect[dat1final$Item == "Item5"],
      type = "b",
      col = 5,
      pch = 5)

grid()

legend("topleft", 
       c("Item 1", "Item 2", "Item 3", "Item 4", "Item 5"), 
       pch = c(1, 2, 3, 4, 5), 
       col = c(1, 2, 3, 4, 5),
       bty = "n", lty = 1)

5 Homework 2

Task: Complete the hw_bmi.R script to create the following plot

5.1 Create data

# height, weight and bmi

# generate weights
wt <- seq(40, 160, length.out=600)

# generate heights
ht <- seq(1.40, 2.00, length.out=600)

# cross wt by ht
wtht <- expand.grid(wt=wt, ht=ht)

# function to compute bmi from wt and ht
bmi_wh <- function(w, h) {w/(h*h)}

# generate data matrix
bmiwtht <- matrix(bmi_wh(wtht$wt, wtht$ht), length(ht), length(wt))

5.2 Plot

# draw the contour

colorbar<-c(rgb(1, 1, 1, 0), rgb( 1, 1, 1, 0),rgb(.94, .50, .50, 0.3))

contour(wt, ht, bmiwtht, bty="n",
        levels = c(18.5, 24.9, 30), 
        drawlabels = F,
        ylab = "Height (m)",
        xlab = "Weight (kg)",
        main = "BMI categories by height and weight")

# add grid lines
grid()

# annotate the bmi categories
text(105, 1.8, "Obese", cex=1, srt=45)
text(92, 1.8, "Overweight", cex=1, srt=45)
text(75, 1.8, "Normal", cex=1, srt=45)
text(55, 1.8, "Underweight", cex=1, srt=55)

image(wt, ht, bmiwtht, col = colorbar, breaks= c(18.5, 24.9, 30, 100), bty="n",
      ylab = "Height (m)",
      xlab = "Weight (kg)",
      main = "BMI categories by height and weight",
      axes = T,
      xlim = c(40, 160),
      ylim = c(1.4, 2.0),
      add = TRUE)

6 Homework 3

Task: Use the free recall data to replicate the figure below: MURDOCK 1962

  • The format of the data in these files is serial position as a function of output position. Each row of data corresponds to one subject=trial.

  • For example, if the data were: 20 19 18 15 1 5 8, that means that the subject recalled item 20, then item 19, etc.

  • The code 88 means that the subject made an extra-list intrusion

6.1 Input data

untar("Murd62.data.tgz")

dat10_2 <- read.table("Murd62/fr10-2.txt", sep = "", col.names = c(1:10), fill = T, nrows = 1200)
dat10_2["Group"] = "10-2"

dat15_2 <- read.table("Murd62/fr15-2.txt", sep = "", col.names = c(1:15), fill = T)
dat15_2["Group"] = "15-2"

dat20_1 <- read.table("Murd62/fr20-1.txt", sep = "", col.names = c(1:20), fill = T)
dat20_1["Group"] = "20-1"

dat20_2 <- read.table("Murd62/fr20-2.txt", sep = "", col.names = c(1:20), fill = T)
dat20_2["Group"] = "20-2"

dat30_1 <- read.table("Murd62/fr30-1.txt", sep = "", col.names = c(1:30), fill = T)
dat30_1["Group"] = "30-1"

dat40_1 <- read.table("Murd62/fr40-1.txt", sep = "", col.names = c(1:40), fill = T)
dat40_1["Group"] = "40-1"

dat_recall <- list(dat10_2, dat15_2, dat20_1, dat20_2, dat30_1, dat40_1)

dat3 <- lapply(dat_recall, 
               function(x) {y <- x |> 
                            tidyr::pivot_longer(cols = starts_with("X"),
                                                names_to = "Serial",
                                                values_to = "Item")
                            y <- as.data.frame(table(y$Item)/1200) |> 
                            dplyr::mutate(Group = x$Group[1])
                            } ) |>
                            dplyr::bind_rows()
dat3$Group <- factor(dat3$Group)

# remove "88"
dat3_re <- dat3[!dat3$Var1 == "88", ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "16" & dat3_re$Group == "15-2"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "31" & dat3_re$Group == "30-1"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "41" & dat3_re$Group == "40-1"), ]
dat3_re <- dat3_re[!(dat3_re$Var1 == "50" & dat3_re$Group == "40-1"), ]

head(dat3_re)
##   Var1      Freq Group
## 1    1 0.7025000  10-2
## 2    2 0.5691667  10-2
## 3    3 0.4750000  10-2
## 4    4 0.4633333  10-2
## 5    5 0.4591667  10-2
## 6    6 0.5675000  10-2

6.2 Plot

plot(0,0, bty="n", type = "n",xlim = c(0, 40), ylim = c(0, 1), 
     xlab = "SERIAL POSITION", 
     ylab = "PROBABILITY OF RECALL", 
     xaxs = "i", yaxs = "i")

# point and line
points(dat3_re$Var1[dat3_re$Group == "10-2"], 
       dat3_re$Freq[dat3_re$Group == "10-2"],
       pch = 19)
lines(dat3_re$Var1[dat3_re$Group == "10-2"], 
      dat3_re$Freq[dat3_re$Group == "10-2"])

points(dat3_re$Var1[dat3_re$Group == "15-2"], 
       dat3_re$Freq[dat3_re$Group == "15-2"],
       pch = 1)
lines(dat3_re$Var1[dat3_re$Group == "15-2"], 
      dat3_re$Freq[dat3_re$Group == "15-2"])

points(dat3_re$Var1[dat3_re$Group == "20-2"], 
       dat3_re$Freq[dat3_re$Group == "20-2"],
       pch = 19)
lines(dat3_re$Var1[dat3_re$Group == "20-2"], 
      dat3_re$Freq[dat3_re$Group == "20-2"])

points(dat3_re$Var1[dat3_re$Group == "20-1"], 
       dat3_re$Freq[dat3_re$Group == "20-1"],
       pch = 1)
lines(dat3_re$Var1[dat3_re$Group == "20-1"], 
      dat3_re$Freq[dat3_re$Group == "20-1"])

points(dat3_re$Var1[dat3_re$Group == "30-1"], 
       dat3_re$Freq[dat3_re$Group == "30-1"],
       pch = 1)
lines(dat3_re$Var1[dat3_re$Group == "30-1"], 
      dat3_re$Freq[dat3_re$Group == "30-1"])

points(dat3_re$Var1[dat3_re$Group == "40-1"], 
       dat3_re$Freq[dat3_re$Group == "40-1"],
       pch = 19)
lines(dat3_re$Var1[dat3_re$Group == "40-1"], 
      dat3_re$Freq[dat3_re$Group == "40-1"])

# text label
text(2, 0.9, "10-2")
lines(c(2, 6), c(0.85, 0.6))

text(15, 0.63, "15-2")
lines(c(14, 13.5), c(0.65, 0.7))

text(13, 0.48, "20-2")
lines(c(13, 14), c(0.45, 0.3))

text(19, 0.28, "20-1")
lines(c(17.5, 17), c(0.28, 0.36))

text(23, 0.6, "30-1")
lines(c(21.5, 21, 26), c(0.55, 0.52, 0.39))

text(35, 0.8, "40-1")
lines(c(33.5, 33, 37.5), c(0.77, 0.75, 0.59))

# axis
axis(1,seq(0, 40, 2), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)

6.3 Reference

  • Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

7 Optional Homework

A dataset, raz2005{rmcorr}, contains two repeated measures, on two occasions (Time), of age (in years) and adjusted volume of cerebellar hemispheres from 72 participants.

  • Participant: Participant ID
  • Time: Measurement time
  • Age: Participant’s age (years)
  • Volume: Adjusted volume of cerebellar hemispheres (cm^3)

Task: Use it to replicate the following figure in which participants older than 65 years of age at Time One are plotted on a shifted panel and those whose cerebellar volume increased from Time One to Time Two are shown in blue color.

7.1 Input data

# Input
dat5 <- rmcorr::raz2005

# kind of similar to the concept of slope and intercept
dat5_re <- dat5 |>
 dplyr::group_by(Participant) |>
    dplyr::mutate(DiffA = Age - dplyr::lag(Age),
                  DiffV = Volume - dplyr::lag(Volume))

# find out whose cerebellar volume increased from Time 1 to Time 2
as.data.frame(subset(dat5_re, DiffA>0 & DiffV>0))
##   Participant Time      Age   Volume    DiffA      DiffV
## 1          18    2 62.70207 148.6754 5.780508 10.7698922
## 2          22    2 66.99092 135.3493 4.930349  4.8035595
## 3          29    2 52.53263 125.8514 5.534490  5.8572004
## 4          36    2 50.78699 129.5912 5.897684  0.2437722
## 5          44    2 55.72681 131.1199 5.715749  3.1674448
## 6          62    2 48.99494 137.7438 2.174556  0.2292488
## 7          63    2 52.04664 138.4257 4.304182  1.5300961
# subset data
dat5_positive <- subset(dat5, Participant %in% c(18, 22, 29, 36, 44, 62, 63))

7.2 Plot

with(dat5, plot(Age, Volume, bty="n", 
                xlim = c(20, 90), 
                ylim = c(100, 170),
                xlab = "Age (year)", 
                ylab = "Cerebellar volume"))
# add grid
grid(nx=NA, ny=NULL)

# arrows for all
arrows(dat5$Age[dat5$Time == 1], 
       dat5$Volume[dat5$Time == 1],
       dat5$Age[dat5$Time == 2], 
       dat5$Volume[dat5$Time == 2],
       length=.08,
       col="gray")

# cover the arrows with increased
arrows(dat5_positive$Age[dat5_positive$Time == 1], 
       dat5_positive$Volume[dat5_positive$Time == 1],
       dat5_positive$Age[dat5_positive$Time == 2], 
       dat5_positive$Volume[dat5_positive$Time == 2],
       length=.08, col="skyblue", lwd = 2)