DataM: Homework Exercise 0413 2-5

HW exercise 2.

Deaths per 100,000 from male suicides for 5 age groups and 15 countries are given in the table below. The data set is available as suicides2{HSAUR3}. Construct side-by-side box plots for the data from different age groups and comment briefly.

Target output

Source: Everitt, B.S., & Hothorn, T. (2009). A Handbook of Statistical Analyses Using R. 2nd ed. p. 15.

[Solution and Answer]

Load in the package and the data set and check the data structure

library(HSAUR3)
data(suicides2)
head(suicides2)

str(suicides2)

'data.frame':   15 obs. of  5 variables:
 $ A25.34: num  22 9 22 29 16 28 48 7 8 26 ...
 $ A35.44: num  27 19 19 40 25 35 65 8 11 29 ...
 $ A45.54: num  31 10 21 52 36 41 84 11 18 36 ...
 $ A55.64: num  34 14 31 53 47 49 81 18 20 32 ...
 $ A65.74: num  24 27 49 69 56 52 107 27 28 28 ...

Use `help(suicides2)` to confirm the meanings of variables in `suicides2`.

A25.34: number of suicides (per 100000 males) between 25 and 34 years old.
A35.44: number of suicides (per 100000 males) between 35 and 44 years old.
A45.54: number of suicides (per 100000 males) between 45 and 54 years old.
A55.64: number of suicides (per 100000 males) between 55 and 64 years old.
A65.74: number of suicides (per 100000 males) between 65 and 74 years old.

Stack the data set

dta2_long <- stack(suicides2)
colnames(dta2_long) <- c('number', 'age_group')
levels(dta2_long$age_group) <- paste0('age ', 2:6, 5, ' - ', 3:7, '4')
str(dta2_long)

'data.frame':   75 obs. of  2 variables:
 $ number   : num  22 9 22 29 16 28 48 7 8 26 ...
 $ age_group: Factor w/ 5 levels "age 25 - 34",..: 1 1 1 1 1 1 1 1 1 1 ...

Plot

boxplot(number ~ age_group, data=dta2_long, col=blues9[1:5],
        xlab = 'Age stage', ylab = 'Number of suicides (per 100,000 males)', 
        main = 'Box Plots of The Number of Suicides of Different Age Stages')

HW exercise 3.

The following R script illustrates how to implement ‘small multiples’ in base graphics given the 4 different diets of the ChickWeight{datasets} example. Adapt the script to produce a plot of 5 panels in which each panel shows a histogram of IQ for each of 5 classes with over 30 pupils in the nlschools{MASS} dataset.

dta <- ChickWeight
dta_diet <- split(dta, dta$Diet)

par(mfrow=c(2, 2), mar=c(4, 4, 0, 0))
lapply(dta_diet, function(x) {
                  plot(x$weight ~ x$Time, xlab="Time (day)", ylab="Weight (gm)")
                  legend('topleft', paste("Diet", x$Diet[1], sep="="), bty='n')}
)

[Solution and Answer]

Load in the data

library(MASS)
data(nlschools)
head(nlschools)

str(nlschools)

'data.frame':   2287 obs. of  6 variables:
 $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
 $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
 $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

Adjust the data set

library(dplyr)
count_class <- table(nlschools$class)
dta_class <- nlschools %>% dplyr::filter(class %in% names(count_class)[count_class > 30])
dta_class$class <- dta_class$class %>% as.character() %>% as.factor()
dta_class <- split(dta_class, dta_class$class)

Plot

par(mfrow=c(2, 3), mar=c(3, 3, 3, 3))
lapply(dta_class,
       function(x) {hist(x$IQ, freq=FALSE, 
                         col = RColorBrewer::brewer.pal(9, 'BuPu')[x$class[1]],
                         main=paste0('Histogram of IQ of class ', x$class[1]))})

Finding

The distributions of IQ of these five classes are quite different. Class 15580 and class 18380 have a relatively symmetric distribution. While class 15980 and class 5480 have a relatively right-skewed distribution.

HW exercise 4.

Use the dataset to replicate the plot below:

Target output

Source: Wainer, H. (2001). Uneducated Guesses. Using Evidence to Uncover Misguided Education Policies.

[Solution and Answer]

Load in the data set

dta4 <- read.table('../data/hw0413_4.txt', header = TRUE)
head(dta4)

str(dta4)

'data.frame':   6 obs. of  5 variables:
 $ College: Factor w/ 6 levels "Barnard","Bowdoin",..: 1 6 2 4 3 5
 $ SAT_No : int  1210 1243 1200 1220 1237 1233
 $ GPA_No : num  3.08 3.1 2.85 2.9 2.7 2.62
 $ SAT_Yes: int  1317 1333 1312 1280 1308 1287
 $ GPA_Yes: num  3.3 3.24 3.12 3.04 2.94 2.8

Plot

[Note] The x-axis scale of the original plot is weird (the distance of one unit is not an integer) thus I adjust it to make it be understood more easily.

# Set the outer margins
par(oma=c(3.5, 1, 1, 1))

# Plot the main frame with black points
plot(dta4$SAT_No, dta4$GPA_No, xlim=c(1150, 1400), ylim=c(2.6, 3.4),
     pch=19, cex=2, xaxt='n', yaxt ='n', xaxs='i', yaxs='i', 
     xlab='SAT (V+W)', ylab='First Year GPA')

# Add the segments before adding white points to make white points be above on the segments
with(dta4[dta4$College != 'Bowdoin',],
     segments(x0=SAT_No, y0=GPA_No, x1=SAT_Yes, y1=GPA_Yes))
with(dta4[dta4$College == 'Bowdoin',], 
     segments(x0=SAT_No, y0=GPA_No, x1=SAT_Yes, y1=GPA_Yes, lwd=2)) #thicker

# Add the white points
points(dta4$SAT_Yes, dta4$GPA_Yes, cex=2)

# Add the labels of college next to white points
with(dta4[dta4$College != 'Bowdoin',],
     text(GPA_Yes ~ SAT_Yes, labels=College, cex=1.0, font=1, adj=c(-.2, -.2)))
with(dta4[dta4$College == 'Bowdoin',],
     text(GPA_Yes ~ SAT_Yes, labels=College, cex=1.2, font=1.5, adj=c(-.2, -.2))) #thicker and larger

# Add the axes with scale and label setting
axis(1, at = seq(1150, 1400, by=(1400-1150)/10), 
     labels = c(seq(1150, 1400, by=(1400-1150)/5)[1:5] %>% as.list() %>% 
                  sapply(., function(x) {c(x, NA)}) %>% as.vector(), 1400))
axis(2, at = seq(2.6, 3.4, by=(3.4-2.6)/20), las = 1,
     labels = c('2.6', rep(NA, 4), '2.8', rep(NA, 4), 
                '3.0', rep(NA, 4), '3.2', rep(NA, 4), '3.4'))

# Add the legend
legend(1155, 3.375, paste0(c('Submitted ', 'Did NOT Submit '), 'SAT Scores'),
       pch=c(1, 19), cex=c(1, 1), bty='n')

# Add the caption
caption <- 'Figure 1.4 The mean SAT coupled with the mean first-year GPA for the class \nof 1999 at six schools shown for those who submitted SAT scores for admis- \nsion and those who did not'
mtext(caption, side=1, line=1.5, cex=1, outer=TRUE, adj=0)

HW exercise 5.

Use the free recall data to replicate the figure.

Target output

reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

[Solution and Answer]

Load in the data sets and combine them into a data frame at the same time.

dta5 <- data.frame()
file_names <- c('10-2', '15-2', '20-1', '20-2', '30-1', '40-1')

for (k in 1:length(file_names)) {
  # Load in the new data set
  df_new <-  read.table(paste0('../data/Murd62/fr', file_names[k], '.txt'),
                        fill=TRUE, na.strings = 88, sep = ' ',
                        col.names = paste0('X', 1:15))
  
  # Create the label of the original data set
  df_new$group <- file_names[k]
  
  # Combine the larger dataset and the new dataset
  dta5 <- rbind(dta5, df_new)    
}

# Check the structure of the combined data set
head(dta5)

summary(dta5)

       X1              X2              X3              X4       
 Min.   : 0.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
 1st Qu.:11.00   1st Qu.:12.00   1st Qu.:11.00   1st Qu.: 9.00  
 Median :17.00   Median :18.00   Median :18.00   Median :15.00  
 Mean   :18.83   Mean   :19.48   Mean   :19.28   Mean   :16.78  
 3rd Qu.:28.00   3rd Qu.:28.00   3rd Qu.:28.00   3rd Qu.:23.00  
 Max.   :40.00   Max.   :40.00   Max.   :41.00   Max.   :41.00  
 NA's   :242     NA's   :315     NA's   :391     NA's   :485    
       X5              X6              X7               X8        
 Min.   : 1.00   Min.   : 1.00   Min.   : 1.000   Min.   : 1.000  
 1st Qu.: 5.00   1st Qu.: 4.00   1st Qu.: 3.000   1st Qu.: 3.000  
 Median :11.00   Median : 9.00   Median : 8.000   Median : 7.000  
 Mean   :13.33   Mean   :11.08   Mean   : 9.951   Mean   : 9.767  
 3rd Qu.:20.00   3rd Qu.:16.00   3rd Qu.:14.000   3rd Qu.:14.000  
 Max.   :40.00   Max.   :40.00   Max.   :40.000   Max.   :50.000  
 NA's   :842     NA's   :1453    NA's   :2393     NA's   :3393    
       X9              X10             X11             X12       
 Min.   : 1.000   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
 1st Qu.: 3.000   1st Qu.: 4.00   1st Qu.: 4.00   1st Qu.: 4.00  
 Median : 8.000   Median : 8.00   Median : 9.00   Median : 9.00  
 Mean   : 9.845   Mean   :10.14   Mean   :10.54   Mean   :10.94  
 3rd Qu.:14.000   3rd Qu.:14.00   3rd Qu.:15.00   3rd Qu.:15.00  
 Max.   :40.000   Max.   :38.00   Max.   :39.00   Max.   :38.00  
 NA's   :4446     NA's   :5253    NA's   :5921    NA's   :6384   
      X13            X14             X15           group          
 Min.   : 1.0   Min.   : 1.00   Min.   : 1.00   Length:7201       
 1st Qu.: 4.0   1st Qu.: 5.00   1st Qu.: 5.75   Class :character  
 Median : 9.0   Median : 9.00   Median :10.00   Mode  :character  
 Mean   :10.7   Mean   :11.11   Mean   :12.37                     
 3rd Qu.:15.0   3rd Qu.:16.00   3rd Qu.:17.00                     
 Max.   :36.0   Max.   :34.00   Max.   :37.00                     
 NA's   :6744   NA's   :6944    NA's   :7073

Adjust the data form

Create binary columns to represent if the item was recalled in each trial (row) or not.

for (i in 1:40) {
  dta5[paste0('V', i)] <- apply(dta5[,1:15], 1, function(x) {i %in% x})
}

Pick binary columns and group them by group.
Compute column means (the probability of each item be recalled) for the data frame of each group.

df_prob <- dta5 %>% select(contains('V')) %>% split(., dta5$group) %>% 
  sapply(., colMeans) %>% as.data.frame()
head(df_prob)

Stack the data set of probability in order to plot.

df_prob_long <- stack(df_prob)
head(df_prob_long)

Plot

[Note] Part (b) also can be done by using lapply to avoid to use for loop. However, it is not convenient to set pch for each polyline when using lapply instead of for loop. Hence, I chose to use for loop.

# (a) Plot the main frame and data of the first dataset
# (Data points with pr<.05 should be removed first)
with(df_prob_long %>% filter((ind == file_names[1]) & (values > .05)),
     plot(values, type='o', pch=16, xlim = c(0, 40.5), ylim=c(0, 1),
          xaxs='i', yaxs='i', axes=FALSE,
          xlab='SERIAL POSITION', ylab='PROBABILITY OF RECALL'))

# (b) Add polylines of data of other 5 datasets with color setting
# (Data points with pr<.05 should be removed first)
pch_lst = c(16, 1, 16, 16, 1, 16)
for (k in 2:length(file_names)) {
  with(df_prob_long %>% filter((ind == file_names[k]) & (values > .05)),
     plot.xy(xy.coords(values), type='o', pch=pch_lst[k]))
}

# (c) Add the axes with scale and label setting
axis(1, at = 0:40, 
     labels = c(seq(0, 40-5, by=5) %>% as.list() %>% 
                  sapply(., function(x) {c(x, rep(NA, 4))}) %>% as.vector(), 40))
axis(2, at = seq(0.0, 1.0, by=1/20), las = 1,
     labels = c('0.00', rep(NA, 3), '0.20', rep(NA, 3), '0.40', rep(NA, 3), 
                '0.60', rep(NA, 3), '0.80', rep(NA, 3), '1.00'))

# (d) Add the label and pointing line for each polyline
text(x=c(2, 14.5, 11, 19, 21, 33), y=c(.9, .61, .4, .25, .55, .8),
     file_names, cex=.9, font=2)
segments(x0=c(2, 14.3, 11, 19, 21, 33.0), y0=c(.85, .63, .39, .27, .52, .75),
         x1=c(6, 12.8, 13, 16, 26, 37.5), y1=c(.60, .70, .30, .35, .39, .59))

DataM: Homework Exercise 0413 2-5