In-class exercise

library(tidyverse)

Q1.

Run the R script to see what happens. Change the ‘df’ parameter to a slightly larger integer and do it again. What statistical concept does this script illustrate?

x <- seq(-pi*2, 2*pi, .05)
z <- dnorm(x)
plot(x, z, type="l", bty="L", xlab="Standard unit", ylab="Density")
y <- dt(x, df=3) # T分佈：x 所對應的密度：dt(x,df)；df：自由度；
polygon(c(x, rev(x)), c(y, rev(z)), col='light gray') # 填色
lines(x, y, col='cadetblue') 
y <- dt(x, df=8) # T分佈：x 所對應的密度：dt(x,df)；df：自由度；
polygon(c(x, rev(x)), c(y, rev(z)), col='yellow') # 填色
lines(x, y, col='cadetblue')

df自由度越高兩條線越契合

Q2.

Doll (1955) showed per capita consumption of cigarettes in 11 countries in 1930, and the death rates from lung cancer for men in 1950. Use R base graphics to generate the plot shown below. Source: Freedman, et al. (1997). Statistics. pp. 148-150.

dta <- read.table("cigarettes.txt",header = T)
head(dta)

m0 <- lm(death ~ consumption, data = dta)
with(dta, plot(consumption, death, type = "n", xlab = " Consumption(per capita per year)", ylab = "Death rate(per million)", xlim = c(min(consumption)*0.9,max(consumption)*1.1),ylim = c(min(death)*0.9,max(death)*1.1)))
with(dta, text(consumption, death, labels = Country, cex = 0.5))
abline(m0, lty =2)
title(main = "Lung cancer and Cigarettes Consumption")

Q3.

Use R base graphics to create the national flag of Denmark

width <- 35
height <- 28
red <- cbind(c(0,width,width,0),c(0,0,height,height))
plot(red, type="n", col="red", axes=F, xlim=c(0,width), ylim=c(0,height), xlab="", ylab="")
polygon(red, density=NULL, col="red", border="red")
whiteH <- cbind(c(0,width,width,0),c(12,12,16,16))
polygon(whiteH, density=NULL, col="white", border="white")
whiteV <- cbind(c(11.35,15.14,15.14,11.35),c(0,0,height,height))
polygon(whiteV, density=NULL, col="white", border="white")

Q4.

Run the R script to see what happens first and then explain how the effect is achieved by the script.

n <- 60
t <- seq(0, 2*pi, length=n)
x <- sin(t)
y <- cos(t)
par(pty = "s")
for (i in 1:n) {
  plot.new()
  plot.window(c(-1, 1), c(-1, 1))
  lines(x*y, -y, col="gray") # 8字形
  points(x[i]*y[i], -y[i], pch=16,# 沿著構成8的每一點走一圈
  col=gray((i-1)/(n+1)))# 顏色隨著n改變
  Sys.sleep(.05)
}

Q5.

Draw a pie chart to represent 50 shades of gray. Hint: Use ‘?gray’ to examine the gray level specification documented for the gray{grDevices}. Use ‘?pie’ to study the function for making pie charts documented for pie{graphics}.

pie(rep(1, 50), col=gray(seq(0, 1, length=50)),
    labels=grey(seq(0, 1, length=50)), clockwise=T, cex=0.5)

Exercises

Q1.

This R script illustrates how to split the plot region to include histograms on the margins of a scatter diagram using the Galton{HistData} data set. Compile it as a html document with comments on each code chunk.

# Galton's data on the heights of parents and their children
# install.packages("HistData")
# get data
dta <- HistData::Galton
# create matrix，是等一下繪圖的順序（0不畫）
zones <- matrix(c(2, 0, 1, 3), ncol=2, byrow=TRUE)
# 分隔畫布，寬高指分成兩份的比例
layout(zones, widths=c(4/5, 1/5), heights = c(1/5, 4/5))
# parent的histogram，先不畫 
xh <- with(dta, hist(parent, plot=FALSE))
# child的histogram，先不畫
yh <- with(dta, hist(child, plot=FALSE))
# 找出兩個直方圖的column最大是多大
ub <- max(c(xh$counts, yh$counts))
# par函數設定繪圖參數
# mar設定圖表邊界距離畫布邊界要留多寬，mar = c(bottom, left, top, right)
par(mar=c(3, 3, 1, 1)) # 在第一個要畫的區域（左下）左邊下面個留三行，右上留一行
# sunflowerplot
with(dta, sunflowerplot(parent, child))
# 在第二個要畫的區域（左上）左邊留三行，右上留一行
par(mar=c(0, 3, 1, 1))
# parent barplot
barplot(xh$counts, axes=FALSE, ylim=c(0, ub), space=0)
# 在第三個要畫的區域（右下）下面留三行，右上留一行
par(mar=c(3, 0, 1, 1))
# child barplot
barplot(yh$counts, axes=FALSE, xlim=c(0, ub), space=0, horiz=TRUE)
# oma設定圖形的外邊界大小，oma = c(bottom,left,to,right)
par(oma=c(3, 3, 0, 0))
# mtext() 在現有的圖形之四個邊緣之一加上文字
# side(1下 2左 3上 4右）
# line文字對應座標軸要再有多少位移
# at自己設座標軸要出現的位置
# adj=0對左或對下對齊，adj=1對右或對上對齊
mtext("Average height of parents (in inch)", side=1, line=2, 
      outer=TRUE, adj=0, 
      at=.4 * (mean(dta$parent) - min(dta$parent))/(diff(range(dta$parent))))
mtext("Height of child (in inch)", side=2, line=2, 
      outer=TRUE, adj=0,
      at=.4 * (mean(dta$child) - min(dta$child))/(diff(range(dta$child))))

Q2.

Deaths per 100,000 from male suicides for 5 age groups and 15 countries are given in the table below. The data set is available as suicides2{HSAUR3}. Construct side-by-side box plots for the data from different age groups and comment briefly.

                         Age                   
             25-34  35-44  45-54  55-64  65-74
Country
Canada          22     27     31     34     24
Israel           9     19     10     14     27
Japan           22     19     21     31     49
Austria         29     40     52     53     69
France          16     25     36     47     56
Germany         28     35     41     49     52
Hungary         48     65     84     81    107
Italy            7      8     11     18     27
Netherlands      8     11     18     20     28
Poland          26     29     36     32     28
Spain            4      7     10     16     22
Sweden          28     41     46     51     35
Switzerland     22     34     41     50     51
UK              10     13     15     17     22
USA             20     22     28     33     37

Source: Everitt, B.S., & Hothorn, T. (2009). A Handbook of Statistical Analyses Using R. 2nd ed. p. 15.

dta <- HSAUR3::suicides2
dtaL <- dta %>% gather(key=Age, value=Counts, contains("A"))
head(dtaL)

boxplot(Counts ~ Age, data=dtaL, xlab="Age", ylab="Deaths per 100,000 from male suicides")
# indicate grand mean
abline(h=mean(dtaL$Counts), lty=2)

Q3.

The R script illustrates how to implement ‘small multiples’ in base graphics given the 4 different diets of the ChickWeight{datasets} example. Adapt the script to produce a plot of 5 panels in which each panel shows a histogram of IQ for each of 5 classes with over 30 pupils in the nlschools{MASS} dataset.

dta <- ChickWeight
dta_diet <- split(dta, dta$Diet)
par(mfrow=c(2, 2), mar=c(4, 4, 0, 0))
lapply(dta_diet, function(x) {
                  plot(x$weight ~ x$Time, xlab="Time (day)", ylab="Weight (gm)")
                  legend('topleft', paste("Diet", x$Diet[1], sep="="), bty='n')}
)

dta <- MASS::nlschools
head(dta)
dta_class <- split(dta, dta$class)[25:30]
par(mfrow=c(2, 3), mar=c(3, 3, 3, 3))
lapply(dta_class, function(x) {
                  hist(x$IQ, xlab="IQ", main = paste("class", x$class[1], sep=" "))}
)

Q4.

Use the dataset to replicate the plot below:

dta <- read.table("sat_gpa.txt",header = T)
dta

with(dta, plot(SAT_No, GPA_No, xlim=c(1150, 1400), ylim=c(2.6,3.4), xaxs="i", yaxs="i", 
               xlab="SAT(V+M)", ylab="First Year GPA", font.lab=2, pch=19, cex=2))
with(dta, segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lwd=2, col="black"))
with(dta, segments(1200, 2.85, 1312, 3.12, lwd=3, col="black"))
with(dta, points(SAT_Yes, GPA_Yes, pch=21, bg="white", cex=2))
with(dta, text(1312+26, 3.12, labels='Bowdoin', cex=.8))
with(dta, text(SAT_Yes+26, GPA_Yes, labels=College, cex=.8))
legend("topleft", c("Submitted SAT Scores", "Did NOT Submit SAT Scores"), pch=c(21, 19), bg=c("white","black"), pt.cex=1.5, bty="n")
axis(1,seq(1150, 1400, 50/3), labels = FALSE)
axis(2,seq(2.6, 3.4, 0.2/5), labels = FALSE)

Q5.

Use the free recall data to replicate the figure

reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

names1 <- paste("X", 1:10)
names2 <- paste("X", 1:15)
dta10.2 <- read.table("Murd62/fr10-2.txt", sep = "", col.names = names1, fill = T, nrows = 1200)
dta10.2['tag'] = '10.2'
dta15.2 <- read.table("Murd62/fr15-2.txt", sep = "", col.names = names2, fill = T)
dta15.2['tag'] = '15.2'
dta20.1 <- read.table("Murd62/fr20-1.txt", sep = "", col.names = names2, fill = T)
dta20.1['tag'] = '20.1'
dta20.2 <- read.table("Murd62/fr20-2.txt", sep = "", col.names = names2, fill = T)
dta20.2['tag'] = '20.2'
dta30.1 <- read.table("Murd62/fr30-1.txt", sep = "", col.names = names2, fill = T)
dta30.1['tag'] = '30.1'
dta40.1 <- read.table("Murd62/fr40-1.txt", sep = "", col.names = names2, fill = T)
dta40.1['tag'] = '40.1'

dta_recall <- list(dta10.2, dta15.2, dta20.1, dta20.2, dta30.1, dta40.1)
dta <- lapply(dta_recall, function(x){y <- stack(x[1:10])
                                      y <- as.data.frame(table(y$values)/1200) %>% 
                                           mutate(tag=x[1,"tag"])
                                      }) %>% bind_rows()

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

dta$tag <- factor(dta$tag)
# remove rows have index == 88
dta <- dta[!dta$Var1 == "88", ]
dta <- dta[!(dta$Var1 == 16 & dta$tag == "15.2"), ]
dta <- dta[!(dta$Var1 == 31 & dta$tag == "30.1"), ]
dta <- dta[!(dta$Var1 == 41 & dta$tag == "40.1"), ]
dta <- dta[!(dta$Var1 == 50 & dta$tag == "40.1"), ]

head(dta)

# plot(dta$Var1,dta$Freq, type = "b",xlim = c(0, 40), ylim = c(0, 1), xlab = "SERIAL POSITION", ylab = "PROBABILITY OF RECALL", xaxs = "i", yaxs = "i")
plot(0,0, type = "n",xlim = c(0, 40), ylim = c(0, 1), xlab = "SERIAL POSITION", ylab = "PROBABILITY OF RECALL", xaxs = "i", yaxs = "i")
pchl = c(1,19)
for (i in 1:length(levels(dta$tag))) {
  points(dta$Var1[dta$tag == levels(dta$tag)[i]], 
        dta$Freq[dta$tag == levels(dta$tag)[i]],pch=pchl[i%%2+1])
  lines(dta$Var1[dta$tag == levels(dta$tag)[i]], 
        dta$Freq[dta$tag == levels(dta$tag)[i]])
}
axis(1,seq(0, 40, 2), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)
# text label
text(2, 0.9, "10-2")
lines(c(2, 6), c(0.85, 0.6))
text(16, 0.7, "15-2")
lines(c(14, 12.5), c(0.7, 0.7))
text(11, 0.41, "20-2")
lines(c(11, 13), c(0.39, 0.3))
text(19, 0.25, "20-1")
lines(c(19, 16), c(0.27, 0.35))
text(21, 0.55, "30-1")
lines(c(21, 26), c(0.52, 0.39))
text(33, 0.8, "40-1")
lines(c(33, 37.5), c(0.75, 0.59))

Class6 : Base graphics

0419

In-class exercise

Q1.

Q2.

Q3.

Q4.

Q5.

Exercises

Q1.

Q2.

Q3.

Q4.

Q5.