In-class exercise

library(tidyverse)
library(ggplot2)
library(dplyr)
library(animation)
library(gganimate)
library(gifski)

Q1.

What does the R script do? Revise the code so that it does the same job without resorting to the use of a nested ‘for’ loop.

ci_norm <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) {
  ci <- NULL
  z0 = qnorm(1 - (1 - level)/2)
  for ( j in 1:nRun ) {
      zbar <- colMeans(replicate(nci, rnorm(n)))     
      zl <- zbar - z0 * 1/sqrt(n); zu <- zbar + z0 * 1/sqrt(n)
      plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), 
           type = "n", xlab = "Sample ID", ylab = "Average") 
      abline(h = 0, lty = 2)
      for( i in 1:nci ) {
          arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90, 
               code = 3,
               col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red"))
          points(i, zbar[i], pch = 19, col = "gray")
          Sys.sleep(pause)
      }
      Sys.sleep(3)
   }
}

##
ci_norm(n = 36, nci = 51, nRun = 2)

ci_norm_M <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) {
  ci <- NULL
  z0 = qnorm(1 - (1 - level)/2)
  sapply(1:nRun, function(x){
    zbar <- colMeans(replicate(nci, rnorm(n)))     
    zl <- zbar - z0 * 1/sqrt(n); zu <- zbar + z0 * 1/sqrt(n)
    plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), 
           type = "n", xlab = "Sample ID", ylab = "Average") 
    abline(h = 0, lty = 2)
    sapply(1:nci, function(x){
      arrows(x, zl[x], x, zu[x], length = 0.05, angle = 90, 
             code = 3, col = ifelse(0 > zl[x] & 0 < zu[x], "gray", "red"))
      points(x, zbar[x], pch = 19, col = "gray")
      # Sys.sleep(pause)
    })
    # Sys.sleep(3)
  })
  return()
}
ci_norm_M(n = 36, nci = 51, nRun = 2)

## NULL

Q2.

Use the read and math variables from the high schools data example for this problem. First firgure out what this R script does and then eliminate the for loop in the code segment.

dta <- read.table("hs0.txt", header = T)
head(dta)
dta.asian <- subset(dta, race=="asian")
r0 <- cor(dta.asian$math, dta.asian$socst)

# Method 1
cnt <- 0
nIter <- 1001
for (i in 1:nIter) {
    new <- sample(dta.asian$read) # sample 11 個值
    r <- cor(new, dta.asian$math)
    if ( r0 <= r ) cnt <- cnt+1
}
cnt/nIter
## [1] 0.04295704
# Method 2
# replicate 對指定的運算式重複執行指定的次數
newread <- replicate(nIter, sample(dta.asian$read))
newread <- data.frame(unlist(newread))
unlist(head(with(newread, lapply(names(newread), function(x)
  cor(dta.asian$math, eval(substitute(tmp, list(tmp=as.name(x))))))
)))
## [1] -0.3066232  0.2957185 -0.1711931 -0.5194420  0.1564189 -0.4768783
# Method 3
cor.test(dta[dta$race=="asian", "math"], dta[dta$race=="asian", "socst"])
## 
##  Pearson's product-moment correlation
## 
## data:  dta[dta$race == "asian", "math"] and dta[dta$race == "asian", "socst"]
## t = 1.9887, df = 9, p-value = 0.07796
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07083501  0.86552255
## sample estimates:
##       cor 
## 0.5525177
corTest <- function(a,b,nIter=1001){
  r0 <- cor(a, b)
  t <- sapply(c(1:nIter), function(x){
    s <- sample(a)
    r <- cor(s,b)
    return(r)
  })
  cat(length(t[t>=r0])/nIter)
}
corTest(dta.asian$math, dta.asian$read)
## 0.02597403

Q3.

An example of obtaining (nonparametric) bootstrap estimates of coefficients for a multiple linear regression model for the anorexia{MASS} data set is presented in the following script . Figure out how it works and improve the code.

data(anorexia, package="MASS")
dta <- anorexia
summary(dta)
##   Treat        Prewt           Postwt      
##  CBT :29   Min.   :70.00   Min.   : 71.30  
##  Cont:26   1st Qu.:79.60   1st Qu.: 79.33  
##  FT  :17   Median :82.30   Median : 84.05  
##            Mean   :82.41   Mean   : 85.17  
##            3rd Qu.:86.00   3rd Qu.: 91.55  
##            Max.   :94.90   Max.   :103.60
dim(dta)
## [1] 72  3
row.names(dta)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
## [71] "71" "72"
anrx_lm <- lm(Postwt ~ Prewt + Treat, data=dta)
summary(anrx_lm)
## 
## Call:
## lm(formula = Postwt ~ Prewt + Treat, data = dta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1083  -4.2773  -0.5484   5.4838  15.2922 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.7711    13.3910   3.717  0.00041 ***
## Prewt         0.4345     0.1612   2.695  0.00885 ** 
## TreatCont    -4.0971     1.8935  -2.164  0.03400 *  
## TreatFT       4.5631     2.1333   2.139  0.03604 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared:  0.2777, Adjusted R-squared:  0.2458 
## F-statistic: 8.713 on 3 and 68 DF,  p-value: 5.719e-05
coef(anrx_lm)
## (Intercept)       Prewt   TreatCont     TreatFT 
##  49.7711090   0.4344612  -4.0970655   4.5630627
bootstrap_function  <- function(string_formula, nc, model_data, ndraws) {
      coeff_mtx <- matrix(0, nrow = ndraws,  ncol = nc)    
      for (i in 1:ndraws) {
        bootstrap_ids   <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
        bootstrap_data <- model_data[bootstrap_ids,]
        bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
        coeff_mtx[i,]   <- bootstrap_model$coefficients
        if (i == 1) {
          print(bootstrap_model$coefficients)
        }
      }     
      return(coeff_mtx)
    }
  
bootstrap_estimates <- bootstrap_function("Postwt ~ Prewt + Treat", nc=4, dta, 500)
## (Intercept)       Prewt   TreatCont     TreatFT 
##  51.8422884   0.3881843  -2.0244971   5.4583632
bootstrap_CIs <- matrix(0, nrow = 4, ncol = 2)
for (i in 1:4) {
  bootstrap_CIs[i,1]  <- quantile(bootstrap_estimates[,i], 0.025)
  bootstrap_CIs[i,2]  <- quantile(bootstrap_estimates[,i], 0.975)
}

print(bootstrap_CIs)
##             [,1]       [,2]
## [1,] 17.09660263 79.0948519
## [2,]  0.08239626  0.8169008
## [3,] -7.82788325 -0.7663607
## [4,]  0.17300893  8.9687487
confint(anrx_lm)
##                  2.5 %     97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt        0.1128268  0.7560955
## TreatCont   -7.8754712 -0.3186599
## TreatFT      0.3060571  8.8200682
## 
par(mfrow=c(2,2))
hist(bootstrap_estimates[,1])
hist(bootstrap_estimates[,2])
hist(bootstrap_estimates[,3])
hist(bootstrap_estimates[,4])

# 全部打包在一起
bootstrap_CI_plot_M <- function(string_formula, nc, model_data, ndraws){
  coeff_mtx <- matrix(0, nrow = ndraws,  ncol = nc) 
  for (i in 1:ndraws) {
    bootstrap_ids   <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
    bootstrap_data  <- model_data[bootstrap_ids,]
    bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
    coeff_mtx[i,]   <- bootstrap_model$coefficients
  }
  bootstrap_CIs <- matrix(0, nrow = nc, ncol = 2)
  for (i in 1:nc) bootstrap_CIs[i,] <- quantile(coeff_mtx[,i], c(0.025, 0.975))
  cat("estimate:\n");print(bootstrap_CIs)
  lm <- lm(as.formula(string_formula), model_data)
  cat("lm:\n");print(confint(lm))
  par(mfrow=c(nc/2,2))
  for (i in 1:nc) hist(coeff_mtx[,i],main=names(coef(bootstrap_model))[i],xlab="")
}
bootstrap_CI_plot_M("Postwt ~ Prewt + Treat", nc=4, dta, 500)
## estimate:
##            [,1]       [,2]
## [1,] 13.3493060 77.5083129
## [2,]  0.1082941  0.8886533
## [3,] -7.6072462 -0.4832734
## [4,]  0.2887894  8.9438489
## lm:
##                  2.5 %     97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt        0.1128268  0.7560955
## TreatCont   -7.8754712 -0.3186599
## TreatFT      0.3060571  8.8200682

bootstrap_CI_plot_M("Postwt ~ Prewt + Treat", nc=4, dta, 1000)
## estimate:
##             [,1]       [,2]
## [1,] 17.78019346 78.0459616
## [2,]  0.09354962  0.8132488
## [3,] -7.47513934 -0.6040686
## [4,] -0.22859953  9.1089925
## lm:
##                  2.5 %     97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt        0.1128268  0.7560955
## TreatCont   -7.8754712 -0.3186599
## TreatFT      0.3060571  8.8200682

Q4.

What does the R script do? Replace the “while” loop in the code with a different iteration control structure.

# n個點隨機亂動
Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) {
    x = rnorm(n)
    y = rnorm(n)
    # i = 1
#   while (i <= nIter) {
    for(i in 1:nIter){   
        plot(x, y, ...)
        text(x, y, cex = 0.5)
        x = x + rnorm(n)
        y = y + rnorm(n)
        # Sys.sleep(pause)
        # i = i + 1
    }
} 
Brownian(xlim = c(-20, 20), ylim = c(-20, 20), 
    pch = 21, cex = 2, col = "cyan", bg = "lavender") 

Q5.

Here is an example of simulation using R. Figure out how it works and improve the code.

simSexHeight <- function(n)  {
   m <- matrix(c(runif(n), rnorm(n)), ncol=2)
   draw <- function(mr) {  # mr = one row of m
      if (mr[1] < 0.505) {
         sex <- 1
         cm <- 170 + 7*mr[2]
      } else {
         sex <- 0
         cm <- 160 + 5*mr[2] 
      }
      return(c(sex, cm))
   }
   person <- t(apply(m, 1, draw))
   person <- as.data.frame(person)
   person[, 1] <- ifelse(person[, 1]==1, "M", "F")
   names(person) <- c("Gender", "Height")
   return(person)
}
simSexHeight(5)
simSexHeight_M <- function(n)  {
  t <- data.frame(Gender = c(sample(c("F","M"), replace=TRUE, size=n)),
                  Height = c(rnorm(n)))
  t <- t %>% mutate(Height = ifelse(Gender == "F", 160 + 5*Height, 170 + 7*Height))
  return(t)
}
simSexHeight_M(5)

Exercises

Q1.

Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the salary script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.

fL <- "http://faculty.washington.edu/kenrice/rintro/salary.txt"
dta <- read.table(fL, header=TRUE)
dim(dta)
## [1] 19792    11
head(dta)
str(dta)
## 'data.frame':    19792 obs. of  11 variables:
##  $ case   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id     : int  1 2 2 4 6 6 6 6 6 7 ...
##  $ gender : Factor w/ 2 levels "F","M": 1 2 2 2 2 2 2 2 2 2 ...
##  $ deg    : Factor w/ 3 levels "Other","PhD",..: 1 1 1 2 2 2 2 2 2 2 ...
##  $ yrdeg  : int  92 91 91 96 66 66 66 66 66 70 ...
##  $ field  : Factor w/ 3 levels "Arts","Other",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ startyr: int  95 94 94 95 91 91 91 91 91 71 ...
##  $ year   : int  95 94 95 95 91 92 93 94 95 76 ...
##  $ rank   : Factor w/ 3 levels "Assist","Assoc",..: 1 1 1 1 3 3 3 3 3 1 ...
##  $ admin  : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ salary : num  6684 4743 4881 NA 11182 ...
dtafinal <- dta %>% filter(!is.na(salary), year == max(year)) %>% 
  select(id, rank, salary) %>% mutate(tag = 'final')
dtafull <- dta %>% filter(!is.na(salary), !is.na(rank)) %>% 
  group_by(id, rank) %>% summarize(salary = mean(salary)) %>% mutate(tag = 'full')
dta_M <- bind_rows(dtafinal,dtafull)

ggplot(data = dta_M, aes(rank, salary)) + 
  geom_boxplot(aes(color = tag)) + 
  labs(x = "",
       y = "Salary") +
  theme_bw()

Q2.

The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.

# riffle
# interlace two lists of numbers, starting with the longer list,
# without repeating
riffle <- function(x, y) {
  z <- NULL
  count <- 1
  for (i in 1:max(length(x), length(y))) {
      if (i <= length(x)) {
          z[count] <- x[i]
          count <- count + 1
      }
      if (i <= length(y)) {
          z[count] <- y[i]
          count <- count + 1
      }
  }
  return(z)
}
riffle(1:10, 50:55)
##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10
# another method
x <- 1:10
y <- 50:55
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
c(na.omit(as.numeric(t(cbind(x,y)))))
##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10
# modified
riffle_M <- function(x, y) {
  dl <- length(x) - length(y)
  y[(length(y) + 1):(length(y) + dl)] <- rep(NA, dl)
  z <- na.omit(as.numeric(t(cbind(x, y))))
  return(z)
}
riffle_M(1:10, 50:55)
##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10
## attr(,"na.action")
## [1] 14 16 18 20
## attr(,"class")
## [1] "omit"

Q3.

The script produces the color strip below: Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)

plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue",
           "violet", "purple")
for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])

plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
t <- 1
for(i in 1:6){
  for(j in 1:6){
    col <- ifelse(t %% 7 == 0, "purple", my_cl[t %% 7])
    rect(j-1, i-1, j, i, col = col)
    t  <- t + 1
  }
}

Q4.

Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).

# 連續投擲出同一面的紀錄
sim_coin <- function(p) sample(c(0,1), replace=TRUE, size=100, prob=c(p,1-p))
sapply(seq(0,1,0.1), function(x) max(rle(sim_coin(x))$lengths))
##  [1] 100  26   9   9   7   8  10  14  13  22 100

Q5.

Use the geometrical properties of the following plot to write an R function to estimate the value of π (i.e., the area of a unit circle) through Monte Carlo simulation.

plot.new()
plot.window(xlim = c(0, 1), ylim = c(0, 1), asp = 1)
c <- 0
for(i in 1:10000){
  p   <- c(runif(1),runif(1))
  col <- ifelse(sqrt(p[1]^2+p[2]^2)>1, "red", "green")
  c   <- ifelse(sqrt(p[1]^2+p[2]^2)>1, c, c+1)
  points(p[1], p[2], pch = '.', col = col)
}
axis(1,at=0:1)
axis(2,at=0:1)

c/10000*4 # pi
## [1] 3.1288

Q6.

(Bonus) Sheu and O’Curry (1996) implemented several nonparametric multivariate statistics in the S language. Turn their scripts into R programs.