library(dplyr)
library(ggplot2)

HW exercise 1.

Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the following script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.

Target output

Download and check

fL <- "http://faculty.washington.edu/kenrice/rintro/salary.txt"
dta <- read.table(fL, header=TRUE)
str(dta)
'data.frame':   19792 obs. of  11 variables:
 $ case   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ id     : int  1 2 2 4 6 6 6 6 6 7 ...
 $ gender : chr  "F" "M" "M" "M" ...
 $ deg    : chr  "Other" "Other" "Other" "PhD" ...
 $ yrdeg  : int  92 91 91 96 66 66 66 66 66 70 ...
 $ field  : chr  "Other" "Other" "Other" "Other" ...
 $ startyr: int  95 94 94 95 91 91 91 91 91 71 ...
 $ year   : int  95 94 95 95 91 92 93 94 95 76 ...
 $ rank   : chr  "Assist" "Assist" "Assist" "Assist" ...
 $ admin  : int  0 0 0 0 1 1 0 0 0 0 ...
 $ salary : num  6684 4743 4881 NA 11182 ...
dim(dta)
[1] 19792    11
names(dta)
 [1] "case"    "id"      "gender"  "deg"     "yrdeg"   "field"   "startyr"
 [8] "year"    "rank"    "admin"   "salary" 

Data transformation

Group the dataset by id and rank to compute salary mean of full years

dta1 <- dta %>% dplyr::filter(!is.na(salary)) %>%
  group_by(id, rank) %>%
  summarise(full = mean(salary)) 

Group the dataset by id only to find the salary of the final year

dta2 <- dta %>% dplyr::filter(!is.na(salary)) %>%
  group_by(id) %>%
  summarise(rank = rank[which.max(year)], final = salary[which.max(year)])

Join the data

dta_long <- full_join(dta1, dta2) %>% 
  reshape2::melt(id.vars = c('id', 'rank'), measure.vars = c('full', 'final'))
summary(dta_long)
       id             rank            variable        value      
 Min.   :   1.0   Length:5524        full :2762   Min.   : 1334  
 1st Qu.: 449.0   Class :character   final:2762   1st Qu.: 3828  
 Median : 868.5   Mode  :character                Median : 4876  
 Mean   : 877.3                                   Mean   : 5167  
 3rd Qu.:1305.0                                   3rd Qu.: 6273  
 Max.   :1770.0                                   Max.   :14464  
                                                  NA's   :1166   

Data visualization

par(oma = c(1, 0, 0, 0))
boxplot(value ~ variable + rank, data = dta_long, col = c('white', 'red'),
        xlab = '', ylab = 'salary', xaxt = 'n')
axis(1, at = 1:6, labels = rep(c('full', 'final'), 3))
mtext(c('Assist', 'Assoc', 'Full'), side = 1, line = -2.5,
      at = c(0.27, 0.53, 0.78), cex = 1, outer = TRUE)


HW exercise 2.

The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.

The original scipts

#
# riffle
# interlace two lists of numbers, starting with the longer list,
# without repeating
#

riffle <- function(x, y) {
  z <- NULL
  count <- 1
  for (i in 1:max(length(x), length(y))) {
      if (i <= length(x)) {
          z[count] <- x[i]
          count <- count + 1
      }
      if (i <= length(y)) {
          z[count] <- y[i]
          count <- count + 1
      }
  }
  return(z)
}
###
##
# test it
riffle(1:10, 50:55)
 [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10
###
x <- 1:10
y <- 50:55
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
c(na.omit(as.numeric(t(cbind(x,y)))))
 [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10

The modified script

riffle2 <- function(x, y) {
  dl <- length(x) - length(y)
  y[(length(y) + 1):(length(y) + dl)] <- rep(NA, dl)
  z <- na.omit(as.numeric(t(cbind(x, y))))
  return(z)
}

riffle(1:10, 50:55)
 [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10
riffle(-10:10, 5:15)
 [1] -10   5  -9   6  -8   7  -7   8  -6   9  -5  10  -4  11  -3  12  -2  13  -1
[20]  14   0  15   1   2   3   4   5   6   7   8   9  10
riffle(rnorm(6), rpois(5, lambda = 2))
 [1] -1.1819838  0.0000000  1.4574409  5.0000000  0.2713082  1.0000000
 [7]  0.8329461  3.0000000  1.0160099  2.0000000  1.8264172


HW exercise 3.

The following R script produces the color strip below:

#
# plot a color strip
#
plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
my_cl <- c("indianred", "orange", "yellow", "forestgreen",
           "dodgerblue", "violet", "purple")
for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])

###

Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)

Target output

Method 1: Double for loop

The original senario

my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
k <- length(my_cl) - 1
plot.new()
plot.window(xlim = c(0, k), ylim = c(0, k), asp = 1)
for(i in 1:k) {
  for (j in 1:k) {
    rect(i-1, j-1, i, j, col = my_cl[(i + k+1 - j) %% (k+1) + 1])
  }
}

Turn into the function

color_squares <- function(cl = 1:7) {
  k <- length(cl) - 1
  par(mar = rep(1, 4))
  plot.new()
  plot.window(xlim = c(0, k), ylim = c(0, k), asp = 1)
  for(i in 1:k) {
    for (j in 1:k) {
      rect(i-1, j-1, i, j, col = cl[(i + k+1 - j) %% (k+1) + 1])
    }
  }
}

par(mfrow=c(2,3))
color_squares()
color_squares(1:5)
color_squares(my_cl)
color_squares(grey(seq(0, 1, by=.1)))
color_squares(c(paste0('slateblue', 1:4), paste0('turquoise', 1:4)))
color_squares(c(paste0('violetred', 1:4), paste0('palevioletred', 1:4)))

Method 2: Sapply (vectorization)

my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
k <- length(my_cl)
plot.new()
plot.window(xlim = c(0, k-1), ylim = c(0, k-1), asp = 1)
plt <- sapply(0:(k-2), function(i) {
  rect(i:(k-2), i, (i+1):(k-1), i+1, col = my_cl[-(k-i)])
  rect(0:(i-1)*(i>1), i, 1:i, i+1, col = my_cl[(k-i+1):k])
})

Turn into the function

color_squares <- function(cl = 0:6) {
  k <- length(cl)
  par(mar = rep(1, 4))
  plot.new()
  plot.window(xlim = c(0, k-1), ylim = c(0, k-1), asp = 1)
  plt <- sapply(0:(k-2), function(i) {
    rect(i:(k-2), i, (i+1):(k-1), i+1, col = cl[-(k-i)])
    rect(0:(i-1)*(i>1), i, 1:i, i+1, col = cl[(k-i+1):k])
  })
}

par(mfrow=c(2,3))
color_squares()
color_squares(seq(0, 8, 2))
color_squares(my_cl)
color_squares(grey(seq(0, 1, by=.05)))
color_squares(c(paste0('gold', 1:4), paste0('goldenrod', 1:4)))
color_squares(c(paste0('mediumorchid', 1:4), paste0('mediumpurple', 1:4)))


HW exercise 4.

Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).

coin <- function(p =.5, n= 100) {ifelse(runif(n) < p, 'H', 'T')}
sapply(seq(0, 1, by = .1), function(p) c(p, max(rle(coin(p))[[1]]))) %>%
  t() %>% as.data.frame()
par(mfrow=c(2,2))
plts <- lapply(c(30, 100, 200, 1000), function(n) {
  df <- sapply(seq(0, 1, by = .01), 
               function(p) c(p, max(rle(coin(p, n))[[1]]))) %>%
    t() %>% as.data.frame()
  plot(df$V1, df$V2, type='l', xlab = 'Probability', ylab = 'The longest streak')
  text(.5, n*.9, paste0('Sample size = ', n))
})

As the probability get closer to 0.5, the longest streak is smaller. The longest streak equals the sample size (n) when the probability is 0 or 1. This pattern gets more obvious as the sample size increases.


HW exercise 5.

Use the geometrical properties of the following plot to write an R function to estimate the value of \(\pi\) (i.e., the area of a unit circle) through Monte Carlo simulation.

Target output

Let \(r\) be the radius of the unit circle, the green area (the one fourth of the unit circle) \(A\) should be \(\frac{1}{4}r^2 \pi\). The ratio of \(A\) and the whole square area is \(A:1 = A = \frac{1}{4}r^2 \pi\). Thus, \(\pi = 4A\).

x <- runif(100000)
y <- runif(100000)
mean(x^2 + y^2 <= 1)*4
[1] 3.13892

Turn into the function to do more investigation

estimate_pi <- function(n=10000, plt=FALSE) {
  x <- runif(n)
  y <- runif(n)
  inner_points <- x^2 + y^2 <= 1
  if (plt) {
    par(pty = 's', mar = c(4, 2, 1, 1))
    plot(x, y, pch = 19,
         col = c('steelblue3', 'sienna2')[inner_points*1 + 1])
  }
  else {return (mean(inner_points)*4)}
}

Find the relationship of sample size and the estimate value

ns <- seq(500, 500000, by=500)
estimates <- sapply(ns, function(n) estimate_pi(n))
summary(estimates)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.083   3.139   3.142   3.142   3.144   3.189 
plot(ns, estimates, type = 'l', ylim = c(3.1, 3.2))
abline(h = pi, col='red')

As the sample size increases, the estimate value gets more stably around the true value of \(\pi\).

Data visualization

par(mfrow=c(2,2), mar = rep(1, 4))
plts <- lapply(c(500, 1000, 2000, 10000),
               function(n) estimate_pi(n, plt = TRUE))


HW exercise 6.

(Bonus) Sheu and O’Curry (1996) implemented several nonparametric multivariate statistics in the S language. Turn their scripts into R programs.

Listing 1

The original S script

bdi <- matrix(scan("bdi.scores"), ncol=2, byrow=T)
estm <- matrix(scan("esteem.scores"), ncol=2, byrow=T)
# We assume the BDI and Self-Esteem scores are stored in two separate files under the
# current working directory. bdi and estm are 10 by 2 matrices of scores

dfBdi <- bdi[,2] - bdi[,1]
dtEstm <- estm[,2] - estm[, 1]
# This generates a vector o f difference scores for BDI and Self-Esteem scores

sign <- function(M) ifelse(M < 0, -1, 1)
# This creates a user-defined sign function

sgnDf <- cbind(sign(dfBdi), sign(dtEstm))
# The two vectors of signs are bound together column-wise

dfSgn <- cbind(sum(sgnDf[,1]), sum(sgnDf[,2]))
# This is a row vector of difference between the number of positive and negative signs

dfSgnV <- crossprod(sgnDf)
# This isnV

starD <- dfSgn %*% solve(dfSgnV) %*% t(dfSgn)
# This calculates the test statistic D*. In S, solve inverts a matrix. Note that dfSgn is a row vector

1 - pchisq(starD, df=2)
# pchisq is the cumulative chi-square distribution function

The modified R script

T1 <- read.table('../data/data_hw0511_6_Table1.txt', header = TRUE)
T1 
bdi <- T1 %>% dplyr::select(contains('BDI'))
estm <- T1 %>% dplyr::select(contains('esteem'))

sign <- function(M) ifelse(M < 0, -1, 1)
sgnDf <- sapply(list(bdi, estm), function(df) sign(df[,2] - df[,1]))
dfSgn <- matrix(colSums(sgnDf))
starD <- as.vector(t(dfSgn) %*% solve(crossprod(sgnDf)) %*% dfSgn)
p <- 1 - pchisq(starD, df=2)
list(stat = starD, p = p, significance = p < .05)
$stat
[1] 6.444444

$p
[1] 0.03986637

$significance
[1] TRUE

Listing 2

The original S script

rnkDfBdi <- rank(abs(dfBdi))
rnkDfEstm <- rank(abs(dfEstm))
# This ranks the difference scores in terms of absolute values

rnkDf <- cbind(rnkDfBdi, rnkDfEstm)
sgnDf <- cbind(sign(dfBdi), sign(dfEstm))
# Generates a matrix of 10 x 2 ranks and a corresponding matrix of signs

sgnRnkDf <- rnkDf * sgnDf
# Attaches signs to corresponding ranks

nObs <- nrow(bdi)
# Defines the size ofthe sample

sgnRnk <- sgnRnkDf / (nObs+1)
sgnRnkT <- cbind(sum(sgnRnk[,1]), sum(sgnRnk[,2]))
# The sign-rank statistic as a row vector

cvT <- crossprod(sgnRnk)
# This is nV

starT <- sgnRnkT %*% solve(cvT) %*% t(sgnRnkT)
# This is the test statistic

1 - pchisq(starT, df=2) 
# The corresponding P-value using chi-square approximation

The modified R script

rnkDf <- sapply(list(bdi, estm), function(df) rank(abs(df[,2] - df[,1])))
sgnDf <- sapply(list(bdi, estm), function(df) sign(df[,2] - df[,1]))
sgnRnk <- (rnkDf * sgnDf) / (nrow(bdi) + 1)
sgnRnkT <- matrix(colSums(sgnRnk))
starT <- as.vector(t(sgnRnkT) %*% solve(crossprod(sgnRnk)) %*% sgnRnkT)
p <- 1 - pchisq(starT, df=2)
list(stat = starT, p = p, significance = p < .05)
$stat
[1] 7.103517

$p
[1] 0.02867418

$significance
[1] TRUE

Listing 3

The original S script

cntrl <- matrix(scan("mice.cntrl.dat"), ncol=2, byrow=T)
trtmnt <- matrix(scan("mice.trtmnt.dat"), ncol=2, byrow=T) 
# Reads data in matrix form

nC <- 10; nT <- 12; N <- nC+nT
# Sets number o f observations in each group and total number of observations

rnkC1T1 <- rank(append(cntrl[,1], trtmnt[,1]))
rnkC2T2 <- rank(append(cntrl[,2], trtmnt[,2]))
# Assigns ranks to observations o f each compound in the pooled sample

rnkC1 <- rnkC1T1[1:nC]
rnkC2 <- rnkC2T2[1:nC]
# Picks out the ranks of the control group

sumRnkC1 <- sum((rnkC1/(N + 1)) - 0.5)
sumRnkC2 <- sum((rnkC2/(N + 1)) - 0.5)
U <- cbind(sumRnkC1, sumRnkC2)
# This is the rank-sum statistic as a row vector

rnkCT <- cbind(rnkC1T1 , rnkC2T2)
# A 22 by 2 matrix ofjoint rankings

coefM <- nC*nT / ((N+1)*(N+1)*(N-1)*N*N)
coefL <- (N*(N+1)*(N+1))/4
# Defines coefficients in covariance formula

cvU <- cqefM*((crossprod(rnkCT))-coefL)
# This is V.

starU <- U %*% solve(N*cvU) %*% t(U)

# This is the test statistic
1 - pchisq(starU, df=2) 

The modified R script

T2 <- read.table('../data/data_hw0511_6_Table2.txt', header = TRUE)
T2
C1T1 <- T2 %>% dplyr::select(contains('1'))
C2T2 <- T2 %>% dplyr::select(contains('2'))
nC <- length(rowMeans(T2[,1:2]))
nT <- length(rowMeans(T2[,3:4]))
N <- nC + nT

rnkCT <- sapply(list(C1T1, C2T2),
              function(df) rank(stack(df)$value)) %>% head(., nC)
U <- apply(rnkCT, 2, function(v) sum((v / (N + 1)) - 0.5)) %>% as.matrix()

coefM <- nC*nT / ((N+1)*(N+1)*(N-1)*N*N)
coefL <- (N*(N+1)*(N+1))/4
cvU <- coefM*((crossprod(rnkCT))-coefL)
starU <- t(U) %*% solve(N*cvU) %*% U
p <- 1 - pchisq(starU, df=2) 

list(stat = starU, p = p, significance = p<.05)
$stat
         [,1]
[1,] 17.41802

$p
            [,1]
[1,] 0.000165092

$significance
     [,1]
[1,] TRUE