DataM: Homework Exercise 0511 - Programming
HW exercise 1.
Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the following script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.
Target output
Download and check
fL <- "http://faculty.washington.edu/kenrice/rintro/salary.txt"
dta <- read.table(fL, header=TRUE)
str(dta)'data.frame': 19792 obs. of 11 variables:
$ case : int 1 2 3 4 5 6 7 8 9 10 ...
$ id : int 1 2 2 4 6 6 6 6 6 7 ...
$ gender : chr "F" "M" "M" "M" ...
$ deg : chr "Other" "Other" "Other" "PhD" ...
$ yrdeg : int 92 91 91 96 66 66 66 66 66 70 ...
$ field : chr "Other" "Other" "Other" "Other" ...
$ startyr: int 95 94 94 95 91 91 91 91 91 71 ...
$ year : int 95 94 95 95 91 92 93 94 95 76 ...
$ rank : chr "Assist" "Assist" "Assist" "Assist" ...
$ admin : int 0 0 0 0 1 1 0 0 0 0 ...
$ salary : num 6684 4743 4881 NA 11182 ...
[1] 19792 11
[1] "case" "id" "gender" "deg" "yrdeg" "field" "startyr"
[8] "year" "rank" "admin" "salary"
Data transformation
Group the dataset by id and rank to compute salary mean of full years
Group the dataset by id only to find the salary of the final year
Join the data
dta_long <- full_join(dta1, dta2) %>%
reshape2::melt(id.vars = c('id', 'rank'), measure.vars = c('full', 'final'))
summary(dta_long) id rank variable value
Min. : 1.0 Length:5524 full :2762 Min. : 1334
1st Qu.: 449.0 Class :character final:2762 1st Qu.: 3828
Median : 868.5 Mode :character Median : 4876
Mean : 877.3 Mean : 5167
3rd Qu.:1305.0 3rd Qu.: 6273
Max. :1770.0 Max. :14464
NA's :1166
Data visualization
par(oma = c(1, 0, 0, 0))
boxplot(value ~ variable + rank, data = dta_long, col = c('white', 'red'),
xlab = '', ylab = 'salary', xaxt = 'n')
axis(1, at = 1:6, labels = rep(c('full', 'final'), 3))
mtext(c('Assist', 'Assoc', 'Full'), side = 1, line = -2.5,
at = c(0.27, 0.53, 0.78), cex = 1, outer = TRUE)HW exercise 2.
The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.
The original scipts
#
# riffle
# interlace two lists of numbers, starting with the longer list,
# without repeating
#
riffle <- function(x, y) {
z <- NULL
count <- 1
for (i in 1:max(length(x), length(y))) {
if (i <= length(x)) {
z[count] <- x[i]
count <- count + 1
}
if (i <= length(y)) {
z[count] <- y[i]
count <- count + 1
}
}
return(z)
}
###
##
# test it
riffle(1:10, 50:55) [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
x <- 1:10
y <- 50:55
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
c(na.omit(as.numeric(t(cbind(x,y))))) [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
The modified script
riffle2 <- function(x, y) {
dl <- length(x) - length(y)
y[(length(y) + 1):(length(y) + dl)] <- rep(NA, dl)
z <- na.omit(as.numeric(t(cbind(x, y))))
return(z)
}
riffle(1:10, 50:55) [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
[1] -10 5 -9 6 -8 7 -7 8 -6 9 -5 10 -4 11 -3 12 -2 13 -1
[20] 14 0 15 1 2 3 4 5 6 7 8 9 10
[1] -1.1819838 0.0000000 1.4574409 5.0000000 0.2713082 1.0000000
[7] 0.8329461 3.0000000 1.0160099 2.0000000 1.8264172
HW exercise 3.
The following R script produces the color strip below:
#
# plot a color strip
#
plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
my_cl <- c("indianred", "orange", "yellow", "forestgreen",
"dodgerblue", "violet", "purple")
for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)
Target output
Method 1: Double for loop
The original senario
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
k <- length(my_cl) - 1
plot.new()
plot.window(xlim = c(0, k), ylim = c(0, k), asp = 1)
for(i in 1:k) {
for (j in 1:k) {
rect(i-1, j-1, i, j, col = my_cl[(i + k+1 - j) %% (k+1) + 1])
}
}Turn into the function
color_squares <- function(cl = 1:7) {
k <- length(cl) - 1
par(mar = rep(1, 4))
plot.new()
plot.window(xlim = c(0, k), ylim = c(0, k), asp = 1)
for(i in 1:k) {
for (j in 1:k) {
rect(i-1, j-1, i, j, col = cl[(i + k+1 - j) %% (k+1) + 1])
}
}
}
par(mfrow=c(2,3))
color_squares()
color_squares(1:5)
color_squares(my_cl)
color_squares(grey(seq(0, 1, by=.1)))
color_squares(c(paste0('slateblue', 1:4), paste0('turquoise', 1:4)))
color_squares(c(paste0('violetred', 1:4), paste0('palevioletred', 1:4)))Method 2: Sapply (vectorization)
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
k <- length(my_cl)
plot.new()
plot.window(xlim = c(0, k-1), ylim = c(0, k-1), asp = 1)
plt <- sapply(0:(k-2), function(i) {
rect(i:(k-2), i, (i+1):(k-1), i+1, col = my_cl[-(k-i)])
rect(0:(i-1)*(i>1), i, 1:i, i+1, col = my_cl[(k-i+1):k])
})Turn into the function
color_squares <- function(cl = 0:6) {
k <- length(cl)
par(mar = rep(1, 4))
plot.new()
plot.window(xlim = c(0, k-1), ylim = c(0, k-1), asp = 1)
plt <- sapply(0:(k-2), function(i) {
rect(i:(k-2), i, (i+1):(k-1), i+1, col = cl[-(k-i)])
rect(0:(i-1)*(i>1), i, 1:i, i+1, col = cl[(k-i+1):k])
})
}
par(mfrow=c(2,3))
color_squares()
color_squares(seq(0, 8, 2))
color_squares(my_cl)
color_squares(grey(seq(0, 1, by=.05)))
color_squares(c(paste0('gold', 1:4), paste0('goldenrod', 1:4)))
color_squares(c(paste0('mediumorchid', 1:4), paste0('mediumpurple', 1:4)))HW exercise 4.
Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).
coin <- function(p =.5, n= 100) {ifelse(runif(n) < p, 'H', 'T')}
sapply(seq(0, 1, by = .1), function(p) c(p, max(rle(coin(p))[[1]]))) %>%
t() %>% as.data.frame()par(mfrow=c(2,2))
plts <- lapply(c(30, 100, 200, 1000), function(n) {
df <- sapply(seq(0, 1, by = .01),
function(p) c(p, max(rle(coin(p, n))[[1]]))) %>%
t() %>% as.data.frame()
plot(df$V1, df$V2, type='l', xlab = 'Probability', ylab = 'The longest streak')
text(.5, n*.9, paste0('Sample size = ', n))
})As the probability get closer to 0.5, the longest streak is smaller. The longest streak equals the sample size (n) when the probability is 0 or 1. This pattern gets more obvious as the sample size increases.
HW exercise 5.
Use the geometrical properties of the following plot to write an R function to estimate the value of \(\pi\) (i.e., the area of a unit circle) through Monte Carlo simulation.
Target output
Let \(r\) be the radius of the unit circle, the green area (the one fourth of the unit circle) \(A\) should be \(\frac{1}{4}r^2 \pi\). The ratio of \(A\) and the whole square area is \(A:1 = A = \frac{1}{4}r^2 \pi\). Thus, \(\pi = 4A\).
[1] 3.13892
Turn into the function to do more investigation
estimate_pi <- function(n=10000, plt=FALSE) {
x <- runif(n)
y <- runif(n)
inner_points <- x^2 + y^2 <= 1
if (plt) {
par(pty = 's', mar = c(4, 2, 1, 1))
plot(x, y, pch = 19,
col = c('steelblue3', 'sienna2')[inner_points*1 + 1])
}
else {return (mean(inner_points)*4)}
}Find the relationship of sample size and the estimate value
ns <- seq(500, 500000, by=500)
estimates <- sapply(ns, function(n) estimate_pi(n))
summary(estimates) Min. 1st Qu. Median Mean 3rd Qu. Max.
3.083 3.139 3.142 3.142 3.144 3.189
As the sample size increases, the estimate value gets more stably around the true value of \(\pi\).
HW exercise 6.
(Bonus) Sheu and O’Curry (1996) implemented several nonparametric multivariate statistics in the S language. Turn their scripts into R programs.
Listing 1
The original S script
bdi <- matrix(scan("bdi.scores"), ncol=2, byrow=T)
estm <- matrix(scan("esteem.scores"), ncol=2, byrow=T)
# We assume the BDI and Self-Esteem scores are stored in two separate files under the
# current working directory. bdi and estm are 10 by 2 matrices of scores
dfBdi <- bdi[,2] - bdi[,1]
dtEstm <- estm[,2] - estm[, 1]
# This generates a vector o f difference scores for BDI and Self-Esteem scores
sign <- function(M) ifelse(M < 0, -1, 1)
# This creates a user-defined sign function
sgnDf <- cbind(sign(dfBdi), sign(dtEstm))
# The two vectors of signs are bound together column-wise
dfSgn <- cbind(sum(sgnDf[,1]), sum(sgnDf[,2]))
# This is a row vector of difference between the number of positive and negative signs
dfSgnV <- crossprod(sgnDf)
# This isnV
starD <- dfSgn %*% solve(dfSgnV) %*% t(dfSgn)
# This calculates the test statistic D*. In S, solve inverts a matrix. Note that dfSgn is a row vector
1 - pchisq(starD, df=2)
# pchisq is the cumulative chi-square distribution functionThe modified R script
bdi <- T1 %>% dplyr::select(contains('BDI'))
estm <- T1 %>% dplyr::select(contains('esteem'))
sign <- function(M) ifelse(M < 0, -1, 1)
sgnDf <- sapply(list(bdi, estm), function(df) sign(df[,2] - df[,1]))
dfSgn <- matrix(colSums(sgnDf))
starD <- as.vector(t(dfSgn) %*% solve(crossprod(sgnDf)) %*% dfSgn)
p <- 1 - pchisq(starD, df=2)
list(stat = starD, p = p, significance = p < .05)$stat
[1] 6.444444
$p
[1] 0.03986637
$significance
[1] TRUE
Listing 2
The original S script
rnkDfBdi <- rank(abs(dfBdi))
rnkDfEstm <- rank(abs(dfEstm))
# This ranks the difference scores in terms of absolute values
rnkDf <- cbind(rnkDfBdi, rnkDfEstm)
sgnDf <- cbind(sign(dfBdi), sign(dfEstm))
# Generates a matrix of 10 x 2 ranks and a corresponding matrix of signs
sgnRnkDf <- rnkDf * sgnDf
# Attaches signs to corresponding ranks
nObs <- nrow(bdi)
# Defines the size ofthe sample
sgnRnk <- sgnRnkDf / (nObs+1)
sgnRnkT <- cbind(sum(sgnRnk[,1]), sum(sgnRnk[,2]))
# The sign-rank statistic as a row vector
cvT <- crossprod(sgnRnk)
# This is nV
starT <- sgnRnkT %*% solve(cvT) %*% t(sgnRnkT)
# This is the test statistic
1 - pchisq(starT, df=2)
# The corresponding P-value using chi-square approximationThe modified R script
rnkDf <- sapply(list(bdi, estm), function(df) rank(abs(df[,2] - df[,1])))
sgnDf <- sapply(list(bdi, estm), function(df) sign(df[,2] - df[,1]))
sgnRnk <- (rnkDf * sgnDf) / (nrow(bdi) + 1)
sgnRnkT <- matrix(colSums(sgnRnk))
starT <- as.vector(t(sgnRnkT) %*% solve(crossprod(sgnRnk)) %*% sgnRnkT)
p <- 1 - pchisq(starT, df=2)
list(stat = starT, p = p, significance = p < .05)$stat
[1] 7.103517
$p
[1] 0.02867418
$significance
[1] TRUE
Listing 3
The original S script
cntrl <- matrix(scan("mice.cntrl.dat"), ncol=2, byrow=T)
trtmnt <- matrix(scan("mice.trtmnt.dat"), ncol=2, byrow=T)
# Reads data in matrix form
nC <- 10; nT <- 12; N <- nC+nT
# Sets number o f observations in each group and total number of observations
rnkC1T1 <- rank(append(cntrl[,1], trtmnt[,1]))
rnkC2T2 <- rank(append(cntrl[,2], trtmnt[,2]))
# Assigns ranks to observations o f each compound in the pooled sample
rnkC1 <- rnkC1T1[1:nC]
rnkC2 <- rnkC2T2[1:nC]
# Picks out the ranks of the control group
sumRnkC1 <- sum((rnkC1/(N + 1)) - 0.5)
sumRnkC2 <- sum((rnkC2/(N + 1)) - 0.5)
U <- cbind(sumRnkC1, sumRnkC2)
# This is the rank-sum statistic as a row vector
rnkCT <- cbind(rnkC1T1 , rnkC2T2)
# A 22 by 2 matrix ofjoint rankings
coefM <- nC*nT / ((N+1)*(N+1)*(N-1)*N*N)
coefL <- (N*(N+1)*(N+1))/4
# Defines coefficients in covariance formula
cvU <- cqefM*((crossprod(rnkCT))-coefL)
# This is V.
starU <- U %*% solve(N*cvU) %*% t(U)
# This is the test statistic
1 - pchisq(starU, df=2) The modified R script
C1T1 <- T2 %>% dplyr::select(contains('1'))
C2T2 <- T2 %>% dplyr::select(contains('2'))
nC <- length(rowMeans(T2[,1:2]))
nT <- length(rowMeans(T2[,3:4]))
N <- nC + nT
rnkCT <- sapply(list(C1T1, C2T2),
function(df) rank(stack(df)$value)) %>% head(., nC)
U <- apply(rnkCT, 2, function(v) sum((v / (N + 1)) - 0.5)) %>% as.matrix()
coefM <- nC*nT / ((N+1)*(N+1)*(N-1)*N*N)
coefL <- (N*(N+1)*(N+1))/4
cvU <- coefM*((crossprod(rnkCT))-coefL)
starU <- t(U) %*% solve(N*cvU) %*% U
p <- 1 - pchisq(starU, df=2)
list(stat = starU, p = p, significance = p<.05)$stat
[,1]
[1,] 17.41802
$p
[,1]
[1,] 0.000165092
$significance
[,1]
[1,] TRUE