library(tidyverse)
library(ggplot2)
library(dplyr)
library(animation)
library(gganimate)
library(gifski)What does the R script do? Revise the code so that it does the same job without resorting to the use of a nested ‘for’ loop.
ci_norm <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) {
ci <- NULL
z0 = qnorm(1 - (1 - level)/2)
for ( j in 1:nRun ) {
zbar <- colMeans(replicate(nci, rnorm(n)))
zl <- zbar - z0 * 1/sqrt(n); zu <- zbar + z0 * 1/sqrt(n)
plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)),
type = "n", xlab = "Sample ID", ylab = "Average")
abline(h = 0, lty = 2)
for( i in 1:nci ) {
arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90,
code = 3,
col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red"))
points(i, zbar[i], pch = 19, col = "gray")
Sys.sleep(pause)
}
Sys.sleep(3)
}
}
##
ci_norm(n = 36, nci = 51, nRun = 2)ci_norm_M <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) {
ci <- NULL
z0 = qnorm(1 - (1 - level)/2)
sapply(1:nRun, function(x){
zbar <- colMeans(replicate(nci, rnorm(n)))
zl <- zbar - z0 * 1/sqrt(n); zu <- zbar + z0 * 1/sqrt(n)
plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)),
type = "n", xlab = "Sample ID", ylab = "Average")
abline(h = 0, lty = 2)
sapply(1:nci, function(x){
arrows(x, zl[x], x, zu[x], length = 0.05, angle = 90,
code = 3, col = ifelse(0 > zl[x] & 0 < zu[x], "gray", "red"))
points(x, zbar[x], pch = 19, col = "gray")
# Sys.sleep(pause)
})
# Sys.sleep(3)
})
return()
}
ci_norm_M(n = 36, nci = 51, nRun = 2)## NULL
Use the read and math variables from the high schools data example for this problem. First firgure out what this R script does and then eliminate the for loop in the code segment.
dta.asian <- subset(dta, race=="asian")
r0 <- cor(dta.asian$math, dta.asian$socst)
# Method 1
cnt <- 0
nIter <- 1001
for (i in 1:nIter) {
new <- sample(dta.asian$read) # sample 11 個值
r <- cor(new, dta.asian$math)
if ( r0 <= r ) cnt <- cnt+1
}
cnt/nIter## [1] 0.04295704
# Method 2
# replicate 對指定的運算式重複執行指定的次數
newread <- replicate(nIter, sample(dta.asian$read))
newread <- data.frame(unlist(newread))
unlist(head(with(newread, lapply(names(newread), function(x)
cor(dta.asian$math, eval(substitute(tmp, list(tmp=as.name(x))))))
)))## [1] -0.3066232 0.2957185 -0.1711931 -0.5194420 0.1564189 -0.4768783
##
## Pearson's product-moment correlation
##
## data: dta[dta$race == "asian", "math"] and dta[dta$race == "asian", "socst"]
## t = 1.9887, df = 9, p-value = 0.07796
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07083501 0.86552255
## sample estimates:
## cor
## 0.5525177
corTest <- function(a,b,nIter=1001){
r0 <- cor(a, b)
t <- sapply(c(1:nIter), function(x){
s <- sample(a)
r <- cor(s,b)
return(r)
})
cat(length(t[t>=r0])/nIter)
}
corTest(dta.asian$math, dta.asian$read)## 0.02597403
An example of obtaining (nonparametric) bootstrap estimates of coefficients for a multiple linear regression model for the anorexia{MASS} data set is presented in the following script . Figure out how it works and improve the code.
## Treat Prewt Postwt
## CBT :29 Min. :70.00 Min. : 71.30
## Cont:26 1st Qu.:79.60 1st Qu.: 79.33
## FT :17 Median :82.30 Median : 84.05
## Mean :82.41 Mean : 85.17
## 3rd Qu.:86.00 3rd Qu.: 91.55
## Max. :94.90 Max. :103.60
## [1] 72 3
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64" "65" "66" "67" "68" "69" "70"
## [71] "71" "72"
##
## Call:
## lm(formula = Postwt ~ Prewt + Treat, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1083 -4.2773 -0.5484 5.4838 15.2922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.7711 13.3910 3.717 0.00041 ***
## Prewt 0.4345 0.1612 2.695 0.00885 **
## TreatCont -4.0971 1.8935 -2.164 0.03400 *
## TreatFT 4.5631 2.1333 2.139 0.03604 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2458
## F-statistic: 8.713 on 3 and 68 DF, p-value: 5.719e-05
## (Intercept) Prewt TreatCont TreatFT
## 49.7711090 0.4344612 -4.0970655 4.5630627
bootstrap_function <- function(string_formula, nc, model_data, ndraws) {
coeff_mtx <- matrix(0, nrow = ndraws, ncol = nc)
for (i in 1:ndraws) {
bootstrap_ids <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
bootstrap_data <- model_data[bootstrap_ids,]
bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
coeff_mtx[i,] <- bootstrap_model$coefficients
if (i == 1) {
print(bootstrap_model$coefficients)
}
}
return(coeff_mtx)
}
bootstrap_estimates <- bootstrap_function("Postwt ~ Prewt + Treat", nc=4, dta, 500)## (Intercept) Prewt TreatCont TreatFT
## 51.8422884 0.3881843 -2.0244971 5.4583632
bootstrap_CIs <- matrix(0, nrow = 4, ncol = 2)
for (i in 1:4) {
bootstrap_CIs[i,1] <- quantile(bootstrap_estimates[,i], 0.025)
bootstrap_CIs[i,2] <- quantile(bootstrap_estimates[,i], 0.975)
}
print(bootstrap_CIs)## [,1] [,2]
## [1,] 17.09660263 79.0948519
## [2,] 0.08239626 0.8169008
## [3,] -7.82788325 -0.7663607
## [4,] 0.17300893 8.9687487
## 2.5 % 97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt 0.1128268 0.7560955
## TreatCont -7.8754712 -0.3186599
## TreatFT 0.3060571 8.8200682
##
par(mfrow=c(2,2))
hist(bootstrap_estimates[,1])
hist(bootstrap_estimates[,2])
hist(bootstrap_estimates[,3])
hist(bootstrap_estimates[,4])# 全部打包在一起
bootstrap_CI_plot_M <- function(string_formula, nc, model_data, ndraws){
coeff_mtx <- matrix(0, nrow = ndraws, ncol = nc)
for (i in 1:ndraws) {
bootstrap_ids <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
bootstrap_data <- model_data[bootstrap_ids,]
bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
coeff_mtx[i,] <- bootstrap_model$coefficients
}
bootstrap_CIs <- matrix(0, nrow = nc, ncol = 2)
for (i in 1:nc) bootstrap_CIs[i,] <- quantile(coeff_mtx[,i], c(0.025, 0.975))
cat("estimate:\n");print(bootstrap_CIs)
lm <- lm(as.formula(string_formula), model_data)
cat("lm:\n");print(confint(lm))
par(mfrow=c(nc/2,2))
for (i in 1:nc) hist(coeff_mtx[,i],main=names(coef(bootstrap_model))[i],xlab="")
}
bootstrap_CI_plot_M("Postwt ~ Prewt + Treat", nc=4, dta, 500)## estimate:
## [,1] [,2]
## [1,] 13.3493060 77.5083129
## [2,] 0.1082941 0.8886533
## [3,] -7.6072462 -0.4832734
## [4,] 0.2887894 8.9438489
## lm:
## 2.5 % 97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt 0.1128268 0.7560955
## TreatCont -7.8754712 -0.3186599
## TreatFT 0.3060571 8.8200682
## estimate:
## [,1] [,2]
## [1,] 17.78019346 78.0459616
## [2,] 0.09354962 0.8132488
## [3,] -7.47513934 -0.6040686
## [4,] -0.22859953 9.1089925
## lm:
## 2.5 % 97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt 0.1128268 0.7560955
## TreatCont -7.8754712 -0.3186599
## TreatFT 0.3060571 8.8200682
What does the R script do? Replace the “while” loop in the code with a different iteration control structure.
# n個點隨機亂動
Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) {
x = rnorm(n)
y = rnorm(n)
# i = 1
# while (i <= nIter) {
for(i in 1:nIter){
plot(x, y, ...)
text(x, y, cex = 0.5)
x = x + rnorm(n)
y = y + rnorm(n)
# Sys.sleep(pause)
# i = i + 1
}
}
Brownian(xlim = c(-20, 20), ylim = c(-20, 20),
pch = 21, cex = 2, col = "cyan", bg = "lavender") Here is an example of simulation using R. Figure out how it works and improve the code.
simSexHeight <- function(n) {
m <- matrix(c(runif(n), rnorm(n)), ncol=2)
draw <- function(mr) { # mr = one row of m
if (mr[1] < 0.505) {
sex <- 1
cm <- 170 + 7*mr[2]
} else {
sex <- 0
cm <- 160 + 5*mr[2]
}
return(c(sex, cm))
}
person <- t(apply(m, 1, draw))
person <- as.data.frame(person)
person[, 1] <- ifelse(person[, 1]==1, "M", "F")
names(person) <- c("Gender", "Height")
return(person)
}
simSexHeight(5)simSexHeight_M <- function(n) {
t <- data.frame(Gender = c(sample(c("F","M"), replace=TRUE, size=n)),
Height = c(rnorm(n)))
t <- t %>% mutate(Height = ifelse(Gender == "F", 160 + 5*Height, 170 + 7*Height))
return(t)
}
simSexHeight_M(5)Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the salary script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.
fL <- "http://faculty.washington.edu/kenrice/rintro/salary.txt"
dta <- read.table(fL, header=TRUE)
dim(dta)## [1] 19792 11
## 'data.frame': 19792 obs. of 11 variables:
## $ case : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 1 2 2 4 6 6 6 6 6 7 ...
## $ gender : Factor w/ 2 levels "F","M": 1 2 2 2 2 2 2 2 2 2 ...
## $ deg : Factor w/ 3 levels "Other","PhD",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ yrdeg : int 92 91 91 96 66 66 66 66 66 70 ...
## $ field : Factor w/ 3 levels "Arts","Other",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ startyr: int 95 94 94 95 91 91 91 91 91 71 ...
## $ year : int 95 94 95 95 91 92 93 94 95 76 ...
## $ rank : Factor w/ 3 levels "Assist","Assoc",..: 1 1 1 1 3 3 3 3 3 1 ...
## $ admin : int 0 0 0 0 1 1 0 0 0 0 ...
## $ salary : num 6684 4743 4881 NA 11182 ...
dtafinal <- dta %>% filter(!is.na(salary), year == max(year)) %>%
select(id, rank, salary) %>% mutate(tag = 'final')
dtafull <- dta %>% filter(!is.na(salary), !is.na(rank)) %>%
group_by(id, rank) %>% summarize(salary = mean(salary)) %>% mutate(tag = 'full')
dta_M <- bind_rows(dtafinal,dtafull)
ggplot(data = dta_M, aes(rank, salary)) +
geom_boxplot(aes(color = tag)) +
labs(x = "",
y = "Salary") +
theme_bw()The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.
# riffle
# interlace two lists of numbers, starting with the longer list,
# without repeating
riffle <- function(x, y) {
z <- NULL
count <- 1
for (i in 1:max(length(x), length(y))) {
if (i <= length(x)) {
z[count] <- x[i]
count <- count + 1
}
if (i <= length(y)) {
z[count] <- y[i]
count <- count + 1
}
}
return(z)
}
riffle(1:10, 50:55)## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
# another method
x <- 1:10
y <- 50:55
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
c(na.omit(as.numeric(t(cbind(x,y)))))## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
# modified
riffle_M <- function(x, y) {
dl <- length(x) - length(y)
y[(length(y) + 1):(length(y) + dl)] <- rep(NA, dl)
z <- na.omit(as.numeric(t(cbind(x, y))))
return(z)
}
riffle_M(1:10, 50:55)## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
## attr(,"na.action")
## [1] 14 16 18 20
## attr(,"class")
## [1] "omit"
The script produces the color strip below: Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)
plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue",
"violet", "purple")
for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
t <- 1
for(i in 1:6){
for(j in 1:6){
col <- ifelse(t %% 7 == 0, "purple", my_cl[t %% 7])
rect(j-1, i-1, j, i, col = col)
t <- t + 1
}
}Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).
# 連續投擲出同一面的紀錄
sim_coin <- function(p) sample(c(0,1), replace=TRUE, size=100, prob=c(p,1-p))
sapply(seq(0,1,0.1), function(x) max(rle(sim_coin(x))$lengths))## [1] 100 26 9 9 7 8 10 14 13 22 100
Use the geometrical properties of the following plot to write an R function to estimate the value of π (i.e., the area of a unit circle) through Monte Carlo simulation.
plot.new()
plot.window(xlim = c(0, 1), ylim = c(0, 1), asp = 1)
c <- 0
for(i in 1:10000){
p <- c(runif(1),runif(1))
col <- ifelse(sqrt(p[1]^2+p[2]^2)>1, "red", "green")
c <- ifelse(sqrt(p[1]^2+p[2]^2)>1, c, c+1)
points(p[1], p[2], pch = '.', col = col)
}
axis(1,at=0:1)
axis(2,at=0:1)## [1] 3.1288
(Bonus) Sheu and O’Curry (1996) implemented several nonparametric multivariate statistics in the S language. Turn their scripts into R programs.