programming exercise 1-5 HW 1-5
Exercise
Exercise 1 (GIF plot)
What does the R script do? Revise the code so that it does the same job without resorting to the use of a nested ‘for’ loop.
# original script
ci_norm <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) {
ci <- NULL
z0 = qnorm(1 - (1 - level)/2)
for ( j in 1:nRun ) {
zbar <- colMeans(replicate(nci, rnorm(n)))
zl <- zbar - z0 * 1/sqrt(n);
zu <- zbar + z0 * 1/sqrt(n)
plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), type = "n", xlab = "Sample ID", ylab = "Average")
abline(h = 0, lty = 2)
for( i in 1:nci ) { arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90, code = 3,
col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red"))
points(i, zbar[i], pch = 19, col = "gray")
Sys.sleep(pause) }
Sys.sleep(3) }
} Create function
library(animation)
ci.line<-function(i, zl, zu, zbar, pause){
arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90, code = 3,
col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red"))
points(i, zbar[i], pch = 19, col = "gray")
Sys.sleep(pause)}
ci.bar<-function(z0,n, pause, nci,nn ){
zbar <- colMeans(replicate(nci, rnorm(n)))
zl <- zbar - z0 * 1/sqrt(n);
zu <- zbar + z0 * 1/sqrt(n)
plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), type = "n", xlab = "Sample ID", ylab = "Average")
abline(h = 0, lty = 2)
lapply(1:nci, function(m) ci.line(i=m, zl=zl, zu=zu, zbar=zbar, pause=pause))
}
ci.norm<-function(n=100, nci=50, nRun=3, level=0.95, pause=0.05){
ci <- NULL
z0 = qnorm(1 - (1 - level)/2)
lapply(1:nRun, function(r) ci.bar(nn=r, z0=z0,n=n, pause=pause, nci=nci))
print("") }
ci.norm(n=50, nci=21, nRun=4)## [1] ""
Exercise 2 (need explain)
Use the read and math variables from the high schools data example for this problem. First firgure out what this R script does and then eliminate the for loop in the code segment.
dta<-read.table("C:/Users/USER/Desktop/R_data management/0511/hs0.txt", header=T, sep="\t")
str(dta)## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ female : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
## $ ses : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
## $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
## $ prog : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 NA ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
## id female race ses schtyp prog read write math science socst
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocation 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocation 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
dta.asian <- subset(dta, race=="asian")
r0 <- cor(dta.asian$math, dta.asian$socst)
cnt <- 0
nIter <- 1001
# Original script
#for (i in 1:nIter) {
# new <- sample(dta.asian$read)
# r <- cor(new, dta.asian$math)
# if ( r0 <= r ) cnt <- cnt+1 }
#cnt/nIter
i=1
while(i <=nIter) {
new <- sample(dta.asian$read)
r <- cor(new, dta.asian$math)
if ( r0 <= r ) cnt <- cnt+1
i=i+1
}
cnt/nIter ## [1] 0.03596404
#
newread <- replicate(nIter, sample(dta.asian$read))
newread <- data.frame(unlist(newread))
nn<-with(newread, lapply(names(newread), function(x)
cor(dta.asian$math, eval(substitute(tmp, list(tmp=as.name(x)))))) )
head(nn)## [[1]]
## [1] -0.1557154
##
## [[2]]
## [1] 0.0158295
##
## [[3]]
## [1] 0.07903023
##
## [[4]]
## [1] 0.2402566
##
## [[5]]
## [1] 0.1757661
##
## [[6]]
## [1] -0.2846964
##
## Pearson's product-moment correlation
##
## data: dta[dta$race == "asian", "math"] and dta[dta$race == "asian", "socst"]
## t = 1.9887, df = 9, p-value = 0.07796
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07083501 0.86552255
## sample estimates:
## cor
## 0.5525177
Exercise 3 (unsolved)
An example of obtaining (nonparametric) bootstrap estimates of coefficients for a multiple linear regression model for the anorexia{MASS} data set is presented in the following script . Figure out how it works and improve the code.
## Treat Prewt Postwt
## CBT :29 Min. :70.00 Min. : 71.30
## Cont:26 1st Qu.:79.60 1st Qu.: 79.33
## FT :17 Median :82.30 Median : 84.05
## Mean :82.41 Mean : 85.17
## 3rd Qu.:86.00 3rd Qu.: 91.55
## Max. :94.90 Max. :103.60
## [1] 72 3
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
##
## Call:
## lm(formula = Postwt ~ Prewt + Treat, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1083 -4.2773 -0.5484 5.4838 15.2922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.7711 13.3910 3.717 0.00041 ***
## Prewt 0.4345 0.1612 2.695 0.00885 **
## TreatCont -4.0971 1.8935 -2.164 0.03400 *
## TreatFT 4.5631 2.1333 2.139 0.03604 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2458
## F-statistic: 8.713 on 3 and 68 DF, p-value: 5.719e-05
## (Intercept) Prewt TreatCont TreatFT
## 49.7711090 0.4344612 -4.0970655 4.5630627
##
bootstrap_function <- function(string_formula, nc, model_data, ndraws) {
coeff_mtx <- matrix(0, nrow = ndraws, ncol = nc)
for (i in 1:ndraws) {
bootstrap_ids <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
bootstrap_data <- model_data[bootstrap_ids,]
bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
coeff_mtx[i,] <- bootstrap_model$coefficients
if (i == 1) {
print(bootstrap_model$coefficients)
}
}
return(coeff_mtx)
}
##
bootstrap_estimates <- bootstrap_function("Postwt ~ Prewt + Treat", nc=4, dta, 500)## (Intercept) Prewt TreatCont TreatFT
## 48.5911158 0.4626028 -3.1025295 3.0240144
##
bootstrap_CIs <- matrix(0, nrow = 4, ncol = 2)
for (i in 1:4) {
bootstrap_CIs[i,1] <- quantile(bootstrap_estimates[,i], 0.025)
bootstrap_CIs[i,2] <- quantile(bootstrap_estimates[,i], 0.975)
}
##
print(bootstrap_CIs)## [,1] [,2]
## [1,] 16.57250533 76.8546219
## [2,] 0.10622647 0.8242523
## [3,] -7.49270920 -0.6607540
## [4,] -0.09770998 9.0778937
## 2.5 % 97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt 0.1128268 0.7560955
## TreatCont -7.8754712 -0.3186599
## TreatFT 0.3060571 8.8200682
##
par(mfrow=c(2,2))
hist(bootstrap_estimates[,1])
hist(bootstrap_estimates[,2])
hist(bootstrap_estimates[,3])
hist(bootstrap_estimates[,4])Exercise 4
What does the R script do? Replace the “while” loop in the code with a different iteration control structure.
Original script
plot the random point with text label while i<=nIter
Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) {
x = rnorm(n)
y = rnorm(n)
i = 1
while (i <= nIter) { # only loop foe those i<=nIter
plot(x, y, ...) # plot the normal distrubution (x, y)
text(x, y, cex = 0.5) # label x,y
x = x + rnorm(n) # iterations
y = y + rnorm(n) # iterations
Sys.sleep(pause) # make 0.05 pause between each iterations
i = i + 1 }
}
###
## test it
Brownian(xlim = c(-20, 20), ylim = c(-20, 20), pch = 21, cex = 2, col = "cyan", bg = "lavender")Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) {
x = rnorm(n)
y = rnorm(n)
for (i in 1: nIter) { # alternative: ifelse
if (i<=nIter){
plot(x, y, ...)
text(x, y, cex = 0.5)
x = x + rnorm(n)
y = y + rnorm(n)
Sys.sleep(pause)
} else next # skip i>nIter
}}
###
## test it
Brownian(xlim = c(-20, 20), ylim = c(-20, 20), pch = 21, cex = 2, col = "cyan", bg = "lavender") Exercise 5
Here is an example of simulation using R. Figure out how it works and improve the code.
# original
simSexHeight <- function(n) {
m <- matrix(c(runif(n), rnorm(n)), ncol=2)
draw <- function(mr) {
# mr = one row of m
if (mr[1] < 0.505) {
sex <- 1
cm <- 170 + 7*mr[2] } else {
sex <- 0
cm <- 160 + 5*mr[2] }
return(c(sex, cm)) }
person <- t(apply(m, 1, draw))
person <- as.data.frame(person)
person[, 1] <- ifelse(person[, 1]==1, "M", "F")
names(person) <- c("Gender", "Height")
return(person) } ###simSexHeight <- function(n){
count<-sum(runif(n)<0.505)
data.frame(Gender=c(rep("M",count), rep("F", n-count)),
Height=c(rnorm(count, 170, 7), rnorm(n-count, 160, 5))
)}
simSexHeight(20)## Gender Height
## 1 M 162.1184
## 2 M 158.0372
## 3 M 169.4136
## 4 M 174.9727
## 5 M 174.9417
## 6 M 173.7682
## 7 M 174.7152
## 8 M 185.7050
## 9 M 170.5542
## 10 M 172.2646
## 11 F 160.2013
## 12 F 159.1887
## 13 F 155.0295
## 14 F 150.3822
## 15 F 159.1699
## 16 F 154.1548
## 17 F 155.1310
## 18 F 160.2239
## 19 F 168.4234
## 20 F 166.4881
HW
HW 1
Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the salary script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.…
HW 2
The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.
# riffle function
riffle <- function(x, y) {
z <- NULL
count <- 1
for (i in 1:max(length(x), length(y))) {
if (i <= length(x)) {
z[count] <- x[i]
count <- count + 1 }
if (i <= length(y)) {
z[count] <- y[i]
count <- count + 1 } }
return(z) } ### ##
# test it
riffle(1:10, 50:55)## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
# alternative approach
x <- 1:10
y <- 50:55
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
c(na.omit(as.numeric(t(cbind(x,y)))))## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
# revised riffle function
riffle <- function(x, y) {
dl <- length(x) - length(y)
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl)
z<-c(na.omit(as.numeric(t(cbind(x,y)))))
return(z) } ### ##
# test it
riffle(1:10, 50:55)## [1] 1 50 2 51 3 52 4 53 5 54 6 55 7 8 9 10
HW 3
Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)…
Original script
plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])adapt script
# solution
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple")
plot.new()
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1)
# lower part
for(x in 1:6){
for(m in -1:4){
if (x+m+1>6) break # border
rr=m+2 # to assign color
rect( x+m, x-1, x+m+1, x,col=my_cl[rr])
}}
# upper part
for(x in 1:5){
for (m in 0:4){
if (x+m+1>6) break
rr=7-m
rect(x-1,x+m, x,x+m+1,
col=my_cl[rr])
}}HW4
Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).
HW5
Use the geometrical properties of the following plot to write an R function to estimate the value of π (i.e., the area of a unit circle) through Monte Carlo simulation.