Exercise

Exercise 1 (GIF plot)

What does the R script do? Revise the code so that it does the same job without resorting to the use of a nested ‘for’ loop.

# original script  
ci_norm <- function(n=100, nci=50, nRun=3, level=0.95, pause=0.05) { 
  ci <- NULL 
  z0 = qnorm(1 - (1 - level)/2) 
  for ( j in 1:nRun ) { 
    zbar <- colMeans(replicate(nci, rnorm(n))) 
    zl <- zbar - z0 * 1/sqrt(n); 
    zu <- zbar + z0 * 1/sqrt(n) 
    plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), type = "n", xlab = "Sample ID", ylab = "Average") 
    abline(h = 0, lty = 2) 
    for( i in 1:nci ) { arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90, code = 3, 
                                                                                                      col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red")) 
                                                      points(i, zbar[i], pch = 19, col = "gray") 
                                                      
                                                      Sys.sleep(pause) } 
                                                    Sys.sleep(3) } 
}

Create function

library(animation)
ci.line<-function(i, zl, zu, zbar, pause){   
  arrows(i, zl[i], i, zu[i], length = 0.05, angle = 90, code = 3, 
         col = ifelse(0 > zl[i] & 0 < zu[i], "gray", "red")) 
  points(i, zbar[i], pch = 19, col = "gray") 
  
  Sys.sleep(pause)}
  
ci.bar<-function(z0,n, pause, nci,nn ){  
  zbar <- colMeans(replicate(nci, rnorm(n))) 
  zl <- zbar - z0 * 1/sqrt(n); 
  zu <- zbar + z0 * 1/sqrt(n) 
  plot(1, xlim = c(0.5, nci + 0.5), ylim = c(min(zl), max(zu)), type = "n", xlab = "Sample ID", ylab = "Average") 
  abline(h = 0, lty = 2) 
  lapply(1:nci, function(m) ci.line(i=m, zl=zl, zu=zu, zbar=zbar, pause=pause))
  
  }  
  
ci.norm<-function(n=100, nci=50, nRun=3, level=0.95, pause=0.05){ 
  ci <- NULL 
  z0 = qnorm(1 - (1 - level)/2)
  lapply(1:nRun, function(r) ci.bar(nn=r, z0=z0,n=n, pause=pause, nci=nci))
   print("") }
ci.norm(n=50, nci=21, nRun=4)

## [1] ""

Exercise 2 (need explain)

Use the read and math variables from the high schools data example for this problem. First firgure out what this R script does and then eliminate the for loop in the code segment.

dta<-read.table("C:/Users/USER/Desktop/R_data management/0511/hs0.txt", header=T, sep="\t")
str(dta)

## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ female : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
##  $ race   : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
##  $ ses    : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
##  $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
##  $ prog   : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 NA ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

head(dta)

##    id female  race    ses schtyp     prog read write math science socst
## 1  70   male white    low public  general   57    52   41      47    57
## 2 121 female white middle public vocation   68    59   53      63    61
## 3  86   male white   high public  general   44    33   54      58    31
## 4 141   male white   high public vocation   63    44   47      53    56
## 5 172   male white middle public academic   47    52   57      53    61
## 6 113   male white middle public academic   44    52   51      63    61

dta.asian <- subset(dta, race=="asian") 
r0 <- cor(dta.asian$math, dta.asian$socst) 
cnt <- 0 
nIter <- 1001 
# Original script
#for (i in 1:nIter) { 
#  new <- sample(dta.asian$read) 
#  r <- cor(new, dta.asian$math) 
#  if ( r0 <= r ) cnt <- cnt+1 } 
#cnt/nIter 
i=1
while(i <=nIter) { 
  new <- sample(dta.asian$read) 
  r <- cor(new, dta.asian$math) 
  if ( r0 <= r ) cnt <- cnt+1 
  i=i+1
  } 
cnt/nIter

## [1] 0.03596404

# 
newread <- replicate(nIter, sample(dta.asian$read)) 
newread <- data.frame(unlist(newread)) 
nn<-with(newread, lapply(names(newread), function(x) 
  cor(dta.asian$math, eval(substitute(tmp, list(tmp=as.name(x)))))) ) 
head(nn)

## [[1]]
## [1] -0.1557154
## 
## [[2]]
## [1] 0.0158295
## 
## [[3]]
## [1] 0.07903023
## 
## [[4]]
## [1] 0.2402566
## 
## [[5]]
## [1] 0.1757661
## 
## [[6]]
## [1] -0.2846964

cor.test(dta[dta$race=="asian", "math"], dta[dta$race=="asian", "socst"])

## 
##  Pearson's product-moment correlation
## 
## data:  dta[dta$race == "asian", "math"] and dta[dta$race == "asian", "socst"]
## t = 1.9887, df = 9, p-value = 0.07796
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07083501  0.86552255
## sample estimates:
##       cor 
## 0.5525177

Exercise 3 (unsolved)

An example of obtaining (nonparametric) bootstrap estimates of coefficients for a multiple linear regression model for the anorexia{MASS} data set is presented in the following script . Figure out how it works and improve the code.

data(anorexia, package="MASS")

##
dta <- anorexia

##     
summary(dta)

##   Treat        Prewt           Postwt      
##  CBT :29   Min.   :70.00   Min.   : 71.30  
##  Cont:26   1st Qu.:79.60   1st Qu.: 79.33  
##  FT  :17   Median :82.30   Median : 84.05  
##            Mean   :82.41   Mean   : 85.17  
##            3rd Qu.:86.00   3rd Qu.: 91.55  
##            Max.   :94.90   Max.   :103.60

##
dim(dta)

## [1] 72  3

##    
row.names(dta)

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"

##    
anrx_lm <- lm(Postwt ~ Prewt + Treat, data=dta)

##    
summary(anrx_lm)

## 
## Call:
## lm(formula = Postwt ~ Prewt + Treat, data = dta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1083  -4.2773  -0.5484   5.4838  15.2922 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.7711    13.3910   3.717  0.00041 ***
## Prewt         0.4345     0.1612   2.695  0.00885 ** 
## TreatCont    -4.0971     1.8935  -2.164  0.03400 *  
## TreatFT       4.5631     2.1333   2.139  0.03604 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.978 on 68 degrees of freedom
## Multiple R-squared:  0.2777, Adjusted R-squared:  0.2458 
## F-statistic: 8.713 on 3 and 68 DF,  p-value: 5.719e-05

##    
coef(anrx_lm)

## (Intercept)       Prewt   TreatCont     TreatFT 
##  49.7711090   0.4344612  -4.0970655   4.5630627

##    
bootstrap_function  <- function(string_formula, nc, model_data, ndraws) {
      coeff_mtx <- matrix(0, nrow = ndraws,  ncol = nc)    
      for (i in 1:ndraws) {
        bootstrap_ids   <- sample(seq(nrow(model_data)), nrow(model_data), replace = TRUE)
        bootstrap_data <- model_data[bootstrap_ids,]
        bootstrap_model <- lm(as.formula(string_formula), bootstrap_data)
        coeff_mtx[i,]   <- bootstrap_model$coefficients
        if (i == 1) {
          print(bootstrap_model$coefficients)
        }
      }     
      return(coeff_mtx)
    }
    
##     
bootstrap_estimates <- bootstrap_function("Postwt ~ Prewt + Treat", nc=4, dta, 500)

## (Intercept)       Prewt   TreatCont     TreatFT 
##  48.5911158   0.4626028  -3.1025295   3.0240144

##     
bootstrap_CIs <- matrix(0, nrow = 4, ncol = 2)
    for (i in 1:4) {
      bootstrap_CIs[i,1]  <- quantile(bootstrap_estimates[,i], 0.025)
      bootstrap_CIs[i,2]  <- quantile(bootstrap_estimates[,i], 0.975)
    }

## 
print(bootstrap_CIs)

##             [,1]       [,2]
## [1,] 16.57250533 76.8546219
## [2,]  0.10622647  0.8242523
## [3,] -7.49270920 -0.6607540
## [4,] -0.09770998  9.0778937

##
confint(anrx_lm)

##                  2.5 %     97.5 %
## (Intercept) 23.0498681 76.4923499
## Prewt        0.1128268  0.7560955
## TreatCont   -7.8754712 -0.3186599
## TreatFT      0.3060571  8.8200682

## 
par(mfrow=c(2,2))
hist(bootstrap_estimates[,1])
hist(bootstrap_estimates[,2])
hist(bootstrap_estimates[,3])
hist(bootstrap_estimates[,4])

Exercise 4

What does the R script do? Replace the “while” loop in the code with a different iteration control structure.

Original script
plot the random point with text label while i<=nIter

Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) { 
  x = rnorm(n) 
  y = rnorm(n) 
  i = 1 
  while (i <= nIter) {  # only loop foe those i<=nIter 
    plot(x, y, ...) # plot the normal distrubution (x, y)
    text(x, y, cex = 0.5) # label x,y 
    x = x + rnorm(n) # iterations 
    y = y + rnorm(n) # iterations
    Sys.sleep(pause) # make 0.05 pause between each iterations
    i = i + 1 } 
  
  } 

### 
## test it 
Brownian(xlim = c(-20, 20), ylim = c(-20, 20), pch = 21, cex = 2, col = "cyan", bg = "lavender")

Brownian <- function(n = 11, pause = 0.05, nIter = 512, ...) { 
  x = rnorm(n) 
  y = rnorm(n) 
 
  for (i in 1: nIter) {  # alternative: ifelse 
    if (i<=nIter){ 
    plot(x, y, ...) 
    text(x, y, cex = 0.5) 
    x = x + rnorm(n) 
    y = y + rnorm(n) 
    Sys.sleep(pause) 
  } else next # skip i>nIter
  }}
### 
## test it 
Brownian(xlim = c(-20, 20), ylim = c(-20, 20), pch = 21, cex = 2, col = "cyan", bg = "lavender")

Exercise 5

Here is an example of simulation using R. Figure out how it works and improve the code.

# original
simSexHeight <- function(n) { 
  m <- matrix(c(runif(n), rnorm(n)), ncol=2) 
  draw <- function(mr) { 
    # mr = one row of m 
    if (mr[1] < 0.505) { 
      sex <- 1 
      cm <- 170 + 7*mr[2] } else { 
        sex <- 0 
        cm <- 160 + 5*mr[2] } 
    return(c(sex, cm)) } 
  
  person <- t(apply(m, 1, draw)) 
  person <- as.data.frame(person) 
  person[, 1] <- ifelse(person[, 1]==1, "M", "F") 
  names(person) <- c("Gender", "Height") 
  return(person) } ###

simSexHeight <- function(n){
count<-sum(runif(n)<0.505)
data.frame(Gender=c(rep("M",count), rep("F", n-count)),
           Height=c(rnorm(count, 170, 7), rnorm(n-count, 160, 5))
           )}

simSexHeight(20)

##    Gender   Height
## 1       M 162.1184
## 2       M 158.0372
## 3       M 169.4136
## 4       M 174.9727
## 5       M 174.9417
## 6       M 173.7682
## 7       M 174.7152
## 8       M 185.7050
## 9       M 170.5542
## 10      M 172.2646
## 11      F 160.2013
## 12      F 159.1887
## 13      F 155.0295
## 14      F 150.3822
## 15      F 159.1699
## 16      F 154.1548
## 17      F 155.1310
## 18      F 160.2239
## 19      F 168.4234
## 20      F 166.4881

HW

HW 1

Download the data set consisting 9 different measurements on 1,597 faculty, including their salaries, gender, ranks, and etc, recorded over several years by executing the first few lines of the salary script. Note that there are different numbers of observations per faculty. Generate a plot (see below for example) the distributions of faculty’s salaries in their final years next to that for all years by rank (Assist, Assoc, Full). You will need to do a bit of programming for data manipulation to get the salary for the final year for each faculty.

…

HW 2

The riffle function carries out the task of interlacing two lists of numbers, starting with the longer list, without repeating. The same objective could have been achieved with the script here. Revise the orginal riffle function with the approach taken in the latter.

# riffle function
riffle <- function(x, y) { 
  z <- NULL 
  count <- 1 
  for (i in 1:max(length(x), length(y))) { 
    if (i <= length(x)) { 
      z[count] <- x[i] 
      count <- count + 1 } 
    if (i <= length(y)) { 
      z[count] <- y[i] 
      count <- count + 1 } } 
  return(z) } ### ## 
# test it 
riffle(1:10, 50:55)

##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10

# alternative approach
x <- 1:10 
y <- 50:55 
dl <- length(x) - length(y) 
y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl) 
c(na.omit(as.numeric(t(cbind(x,y)))))

##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10

# revised riffle function
riffle <- function(x, y) { 
  dl <- length(x) - length(y) 
  y[(length(y)+1):(length(y)+dl)] <- rep(NA, dl) 
  z<-c(na.omit(as.numeric(t(cbind(x,y)))))
  return(z) } ### ## 
# test it 
riffle(1:10, 50:55)

##  [1]  1 50  2 51  3 52  4 53  5 54  6 55  7  8  9 10

HW 3

Draw the color squares below by revising and vectorizing the code given. (Hint: ?rect)

…

Original script

plot.new() 
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1) 
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple") 

for(i in 1:6) rect(i-1, i-1, i, i, col = my_cl[i])

adapt script

# solution 
my_cl <- c("indianred", "orange", "yellow", "forestgreen", "dodgerblue", "violet", "purple") 

plot.new() 
plot.window(xlim = c(0, 6), ylim = c(0, 6), asp = 1) 


# lower part 

for(x in 1:6){
  for(m in -1:4){ 
    if (x+m+1>6) break # border 
    rr=m+2 # to assign color 
    rect( x+m, x-1, x+m+1, x,col=my_cl[rr])  
  }}    

# upper part 

for(x in 1:5){ 
  for (m in 0:4){   
    if (x+m+1>6) break
    rr=7-m
    rect(x-1,x+m, x,x+m+1,   
         
         col=my_cl[rr])
  }}

HW4

Simulate sequences of 100 coin tosses and look for the length of the longest streak (run) in each sequence. Investigate the relationship between the probability of heads (or tails) and the maximum run length. (Hint: ?rle).

HW5

Use the geometrical properties of the following plot to write an R function to estimate the value of π (i.e., the area of a unit circle) through Monte Carlo simulation.

programming exercise 1-5 HW 1-5