Problem 8

Part A.

Based on the side by side box plot, it seems of the shelves that were selected, shelves numbered less than 22 have a similar average cost and shelves 23-40 have a similar cost. Overall though, the means and variances are quite wide and NOT the same througout.

books <- read.csv(file="books.csv", header=TRUE, sep=",")
boxplot(books$replace~books$shelf, main = "Replacement Cost of Book by Shelf", xlab="Shelf Number", ylab="Replacement Cost")

Part B.

$\hat{t_{unb}}$ = 32637.73.

Standard error of $\hat_t_{unb}$ for the estimate is 5733.527.

The coefficient of variation is 0.1756717.

df <- data.frame(books)

#Add mi Column
df$mi <- c()
df$mi <- ifelse(df$shelf > "0","5","0")   

#Add avg or yi_bar Column
yi <- aggregate(df$replace~df$shelf,df, mean)
y_i_bar <- yi$`df$replace`


shelves <- c("2","4","11","14", "20", "22", "23", "31", "37", "38", "40", "43")

Mi <- as.numeric(c("26", "52", "70", "47", "5", "28", "27", "29", "21", "31", "14", "27"))

mi <- as.numeric(c("5", "5", "5", "5", "5", "5", "5", "5","5", "5", "5", "5"))

ti_hat <- Mi*y_i_bar
ti_hat

##  [1]  249.6  322.4  644.0  338.4  209.0  834.4 1398.6 1774.8 1058.4 1134.6
## [11]  758.8  178.2

df$yi_bar <- ifelse(df$shelf == 2, 9.6, ifelse(df$shelf == 4, 6.2, ifelse(df$shelf == 11, 9.2, ifelse(df$shelf == 14, 7.2, ifelse(df$shelf == 20, 41.8, ifelse(df$shelf == 22, 29.8, ifelse(df$shelf == 23, 51.8, ifelse(df$shelf == 31, 61.2, ifelse(df$shelf == 37, 50.4, ifelse(df$shelf == 38, 36.6, ifelse(df$shelf == 40, 54.2, ifelse(df$shelf == 43, 6.6, 0))))))))))))

#si^2 column 
si2_2 <- sum((df$replace[1:5] - df$yi_bar[1:5])^2)/4
si2_4 <- sum((df$replace[6:10] - df$yi_bar[6:10])^2)/4
si2_11<- sum((df$replace[11:15] - df$yi_bar[11:15])^2)/4
si2_14<- sum((df$replace[16:20] - df$yi_bar[16:20])^2)/4
si2_20<- sum((df$replace[21:25] - df$yi_bar[21:25])^2)/4
si2_22<- sum((df$replace[26:30] - df$yi_bar[26:30])^2)/4
si2_23<- sum((df$replace[31:35] - df$yi_bar[31:35])^2)/4
si2_31<- sum((df$replace[36:40] - df$yi_bar[36:40])^2)/4
si2_37<- sum((df$replace[41:45] - df$yi_bar[41:45])^2)/4
si2_38<- sum((df$replace[46:50] - df$yi_bar[46:50])^2)/4
si2_40<- sum((df$replace[51:55] - df$yi_bar[51:55])^2)/4
si2_43<- sum((df$replace[56:60] - df$yi_bar[56:60])^2)/4

si2 <- c(si2_2, si2_4, si2_11, si2_14, si2_20, si2_22, si2_23, si2_31, si2_37, si2_38, si2_40, si2_43)

#Estimate of Total Replacement Value of the Book Collection
t_hat_unb <- (44/12)*sum(ti_hat)
t_hat_unb

## [1] 32637.73

#Standard Error of Estimate
k <- sum(1-(mi/Mi)) *(Mi^2)*((si2^2)/mi)
st_2 <- var(ti_hat)
SE_t_hat <- 44*sqrt( (1-12/44)* (st_2/12) + (1/(12*44))*(372463.2))
SE_t_hat

## [1] 5733.527

#Estimated Coefficient of Variation for t_hat_unb
SE_t_hat/t_hat_unb

## [1] 0.1756717

Part C.

The average replacement cost per book is $23.61.

The standard error of the estimate y_hat_bar_r is 5.476.

The estimated coefficient of variation of y_hat_bar_r is 0.2319271.

#Average Cost
y_hat_bar_r <- sum(ti_hat)/sum(Mi)
y_hat_bar_r

## [1] 23.61061

sum((ti_hat - (Mi*y_hat_bar_r))^2)/(12-1)

## [1] 476700.3

sr2 <- sum((ti_hat - (Mi*y_hat_bar_r))^2)/(12-1)
V_yhatbarr<- (1-(12/44))*(sr2/12*mean(Mi)^2) + (1/(12*44*(mean(Mi)^2))*(372463.2))

#Standard Error
SE_y_bar_r <- 1/(mean(Mi)) * sqrt( (1-12/44) * sr2/12 + 705.4)
SE_y_bar_r

## [1] 5.475941

#Coefficient of Variation
SE_y_bar_r/y_hat_bar_r

## [1] 0.2319271

Question 12

I basically used your code and changed avg_vol to avg_leng.

The estimate for average egg length is 48.65. The standard error here is 0.12939.

Something to note for future me who is reviewing this- N is not given, but it must be very large. So in the first part for the sample variance, the fpc is close to one, and for the second part it is very small so we can ignore it.

library(SDaA)

## 
## Attaching package: 'SDaA'

## The following object is masked _by_ '.GlobalEnv':
## 
##     books

data(coots)
head(coots)

##   clutch csize length breadth    volume tmt
## 1      1    13  44.30   31.10 3.7957569 yes
## 2      1    13  45.90   32.70 3.9328497 yes
## 3      2    13  49.20   34.40 4.2156036 yes
## 4      2    13  48.70   32.70 4.1727621 yes
## 5      3     6  51.05   34.25 0.9317646  no
## 6      3     6  49.35   34.40 0.9007362  no

plot(coots$clutch, coots$length, pch=16, col="maroon", main = "Scatterplot of Length Vs Clutch", xlab="clutch", ylab="length")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mns <- group_by(coots, clutch) %>% summarise(avg_leng = mean(length))
mns$rank <- rank(mns$avg_leng)
head(coots)

##   clutch csize length breadth    volume tmt
## 1      1    13  44.30   31.10 3.7957569 yes
## 2      1    13  45.90   32.70 3.9328497 yes
## 3      2    13  49.20   34.40 4.2156036 yes
## 4      2    13  48.70   32.70 4.1727621 yes
## 5      3     6  51.05   34.25 0.9317646  no
## 6      3     6  49.35   34.40 0.9007362  no

coots <- merge(coots, mns, by.x = "clutch", by.y = "clutch", all.x = TRUE) 

head(coots)

##   clutch csize length breadth    volume tmt avg_leng  rank
## 1      1    13  44.30   31.10 3.7957569 yes    45.10   5.0
## 2      1    13  45.90   32.70 3.9328497 yes    45.10   5.0
## 3      2    13  49.20   34.40 4.2156036 yes    48.95 110.5
## 4      2    13  48.70   32.70 4.1727621 yes    48.95 110.5
## 5      3     6  51.05   34.25 0.9317646  no    50.20 146.5
## 6      3     6  49.35   34.40 0.9007362  no    50.20 146.5

#Let's calculate some stuff. 
mns <- group_by(coots, clutch) %>% summarise(avg_leng= mean(length), M_i = mean(csize))

y_hat_bar_r <- sum(mns$M_i*mns$avg_leng)/sum(mns$M_i)
y_hat_bar_r

## [1] 48.64863

#Let's calculate the variance
M_bar <- mean(mns$M_i)
n <- nrow(mns)

s2_r <- sum((mns$M_i*mns$avg_leng - mns$M_i*y_hat_bar_r)^2)/(n-1)

#Standard error
se <- sqrt(1/M_bar^2*s2_r/n)
se

## [1] 0.129321

Question 33

par(mfrow=c(1,2),pty="s",ask=T)

intervals <- function(groupcorr = 0, numintervals = 100, groupsize = 5, 
      sampgroups=10, popgroups=5000, mu = 0, sigma = 1) {


#Description:

#     Simulate a population of clusters, then draw a simple random 
#     sample of clusters and construct interval estimates using incorrect
#     SRS formulae and formulae appropriate for cluster samples

#Usage:

#     intervals(groupcorr=0, numintervals = 100, groupsize = 5, sampgroups=10, 
#        popgroups=5000, mu = 0, sigma = 1)

#Arguments:

#groupcorr: The intracluster correlation coefficient rho.

#numintervals: Number of samples to be taken from population.
          
#groupsize: Number of elements in each population cluster.

#sampgroups: Number of clusters to be sampled

#popgroups: Number of clusters in population

#    mu:  Mean for generating population

# sigma: Standard deviation for generating population

#Value:

#   Produces 3 graphs:

#  (1) Dotplot of data from last sample, displaying similarity within clusters

#  (2) Interval estimates for mean using SRS formulas. Red lines
#      are the intervals that do not include the population mean

#  (3) Confidence intervals using clusters as unit of analysis.


  if (groupcorr < 0 | groupcorr > 1) stop("correlation must be between 0 and 1")

  betweenvar <- groupcorr * sigma^2
  withinvar <- sigma^2*(1 - groupcorr)
  group <- rep(1:sampgroups,each=groupsize)

 # Generate population with correlation structure

  gpmeans <- rnorm(popgroups,mu,sqrt(betweenvar))
  yy <- rep(gpmeans,each=groupsize) + rnorm(popgroups*groupsize,0,sqrt(withinvar))
  yy <- matrix(yy,nrow=groupsize,ncol=popgroups)

  indci <- matrix(0,ncol=2,nrow=numintervals)
  clci <- indci
  yy <- yy - mean(yy) + mu  # Adjust population so it has mean mu

  for (i in 1:numintervals) {

 # Take cluster sample of size groupsize

    sampindex <- sample(1:popgroups,sampgroups)
    mysamp <- yy[,sampindex]

    indci[i,] <- t.test(as.vector(mysamp))$conf.int
    sampmeans <- apply(mysamp,2,mean)
    clci[i,] <- t.test(sampmeans)$conf.int

  }

   print(paste("Data from sample",numintervals))
   print(mysamp)
   plot(group,as.vector(mysamp),main=paste("Data values from sample",numintervals),
       xlab="group number",ylab="x")
   plot(as.vector(mysamp),as.vector(mysamp),type="n",axes=F,xlab="",ylab="")

 # Determine if true value inside intervals

   indcover <- indci[,1] < mu & indci[,2] > mu
   indcol <- rep("black",numintervals)
   indcol[!indcover] <- "red"

   clcover <- clci[,1] < mu & clci[,2] > mu
   clcol <- rep("black",numintervals)
   clcol[!clcover] <- "red"

 # Draw confidence intervals

   plot(indci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
      ylab="interval",xlab="",main = "assuming SRS")
   abline(v=mu)
   for ( i in 1:numintervals) lines( c(indci[i,1],indci[i,2]),c(i,i),col=indcol[i] )

   plot(clci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
      ylab="interval",xlab="",main = "using sampling design")
   abline(v=mu)
   for ( i in 1:numintervals) lines( c(clci[i,1],clci[i,2]),c(i,i),col=clcol[i] )
}

Part A.

The effect of ignorning the clustering when ICC = 0 is not an issues because its simply an SRS.

The SRS plot treats each of the clustered points as a single SRS draw or as individual points.

set.seed(1234)
intervals(0)

## [1] "Data from sample 100"
##           [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
## [1,] 0.2330084 -0.1350343  0.6039789 -0.2570171 -0.5486294  0.6714340
## [2,] 0.3558845 -1.4001874 -1.6911135  1.4129490 -1.2120906 -0.3779769
## [3,] 0.1702957  0.1216958  1.0975348  0.2019594  1.4535386  0.8181927
## [4,] 0.3183312  0.4477785  1.8672250  0.7642093  1.0353036  2.5165197
## [5,] 1.7990276 -1.3217385 -0.9191481 -1.3368679  1.6204426  0.1652719
##            [,7]       [,8]       [,9]       [,10]
## [1,]  0.2172319 -0.5087818  1.7174299  2.08702675
## [2,] -0.3001348  2.3921469 -1.3621159  0.10898358
## [3,] -0.6160307 -0.4142567  0.5730560  0.06851685
## [4,]  0.7237702  0.8820747  0.6946531 -1.07005491
## [5,]  1.1418483  0.2009615  1.1931444  0.21070931

Part B.

In comparison, when ICC = 0.5, the intervals are not as wide as when ICC = 1 but they are wider than when ICC = 0.

78% of the interval estimates contain the true mean. 97% of the intervals calculated using the formulae for cluster samples include the true mean.

par(mfrow=c(1,2),pty="s",ask=T)

intervals2 <- function(groupcorr = 0, numintervals = 100, groupsize = 5, 
      sampgroups=10, popgroups=5000, mu = 0, sigma = 1) {

  if (groupcorr < 0 | groupcorr > 1) stop("correlation must be between 0 and 1")

  betweenvar <- groupcorr * sigma^2
  withinvar <- sigma^2*(1 - groupcorr)
  group <- rep(1:sampgroups,each=groupsize)

 # Generate population with correlation structure

  gpmeans <- rnorm(popgroups,mu,sqrt(betweenvar))
  yy <- rep(gpmeans,each=groupsize) + rnorm(popgroups*groupsize,0,sqrt(withinvar))
  yy <- matrix(yy,nrow=groupsize,ncol=popgroups)

  indci <- matrix(0,ncol=2,nrow=numintervals)
  clci <- indci
  yy <- yy - mean(yy) + mu  # Adjust population so it has mean mu

  for (i in 1:numintervals) {

 # Take cluster sample of size groupsize

    sampindex <- sample(1:popgroups,sampgroups)
    mysamp <- yy[,sampindex]

    indci[i,] <- t.test(as.vector(mysamp))$conf.int
    sampmeans <- apply(mysamp,2,mean)
    clci[i,] <- t.test(sampmeans)$conf.int

  }

   print(paste("Data from sample",numintervals))
   print(mysamp)
   plot(group,as.vector(mysamp),main=paste("Data values from sample",numintervals),
       xlab="group number",ylab="x")
   plot(as.vector(mysamp),as.vector(mysamp),type="n",axes=F,xlab="",ylab="")

 # Determine if true value inside intervals

   indcover <- indci[,1] < mu & indci[,2] > mu
   indcol <- rep("black",numintervals)
   indcol[!indcover] <- "red"

   clcover <- clci[,1] < mu & clci[,2] > mu
   clcol <- rep("black",numintervals)
   clcol[!clcover] <- "red"

 # Draw confidence intervals

   plot(indci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
      ylab="interval",xlab="",main = "assuming SRS")
   abline(v=mu)
   for ( i in 1:numintervals) lines( c(indci[i,1],indci[i,2]),c(i,i),col=indcol[i] )

   plot(clci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
      ylab="interval",xlab="",main = "using sampling design")
   abline(v=mu)
   for ( i in 1:numintervals) lines( c(clci[i,1],clci[i,2]),c(i,i),col=clcol[i] )
   
  print(mean(indcover))
  print(mean(clcover))

}

intervals(0.5)

## [1] "Data from sample 100"
##            [,1]       [,2]       [,3]       [,4]      [,5]      [,6]
## [1,] -1.0024549  1.2267683  0.2946263 -1.6289762 0.2917552 1.5028543
## [2,] -0.7303215 -0.2929515 -1.2425009  0.3564725 2.5286538 1.3064378
## [3,] -0.2194137 -0.4322320 -1.6849145  0.3845662 0.2223460 1.0784968
## [4,] -0.6329102 -0.7226171 -0.7180164  0.3141875 0.9756198 1.9229521
## [5,] -0.6013197  0.7853145 -1.3062367 -1.5108996 0.5597302 0.4590043
##             [,7]     [,8]       [,9]       [,10]
## [1,]  0.29447091 1.944273 -0.5932644 -0.35338645
## [2,] -1.29273415 2.947248  0.6409272 -0.55168311
## [3,] -0.02616879 2.486192 -0.5842212 -0.04855095
## [4,] -1.07356349 2.578674 -0.4710935 -1.07545177
## [5,] -0.79721211 2.845043 -2.3669475 -2.08680493

intervals2(0)

## [1] "Data from sample 100"
##            [,1]       [,2]       [,3]        [,4]        [,5]        [,6]
## [1,] -1.9000394  0.9557255 -0.2085279  0.02900536 -0.80310543  0.71956992
## [2,] -2.2867513  1.0146087 -0.9341731 -0.43017230 -0.60271386 -0.16057919
## [3,] -2.2623149  0.5917808 -0.6450523  0.78147753  1.15411206 -0.04835204
## [4,]  0.2399447  1.3233774  0.5727465 -0.32631639 -1.14898584 -0.98692163
## [5,]  0.7587396 -0.2899003  0.6899971  1.47432902  0.04425618 -1.30086588
##             [,7]       [,8]         [,9]       [,10]
## [1,] -0.09801864  0.1257906  0.323780528 -0.81971136
## [2,]  1.91609655  0.5458051 -0.002807239 -0.45006716
## [3,] -0.19067728  1.5916478  0.209970051 -0.04075734
## [4,] -1.17736948  0.3217231  0.107319998  0.13787678
## [5,]  1.57772090 -0.7346071  0.516078538  0.57699492

## [1] 0.93
## [1] 0.94

intervals2(0.5)

## [1] "Data from sample 100"
##            [,1]       [,2]        [,3]       [,4]       [,5]        [,6]
## [1,] -1.3961574 -0.5600964 -0.88498748  1.3975282  0.5604249  0.16642608
## [2,] -1.0274363 -0.2203076  1.24736220  1.6041660  1.0161729 -1.06430125
## [3,] -1.2774339 -0.2051571  0.62455414  1.3079740 -0.4367768 -0.50360805
## [4,] -1.4350511 -0.7770021  0.03118201  0.1450623  1.7530799 -0.02402023
## [5,] -0.9386758  0.2203689  0.80944188 -0.1691540  1.9562928  1.20660794
##             [,7]      [,8]     [,9]       [,10]
## [1,] -0.30378898 1.2922282 2.644890 -1.37295867
## [2,] -2.25533124 0.8930161 2.334817 -0.46593062
## [3,] -0.19012419 0.9489262 2.590199  0.08489339
## [4,]  0.89590193 0.5285770 1.618975 -0.30332097
## [5,] -0.03636114 0.7077515 2.137351  0.04270696

## [1] 0.76
## [1] 0.96

intervals2(1)

## [1] "Data from sample 100"
##           [,1]       [,2]       [,3]     [,4]     [,5]       [,6]     [,7]
## [1,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [2,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [3,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [4,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [5,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
##            [,8]      [,9]     [,10]
## [1,] -0.5834179 -0.304153 -1.712121
## [2,] -0.5834179 -0.304153 -1.712121
## [3,] -0.5834179 -0.304153 -1.712121
## [4,] -0.5834179 -0.304153 -1.712121
## [5,] -0.5834179 -0.304153 -1.712121

## [1] 0.46
## [1] 0.93

Part C.

I think if we ran the function with ICC = 1, I suspect the intervals to be wider and the percentage of the interval estimates including the true mean will decrease when ignoring clustering and increasing when including the clustering.

My predictions were correct.

intervals(1)

## [1] "Data from sample 100"
##           [,1]      [,2]      [,3]       [,4]     [,5]      [,6]      [,7]
## [1,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [2,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [3,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [4,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [5,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
##            [,8]      [,9]     [,10]
## [1,] -0.8791541 0.2064434 -1.536629
## [2,] -0.8791541 0.2064434 -1.536629
## [3,] -0.8791541 0.2064434 -1.536629
## [4,] -0.8791541 0.2064434 -1.536629
## [5,] -0.8791541 0.2064434 -1.536629

intervals2(1)

## [1] "Data from sample 100"
##            [,1]      [,2]      [,3]       [,4]       [,5]      [,6]
## [1,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [2,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [3,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [4,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [5,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
##           [,7]      [,8]      [,9]       [,10]
## [1,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [2,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [3,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [4,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [5,] 0.5893156 0.7367724 -2.549818 -0.02110551

## [1] 0.57
## [1] 0.92

STAT 488 HW 5

Kajal Chokshi

11/22/2018

Problem 8

Part A.

Part B.

Part C.

Question 12

Question 33

Part A.

Part B.

Part C.