Based on the side by side box plot, it seems of the shelves that were selected, shelves numbered less than 22 have a similar average cost and shelves 23-40 have a similar cost. Overall though, the means and variances are quite wide and NOT the same througout.
books <- read.csv(file="books.csv", header=TRUE, sep=",")
boxplot(books$replace~books$shelf, main = "Replacement Cost of Book by Shelf", xlab="Shelf Number", ylab="Replacement Cost")
\(\hat{t_{unb}}\) = 32637.73.
Standard error of \(\hat_t_{unb}\) for the estimate is 5733.527.
The coefficient of variation is 0.1756717.
df <- data.frame(books)
#Add mi Column
df$mi <- c()
df$mi <- ifelse(df$shelf > "0","5","0")
#Add avg or yi_bar Column
yi <- aggregate(df$replace~df$shelf,df, mean)
y_i_bar <- yi$`df$replace`
shelves <- c("2","4","11","14", "20", "22", "23", "31", "37", "38", "40", "43")
Mi <- as.numeric(c("26", "52", "70", "47", "5", "28", "27", "29", "21", "31", "14", "27"))
mi <- as.numeric(c("5", "5", "5", "5", "5", "5", "5", "5","5", "5", "5", "5"))
ti_hat <- Mi*y_i_bar
ti_hat
## [1] 249.6 322.4 644.0 338.4 209.0 834.4 1398.6 1774.8 1058.4 1134.6
## [11] 758.8 178.2
df$yi_bar <- ifelse(df$shelf == 2, 9.6, ifelse(df$shelf == 4, 6.2, ifelse(df$shelf == 11, 9.2, ifelse(df$shelf == 14, 7.2, ifelse(df$shelf == 20, 41.8, ifelse(df$shelf == 22, 29.8, ifelse(df$shelf == 23, 51.8, ifelse(df$shelf == 31, 61.2, ifelse(df$shelf == 37, 50.4, ifelse(df$shelf == 38, 36.6, ifelse(df$shelf == 40, 54.2, ifelse(df$shelf == 43, 6.6, 0))))))))))))
#si^2 column
si2_2 <- sum((df$replace[1:5] - df$yi_bar[1:5])^2)/4
si2_4 <- sum((df$replace[6:10] - df$yi_bar[6:10])^2)/4
si2_11<- sum((df$replace[11:15] - df$yi_bar[11:15])^2)/4
si2_14<- sum((df$replace[16:20] - df$yi_bar[16:20])^2)/4
si2_20<- sum((df$replace[21:25] - df$yi_bar[21:25])^2)/4
si2_22<- sum((df$replace[26:30] - df$yi_bar[26:30])^2)/4
si2_23<- sum((df$replace[31:35] - df$yi_bar[31:35])^2)/4
si2_31<- sum((df$replace[36:40] - df$yi_bar[36:40])^2)/4
si2_37<- sum((df$replace[41:45] - df$yi_bar[41:45])^2)/4
si2_38<- sum((df$replace[46:50] - df$yi_bar[46:50])^2)/4
si2_40<- sum((df$replace[51:55] - df$yi_bar[51:55])^2)/4
si2_43<- sum((df$replace[56:60] - df$yi_bar[56:60])^2)/4
si2 <- c(si2_2, si2_4, si2_11, si2_14, si2_20, si2_22, si2_23, si2_31, si2_37, si2_38, si2_40, si2_43)
#Estimate of Total Replacement Value of the Book Collection
t_hat_unb <- (44/12)*sum(ti_hat)
t_hat_unb
## [1] 32637.73
#Standard Error of Estimate
k <- sum(1-(mi/Mi)) *(Mi^2)*((si2^2)/mi)
st_2 <- var(ti_hat)
SE_t_hat <- 44*sqrt( (1-12/44)* (st_2/12) + (1/(12*44))*(372463.2))
SE_t_hat
## [1] 5733.527
#Estimated Coefficient of Variation for t_hat_unb
SE_t_hat/t_hat_unb
## [1] 0.1756717
The average replacement cost per book is $23.61.
The standard error of the estimate y_hat_bar_r is 5.476.
The estimated coefficient of variation of y_hat_bar_r is 0.2319271.
#Average Cost
y_hat_bar_r <- sum(ti_hat)/sum(Mi)
y_hat_bar_r
## [1] 23.61061
sum((ti_hat - (Mi*y_hat_bar_r))^2)/(12-1)
## [1] 476700.3
sr2 <- sum((ti_hat - (Mi*y_hat_bar_r))^2)/(12-1)
V_yhatbarr<- (1-(12/44))*(sr2/12*mean(Mi)^2) + (1/(12*44*(mean(Mi)^2))*(372463.2))
#Standard Error
SE_y_bar_r <- 1/(mean(Mi)) * sqrt( (1-12/44) * sr2/12 + 705.4)
SE_y_bar_r
## [1] 5.475941
#Coefficient of Variation
SE_y_bar_r/y_hat_bar_r
## [1] 0.2319271
I basically used your code and changed avg_vol to avg_leng.
The estimate for average egg length is 48.65. The standard error here is 0.12939.
Something to note for future me who is reviewing this- N is not given, but it must be very large. So in the first part for the sample variance, the fpc is close to one, and for the second part it is very small so we can ignore it.
library(SDaA)
##
## Attaching package: 'SDaA'
## The following object is masked _by_ '.GlobalEnv':
##
## books
data(coots)
head(coots)
## clutch csize length breadth volume tmt
## 1 1 13 44.30 31.10 3.7957569 yes
## 2 1 13 45.90 32.70 3.9328497 yes
## 3 2 13 49.20 34.40 4.2156036 yes
## 4 2 13 48.70 32.70 4.1727621 yes
## 5 3 6 51.05 34.25 0.9317646 no
## 6 3 6 49.35 34.40 0.9007362 no
plot(coots$clutch, coots$length, pch=16, col="maroon", main = "Scatterplot of Length Vs Clutch", xlab="clutch", ylab="length")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mns <- group_by(coots, clutch) %>% summarise(avg_leng = mean(length))
mns$rank <- rank(mns$avg_leng)
head(coots)
## clutch csize length breadth volume tmt
## 1 1 13 44.30 31.10 3.7957569 yes
## 2 1 13 45.90 32.70 3.9328497 yes
## 3 2 13 49.20 34.40 4.2156036 yes
## 4 2 13 48.70 32.70 4.1727621 yes
## 5 3 6 51.05 34.25 0.9317646 no
## 6 3 6 49.35 34.40 0.9007362 no
coots <- merge(coots, mns, by.x = "clutch", by.y = "clutch", all.x = TRUE)
head(coots)
## clutch csize length breadth volume tmt avg_leng rank
## 1 1 13 44.30 31.10 3.7957569 yes 45.10 5.0
## 2 1 13 45.90 32.70 3.9328497 yes 45.10 5.0
## 3 2 13 49.20 34.40 4.2156036 yes 48.95 110.5
## 4 2 13 48.70 32.70 4.1727621 yes 48.95 110.5
## 5 3 6 51.05 34.25 0.9317646 no 50.20 146.5
## 6 3 6 49.35 34.40 0.9007362 no 50.20 146.5
#Let's calculate some stuff.
mns <- group_by(coots, clutch) %>% summarise(avg_leng= mean(length), M_i = mean(csize))
y_hat_bar_r <- sum(mns$M_i*mns$avg_leng)/sum(mns$M_i)
y_hat_bar_r
## [1] 48.64863
#Let's calculate the variance
M_bar <- mean(mns$M_i)
n <- nrow(mns)
s2_r <- sum((mns$M_i*mns$avg_leng - mns$M_i*y_hat_bar_r)^2)/(n-1)
#Standard error
se <- sqrt(1/M_bar^2*s2_r/n)
se
## [1] 0.129321
par(mfrow=c(1,2),pty="s",ask=T)
intervals <- function(groupcorr = 0, numintervals = 100, groupsize = 5,
sampgroups=10, popgroups=5000, mu = 0, sigma = 1) {
#Description:
# Simulate a population of clusters, then draw a simple random
# sample of clusters and construct interval estimates using incorrect
# SRS formulae and formulae appropriate for cluster samples
#Usage:
# intervals(groupcorr=0, numintervals = 100, groupsize = 5, sampgroups=10,
# popgroups=5000, mu = 0, sigma = 1)
#Arguments:
#groupcorr: The intracluster correlation coefficient rho.
#numintervals: Number of samples to be taken from population.
#groupsize: Number of elements in each population cluster.
#sampgroups: Number of clusters to be sampled
#popgroups: Number of clusters in population
# mu: Mean for generating population
# sigma: Standard deviation for generating population
#Value:
# Produces 3 graphs:
# (1) Dotplot of data from last sample, displaying similarity within clusters
# (2) Interval estimates for mean using SRS formulas. Red lines
# are the intervals that do not include the population mean
# (3) Confidence intervals using clusters as unit of analysis.
if (groupcorr < 0 | groupcorr > 1) stop("correlation must be between 0 and 1")
betweenvar <- groupcorr * sigma^2
withinvar <- sigma^2*(1 - groupcorr)
group <- rep(1:sampgroups,each=groupsize)
# Generate population with correlation structure
gpmeans <- rnorm(popgroups,mu,sqrt(betweenvar))
yy <- rep(gpmeans,each=groupsize) + rnorm(popgroups*groupsize,0,sqrt(withinvar))
yy <- matrix(yy,nrow=groupsize,ncol=popgroups)
indci <- matrix(0,ncol=2,nrow=numintervals)
clci <- indci
yy <- yy - mean(yy) + mu # Adjust population so it has mean mu
for (i in 1:numintervals) {
# Take cluster sample of size groupsize
sampindex <- sample(1:popgroups,sampgroups)
mysamp <- yy[,sampindex]
indci[i,] <- t.test(as.vector(mysamp))$conf.int
sampmeans <- apply(mysamp,2,mean)
clci[i,] <- t.test(sampmeans)$conf.int
}
print(paste("Data from sample",numintervals))
print(mysamp)
plot(group,as.vector(mysamp),main=paste("Data values from sample",numintervals),
xlab="group number",ylab="x")
plot(as.vector(mysamp),as.vector(mysamp),type="n",axes=F,xlab="",ylab="")
# Determine if true value inside intervals
indcover <- indci[,1] < mu & indci[,2] > mu
indcol <- rep("black",numintervals)
indcol[!indcover] <- "red"
clcover <- clci[,1] < mu & clci[,2] > mu
clcol <- rep("black",numintervals)
clcol[!clcover] <- "red"
# Draw confidence intervals
plot(indci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
ylab="interval",xlab="",main = "assuming SRS")
abline(v=mu)
for ( i in 1:numintervals) lines( c(indci[i,1],indci[i,2]),c(i,i),col=indcol[i] )
plot(clci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
ylab="interval",xlab="",main = "using sampling design")
abline(v=mu)
for ( i in 1:numintervals) lines( c(clci[i,1],clci[i,2]),c(i,i),col=clcol[i] )
}
The effect of ignorning the clustering when ICC = 0 is not an issues because its simply an SRS.
The SRS plot treats each of the clustered points as a single SRS draw or as individual points.
set.seed(1234)
intervals(0)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.2330084 -0.1350343 0.6039789 -0.2570171 -0.5486294 0.6714340
## [2,] 0.3558845 -1.4001874 -1.6911135 1.4129490 -1.2120906 -0.3779769
## [3,] 0.1702957 0.1216958 1.0975348 0.2019594 1.4535386 0.8181927
## [4,] 0.3183312 0.4477785 1.8672250 0.7642093 1.0353036 2.5165197
## [5,] 1.7990276 -1.3217385 -0.9191481 -1.3368679 1.6204426 0.1652719
## [,7] [,8] [,9] [,10]
## [1,] 0.2172319 -0.5087818 1.7174299 2.08702675
## [2,] -0.3001348 2.3921469 -1.3621159 0.10898358
## [3,] -0.6160307 -0.4142567 0.5730560 0.06851685
## [4,] 0.7237702 0.8820747 0.6946531 -1.07005491
## [5,] 1.1418483 0.2009615 1.1931444 0.21070931
In comparison, when ICC = 0.5, the intervals are not as wide as when ICC = 1 but they are wider than when ICC = 0.
78% of the interval estimates contain the true mean. 97% of the intervals calculated using the formulae for cluster samples include the true mean.
par(mfrow=c(1,2),pty="s",ask=T)
intervals2 <- function(groupcorr = 0, numintervals = 100, groupsize = 5,
sampgroups=10, popgroups=5000, mu = 0, sigma = 1) {
if (groupcorr < 0 | groupcorr > 1) stop("correlation must be between 0 and 1")
betweenvar <- groupcorr * sigma^2
withinvar <- sigma^2*(1 - groupcorr)
group <- rep(1:sampgroups,each=groupsize)
# Generate population with correlation structure
gpmeans <- rnorm(popgroups,mu,sqrt(betweenvar))
yy <- rep(gpmeans,each=groupsize) + rnorm(popgroups*groupsize,0,sqrt(withinvar))
yy <- matrix(yy,nrow=groupsize,ncol=popgroups)
indci <- matrix(0,ncol=2,nrow=numintervals)
clci <- indci
yy <- yy - mean(yy) + mu # Adjust population so it has mean mu
for (i in 1:numintervals) {
# Take cluster sample of size groupsize
sampindex <- sample(1:popgroups,sampgroups)
mysamp <- yy[,sampindex]
indci[i,] <- t.test(as.vector(mysamp))$conf.int
sampmeans <- apply(mysamp,2,mean)
clci[i,] <- t.test(sampmeans)$conf.int
}
print(paste("Data from sample",numintervals))
print(mysamp)
plot(group,as.vector(mysamp),main=paste("Data values from sample",numintervals),
xlab="group number",ylab="x")
plot(as.vector(mysamp),as.vector(mysamp),type="n",axes=F,xlab="",ylab="")
# Determine if true value inside intervals
indcover <- indci[,1] < mu & indci[,2] > mu
indcol <- rep("black",numintervals)
indcol[!indcover] <- "red"
clcover <- clci[,1] < mu & clci[,2] > mu
clcol <- rep("black",numintervals)
clcol[!clcover] <- "red"
# Draw confidence intervals
plot(indci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
ylab="interval",xlab="",main = "assuming SRS")
abline(v=mu)
for ( i in 1:numintervals) lines( c(indci[i,1],indci[i,2]),c(i,i),col=indcol[i] )
plot(clci[,1],1:numintervals,type="n",xlim=c(-2*sigma+mu,2*sigma+mu),
ylab="interval",xlab="",main = "using sampling design")
abline(v=mu)
for ( i in 1:numintervals) lines( c(clci[i,1],clci[i,2]),c(i,i),col=clcol[i] )
print(mean(indcover))
print(mean(clcover))
}
intervals(0.5)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1.0024549 1.2267683 0.2946263 -1.6289762 0.2917552 1.5028543
## [2,] -0.7303215 -0.2929515 -1.2425009 0.3564725 2.5286538 1.3064378
## [3,] -0.2194137 -0.4322320 -1.6849145 0.3845662 0.2223460 1.0784968
## [4,] -0.6329102 -0.7226171 -0.7180164 0.3141875 0.9756198 1.9229521
## [5,] -0.6013197 0.7853145 -1.3062367 -1.5108996 0.5597302 0.4590043
## [,7] [,8] [,9] [,10]
## [1,] 0.29447091 1.944273 -0.5932644 -0.35338645
## [2,] -1.29273415 2.947248 0.6409272 -0.55168311
## [3,] -0.02616879 2.486192 -0.5842212 -0.04855095
## [4,] -1.07356349 2.578674 -0.4710935 -1.07545177
## [5,] -0.79721211 2.845043 -2.3669475 -2.08680493
intervals2(0)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1.9000394 0.9557255 -0.2085279 0.02900536 -0.80310543 0.71956992
## [2,] -2.2867513 1.0146087 -0.9341731 -0.43017230 -0.60271386 -0.16057919
## [3,] -2.2623149 0.5917808 -0.6450523 0.78147753 1.15411206 -0.04835204
## [4,] 0.2399447 1.3233774 0.5727465 -0.32631639 -1.14898584 -0.98692163
## [5,] 0.7587396 -0.2899003 0.6899971 1.47432902 0.04425618 -1.30086588
## [,7] [,8] [,9] [,10]
## [1,] -0.09801864 0.1257906 0.323780528 -0.81971136
## [2,] 1.91609655 0.5458051 -0.002807239 -0.45006716
## [3,] -0.19067728 1.5916478 0.209970051 -0.04075734
## [4,] -1.17736948 0.3217231 0.107319998 0.13787678
## [5,] 1.57772090 -0.7346071 0.516078538 0.57699492
## [1] 0.93
## [1] 0.94
intervals2(0.5)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1.3961574 -0.5600964 -0.88498748 1.3975282 0.5604249 0.16642608
## [2,] -1.0274363 -0.2203076 1.24736220 1.6041660 1.0161729 -1.06430125
## [3,] -1.2774339 -0.2051571 0.62455414 1.3079740 -0.4367768 -0.50360805
## [4,] -1.4350511 -0.7770021 0.03118201 0.1450623 1.7530799 -0.02402023
## [5,] -0.9386758 0.2203689 0.80944188 -0.1691540 1.9562928 1.20660794
## [,7] [,8] [,9] [,10]
## [1,] -0.30378898 1.2922282 2.644890 -1.37295867
## [2,] -2.25533124 0.8930161 2.334817 -0.46593062
## [3,] -0.19012419 0.9489262 2.590199 0.08489339
## [4,] 0.89590193 0.5285770 1.618975 -0.30332097
## [5,] -0.03636114 0.7077515 2.137351 0.04270696
## [1] 0.76
## [1] 0.96
intervals2(1)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [2,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [3,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [4,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [5,] 0.2607745 -0.7079134 -0.2975785 1.423688 1.321459 -0.7995306 2.055115
## [,8] [,9] [,10]
## [1,] -0.5834179 -0.304153 -1.712121
## [2,] -0.5834179 -0.304153 -1.712121
## [3,] -0.5834179 -0.304153 -1.712121
## [4,] -0.5834179 -0.304153 -1.712121
## [5,] -0.5834179 -0.304153 -1.712121
## [1] 0.46
## [1] 0.93
I think if we ran the function with ICC = 1, I suspect the intervals to be wider and the percentage of the interval estimates including the true mean will decrease when ignoring clustering and increasing when including the clustering.
My predictions were correct.
intervals(1)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [2,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [3,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [4,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [5,] -1.079695 0.1330153 -1.398384 -0.5671712 1.107137 -1.475429 0.2604933
## [,8] [,9] [,10]
## [1,] -0.8791541 0.2064434 -1.536629
## [2,] -0.8791541 0.2064434 -1.536629
## [3,] -0.8791541 0.2064434 -1.536629
## [4,] -0.8791541 0.2064434 -1.536629
## [5,] -0.8791541 0.2064434 -1.536629
intervals2(1)
## [1] "Data from sample 100"
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [2,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [3,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [4,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [5,] -0.1613274 0.7328168 0.8725767 -0.1966638 -0.3204468 0.1408637
## [,7] [,8] [,9] [,10]
## [1,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [2,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [3,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [4,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [5,] 0.5893156 0.7367724 -2.549818 -0.02110551
## [1] 0.57
## [1] 0.92