library(SDaA)
data('agsrs')
set.seed(1234)
sample <- replicate(500, sample(agsrs$acres92, 100, replace=TRUE))
str(sample)
## int [1:100, 1:500] 72626 268043 1209335 494304 531206 70404 56102 392639 127867 144858 ...
means <- apply(sample, 2, mean)
kajal <- hist(means, main="Means of 500 Samples", col = 'purple')
kajal
## $breaks
## [1] 200000 220000 240000 260000 280000 300000 320000 340000 360000 380000
## [11] 400000 420000
##
## $counts
## [1] 1 14 51 90 112 102 68 37 18 6 1
##
## $density
## [1] 1.00e-07 1.40e-06 5.10e-06 9.00e-06 1.12e-05 1.02e-05 6.80e-06
## [8] 3.70e-06 1.80e-06 6.00e-07 1.00e-07
##
## $mids
## [1] 210000 230000 250000 270000 290000 310000 330000 350000 370000 390000
## [11] 410000
##
## $xname
## [1] "means"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
Part A. 1*10^-5 Part B. 0.98019 Part C. 69315 samples are needed
n<-1000
sf<- 100000000
###Part A.
p <-n/sf
p
## [1] 1e-05
###Part B.
(1-p)^2000
## [1] 0.9801986
###Part C.
log(0.5)/log(1 - p)
## [1] 69314.37
It’s important to take an SRS instead of taking a convenience sample of the first 10 rows because each member of the audience would have an equal probability of being selected. One could argue very many people don’t like sitting in the front row because their necks may hurt from looking up at the screen, and this could falsely decrease the sample of the audience size. Another could argue that true fans of the film may want to sit in the front rows, which may not represent the size of the audience well.
Also, With the convenience sample, individuals who may be very invested in the film or have poor eyesight may be sitting in the front wrong which could skew the demographic of the sample.
Within strata 1, the SRSs of size 2 = (1,2), (1,3), (1,8), (2,3), (2,8), (3,8) The probability of each sample is 1/6 because they are simple random samples Within strata 2, the SRSs of size 2 = (4,5), (4,6), (4,7), (5,6), (5,7), (6,7) The probability of each sample is 1/6 because they are simple random samples
The sampling distribution is below:
1 1 1 2 1 3 1 4 1 5 1 6 2 1 2 2 2 3 2 4 2 5 2 6 3 1 3 2 3 3 3 4 3 5 3 6 4 1 4 2 4 3 4 4 4 5 30 36 36 28 28 34 34 40 40 32 32 38 40 46 46 38 38 44 36 42 42 34 34 4 6 5 1 5 2 5 3 5 4 5 5 5 6 6 1 6 2 6 3 6 4 6 5 6 6 40 42 48 48 40 40 46 46 52 52 44 44 50
unit_num <- c(1,2,3,8,4,5,6,7)
stratum <- c(1,1,1,1,2,2,2,2)
y <- c(1,2,4,8,4,7,7,7)
str1<-list(c(1,2),c(1,3),c(1,8),c(2,3),c(2,8),c(3,8))
str2<-list(c(4,5),c(4,6),c(4,7),c(5,6),c(5,7),c(6,7))
t_hat<-list()
#Part B. Create the sampling distribution
for (i in 1:6){
for (j in 1:6){
t_hat[paste(i,j)]<-4*mean(y[str1[[i]]]) + 4*mean(y[str2[[j]]])
}
}
sampdist<-unlist(t_hat)
hist(unlist(t_hat))
### If you want to do it by hand...
#(1,2)(4,5)
y1a <- (1+2)/2
y1b <- (4+7)/2
t_hat1 <- 4*y1a + 4*y1b
y1a <- (1+2)/2
y1b <- (4+7)/2
t_hat1 <- 4*y1a + 4*y1b
### Part C.
#Mean in example 2.2 = 40, Variance in example 2.2 = 54.86
mean(sampdist)
## [1] 40
var(sampdist)
## [1] 41.14286
n <-2
N <-4
s_sqr <- 1/(N-1)*sum((sampdist-mean(sampdist))^2)
y1 <- y[stratum == 1]
y2 <- y[stratum == 2]
S_sqr_1 <- var(y1)
S_sqr_2 <- var(y2)
#var_t_hat <- ((S_sqr_2)/n)*(1 - n/N)*N^2 + ((S_sqr_2)/n)*(1 - n/N)*N^2
#var_t_hat
var_t_hat <-4*var(y1) + 4*var(y2)
var_t_hat
## [1] 47.33333
The mean is 71.8333 and the variance is 86.1667
There are 15 SRS’s of size 4
The variable x lists all the possible SRS’s.
x [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15][1,] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 3 [2,] 2 2 2 2 2 2 3 3 3 4 3 3 3 4 4 [3,] 3 3 3 4 4 5 4 4 5 5 4 4 5 5 5 [4,] 4 5 6 5 6 6 5 6 6 6 5 6 6 6 6
Where 1 = 66 2 = 59 3 = 70 4 = 83 5 = 82 6 = 71
y_bar is the list of the means which is 69.50 69.25 66.50 72.50 69.75 69.50 75.25 72.50 72.25 75.50 73.50 70.75 70.50 73.75 76.50. The mean of y_bar is 71.333.
Variance of y_bar = 7.180556
There are 9 possible combinations.
I WORKED WITH GRADY F. ON THIS! HE SHOWED ME HOW TO WRITE THIS CODE!
The 9 possible combinations are listed below [,1] [,2] [,3] [,4] [1,] 66 59 83 82 [2,] 66 70 83 82 [3,] 59 70 83 82 [4,] 66 59 83 71 [5,] 66 70 83 71 [6,] 59 70 83 71 [7,] 66 59 82 71 [8,] 66 70 82 71 [9,] 59 70 82 71
The 6 samples from part c that contain 3 values from strata 1 and 1 value from stratum 2 or 3 values from strata 2 and 1 value from strata 1 are not included.
For example, 66,59,70,83 will not be included.
###Part A
score <-c(66,59,70,83,82,71)
mean(score)
## [1] 71.83333
var(score)
## [1] 86.16667
###Part B.
choose(6,4)
## [1] 15
###Part C.
y <- c(1,2,3,4,5,6)
#List of the possible SRS's
library(permutations)
##
## Attaching package: 'permutations'
## The following object is masked from 'package:stats':
##
## cycle
x <- combn(6,4)
x
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 1 1 1 1 1 1 1 1 1 2 2 2
## [2,] 2 2 2 2 2 2 3 3 3 4 3 3 3
## [3,] 3 3 3 4 4 5 4 4 5 5 4 4 5
## [4,] 4 5 6 5 6 6 5 6 6 6 5 6 6
## [,14] [,15]
## [1,] 2 3
## [2,] 4 4
## [3,] 5 5
## [4,] 6 6
#List of means = y_bar
g <- x[,1]
getmean <- function(g){
return(mean(score[g]))
}
y_bar <- apply(x,2,getmean)
y_bar
## [1] 69.50 69.25 66.50 72.50 69.75 69.50 75.25 72.50 72.25 75.50 73.50
## [12] 70.75 70.50 73.75 76.50
mean(y_bar)
## [1] 71.83333
var(y_bar)
## [1] 7.693452
n <-4
N <-6
#Variance of y_bar
var_y_bar <- ((var(score))/n)*(1 - n/N)
var_y_bar
## [1] 7.180556
# Part D
#Worked with Grady
stratum_1 <- t(combn(score[1:3], m = 2))
stratum_2 <- t(combn(score[4:6], m = 2))
str <- cbind(stratum_1, rbind(stratum_2[1,], stratum_2[1,], stratum_2[1,]))
str <- rbind(str, cbind(stratum_1, rbind(stratum_2[2,], stratum_2[2,], stratum_2[2,])))
str<- rbind(str, cbind(stratum_1, rbind(stratum_2[3,], stratum_2[3,], stratum_2[3,])))
rowMeans(str)
## [1] 72.50 75.25 73.50 69.75 72.50 70.75 69.50 72.25 70.50
# Part E.
var_s1 <- var(c(66, 59, 70))
var_s2 <- var(c(83, 82, 71))
n <-2
N <-3
vs1 <- (1 - n/N)*((N/6)^2)*((var_s1)/n)
vs2 <- (1 - n/N)*((N/6)^2)*((var_s2)/n)
vs1 + vs2
## [1] 3.138889