Lab 6

Exercise 1

These percentages are sample statistics derived from the sample.

Exercise 2

To generalize the findings we must assume the sample is random and independent and that each religion has at least 10 respondents (to test difference in proportion) in each country.

Exercise 3

load("C:/Users/sogde/Downloads/atheism.RData")

Each row corresponds to a a survey respondent.

Exercise 4

us12 <- subset(atheism, nationality == "United States" & year == "2012")

nrow(subset(us12,us12$response=='atheist'))/nrow(us12)

## [1] 0.0499002

we get a result of 5%.

Exercise 5

The conditions of inference are that there be at least 10 survey respondents and that the sample is random and independent. These Conditions are met by the methodology and:

nrow(subset(us12,us12$response=='atheist'))

## [1] 50

Exercise 6

inference(us12$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Warning: package 'lmPerm' was built under R version 3.3.2

## Warning: package 'BHH2' was built under R version 3.3.2

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.0499 ;  n = 1002 
## Check conditions: number of successes = 50 ; number of failures = 952 
## Standard error = 0.0069 
## 95 % Confidence interval = ( 0.0364 , 0.0634 )

Margin of error

se<-0.0069
zstar<-1.966
moe<-  zstar*se
moe

## [1] 0.0135654

Exercise 7

Canada

ca12 <- subset(atheism, nationality == "Canada" & year == "2012")

nrow(subset(ca12,ca12$response=='atheist')) >10

## [1] TRUE

##Conditions met

inference(ca12$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.0898 ;  n = 1002 
## Check conditions: number of successes = 90 ; number of failures = 912 
## Standard error = 0.009 
## 95 % Confidence interval = ( 0.0721 , 0.1075 )

##Does not contain zero

seca<-0.009
moeca<-seca*zstar
moeca

## [1] 0.017694

France

fr12 <- subset(atheism, nationality == "France" & year == "2012")

nrow(subset(fr12,fr12$response=='atheist')) >10

## [1] TRUE

##Conditions met

inference(fr12$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.2873 ;  n = 1688 
## Check conditions: number of successes = 485 ; number of failures = 1203 
## Standard error = 0.011 
## 95 % Confidence interval = ( 0.2657 , 0.3089 )

##Does not contain zero

sefr<-0.011
moefr<-sefr*zstar
moefr

## [1] 0.021626

Exercise 8

n <- 1000
p <- seq(0, 1, 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
plot(me ~ p, ylab = "Margin of Error", xlab = "Population Proportion")

Margin of error follows a semi-circular shape (locally it would be parabolic around 0.5 in a taylor series) with proportion, with a theoretically maximum where dme/dp = 0.0 at p=0.5. This is independent of n.

Exercise 9

p <- 0.1
n <- 1040
p_hats <- rep(0, 5000)

for(i in 1:5000){
  samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
  p_hats[i] <- sum(samp == "atheist")/n
}

hist(p_hats, main = "p = 0.1, n = 1040", xlim = c(0, 0.18))

summary(p_hats)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.07019 0.09327 0.10000 0.09998 0.10580 0.13170

##Center
mean(p_hats)

## [1] 0.09997692

##Shape
sd(p_hats)

## [1] 0.009275774

Exercise 10

p <- 0.1
n <- 400
p_hats1 <- rep(0, 5000)

for(i in 1:5000){
  samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
  p_hats1[i] <- sum(samp == "atheist")/n
}


summary(p_hats1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0500  0.0900  0.1000  0.1002  0.1100  0.1575

##Center
mean(p_hats1)

## [1] 0.100152

##Shape
sd(p_hats1)

## [1] 0.01483883

p <- 0.02
n <- 1040
p_hats2 <- rep(0, 5000)

for(i in 1:5000){
  samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
  p_hats2[i] <- sum(samp == "atheist")/n
}

summary(p_hats2)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005769 0.017310 0.020190 0.020010 0.023080 0.036540

##Center
mean(p_hats2)

## [1] 0.02001462

##Shape
sd(p_hats2)

## [1] 0.004303382

p <- 0.02
n <- 400
p_hats3 <- rep(0, 5000)

for(i in 1:5000){
  samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
  p_hats3[i] <- sum(samp == "atheist")/n
}

summary(p_hats3)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00250 0.01500 0.02000 0.02009 0.02500 0.05000

##Center
mean(p_hats3)

## [1] 0.0200945

##Shape
sd(p_hats3)

## [1] 0.007119161

par(mfrow = c(2, 2))
hist(p_hats, main = "p = 0.1, n = 1040", xlim = c(0, 0.18))
hist(p_hats1, main = "p = 0.1, n = 400", xlim = c(0, 0.18))
hist(p_hats2, main = "p = 0.02, n = 1040", xlim = c(0, 0.18))
hist(p_hats3, main = "p = 0.02, n = 400", xlim = c(0, 0.18))

Based on this, increasing n decreased the spread and changing p changes the center.

Exercise 11

It is probably not sensible since the samples among them are independent and the sizes are .1x1040 ~ 100 and .02x400 ~ 8 people since the sample size of 10 for that group is not met.

Problem 1

Spain

sp12 <- subset(atheism, nationality == "Spain" & year == "2012")
sp05 <- subset(atheism, nationality == "Spain" & year == "2005")


inference(sp12$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.09 ;  n = 1145 
## Check conditions: number of successes = 103 ; number of failures = 1042 
## Standard error = 0.0085 
## 95 % Confidence interval = ( 0.0734 , 0.1065 )

inference(sp05$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.1003 ;  n = 1146 
## Check conditions: number of successes = 115 ; number of failures = 1031 
## Standard error = 0.0089 
## 95 % Confidence interval = ( 0.083 , 0.1177 )

The confidence intervals do overlap so there is no meaningful difference.

us12 <- subset(atheism, nationality == "United States" & year == "2012")
us05 <- subset(atheism, nationality == "United States" & year == "2005")


inference(us12$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.0499 ;  n = 1002 
## Check conditions: number of successes = 50 ; number of failures = 952 
## Standard error = 0.0069 
## 95 % Confidence interval = ( 0.0364 , 0.0634 )

inference(us05$response, est = "proportion", type = "ci", method = "theoretical", 
          success = "atheist")

## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.01 ;  n = 1002 
## Check conditions: number of successes = 10 ; number of failures = 992 
## Standard error = 0.0031 
## 95 % Confidence interval = ( 0.0038 , 0.0161 )

These confidence intervals do not overlap thus there is a meaningful difference.

Problem 2

We would expect to detect a change in 0.05*39 countries ~ 2 counties.

Problem 3

To ensure a moe no greater than 1% we have to assume the proportion ends up at the value that maximizes moe, ie p=0.5. zstar= 1.96 so, solving for n in the moe equation:

moe<-.01
zstar= 1.96
p<-0.5

n<-(moe^2/(p*(1-p)))^(-1)
n

## [1] 2500

n=2500