download.file("http://www.openintro.org/stat/data/atheism.RData", destfile = "atheism.RData")
load("atheism.RData")
us12 <- subset(atheism, nationality == "United States" & year == "2012")
inference(us12$response, est = "proportion", type = "ci", method = "theoretical", success = "atheist")
## Warning: package 'BHH2' was built under R version 3.6.3
## Single proportion -- success: atheist
## Summary statistics:

## p_hat = 0.0499 ; n = 1002
## Check conditions: number of successes = 50 ; number of failures = 952
## Standard error = 0.0069
## 95 % Confidence interval = ( 0.0364 , 0.0634 )
n <- 1000
p <- seq(0, 1, 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
plot(me ~ p, ylab = "Margin of Error", xlab = "Population Proportion")

p <- 0.1
n <- 1040
p_hats <- rep(0, 5000)
for(i in 1:5000){
samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
p_hats[i] <- sum(samp == "atheist")/n
}
hist(p_hats, main = "p = 0.1, n = 1040", xlim = c(0, 0.18))

## OpenIntro
1.
## [1] 1
download.file("http://www.openintro.org/stat/data/atheism.RData", destfile = "atheism.RData")
load("atheism.Rdata")
## Is there convincing evidence that Spain has seen a change in its atheism index between 2005 and 2012?
spain05 <- subset(atheism, nationality == "Spain" & year == "2005")
spain05$nationality <- as.factor(as.character(spain05$nationality))
table(spain05$nationality, spain05$response)
##
## atheist non-atheist
## Spain 115 1031
spain12 <- subset(atheism, nationality == "Spain" & year == "2012")
spain12$nationality <- as.factor(as.character(spain12$nationality))
table(spain12$nationality, spain12$response)
##
## atheist non-atheist
## Spain 103 1042
# The number of atheists in 2005 is 115 and in 2012 it is 103. Both are greater than 10, so we can assume nearly normal distribution.
## Is there convincing evidence that the United States has seen a change in its atheism index between 2005 and 2012?
us05 <- subset(atheism, nationality == "United States" & year == "2005")
us05$nationality <- as.factor(as.character(us05$nationality))
table(us05$nationality, us05$response)
##
## atheist non-atheist
## United States 10 992
us12 <- subset(atheism, nationality == "United States" & year == "2012")
us12$nationality <- as.factor(as.character(us12$nationality))
table(us12$nationality, us12$response)
##
## atheist non-atheist
## United States 50 952
# The number of atheists in 2005 is 10 and in 2012 it is 50.
## If in fact there has been no change in the atheism index in the countries listed in Table 4, in how many of those countries would you expect to detect a change (at a significance level of 0.05) simply by chance?
#Hint: Look in the textbook index under Type 1 error.
#Since we reject a null hypothesis, that means we have made a Type 1 error. With the significant level of 0.05 and 39 countries, so the Type 1 error is about 2. (39*0.05)
##Suppose you're hired by the local government to estimate the proportion of residents that attend a religious service on a weekly basis. According to the guidelines, the estimate must have a margin of error no greater than 1% with 95% confidence. You have no idea what to expect for p. How many people would you have to sample to ensure that you are within the guidelines?
#If the margin of error is 0.01
MOE <- 0.01
z<- 1.96
SE <- MOE/z
p<- .5
n<- (p*(1-p))/SE^2
# Milestone Data
#1. Select a categorical variable of interest, and develop a research question about a population proportion for the variable of interest. Answer the question by conducting a hypothesis test. Use the Prepare-Check-Calculate-Conclude method modeled in OpenIntro Sections 5.3 and 6.1. If your dataset contains the population data, then generate a sample of size n=40, and use the sample to perform the test.
Pokemon <- read.csv("Pokemon.csv")
#Prepare
n = 40
p = .5
con_lvl = .95
#Ho : p =.5
#HA : p > .5
alpha <- 1-(con_lvl/100)
#Pokemon_Legend <- sample(Pokemon$Legendary, n) there is an error when publishing
#p_hat <- mean(Pokemon_Legend)
#mean(p_hat)
#Check
# this is independent because the sample was randomly sample.
#n*p >= 10
#n*(1-p) > 10
# it doesn't satisfy the check
# calculate
#table(Pokemon_Legend)
#inference(y=as.factor(Pokemon_Legend), est = "proportion", success = "TRUE", type = "ht", alternative = "less", method = "theoretical", null=0.5)
#Conclude
# since the p value is less than alpha = .05 we reject the null hypothesis.
#2. Select two categorical variables of interest (or split one of your categorical variables), and develop a research question about the relationship between the two corresponding population proportions. Answer the question by conducting a hypothesis test. Use the Prepare-Check-Calculate-Conclude method modeled in OpenIntro Section 6.2. If your dataset contains the population data, then generate a sample of size n=40, and use the sample to perform the test.
#Prepare
n = 40
con_lvl = .95
#Ho : The proportion Type1 Fire = The proportion Type2 Poison;
#HA : The proportion Type1 Fire =! The proportion Type2 Poison;
n = 40
type1 <- sample(Pokemon$Type.1,n);
set.seed(40)
type2 <- sample(Pokemon$Type.2,n);
str(Pokemon)
## 'data.frame': 800 obs. of 13 variables:
## $ X. : int 1 2 3 3 4 5 6 6 6 7 ...
## $ Name : Factor w/ 800 levels "Abomasnow","AbomasnowMega Abomasnow",..: 81 330 746 747 103 104 100 101 102 666 ...
## $ Type.1 : Factor w/ 18 levels "Bug","Dark","Dragon",..: 10 10 10 10 7 7 7 7 7 18 ...
## $ Type.2 : Factor w/ 19 levels "","Bug","Dark",..: 15 15 15 15 1 1 9 4 9 1 ...
## $ Total : int 318 405 525 625 309 405 534 634 634 314 ...
## $ HP : int 45 60 80 80 39 58 78 78 78 44 ...
## $ Attack : int 49 62 82 100 52 64 84 130 104 48 ...
## $ Defense : int 49 63 83 123 43 58 78 111 78 65 ...
## $ Sp..Atk : int 65 80 100 122 60 80 109 130 159 50 ...
## $ Sp..Def : int 65 80 100 120 50 65 85 85 115 64 ...
## $ Speed : int 45 60 80 80 65 80 100 100 100 43 ...
## $ Generation: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Legendary : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
table(type1)
## type1
## Bug Dark Dragon Electric Fairy Fighting Fire Flying
## 4 1 2 2 2 0 3 0
## Ghost Grass Ground Ice Normal Poison Psychic Rock
## 1 4 1 0 2 1 4 2
## Steel Water
## 1 10
Fire1 = sum(type1 == "Fire");
Fire1
## [1] 3
table(type1)
## type1
## Bug Dark Dragon Electric Fairy Fighting Fire Flying
## 4 1 2 2 2 0 3 0
## Ghost Grass Ground Ice Normal Poison Psychic Rock
## 1 4 1 0 2 1 4 2
## Steel Water
## 1 10
p_hat1 = Fire1/n;
p_hat1
## [1] 0.075
Pois2 = sum(type2 == "Poison");
Pois2
## [1] 3
p_hat2 = Pois2/n;
p_hat2
## [1] 0.075
table(Pois2)
## Pois2
## 3
## 1
p_diff = p_hat1-p_hat2;
p_diff
## [1] 0
#Check
p_pool = (p_hat1+p_hat2)/(n+n);
p_pool
## [1] 0.001875
# this data is independent because I took a random sample from the data.
n*p_pool >= 10
## [1] FALSE
n*(1-p_pool) >= 10
## [1] TRUE
#DO not meet satisfication
# calculate
SE = sqrt(p_pool*(1-p_pool)*(1/n +1/n));
SE
## [1] 0.009673377
z = p_diff/SE; z
## [1] 0
p_value = pnorm(z)*2;
p_value
## [1] 1
#Conclude
# At the 95% cofidence level, the p_value is greater than alpha = .05 so we fail to reject the null hypothesis.