download.file("http://www.openintro.org/stat/data/atheism.RData", destfile = "atheism.RData")
load("atheism.RData")

us12 <- subset(atheism, nationality == "United States" & year == "2012")
inference(us12$response, est = "proportion", type = "ci", method = "theoretical", success = "atheist")
## Warning: package 'BHH2' was built under R version 3.6.3
## Single proportion -- success: atheist 
## Summary statistics:

## p_hat = 0.0499 ;  n = 1002 
## Check conditions: number of successes = 50 ; number of failures = 952 
## Standard error = 0.0069 
## 95 % Confidence interval = ( 0.0364 , 0.0634 )
n <- 1000
p <- seq(0, 1, 0.01)
me <- 2 * sqrt(p * (1 - p)/n)
plot(me ~ p, ylab = "Margin of Error", xlab = "Population Proportion")

p <- 0.1
n <- 1040
p_hats <- rep(0, 5000)

for(i in 1:5000){
  samp <- sample(c("atheist", "non_atheist"), n, replace = TRUE, prob = c(p, 1-p))
  p_hats[i] <- sum(samp == "atheist")/n
}

hist(p_hats, main = "p = 0.1, n = 1040", xlim = c(0, 0.18))

## OpenIntro

1. 
## [1] 1
download.file("http://www.openintro.org/stat/data/atheism.RData", destfile = "atheism.RData")
load("atheism.Rdata")

## Is there convincing evidence that Spain has seen a change in its atheism index between 2005 and 2012?

spain05 <- subset(atheism, nationality == "Spain" & year == "2005")
spain05$nationality <- as.factor(as.character(spain05$nationality))
table(spain05$nationality, spain05$response)
##        
##         atheist non-atheist
##   Spain     115        1031
spain12 <- subset(atheism, nationality == "Spain" & year == "2012")
spain12$nationality <- as.factor(as.character(spain12$nationality))
table(spain12$nationality, spain12$response)
##        
##         atheist non-atheist
##   Spain     103        1042
# The number of atheists in 2005 is 115 and in 2012 it is 103. Both are greater than 10, so we can assume nearly normal distribution.

## Is there convincing evidence that the United States has seen a change in its atheism index between 2005 and 2012?

us05 <- subset(atheism, nationality == "United States" & year == "2005")
us05$nationality <- as.factor(as.character(us05$nationality))
table(us05$nationality, us05$response)
##                
##                 atheist non-atheist
##   United States      10         992
us12 <- subset(atheism, nationality == "United States" & year == "2012")
us12$nationality <- as.factor(as.character(us12$nationality))
table(us12$nationality, us12$response)
##                
##                 atheist non-atheist
##   United States      50         952
# The number of atheists in 2005 is 10 and in 2012 it is 50.

## If in fact there has been no change in the atheism index in the countries listed in Table 4, in how many of those countries would you expect to detect a change (at a significance level of 0.05) simply by chance?
#Hint: Look in the textbook index under Type 1 error.

#Since we reject a null hypothesis, that means we have made a Type 1 error. With the significant level of 0.05 and 39 countries, so the Type 1 error is about 2. (39*0.05)

##Suppose you're hired by the local government to estimate the proportion of residents that attend a religious service on a weekly basis. According to the guidelines, the estimate must have a margin of error no greater than 1% with 95% confidence. You have no idea what to expect for p. How many people would you have to sample to ensure that you are within the guidelines?

#If the margin of error is 0.01

MOE <- 0.01

z<- 1.96

SE <- MOE/z

p<- .5

n<- (p*(1-p))/SE^2

# Milestone Data


#1. Select a categorical variable of interest, and develop a research question about a population proportion for the variable of interest. Answer the question by conducting a hypothesis test. Use the Prepare-Check-Calculate-Conclude method modeled in OpenIntro Sections 5.3 and 6.1. If your dataset contains the population data, then generate a sample of size n=40, and use the sample to perform the test.

Pokemon <- read.csv("Pokemon.csv")

#Prepare
n = 40
p = .5
con_lvl = .95
#Ho : p =.5
#HA : p > .5
alpha <- 1-(con_lvl/100)

#Pokemon_Legend <- sample(Pokemon$Legendary, n) there is an error when publishing
#p_hat <- mean(Pokemon_Legend)
#mean(p_hat)

#Check

# this is independent because the sample was randomly sample.


#n*p >= 10
#n*(1-p) > 10

# it doesn't satisfy the check

# calculate


#table(Pokemon_Legend)

#inference(y=as.factor(Pokemon_Legend), est = "proportion", success = "TRUE", type = "ht", alternative = "less", method = "theoretical", null=0.5)

#Conclude

# since the p value is less than alpha = .05 we reject the null hypothesis. 

#2. Select two categorical variables of interest (or split one of your categorical variables), and develop a research question about the relationship between the two corresponding population proportions. Answer the question by conducting a hypothesis test. Use the Prepare-Check-Calculate-Conclude method modeled in OpenIntro Section 6.2. If your dataset contains the population data, then generate a sample of size n=40, and use the sample to perform the test.

#Prepare

n = 40
con_lvl = .95
#Ho : The proportion Type1 Fire = The proportion Type2 Poison;  
#HA : The proportion Type1 Fire =! The proportion Type2 Poison;

n = 40
type1 <- sample(Pokemon$Type.1,n);
set.seed(40)
type2 <- sample(Pokemon$Type.2,n);

str(Pokemon)
## 'data.frame':    800 obs. of  13 variables:
##  $ X.        : int  1 2 3 3 4 5 6 6 6 7 ...
##  $ Name      : Factor w/ 800 levels "Abomasnow","AbomasnowMega Abomasnow",..: 81 330 746 747 103 104 100 101 102 666 ...
##  $ Type.1    : Factor w/ 18 levels "Bug","Dark","Dragon",..: 10 10 10 10 7 7 7 7 7 18 ...
##  $ Type.2    : Factor w/ 19 levels "","Bug","Dark",..: 15 15 15 15 1 1 9 4 9 1 ...
##  $ Total     : int  318 405 525 625 309 405 534 634 634 314 ...
##  $ HP        : int  45 60 80 80 39 58 78 78 78 44 ...
##  $ Attack    : int  49 62 82 100 52 64 84 130 104 48 ...
##  $ Defense   : int  49 63 83 123 43 58 78 111 78 65 ...
##  $ Sp..Atk   : int  65 80 100 122 60 80 109 130 159 50 ...
##  $ Sp..Def   : int  65 80 100 120 50 65 85 85 115 64 ...
##  $ Speed     : int  45 60 80 80 65 80 100 100 100 43 ...
##  $ Generation: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Legendary : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
table(type1)
## type1
##      Bug     Dark   Dragon Electric    Fairy Fighting     Fire   Flying 
##        4        1        2        2        2        0        3        0 
##    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic     Rock 
##        1        4        1        0        2        1        4        2 
##    Steel    Water 
##        1       10
Fire1 =  sum(type1 == "Fire"); 
Fire1
## [1] 3
table(type1)
## type1
##      Bug     Dark   Dragon Electric    Fairy Fighting     Fire   Flying 
##        4        1        2        2        2        0        3        0 
##    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic     Rock 
##        1        4        1        0        2        1        4        2 
##    Steel    Water 
##        1       10
p_hat1 = Fire1/n; 
p_hat1
## [1] 0.075
Pois2 =  sum(type2 == "Poison"); 
Pois2
## [1] 3
p_hat2 = Pois2/n; 
p_hat2
## [1] 0.075
table(Pois2)
## Pois2
## 3 
## 1
p_diff = p_hat1-p_hat2; 
p_diff
## [1] 0
#Check

p_pool = (p_hat1+p_hat2)/(n+n); 
p_pool
## [1] 0.001875
# this data is independent because I took a random sample from the data.

n*p_pool >= 10
## [1] FALSE
n*(1-p_pool) >= 10
## [1] TRUE
#DO not meet satisfication

# calculate

SE = sqrt(p_pool*(1-p_pool)*(1/n +1/n)); 

SE
## [1] 0.009673377
z = p_diff/SE; z
## [1] 0
p_value = pnorm(z)*2; 
p_value
## [1] 1
#Conclude

# At the 95% cofidence level, the p_value is greater than alpha = .05 so we fail to reject the null hypothesis.