# Difference in Population Proportion?
#Difference of population proportions shows us that the difference trends between 2 groups of sample that interested into the same thing. The example of this is a survey taken of both vegans and non-vegans if they have opinions relating to cruelty of harming animals.

# It is interesting because we can compare two types of pokemons and see if there is any bias toward how the pokemon designer design the pokemon which type is which. 

# Question: "Is the distribution of water in type 1 ad type 2 of Pokemon equally distributed in population proportion?"

#Prepare

# H_0 : The population proportion of Water Pokemon in type 1 = The population proportion of Water Pokemon in type 2;  
# H_A : The population proportion of Water Pokemon in type 1 =! The population proportion of Water Pokemon in type 2; 
Pokemon <- read.csv("Pokemon.csv")

n <-800
con_lvl <-0.95

Poke_type1 <- sample(Pokemon$Type.1, n)
Poke_type2 <- sample(Pokemon$Type.2, n)

table(Poke_type1)
## Poke_type1
##      Bug     Dark   Dragon Electric    Fairy Fighting     Fire   Flying 
##       69       31       32       44       17       27       52        4 
##    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic     Rock 
##       32       70       32       24       98       28       57       44 
##    Steel    Water 
##       27      112
table(Poke_type2)
## Poke_type2
##               Bug     Dark   Dragon Electric    Fairy Fighting     Fire 
##      386        3       20       18        6       23       26       12 
##   Flying    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic 
##       97       14       25       35       14        4       34       33 
##     Rock    Steel    Water 
##       14       22       14
Poke_water1 = sum(Poke_type1=="Water")
Poke_water2 = sum(Poke_type2=="Water")

p_hat_water1 =  Poke_water1/n
p_hat_water2 =  Poke_water2/n

p_diff_water = p_hat_water1-p_hat_water2

alpha <- 1-con_lvl

#Check

#The data is independent because it is randomly sample from known data.

p_pool_water = (p_hat_water1 + p_hat_water2)/ (n+n)
n*p_pool_water >=10
## [1] FALSE
n*(1-p_pool_water) >= 10
## [1] TRUE
#it failed because there is a lot uncalssified type in type 2. So we could not the "NA" as water or non-water

#Calculate

SE = sqrt(((p_pool_water * (1 - p_pool_water)) / n) + ((p_pool_water * (1 - p_pool_water)) / n) )
z = p_diff_water / SE

p_Value = pnorm(z) * 2


#Now we repeat the same process as above but the change the sample size to 40

n <-40
con_lvl <-0.95

Poke_type1 <- sample(Pokemon$Type.1, n)
Poke_type2 <- sample(Pokemon$Type.2, n)

table(Poke_type1)
## Poke_type1
##      Bug     Dark   Dragon Electric    Fairy Fighting     Fire   Flying 
##        3        0        0        4        1        2        3        0 
##    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic     Rock 
##        2        3        1        1        5        3        2        4 
##    Steel    Water 
##        2        4
table(Poke_type2)
## Poke_type2
##               Bug     Dark   Dragon Electric    Fairy Fighting     Fire 
##       17        0        1        1        0        2        0        1 
##   Flying    Ghost    Grass   Ground      Ice   Normal   Poison  Psychic 
##        5        0        1        1        1        2        2        2 
##     Rock    Steel    Water 
##        1        1        2
Poke_water1 = sum(Poke_type1=="Water")
Poke_water2 = sum(Poke_type2=="Water")

p_hat_water1 =  Poke_water1/n
p_hat_water2 =  Poke_water2/n

p_diff_water = p_hat_water1-p_hat_water2

alpha <- 1-con_lvl

#Check

#The data is independent because it is randomly sample from known data.

p_pool_water = (p_hat_water1 + p_hat_water2)/ (n+n)
n*p_pool_water >=10
## [1] FALSE
n*(1-p_pool_water) >= 10
## [1] TRUE
#it failed because there is a lot uncalssified type in type 2. So we could not the "NA" as water or non-water

#Calculate

SE = sqrt(((p_pool_water * (1 - p_pool_water)) / n) + ((p_pool_water * (1 - p_pool_water)) / n) )
z = p_diff_water / SE

p_Value = pnorm(z) * 2

#Conclusion

#The two difference population propotion have their p_value less than alpha in both case for n=40 and 800.