library(readxl)
mydata <- read_excel("C:/Users/mattv/Desktop/ADEC 7301 Assignments/Week 1/Titanic/train.xlsx")
alpha <- 0.05
Zc <-qnorm(1 - alpha / 2)
I loaded the dplyr package and used the filter function to isolate specific columns
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
female_data <- mydata %>% filter(Sex == "female")
n_female <- nrow(female_data)
c("Number of total female passengers", n_female)
## [1] "Number of total female passengers" "314"
female_survived <- sum(female_data$Survived)
c("Number of female passengers who survived", female_survived)
## [1] "Number of female passengers who survived"
## [2] "233"
phat_female <- female_survived/n_female
c("phat_female", phat_female)
## [1] "phat_female" "0.74203821656051"
Total_Passangers <- nrow(mydata)
c("Total number of passangers", Total_Passangers)
## [1] "Total number of passangers" "891"
Total_Survived <-nrow(mydata %>% filter(Survived == "1"))
c("Total number of Survivors", Total_Survived)
## [1] "Total number of Survivors" "342"
phat_total <- mean(mydata$Survived)
c("phat_total",phat_total)
## [1] "phat_total" "0.383838383838384"
Se_female<-sqrt(phat_female * (1-phat_female)/n_female)
c("Se_female", Se_female)
## [1] "Se_female" "0.0246902790203467"
Se_total<-sqrt(phat_total * (1-phat_total)/nrow(mydata))
c("Se_total", Se_total)
## [1] "Se_total" "0.01629231015825"
upper_female <-phat_female+Zc * Se_female
lower_female <-phat_female-Zc * Se_female
c(lower_female, phat_female, upper_female)
## [1] 0.6936462 0.7420382 0.7904303
female_prop_test <-prop.test(x=233, n=314, conf.level = 0.98 )
female_prop_test
##
## 1-sample proportions test with continuity correction
##
## data: 233 out of 314, null probability 0.5
## X-squared = 72.615, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 98 percent confidence interval:
## 0.6791630 0.7964867
## sample estimates:
## p
## 0.7420382
upper_total <-phat_total+Zc*Se_total
lower_total <-phat_total-Zc*Se_total
c(lower_total, phat_total, upper_total)
## [1] 0.3519060 0.3838384 0.4157707
total_prop_test <-prop.test(x=342, n=891, conf.level = 0.95)
total_prop_test
##
## 1-sample proportions test with continuity correction
##
## data: 342 out of 891, null probability 0.5
## X-squared = 47.627, df = 1, p-value = 5.154e-12
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3519194 0.4167722
## sample estimates:
## p
## 0.3838384
Here I ran a two sample prop_test to test the hypothesis that the female survival rate is higher than the survival rate of the entire population:
Ho: There is no meaningful difference in the
survival rate of females and the total survival rate.
HA: The survival rate of females is higher than the
total survival rate.
After running this prop.test the outcome returned a p-value that was
less than alpha. Given that outcome we can reject the H0
and conclude that the female survival rate is higher than that of the
total survival rate.
hypothesis_test <-prop.test(x=c(233,342), n=c(314,891))
hypothesis_test
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(233, 342) out of c(314, 891)
## X-squared = 117.98, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.2980682 0.4183315
## sample estimates:
## prop 1 prop 2
## 0.7420382 0.3838384