1. Using the train.csv file from the Titanic dataset, build a 1- /2% confidence interval for survival of females and then a separate confidence interval for the entire sample. Choose the value of yourself. Using this same confidence level, test the hypotheses that the female survival rate is higher than the survival rate of the entire population. Provide your R code in the discussion. NOTE: we normally would test females versus not females. But that’s ok for now..
# Load the required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stats)

# Read the Titanic dataset

mydata=read.csv('C:/Users/Ruben/Desktop/Data Analytics/train.csv')
# Calculate the survival rate for females
female_survival_rate <- mean(mydata$Survived[mydata$Sex == "female"])

# Calculate the overall survival rate
overall_survival_rate <- mean(mydata$Survived)

# Set the confidence level (1 - alpha)
confidence_level <- 0.95

# Calculate the margin of error for the female survival rate
female_se <- sqrt(female_survival_rate * (1 - female_survival_rate) / sum(mydata$Sex == "female"))
margin_of_error_female <- qnorm(1 - (1 - confidence_level) / 2) * female_se

# Calculate the confidence interval for female survival rate
confidence_interval_female <- c(female_survival_rate - margin_of_error_female,
                                female_survival_rate + margin_of_error_female)

# Calculate the margin of error for the overall survival rate
overall_se <- sqrt(overall_survival_rate * (1 - overall_survival_rate) / nrow(mydata))
margin_of_error_overall <- qnorm(1 - (1 - confidence_level) / 2) * overall_se

# Calculate the confidence interval for the overall survival rate
confidence_interval_overall <- c(overall_survival_rate - margin_of_error_overall,
                                 overall_survival_rate + margin_of_error_overall)

# Output the results
cat("Confidence Interval for Female Survival Rate (", confidence_level * 100, "%):")
## Confidence Interval for Female Survival Rate ( 95 %):
cat("\nLower Bound:", confidence_interval_female[1])
## 
## Lower Bound: 0.6936462
cat("\nUpper Bound:", confidence_interval_female[2])
## 
## Upper Bound: 0.7904303
cat("\n\nConfidence Interval for Overall Survival Rate (", confidence_level * 100, "%):")
## 
## 
## Confidence Interval for Overall Survival Rate ( 95 %):
cat("\nLower Bound:", confidence_interval_overall[1])
## 
## Lower Bound: 0.351906
cat("\nUpper Bound:", confidence_interval_overall[2])
## 
## Upper Bound: 0.4157707
# Perform hypothesis test
test_statistic <- (female_survival_rate - overall_survival_rate) / sqrt(female_se^2 + overall_se^2)
critical_value <- qnorm(1 - (1 - confidence_level) / 2)
p_value <- 2 * (1 - pnorm(abs(test_statistic)))

cat("\n\nHypothesis Test:")
## 
## 
## Hypothesis Test:
cat("\nTest Statistic:", test_statistic)
## 
## Test Statistic: 12.10902
cat("\nCritical Value:", critical_value)
## 
## Critical Value: 1.959964
cat("\nP-value:", p_value)
## 
## P-value: 0