# Import the titanic data
titanic_data <- read_csv("titanic_data.csv", col_names = TRUE)
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Fare

# Compare the average difference in ticket price (Fare) between men and women (Sex) 
avg_fare_sex <- titanic_data %>%
  group_by(Sex) %>%
  summarise(avg_fare = mean(Fare, na.rm = TRUE))

# Print the result
avg_fare_sex
## # A tibble: 2 × 2
##   Sex    avg_fare
##   <chr>     <dbl>
## 1 female     44.5
## 2 male       25.5
# Bar plot for average fare by sex
ggplot(avg_fare_sex, aes(x = Sex, y = avg_fare, fill = Sex)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Fare by Sex", 
       x = "Sex", 
       y = "Average Fare") +
  scale_fill_brewer(palette = "Set2")

# Linear Regression of sex and fare
reg_fare_sex <- lm(formula = Fare ~ Sex, titanic_data)
summary(reg_fare_sex)
## 
## Call:
## lm(formula = Fare ~ Sex, data = titanic_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37.73 -18.40 -16.76   2.70 486.81 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   44.480      2.759  16.122  < 2e-16 ***
## Sexmale      -18.956      3.428  -5.529 4.23e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.89 on 889 degrees of freedom
## Multiple R-squared:  0.03325,    Adjusted R-squared:  0.03216 
## F-statistic: 30.57 on 1 and 889 DF,  p-value: 4.231e-08
# Compare the average difference in ticket price (Fare) between passenger classes (Pclass)

# Converting Pclass from numeric to factor
titanic_data$Pclass <- as.factor(titanic_data$Pclass)

avg_fare_pclass <- titanic_data %>%
  group_by(Pclass) %>%
  summarise(avg_fare = mean(Fare, na.rm = TRUE))

# Print the result
avg_fare_pclass
## # A tibble: 3 × 2
##   Pclass avg_fare
##   <fct>     <dbl>
## 1 1          84.2
## 2 2          20.7
## 3 3          13.7
# Bar plot for average fare by passenger class
ggplot(avg_fare_pclass, aes(x = Pclass, y = avg_fare, fill = Pclass)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Fare by Passenger Class", 
       x = "Passenger Class", 
       y = "Average Fare") +
  scale_fill_brewer(palette = "Set3")

# Linear Regression of passenger class and fare
reg_fare_pclass <- lm(formula = Fare ~ Pclass, titanic_data)
summary(reg_fare_pclass)
## 
## Call:
## lm(formula = Fare ~ Pclass, data = titanic_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84.15  -6.93  -5.75   5.03 428.17 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   84.155      2.723   30.91   <2e-16 ***
## Pclass2      -63.493      4.014  -15.82   <2e-16 ***
## Pclass3      -70.479      3.267  -21.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.01 on 888 degrees of freedom
## Multiple R-squared:  0.3531, Adjusted R-squared:  0.3516 
## F-statistic: 242.3 on 2 and 888 DF,  p-value: < 2.2e-16
# Compare the average difference in ticket price (Fare) between both Sex and Pclass
avg_fare_sex_pclass <- titanic_data %>%
  group_by(Sex, Pclass) %>%
  summarise(avg_fare = mean(Fare, na.rm = TRUE), .groups = "drop")

# Print the result
avg_fare_sex_pclass
## # A tibble: 6 × 3
##   Sex    Pclass avg_fare
##   <chr>  <fct>     <dbl>
## 1 female 1         106. 
## 2 female 2          22.0
## 3 female 3          16.1
## 4 male   1          67.2
## 5 male   2          19.7
## 6 male   3          12.7
# Bar plot for average fare by sex and passenger class
ggplot(avg_fare_sex_pclass, aes(x = Pclass, y = avg_fare, fill = Sex)) +
  geom_bar(stat = "identity", position = "dodge") +  
  theme_minimal() +
  labs(title = "Average Fare by Sex and Passenger Class",
       x = "Passenger Class",
       y = "Average Fare") +
  scale_fill_brewer(palette = "Set2")

Based on the data, there is a notable difference in ticket prices between men and women on the Titanic. On average, women paid $44.48 for their tickets, while men paid $25.52. This is almost a $19 difference. The linear regression model for sex and fare shows a small p-value, suggesting that the difference is statistically significant. However, the multiple r-squared value of 0.03325 indicates that sex explains only 3.3% of the variation in ticket prices. This suggests that other factors likely played a much larger role in determining fares.

The data also highlights significant differences in ticket prices based on passenger class. On average, passengers in class 1 paid $84.15 for their tickets, passengers in class 2 paid $20.66, and passengers in class 3 paid $13.68. Passengers in class 2 and 3 paid considerably less than those in class 1. The linear regression model for passenger class and fare shows small p-values for both class 2 and 3, indicating that the differences in fares across passenger classes are statistically significant. The multiple r-squared value of 0.351 suggests that passenger class accounts for 35.3% of the variation in ticket prices. This is a much higher proportion compared to sex, meaning that the passenger class had a greater impact on ticket prices than sex of the passenger.

Survival

# Compare the average survival chance (Survived) between men and women (Sex)
avg_survival_sex <- titanic_data %>%
  group_by(Sex) %>%
  summarise(avg_survival = mean(Survived, na.rm = TRUE))

# Print the result
avg_survival_sex
## # A tibble: 2 × 2
##   Sex    avg_survival
##   <chr>         <dbl>
## 1 female        0.742
## 2 male          0.189
# Bar plot for survival chance by sex
ggplot(avg_survival_sex, aes(x = Sex, y = avg_survival, fill = Sex)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Survival Rate by Sex", y = "Average Survival Rate", x = "Sex") +
  scale_fill_brewer(palette = "Set2")

# Linear Regression of sex and survival chance
reg3 <- lm(formula = Survived ~ Sex, titanic_data)
summary(reg3)
## 
## Call:
## lm(formula = Survived ~ Sex, data = titanic_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7420 -0.1889 -0.1889  0.2580  0.8111 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.74204    0.02307   32.17   <2e-16 ***
## Sexmale     -0.55313    0.02866  -19.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4087 on 889 degrees of freedom
## Multiple R-squared:  0.2952, Adjusted R-squared:  0.2944 
## F-statistic: 372.4 on 1 and 889 DF,  p-value: < 2.2e-16
# Compare the average survival chance (Survived) between passenger class (Pclass)
avg_survival_pclass <- titanic_data %>%
  group_by(Pclass) %>%
  summarise(avg_survival = mean(Survived, na.rm = TRUE))

# Print the result
avg_survival_pclass
## # A tibble: 3 × 2
##   Pclass avg_survival
##   <fct>         <dbl>
## 1 1             0.630
## 2 2             0.473
## 3 3             0.242
# Bar plot for survival chance by passenger class
ggplot(avg_survival_pclass, aes(x = Pclass, y = avg_survival, fill = Pclass)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Survival Rate by Passenger Class", 
       x = "Passenger Class", 
       y = "Average Survival Rate") +
  scale_fill_brewer(palette = "Set3")

# Linear Regression of passenger class and survival chance
reg4 <- lm(formula = Survived ~ Pclass, titanic_data)
summary(reg4)
## 
## Call:
## lm(formula = Survived ~ Pclass, data = titanic_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6296 -0.2424 -0.2424  0.3704  0.7576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.62963    0.03117  20.198  < 2e-16 ***
## Pclass2     -0.15680    0.04596  -3.412 0.000675 ***
## Pclass3     -0.38727    0.03741 -10.353  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4582 on 888 degrees of freedom
## Multiple R-squared:  0.1155, Adjusted R-squared:  0.1135 
## F-statistic: 57.96 on 2 and 888 DF,  p-value: < 2.2e-16
# Compare the average difference in survival chance (Survived) between both Sex and Pclass
avg_survival_sex_pclass <- titanic_data %>%
  group_by(Sex, Pclass) %>%
  summarise(avg_survival = mean(Survived, na.rm = TRUE), .groups = "drop")

# Print the result
avg_survival_sex_pclass
## # A tibble: 6 × 3
##   Sex    Pclass avg_survival
##   <chr>  <fct>         <dbl>
## 1 female 1             0.968
## 2 female 2             0.921
## 3 female 3             0.5  
## 4 male   1             0.369
## 5 male   2             0.157
## 6 male   3             0.135
# Bar plot for survival chance by sex and passenger class

ggplot(avg_survival_sex_pclass, aes(x = Pclass, y = avg_survival, fill = Sex)) +
  geom_bar(stat = "identity", position = "dodge") +  
  theme_minimal() +
  labs(title = "Average Survival by Sex and Passenger Class",
       x = "Passenger Class",
       y = "Average Survival") +
  scale_fill_brewer(palette = "Set2")

This data reveals a significant difference in survival rates on the Titanic between men and women. On average, the survival rate for women was 74.2%, compared to just 18.9% for men. This means that women were over 55% more likely to survive. The linear regression model for sex and survival shows a small p-value, suggesting that this difference is statistically significant. The multiple r-squared value of 0.2952 indicates that sex explains about 29.5% of the variation in survival probability.

The data also shows varying survival rates based on passenger class. On average, passengers in class 1 had a survival rate of about 63%, those in class 2 had a survival rate of 47.3%, and passengers in class 3 had the lowest survival rate at 24.2%. The linear regression model for passenger class and survival shows small p-values for both class 2 and 3, indicating that the differences in survival rates across these classes are statistically significant. The multiple r-squared value of 0.1155 suggests that passenger class accounts for 11.55% of the variation in survival rates. This indicates that sex had a greater impact on survival rates than passenger class did. Overall, this data highlights a significant gender and socio-economic divide on the Titanic.