# Start Session
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 544014 29.1 1211095 64.7 686460 36.7
## Vcells 993613 7.6 8388608 64.0 1876786 14.4
# Load Packages
library(readxl)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(sp)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(descr)
library(leaflet)
library(ggthemes)
library(writexl)
library(readr)
library(haven)
library(leaflet)
# Importing the dataset
titanic_data <- read.csv("titanic_data.csv")
# Viewing the first few rows of the dataset
head(titanic_data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
# Viewing the column names
names(titanic_data)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
Upon analyzing the average fare paid by passengers, it becomes clear that women ($44.48) on the Titanic, on average, paid more for their tickets than men ($25.52). This trend is especially noticeable when examining the fare distribution across different passenger classes. The overall higher average fare for women can likely be attributed to their overrepresentation in higher-class cabins, where ticket prices were significantly more expensive. When breaking down the data by both gender and class, it becomes evident that women in first class ($106.13) paid substantially more for their tickets compared to their male counterparts ($67.23). In contrast, the average fare for women in third class was somewhat comparable to that of men in the same class, which may have minimized the gender-based disparity at lower fare levels.
This finding is further supported by the passenger class distribution, which shows a higher proportion of women in first and second class, while men were more commonly found in the lower third class. This demographic skew likely explains the higher fares for women overall, as they were more frequently booked into cabins with higher ticket prices. Therefore, the disparity in ticket prices is not directly linked to gender-based pricing differences but rather to the structural distribution of women across the Titanic’s passenger classes. These insights underscore how social and economic factors, such as class and gender, intersected on the Titanic, shaping both the cost of travel and the overall experience.
# Comparing the average fare by Gender
average_fare_by_gender <- titanic_data %>%
group_by(Sex) %>%
summarise(average_fare = mean(Fare, na.rm = TRUE))
# Comparing the average fare by Passenger Class
average_fare_by_class <- titanic_data %>%
group_by(Pclass) %>%
summarise(average_fare = mean(Fare, na.rm = TRUE))
# Comparing the average fare by Gender and Passenger Class
average_fare_gender_class <- titanic_data %>%
group_by(Sex, Pclass) %>%
summarise(average_fare = mean(Fare, na.rm = TRUE))
## `summarise()` has grouped output by 'Sex'. You can override using the `.groups`
## argument.
# Viewing the results
average_fare_by_gender
## # A tibble: 2 × 2
## Sex average_fare
## <chr> <dbl>
## 1 female 44.5
## 2 male 25.5
average_fare_by_class
## # A tibble: 3 × 2
## Pclass average_fare
## <int> <dbl>
## 1 1 84.2
## 2 2 20.7
## 3 3 13.7
average_fare_gender_class
## # A tibble: 6 × 3
## # Groups: Sex [2]
## Sex Pclass average_fare
## <chr> <int> <dbl>
## 1 female 1 106.
## 2 female 2 22.0
## 3 female 3 16.1
## 4 male 1 67.2
## 5 male 2 19.7
## 6 male 3 12.7
# Investigating why women paid more:
# Check the number of men and women in each class and view the data
passenger_counts <- titanic_data %>%
group_by(Sex, Pclass) %>%
tally()
passenger_counts
## # A tibble: 6 × 3
## # Groups: Sex [2]
## Sex Pclass n
## <chr> <int> <int>
## 1 female 1 94
## 2 female 2 76
## 3 female 3 144
## 4 male 1 122
## 5 male 2 108
## 6 male 3 347
Upon analyzing the average survival chances of passengers, it is evident that gender played a significant role in survival rates on the Titanic. The average survival rate for women was notably higher than for men, with women having a survival rate of approximately 74%, compared to only about 19% for men. This disparity highlights the well-documented phenomenon where women and children were given priority during the lifeboat loading, which increased their chances of survival.
When examining survival rates across different passenger classes, a clear pattern emerges. Passengers in first class had the highest survival rate, around 63%, while survival rates decreased as passenger class lowered. Second-class passengers had a survival rate of approximately 47%, and third-class passengers had the lowest survival rate at around 24%. This suggests that not only gender but also passenger class played a critical role in survival, with higher-class passengers having better access to lifeboats and more favorable treatment during the evacuation.
Further investigation into the combined effects of gender and class reveals that women in first class had the highest survival rates, with nearly 97% surviving. In contrast, women in third class had a significantly lower survival rate, around 50%. Men in first class had a survival rate of about 37%, while men in third class had only an 14% survival rate. These findings underscore the intersection of gender and class, illustrating that women, particularly those in first class, had the best chances of survival, while men—especially those in third class—had much lower survival rates.
# Comparing the average survival chance by gender
average_survival_gender <- titanic_data %>%
group_by(Sex) %>%
summarise(average_survival = mean(Survived, na.rm = TRUE))
# Comparing the average survival chance by class
average_survival_class <- titanic_data %>%
group_by(Pclass) %>%
summarise(average_survival = mean(Survived, na.rm = TRUE))
# Comparing the average survival chance by gender and class
average_survival_gender_class <- titanic_data %>%
group_by(Sex, Pclass) %>%
summarise(average_survival = mean(Survived, na.rm = TRUE))
## `summarise()` has grouped output by 'Sex'. You can override using the `.groups`
## argument.
# Viewing the results:
average_survival_gender
## # A tibble: 2 × 2
## Sex average_survival
## <chr> <dbl>
## 1 female 0.742
## 2 male 0.189
average_survival_class
## # A tibble: 3 × 2
## Pclass average_survival
## <int> <dbl>
## 1 1 0.630
## 2 2 0.473
## 3 3 0.242
average_survival_gender_class
## # A tibble: 6 × 3
## # Groups: Sex [2]
## Sex Pclass average_survival
## <chr> <int> <dbl>
## 1 female 1 0.968
## 2 female 2 0.921
## 3 female 3 0.5
## 4 male 1 0.369
## 5 male 2 0.157
## 6 male 3 0.135
In conclusion, the analysis of ticket fares and survival chances on the Titanic reveals significant insights into the influence of gender and class on passengers’ experiences. Women, on average, paid more for their tickets primarily due to their over-representation in higher-class cabins, where fares were significantly more expensive. While the fare disparity between men and women was most pronounced in first class, it was less noticeable in third class, where ticket prices were more similar across genders. This underscores how class, rather than gender-based pricing, contributed to the higher ticket costs for women.
When examining survival rates, gender and class once again played pivotal roles. Women had a much higher chance of survival than men, with women in first class having the best survival rates. However, survival rates for women in third class were lower, reflecting the intersection of class and gender in the evacuation process. Similarly, passengers in higher classes had better chances of survival overall, further emphasizing the influence of socio-economic factors on survival outcomes. The combined analysis of ticket prices and survival rates highlights how gender and class intersected to shape passengers’ experiences on the Titanic, illustrating how deeply entrenched social structures influenced both the cost of travel and the chances of survival.
# Loading the mtcars dataset
data(mtcars)
# Viewing the first few rows of the dataset
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Viewing the column names
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
# Summary of the dataset
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
The mtcars, or Motor Trend Car Road Tests dataset, is a built-in R dataset extracted from the 1974 Motor Trends US Magazine.
In my analysis of the mtcars dataset, I set out to understand what factors drive fuel efficiency (measured in MPG) in 1973-1974 model cars. My initial focus was on the relationship between a car’s weight and its MPG. The scatter plot I created—with weight on the x-axis (in 1000 lbs) and MPG on the y-axis—revealed a clear negative relationship, and the orange regression line confirmed that as a car’s weight increases, its fuel efficiency tends to drop. This finding makes intuitive sense since heavier vehicles require more energy to move, which naturally reduces their miles per gallon.
To dive deeper into other influences on MPG, I recoded the transmission variable from a numeric indicator (0 for Automatic and 1 for Manual) into a factor with descriptive labels, enhancing the clarity of my analysis. I then built a linear regression model to predict MPG using weight, horsepower, and transmission type. The model’s summary showed that both weight and horsepower have significant negative effects on MPG, indicating that heavier cars and those with higher horsepower tend to be less fuel-efficient. Although the coefficient for manual transmissions was positive, suggesting a slight improvement in fuel efficiency, but its impact is relatively minor compared to weight and horsepower.
I further illustrated these findings with a boxplot comparing MPG across automatic and manual transmission vehicles, which highlighted the distribution differences between the two groups. Additionally, by identifying the car with the highest MPG—namely, the Toyota Corolla with 33.9 mpg—I spotlighted an exceptional case in fuel economy within the dataset. Overall, these analyses tell a compelling story: vehicle weight and horsepower are the dominant factors influencing fuel efficiency, while transmission type plays a secondary role. This view deepens my understanding of automotive performance and the complex interplay of design and engineering choices in these classic cars.
The description of the variables in this dataset can be found here.
# Visualizing the rlationship between MPG and car weight:
ggplot(mtcars, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", color = "orange") +
labs(title = "Relationship between MPG and Car Weight",
x = "Weight (1000 lbs)",
y = "Miles per Gallon")
## `geom_smooth()` using formula = 'y ~ x'
# Changing the 0 to automatic and 1 to manual for the "am" or transmission variable:
mtcars$am <- factor(mtcars$am, labels = c("Automatic", "Manual"))
# Linear Regression:
lmmpgwthpam <- lm(mpg ~ wt + hp + am, data = mtcars)
summary(lmmpgwthpam)
##
## Call:
## lm(formula = mpg ~ wt + hp + am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4221 -1.7924 -0.3788 1.2249 5.5317
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.002875 2.642659 12.867 2.82e-13 ***
## wt -2.878575 0.904971 -3.181 0.003574 **
## hp -0.037479 0.009605 -3.902 0.000546 ***
## amManual 2.083710 1.376420 1.514 0.141268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.538 on 28 degrees of freedom
## Multiple R-squared: 0.8399, Adjusted R-squared: 0.8227
## F-statistic: 48.96 on 3 and 28 DF, p-value: 2.908e-11
# Visualizing MPG by Transmission Type
ggplot(mtcars, aes(x = am, y = mpg, fill = am)) +
geom_boxplot() +
labs(title = "MPG by Transmission Type",
x = "Transmission Type",
y = "Miles per Gallon")
# Car with the highest MPG
best_mpg <- mtcars[which.max(mtcars$mpg), ]
best_mpg
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 Manual 4 1