Import & Load Data

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)

Analyzing Titanic Data

titanic <- read_csv("~/Documents/Data 712/titanic_data.csv", show_col_types = FALSE)
glimpse(titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived    <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass      <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp       <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
head(titanic)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl>    <dbl>  <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1           1        0      3 Braund… male     22     1     0 A/5 2…  7.25 <NA> 
## 2           2        1      1 Cuming… fema…    38     1     0 PC 17… 71.3  C85  
## 3           3        1      3 Heikki… fema…    26     0     0 STON/…  7.92 <NA> 
## 4           4        1      1 Futrel… fema…    35     1     0 113803 53.1  C123 
## 5           5        0      3 Allen,… male     35     0     0 373450  8.05 <NA> 
## 6           6        0      3 Moran,… male     NA     0     0 330877  8.46 <NA> 
## # ℹ 1 more variable: Embarked <chr>

Compare Average Fare Between Men and Women

Average_Fare <- titanic %>%
  group_by(Sex) %>%
  summarize(Average_Fare = mean(Fare, na.rm = TRUE))

Compare Average Fare Between Passenger Class

Average_Fare <- titanic %>%
  group_by(Pclass) %>%
  summarize(Average_Fare = mean(Fare, na.rm = TRUE))

Compare Average Survival Rate Between Men and Women

Average_Survival <- titanic %>%
  group_by(Sex) %>%
  summarize(SurvivalRate = mean(Survived, na.rm = TRUE))

Compare Average Survival Rate Between Passenger Class

Average_Survival <- titanic %>%
  group_by(Pclass) %>%
  summarize(SurvivalRate = mean(Survived, na.rm = TRUE))

Visualization

Average Fare by Gender

ggplot(titanic, aes(x = Sex, y = Fare, fill = Sex)) +
  geom_bar(stat = "summary", fun = "mean") +
  labs(title = "Average Fare by Gender", y = "Average Fare")

Average Fare by Passenger Class

ggplot(titanic, aes(x = factor(Pclass), y = Fare, fill = factor(Pclass))) +
  geom_bar(stat = "summary", fun = "mean") +
  labs(title = "Average Fare by Passenger Class", x = "Passenger Class", y = "Average Fare")

Survival Rate by Gender

ggplot(titanic, aes(x = Sex, fill = factor(Survived))) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Gender", y = "Proportion Survived")

Survival Rate by Passenger Class

ggplot(titanic, aes(x = factor(Pclass), fill = factor(Survived))) +
  geom_bar(position = "fill") +
  labs(title = "Survival Rate by Passenger Class", x = "Passenger Class", y = "Proportion Survived")

Titanic Data Findings

On average, women paid higher fares than men. This could be due to the distribution of women across different passenger classes, as well as social roles at the time that might have led to women traveling in higher classes where fares were higher.

On average, First Class passengers paid more than Second and Third Class passengers. This is consistent with the luxurious accommodations offered in First Class, which were significantly more expensive than those in lower classes.

Women had a higher survival rate than men, which may be attributed to the “women and children first” evacuation policy that was in effect during the disaster. This suggests that women were prioritized for lifeboats and safety.

First Class passengers had a higher survival rate compared to Second and Third Class passengers. This may be due to better access to lifeboats and preferential treatment during the evacuation, as well as their proximity to areas of the ship that were more easily accessible during the emergency.

Passenger class was a significant determinant of survival, with First Class passengers having the highest survival rate (approximately 60%), followed by Second Class (40%), and Third Class passengers (25%). This illustrates the role of socioeconomic status in determining survival chances.

Analyzing Air Quality Data

data("airquality")
airquality <- as_tibble(airquality)
glimpse(airquality)
## Rows: 153
## Columns: 6
## $ Ozone   <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 14, …
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290, 27…
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9, 9…
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58, 64…
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
sum(is.na(airquality$Ozone))
## [1] 37
sum(is.na(airquality$Temp))
## [1] 0
airquality <- airquality %>% drop_na(Ozone, Temp)

Compare average ozone levels by month and temperature

Average ozone levels by month

Average_Ozone_Month <-airquality %>%
  group_by(Month) %>%
  summarize(Average_Ozone = mean(Ozone, na.rm = TRUE))

Average ozone levels by temperature

airquality <- airquality %>%
  mutate(Temp_Group = case_when(
    Temp < 70 ~ "Low",
    Temp >= 70 & Temp < 80 ~ "Medium",
    Temp >= 80 ~ "High"
  ))

Average ozone by temperature group

Average_Ozone_Temp <- airquality %>%
  group_by(Temp_Group) %>%
  summarize(Average_Ozone = mean(Ozone, na.rm = TRUE))

Visualization

Ozone by month - Box Plot

ggplot(airquality, aes(x = factor(Month), y = Ozone)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Ozone Levels by Month", x = "Month", y = "Ozone")

Ozone by month - Bar Graph (Average Ozone Levels)

ggplot(Average_Ozone_Month, aes(x = factor(Month), y = Average_Ozone, fill = factor(Month))) +
  geom_bar(stat = "identity") +
  labs(title = "Average Ozone Levels by Month", x = "Month", y = "Average Ozone")

Ozone by temperature group - Scatter Plot

ggplot(airquality, aes(x = Temp, y = Ozone, color = Temp_Group)) +
  geom_point(size = 2) +
  labs(title = "Ozone Levels by Temperature", x = "Temperature", y = "Ozone")

Air Quality Data Findings

Our analysis of the airquality dataset reveals that ozone levels fluctuate throughout the months, peaking in July and August and reaching their lowest point in May. This pattern suggests that summer weather conditions — such as increased sunlight and higher temperatures — play a significant role in ozone formation.

Furthermore, our examination of the relationship between temperature and ozone levels indicates a positive correlation, meaning that warmer days tend to experience higher ozone pollution. This aligns with expectations, as ozone forms more readily in warm, sunny environments. These findings highlight the importance of monitoring air quality, especially during the summer months when ozone pollution peaks, as elevated ozone levels can pose both environmental and health risks.