library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
titanic <- read_csv("~/Documents/Data 712/titanic_data.csv", show_col_types = FALSE)
glimpse(titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
head(titanic)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # ℹ 1 more variable: Embarked <chr>
Average_Fare <- titanic %>%
group_by(Sex) %>%
summarize(Average_Fare = mean(Fare, na.rm = TRUE))
Average_Fare <- titanic %>%
group_by(Pclass) %>%
summarize(Average_Fare = mean(Fare, na.rm = TRUE))
Average_Survival <- titanic %>%
group_by(Sex) %>%
summarize(SurvivalRate = mean(Survived, na.rm = TRUE))
Average_Survival <- titanic %>%
group_by(Pclass) %>%
summarize(SurvivalRate = mean(Survived, na.rm = TRUE))
ggplot(titanic, aes(x = Sex, y = Fare, fill = Sex)) +
geom_bar(stat = "summary", fun = "mean") +
labs(title = "Average Fare by Gender", y = "Average Fare")
ggplot(titanic, aes(x = factor(Pclass), y = Fare, fill = factor(Pclass))) +
geom_bar(stat = "summary", fun = "mean") +
labs(title = "Average Fare by Passenger Class", x = "Passenger Class", y = "Average Fare")
ggplot(titanic, aes(x = Sex, fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival Rate by Gender", y = "Proportion Survived")
ggplot(titanic, aes(x = factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival Rate by Passenger Class", x = "Passenger Class", y = "Proportion Survived")
On average, women paid higher fares than men. This could be due to the distribution of women across different passenger classes, as well as social roles at the time that might have led to women traveling in higher classes where fares were higher.
On average, First Class passengers paid more than Second and Third Class passengers. This is consistent with the luxurious accommodations offered in First Class, which were significantly more expensive than those in lower classes.
Women had a higher survival rate than men, which may be attributed to the “women and children first” evacuation policy that was in effect during the disaster. This suggests that women were prioritized for lifeboats and safety.
First Class passengers had a higher survival rate compared to Second and Third Class passengers. This may be due to better access to lifeboats and preferential treatment during the evacuation, as well as their proximity to areas of the ship that were more easily accessible during the emergency.
Passenger class was a significant determinant of survival, with First Class passengers having the highest survival rate (approximately 60%), followed by Second Class (40%), and Third Class passengers (25%). This illustrates the role of socioeconomic status in determining survival chances.
data("airquality")
airquality <- as_tibble(airquality)
glimpse(airquality)
## Rows: 153
## Columns: 6
## $ Ozone <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 14, …
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290, 27…
## $ Wind <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9, 9…
## $ Temp <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58, 64…
## $ Month <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,…
## $ Day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
sum(is.na(airquality$Ozone))
## [1] 37
sum(is.na(airquality$Temp))
## [1] 0
airquality <- airquality %>% drop_na(Ozone, Temp)
Average_Ozone_Month <-airquality %>%
group_by(Month) %>%
summarize(Average_Ozone = mean(Ozone, na.rm = TRUE))
airquality <- airquality %>%
mutate(Temp_Group = case_when(
Temp < 70 ~ "Low",
Temp >= 70 & Temp < 80 ~ "Medium",
Temp >= 80 ~ "High"
))
Average_Ozone_Temp <- airquality %>%
group_by(Temp_Group) %>%
summarize(Average_Ozone = mean(Ozone, na.rm = TRUE))
ggplot(airquality, aes(x = factor(Month), y = Ozone)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Ozone Levels by Month", x = "Month", y = "Ozone")
ggplot(Average_Ozone_Month, aes(x = factor(Month), y = Average_Ozone, fill = factor(Month))) +
geom_bar(stat = "identity") +
labs(title = "Average Ozone Levels by Month", x = "Month", y = "Average Ozone")
ggplot(airquality, aes(x = Temp, y = Ozone, color = Temp_Group)) +
geom_point(size = 2) +
labs(title = "Ozone Levels by Temperature", x = "Temperature", y = "Ozone")
Our analysis of the airquality dataset reveals that ozone levels fluctuate throughout the months, peaking in July and August and reaching their lowest point in May. This pattern suggests that summer weather conditions — such as increased sunlight and higher temperatures — play a significant role in ozone formation.
Furthermore, our examination of the relationship between temperature and ozone levels indicates a positive correlation, meaning that warmer days tend to experience higher ozone pollution. This aligns with expectations, as ozone forms more readily in warm, sunny environments. These findings highlight the importance of monitoring air quality, especially during the summer months when ozone pollution peaks, as elevated ozone levels can pose both environmental and health risks.