library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#turn every dataset into a csv
store_sales <- tribble(
~Date, ~Store, ~ProductA, ~ProductB, ~ProductC,
"2024-01-01", "Store1", 50, 30, 20,
"2024-01-01", "Store2", 55, 20, 45,
"2024-01-02", "Store1", 60, 25, 35,
"2024-01-02", "Store2", 65, 35, 50
)
write_csv(store_sales, "store_sales.csv")
flights_wide <- tribble(
~Airline, ~Status, ~`Los Angeles`, ~Phoenix, ~`San Diego`, ~`San Francisco`, ~Seattle,
"Alaska", "On Time", 497, 221, 212, 503, 1841,
"Alaska", "Delayed", 62, 12, 20, 102, 305,
"AM WEST", "On Time", 694, 4840, 383, 320, 201,
"AM WEST", "Delayed", 117, 415, 65, 129, 61
)
write_csv(flights_wide, "flights.csv")
student_scores <- tribble(
~Student, ~Math_2019, ~Science_2019, ~English_2019, ~Math_2020, ~Science_2020, ~English_2020,
"Alice", 85, 90, 78, 88, 92, 81,
"Bob", 75, 80, 70, 78, 83, 72,
"Carol", 92, 88, 95, 94, 90, 97,
"David", 68, 70, 65, 70, 72, 68
)
write_csv(student_scores, "student_scores.csv")
#read csv files
sales_wide <- readr::read_csv("store_sales.csv", show_col_types = FALSE)
flights_wide <- readr::read_csv("flights.csv", show_col_types = FALSE)
scores_wide <- readr::read_csv("student_scores.csv", show_col_types = FALSE)
# Show tables to confirm
head(sales_wide)
## # A tibble: 4 × 5
## Date Store ProductA ProductB ProductC
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2024-01-01 Store1 50 30 20
## 2 2024-01-01 Store2 55 20 45
## 3 2024-01-02 Store1 60 25 35
## 4 2024-01-02 Store2 65 35 50
head(flights_wide)
## # A tibble: 4 × 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska On Time 497 221 212 503 1841
## 2 Alaska Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 AM WEST Delayed 117 415 65 129 61
head(scores_wide)
## # A tibble: 4 × 7
## Student Math_2019 Science_2019 English_2019 Math_2020 Science_2020
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alice 85 90 78 88 92
## 2 Bob 75 80 70 78 83
## 3 Carol 92 88 95 94 90
## 4 David 68 70 65 70 72
## # ℹ 1 more variable: English_2020 <dbl>
#pivot to create long forms for all datasets
#Store Sales
sales_long <- sales_wide %>%
pivot_longer(
cols = starts_with("Product"),
names_to = "Product",
values_to = "Units"
)
# Flights
flights_long <- flights_wide %>%
pivot_longer(
cols = -c(Airline, Status),
names_to = "City",
values_to = "Flights"
)
#Student Scores
scores_long <- scores_wide %>%
pivot_longer(
cols = -Student,
names_to = "Subject_Year",
values_to = "Score"
) %>%
tidyr::separate(Subject_Year, into = c("Subject", "Year"), sep = "_", remove = TRUE)
# Show tables
head(sales_long)
## # A tibble: 6 × 4
## Date Store Product Units
## <date> <chr> <chr> <dbl>
## 1 2024-01-01 Store1 ProductA 50
## 2 2024-01-01 Store1 ProductB 30
## 3 2024-01-01 Store1 ProductC 20
## 4 2024-01-01 Store2 ProductA 55
## 5 2024-01-01 Store2 ProductB 20
## 6 2024-01-01 Store2 ProductC 45
head(flights_long)
## # A tibble: 6 × 4
## Airline Status City Flights
## <chr> <chr> <chr> <dbl>
## 1 Alaska On Time Los Angeles 497
## 2 Alaska On Time Phoenix 221
## 3 Alaska On Time San Diego 212
## 4 Alaska On Time San Francisco 503
## 5 Alaska On Time Seattle 1841
## 6 Alaska Delayed Los Angeles 62
head(scores_long)
## # A tibble: 6 × 4
## Student Subject Year Score
## <chr> <chr> <chr> <dbl>
## 1 Alice Math 2019 85
## 2 Alice Science 2019 90
## 3 Alice English 2019 78
## 4 Alice Math 2020 88
## 5 Alice Science 2020 92
## 6 Alice English 2020 81
#analysis for every dataset
#Store Sales
sales_avg_by_product <- sales_long %>%
group_by(Product) %>%
summarise(avg_units = mean(Units), .groups = "drop")
sales_total_by_date_store <- sales_long %>%
group_by(Date, Store) %>%
summarise(total_units = sum(Units), .groups = "drop")
sales_avg_by_product
## # A tibble: 3 × 2
## Product avg_units
## <chr> <dbl>
## 1 ProductA 57.5
## 2 ProductB 27.5
## 3 ProductC 37.5
sales_total_by_date_store
## # A tibble: 4 × 3
## Date Store total_units
## <date> <chr> <dbl>
## 1 2024-01-01 Store1 100
## 2 2024-01-01 Store2 120
## 3 2024-01-02 Store1 120
## 4 2024-01-02 Store2 150
#Flights
flights_totals <- flights_long %>%
group_by(Airline, City) %>%
summarise(total_flights = sum(Flights), .groups = "drop")
flights_delay_rate <- flights_long %>%
group_by(Airline, City) %>%
mutate(total_city_airline = sum(Flights)) %>%
ungroup() %>%
group_by(Airline, City, Status) %>%
summarise(Flights = sum(Flights),
total_city_airline = first(total_city_airline),
.groups = "drop_last") %>%
mutate(pct = Flights / total_city_airline) %>%
ungroup() %>%
filter(Status == "Delayed") %>%
arrange(desc(pct))
flights_totals
## # A tibble: 10 × 3
## Airline City total_flights
## <chr> <chr> <dbl>
## 1 AM WEST Los Angeles 811
## 2 AM WEST Phoenix 5255
## 3 AM WEST San Diego 448
## 4 AM WEST San Francisco 449
## 5 AM WEST Seattle 262
## 6 Alaska Los Angeles 559
## 7 Alaska Phoenix 233
## 8 Alaska San Diego 232
## 9 Alaska San Francisco 605
## 10 Alaska Seattle 2146
flights_delay_rate
## # A tibble: 10 × 6
## Airline City Status Flights total_city_airline pct
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 AM WEST San Francisco Delayed 129 449 0.287
## 2 AM WEST Seattle Delayed 61 262 0.233
## 3 Alaska San Francisco Delayed 102 605 0.169
## 4 AM WEST San Diego Delayed 65 448 0.145
## 5 AM WEST Los Angeles Delayed 117 811 0.144
## 6 Alaska Seattle Delayed 305 2146 0.142
## 7 Alaska Los Angeles Delayed 62 559 0.111
## 8 Alaska San Diego Delayed 20 232 0.0862
## 9 AM WEST Phoenix Delayed 415 5255 0.0790
## 10 Alaska Phoenix Delayed 12 233 0.0515
#Student Scores
scores_avg_subject <- scores_long %>%
group_by(Subject) %>%
summarise(avg_score = mean(Score), .groups = "drop")
scores_avg_year <- scores_long %>%
group_by(Year) %>%
summarise(avg_score = mean(Score), .groups = "drop")
scores_avg_subject
## # A tibble: 3 × 2
## Subject avg_score
## <chr> <dbl>
## 1 English 78.2
## 2 Math 81.2
## 3 Science 83.1
scores_avg_year
## # A tibble: 2 × 2
## Year avg_score
## <chr> <dbl>
## 1 2019 79.7
## 2 2020 82.1
#analysis
# Store Sales product per store/date
ggplot(sales_long, aes(x = Date, y = Units, fill = Product)) +
geom_col(position = "dodge") +
facet_wrap(~Store) +
labs(title = "Store Sales by Product and Date", y = "Units")

# Flights: delayed
ggplot(flights_delay_rate, aes(x = reorder(paste(Airline, City, sep = " — "), pct), y = pct)) +
geom_col() +
coord_flip() +
labs(title = "Delayed Flight Share by Airline & City", x = "Airline — City", y = "Delayed Share")

# Scores across years
ggplot(scores_long, aes(x = Year, y = Score, group = Subject, color = Subject)) +
stat_summary(fun = mean, geom = "line", linewidth = 1) +
stat_summary(fun = mean, geom = "point", size = 2) +
labs(title = "Average Student Scores by Subject and Year")
