project 2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#turn every dataset into a csv

store_sales <- tribble(
  ~Date,        ~Store,   ~ProductA, ~ProductB, ~ProductC,
  "2024-01-01", "Store1", 50, 30, 20,
  "2024-01-01", "Store2", 55, 20, 45,
  "2024-01-02", "Store1", 60, 25, 35,
  "2024-01-02", "Store2", 65, 35, 50
)

write_csv(store_sales, "store_sales.csv")


flights_wide <- tribble(
  ~Airline, ~Status, ~`Los Angeles`, ~Phoenix, ~`San Diego`, ~`San Francisco`, ~Seattle,
  "Alaska", "On Time", 497, 221, 212, 503, 1841,
  "Alaska", "Delayed", 62, 12, 20, 102, 305,
  "AM WEST", "On Time", 694, 4840, 383, 320, 201,
  "AM WEST", "Delayed", 117, 415, 65, 129, 61
)
write_csv(flights_wide, "flights.csv")




student_scores <- tribble(
  ~Student, ~Math_2019, ~Science_2019, ~English_2019, ~Math_2020, ~Science_2020, ~English_2020,
  "Alice", 85, 90, 78, 88, 92, 81,
  "Bob",   75, 80, 70, 78, 83, 72,
  "Carol", 92, 88, 95, 94, 90, 97,
  "David", 68, 70, 65, 70, 72, 68
)

write_csv(student_scores, "student_scores.csv")

#read csv files
sales_wide   <- readr::read_csv("store_sales.csv",    show_col_types = FALSE)
flights_wide <- readr::read_csv("flights.csv",        show_col_types = FALSE)
scores_wide  <- readr::read_csv("student_scores.csv", show_col_types = FALSE)

# Show tables to confirm
head(sales_wide)

## # A tibble: 4 × 5
##   Date       Store  ProductA ProductB ProductC
##   <date>     <chr>     <dbl>    <dbl>    <dbl>
## 1 2024-01-01 Store1       50       30       20
## 2 2024-01-01 Store2       55       20       45
## 3 2024-01-02 Store1       60       25       35
## 4 2024-01-02 Store2       65       35       50

head(flights_wide)

## # A tibble: 4 × 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 Alaska  On Time           497     221         212             503    1841
## 2 Alaska  Delayed            62      12          20             102     305
## 3 AM WEST On Time           694    4840         383             320     201
## 4 AM WEST Delayed           117     415          65             129      61

head(scores_wide)

## # A tibble: 4 × 7
##   Student Math_2019 Science_2019 English_2019 Math_2020 Science_2020
##   <chr>       <dbl>        <dbl>        <dbl>     <dbl>        <dbl>
## 1 Alice          85           90           78        88           92
## 2 Bob            75           80           70        78           83
## 3 Carol          92           88           95        94           90
## 4 David          68           70           65        70           72
## # ℹ 1 more variable: English_2020 <dbl>

#pivot to create long forms for all datasets

#Store Sales
sales_long <- sales_wide %>%
  pivot_longer(
    cols = starts_with("Product"),
    names_to = "Product",
    values_to = "Units"
  )

# Flights
flights_long <- flights_wide %>%
  pivot_longer(
    cols = -c(Airline, Status),
    names_to = "City",
    values_to = "Flights"
  )
#Student Scores
scores_long <- scores_wide %>%
  pivot_longer(
    cols = -Student,
    names_to = "Subject_Year",
    values_to = "Score"
  ) %>%
  tidyr::separate(Subject_Year, into = c("Subject", "Year"), sep = "_", remove = TRUE)

# Show tables
head(sales_long)

## # A tibble: 6 × 4
##   Date       Store  Product  Units
##   <date>     <chr>  <chr>    <dbl>
## 1 2024-01-01 Store1 ProductA    50
## 2 2024-01-01 Store1 ProductB    30
## 3 2024-01-01 Store1 ProductC    20
## 4 2024-01-01 Store2 ProductA    55
## 5 2024-01-01 Store2 ProductB    20
## 6 2024-01-01 Store2 ProductC    45

head(flights_long)

## # A tibble: 6 × 4
##   Airline Status  City          Flights
##   <chr>   <chr>   <chr>           <dbl>
## 1 Alaska  On Time Los Angeles       497
## 2 Alaska  On Time Phoenix           221
## 3 Alaska  On Time San Diego         212
## 4 Alaska  On Time San Francisco     503
## 5 Alaska  On Time Seattle          1841
## 6 Alaska  Delayed Los Angeles        62

head(scores_long)

## # A tibble: 6 × 4
##   Student Subject Year  Score
##   <chr>   <chr>   <chr> <dbl>
## 1 Alice   Math    2019     85
## 2 Alice   Science 2019     90
## 3 Alice   English 2019     78
## 4 Alice   Math    2020     88
## 5 Alice   Science 2020     92
## 6 Alice   English 2020     81

#analysis for every dataset

#Store Sales
sales_avg_by_product <- sales_long %>%
  group_by(Product) %>%
  summarise(avg_units = mean(Units), .groups = "drop")

sales_total_by_date_store <- sales_long %>%
  group_by(Date, Store) %>%
  summarise(total_units = sum(Units), .groups = "drop")

sales_avg_by_product

## # A tibble: 3 × 2
##   Product  avg_units
##   <chr>        <dbl>
## 1 ProductA      57.5
## 2 ProductB      27.5
## 3 ProductC      37.5

sales_total_by_date_store

## # A tibble: 4 × 3
##   Date       Store  total_units
##   <date>     <chr>        <dbl>
## 1 2024-01-01 Store1         100
## 2 2024-01-01 Store2         120
## 3 2024-01-02 Store1         120
## 4 2024-01-02 Store2         150

#Flights 
flights_totals <- flights_long %>%
  group_by(Airline, City) %>%
  summarise(total_flights = sum(Flights), .groups = "drop")

flights_delay_rate <- flights_long %>%
  group_by(Airline, City) %>%
  mutate(total_city_airline = sum(Flights)) %>%
  ungroup() %>%
  group_by(Airline, City, Status) %>%
  summarise(Flights = sum(Flights),
            total_city_airline = first(total_city_airline),
            .groups = "drop_last") %>%
  mutate(pct = Flights / total_city_airline) %>%
  ungroup() %>%
  filter(Status == "Delayed") %>%
  arrange(desc(pct))

flights_totals

## # A tibble: 10 × 3
##    Airline City          total_flights
##    <chr>   <chr>                 <dbl>
##  1 AM WEST Los Angeles             811
##  2 AM WEST Phoenix                5255
##  3 AM WEST San Diego               448
##  4 AM WEST San Francisco           449
##  5 AM WEST Seattle                 262
##  6 Alaska  Los Angeles             559
##  7 Alaska  Phoenix                 233
##  8 Alaska  San Diego               232
##  9 Alaska  San Francisco           605
## 10 Alaska  Seattle                2146

flights_delay_rate

## # A tibble: 10 × 6
##    Airline City          Status  Flights total_city_airline    pct
##    <chr>   <chr>         <chr>     <dbl>              <dbl>  <dbl>
##  1 AM WEST San Francisco Delayed     129                449 0.287 
##  2 AM WEST Seattle       Delayed      61                262 0.233 
##  3 Alaska  San Francisco Delayed     102                605 0.169 
##  4 AM WEST San Diego     Delayed      65                448 0.145 
##  5 AM WEST Los Angeles   Delayed     117                811 0.144 
##  6 Alaska  Seattle       Delayed     305               2146 0.142 
##  7 Alaska  Los Angeles   Delayed      62                559 0.111 
##  8 Alaska  San Diego     Delayed      20                232 0.0862
##  9 AM WEST Phoenix       Delayed     415               5255 0.0790
## 10 Alaska  Phoenix       Delayed      12                233 0.0515

#Student Scores
scores_avg_subject <- scores_long %>%
  group_by(Subject) %>%
  summarise(avg_score = mean(Score), .groups = "drop")

scores_avg_year <- scores_long %>%
  group_by(Year) %>%
  summarise(avg_score = mean(Score), .groups = "drop")

scores_avg_subject

## # A tibble: 3 × 2
##   Subject avg_score
##   <chr>       <dbl>
## 1 English      78.2
## 2 Math         81.2
## 3 Science      83.1

scores_avg_year

## # A tibble: 2 × 2
##   Year  avg_score
##   <chr>     <dbl>
## 1 2019       79.7
## 2 2020       82.1

#analysis

# Store Sales product per store/date
ggplot(sales_long, aes(x = Date, y = Units, fill = Product)) +
  geom_col(position = "dodge") +
  facet_wrap(~Store) +
  labs(title = "Store Sales by Product and Date", y = "Units")

# Flights: delayed
ggplot(flights_delay_rate, aes(x = reorder(paste(Airline, City, sep = " — "), pct), y = pct)) +
  geom_col() +
  coord_flip() +
  labs(title = "Delayed Flight Share by Airline & City", x = "Airline — City", y = "Delayed Share")

# Scores across years
ggplot(scores_long, aes(x = Year, y = Score, group = Subject, color = Subject)) +
  stat_summary(fun = mean, geom = "line", linewidth = 1) +
  stat_summary(fun = mean, geom = "point", size = 2) +
  labs(title = "Average Student Scores by Subject and Year")

project 2

2025-10-06