Week 5 Assignment

dplyr and tidyr are part of tidyverse

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Get the untidy airlines data

untidy_data <- read.csv("flights.csv")
head(untidy_data)

##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

#Rename the columns
names(untidy_data) <- c("Airline", "Departure", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")
head(untidy_data)

##   Airline Departure Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA   on time         497     221       212           503    1841
## 2           delayed          62      12        20           102     305
## 3                            NA      NA        NA            NA      NA
## 4 AM WEST   on time         694    4840       383           320     201
## 5           delayed         117     415        65           129      61

Tidy it up

# Apologies for doing this. RStudio would work correctly but suddenly decides it doesn't know how to count. It gave the result TRUE TRUE TRUE TRUE TRUE for nchar(untidy_data$Airline) > 0
# nchar(untidy$Airline) also gave the incorrect result of 6 6 6 6 6 
untidy_data$Airline[2] = untidy_data$Airline[1]
untidy_data$Airline[5] = untidy_data$Airline[4]

airlines <- untidy_data |>
  pivot_longer(`Los Angeles`:`Seattle`, names_to = "Destination", values_to = "Times", values_drop_na = TRUE)

str(airlines)

## tibble [20 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Airline    : chr [1:20] "ALASKA" "ALASKA" "ALASKA" "ALASKA" ...
##  $ Departure  : chr [1:20] "on time" "on time" "on time" "on time" ...
##  $ Destination: chr [1:20] "Los Angeles" "Phoenix" "San Diego" "San Francisco" ...
##  $ Times      : int [1:20] 497 221 212 503 1841 62 12 20 102 305 ...

head(airlines)

## # A tibble: 6 × 4
##   Airline Departure Destination   Times
##   <chr>   <chr>     <chr>         <int>
## 1 ALASKA  on time   Los Angeles     497
## 2 ALASKA  on time   Phoenix         221
## 3 ALASKA  on time   San Diego       212
## 4 ALASKA  on time   San Francisco   503
## 5 ALASKA  on time   Seattle        1841
## 6 ALASKA  delayed   Los Angeles      62

Comparing Arrival Delays

#Delayed Flights
## AM West have more delayed flights on average along with having a higher median of delayed flights
airlines |> 
  group_by(Airline) |>
  filter(Departure == "delayed") |>
  summarize(average = mean(Times), median = median(Times), standard_deviation = sd(Times))

## # A tibble: 2 × 4
##   Airline average median standard_deviation
##   <chr>     <dbl>  <int>              <dbl>
## 1 ALASKA     100.     62               120.
## 2 AM WEST    157.    117               147.

delayed_flights <- airlines |>
  filter(Departure == "delayed")
ggplot(delayed_flights, aes(x = Airline, y = Times)) + geom_point(aes(color = Destination))

# On Time Flights
## AM West also have a higher average of being on time for flights than Alaska Airlines, it have a lower median though
airlines |> 
  group_by(Airline) |>
  filter(Departure == "on time") |>
  summarize(average = mean(Times), median = median(Times), standard_deviation = sd(Times))

## # A tibble: 2 × 4
##   Airline average median standard_deviation
##   <chr>     <dbl>  <int>              <dbl>
## 1 ALASKA     655.    497               678.
## 2 AM WEST   1288.    383              1994.

ontime_flights <- airlines |>
  filter(Departure == "on time")
ggplot(ontime_flights, aes(x = Airline, y = Times)) + geom_point(aes(color = Destination))

# Delayed Flights be Destination
airlines |> 
  group_by(Destination) |>
  filter(Departure == "delayed") |>
  summarize(average = mean(Times), median = median(Times))

## # A tibble: 5 × 3
##   Destination   average median
##   <chr>           <dbl>  <dbl>
## 1 Los Angeles      89.5   89.5
## 2 Phoenix         214.   214. 
## 3 San Diego        42.5   42.5
## 4 San Francisco   116.   116. 
## 5 Seattle         183    183

# On Time Flights be Destination
airlines |> 
  group_by(Destination) |>
  filter(Departure == "on time") |>
  summarize(average = mean(Times), median = median(Times))

## # A tibble: 5 × 3
##   Destination   average median
##   <chr>           <dbl>  <dbl>
## 1 Los Angeles      596.   596.
## 2 Phoenix         2530.  2530.
## 3 San Diego        298.   298.
## 4 San Francisco    412.   412.
## 5 Seattle         1021   1021

Week 5 Assignment

Kelly Eng

2023-10-01