library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)

Created a csv file and uploaded in the github. The structure of the csv is little unconvetional and hard to grab any data. I have replaced all the empty spaces with NA so that it is easier to work later.

untidy_flight_data <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/assignment_4/untidy_flight_dataset.csv', na.strings = c('', 'NA'))

untidy_flight_data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129     601

In this section we are filling the X column with Value of Either Alaska or AM West and dropping all the NA values. tidyr::pivot_longer is used to change the structure of the dataset and create a new variable name Arrival so that the dataset is arranged according to the airport destination. tidyr::pivot_wider is used to add column on.Time and delayed and grap the data from temp column Time

tidy_data <- untidy_flight_data %>%
  fill('X') %>%
  drop_na() %>%
  pivot_longer(cols = c('Los.Angeles', 'Phoenix', 'San.Diego', 'San.Francisco', 'Seattle'), names_to = 'Arrival', values_to = 'Time' ) %>%
  pivot_wider(names_from = 'X.1', values_from = 'Time')
tidy_data
## # A tibble: 10 × 4
##    X       Arrival       `on time` delayed
##    <chr>   <chr>             <int>   <int>
##  1 Alaska  Los.Angeles         497      62
##  2 Alaska  Phoenix             221      12
##  3 Alaska  San.Diego           212      20
##  4 Alaska  San.Francisco       503     102
##  5 Alaska  Seattle            1841     305
##  6 AM WEST Los.Angeles         694     117
##  7 AM WEST Phoenix            4840     415
##  8 AM WEST San.Diego           383      65
##  9 AM WEST San.Francisco       320     129
## 10 AM WEST Seattle             201     601

In the section, the column name of X is changed to Departure and renamed the column name on time to on.time for consistency.

colnames(tidy_data)[names(tidy_data) == 'X'] <- "Departure"
names(tidy_data) <- str_replace_all(names(tidy_data), c(" " = "."))
tidy_data
## # A tibble: 10 × 4
##    Departure Arrival       on.time delayed
##    <chr>     <chr>           <int>   <int>
##  1 Alaska    Los.Angeles       497      62
##  2 Alaska    Phoenix           221      12
##  3 Alaska    San.Diego         212      20
##  4 Alaska    San.Francisco     503     102
##  5 Alaska    Seattle          1841     305
##  6 AM WEST   Los.Angeles       694     117
##  7 AM WEST   Phoenix          4840     415
##  8 AM WEST   San.Diego         383      65
##  9 AM WEST   San.Francisco     320     129
## 10 AM WEST   Seattle           201     601

The analysis I performed was to determine the average on time and delay .

tidy_data %>%
  filter(Arrival == 'Los.Angeles') %>%
  summarise(on_time_mean = mean(on.time), delay_mean = mean(delayed))
## # A tibble: 1 × 2
##   on_time_mean delay_mean
##          <dbl>      <dbl>
## 1         596.       89.5