library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
Created a csv file and uploaded in the github. The structure of the csv is little unconvetional and hard to grab any data. I have replaced all the empty spaces with NA so that it is easier to work later.
untidy_flight_data <- read.csv('https://raw.githubusercontent.com/karmaggyatso/CUNY_SPS/main/Github_data607/assignment_4/untidy_flight_dataset.csv', na.strings = c('', 'NA'))
untidy_flight_data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 601
In this section we are filling the X column with Value of Either Alaska or AM West and dropping all the NA values. tidyr::pivot_longer is used to change the structure of the dataset and create a new variable name Arrival so that the dataset is arranged according to the airport destination. tidyr::pivot_wider is used to add column on.Time and delayed and grap the data from temp column Time
tidy_data <- untidy_flight_data %>%
fill('X') %>%
drop_na() %>%
pivot_longer(cols = c('Los.Angeles', 'Phoenix', 'San.Diego', 'San.Francisco', 'Seattle'), names_to = 'Arrival', values_to = 'Time' ) %>%
pivot_wider(names_from = 'X.1', values_from = 'Time')
tidy_data
## # A tibble: 10 × 4
## X Arrival `on time` delayed
## <chr> <chr> <int> <int>
## 1 Alaska Los.Angeles 497 62
## 2 Alaska Phoenix 221 12
## 3 Alaska San.Diego 212 20
## 4 Alaska San.Francisco 503 102
## 5 Alaska Seattle 1841 305
## 6 AM WEST Los.Angeles 694 117
## 7 AM WEST Phoenix 4840 415
## 8 AM WEST San.Diego 383 65
## 9 AM WEST San.Francisco 320 129
## 10 AM WEST Seattle 201 601
In the section, the column name of X is changed to Departure and renamed the column name on time to on.time for consistency.
colnames(tidy_data)[names(tidy_data) == 'X'] <- "Departure"
names(tidy_data) <- str_replace_all(names(tidy_data), c(" " = "."))
tidy_data
## # A tibble: 10 × 4
## Departure Arrival on.time delayed
## <chr> <chr> <int> <int>
## 1 Alaska Los.Angeles 497 62
## 2 Alaska Phoenix 221 12
## 3 Alaska San.Diego 212 20
## 4 Alaska San.Francisco 503 102
## 5 Alaska Seattle 1841 305
## 6 AM WEST Los.Angeles 694 117
## 7 AM WEST Phoenix 4840 415
## 8 AM WEST San.Diego 383 65
## 9 AM WEST San.Francisco 320 129
## 10 AM WEST Seattle 201 601
The analysis I performed was to determine the average on time and delay .
tidy_data %>%
filter(Arrival == 'Los.Angeles') %>%
summarise(on_time_mean = mean(on.time), delay_mean = mean(delayed))
## # A tibble: 1 × 2
## on_time_mean delay_mean
## <dbl> <dbl>
## 1 596. 89.5