Read in necessary libraries:

library(tidyr)  
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)

Read in csv and look at it:

flights <- read.csv(file = 'flight_tidying.csv') 
flights 
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Look at structure of csv:

str(flights) 
## 'data.frame':    5 obs. of  7 variables:
##  $ X            : chr  "Alaska" "" "" "AM WEST" ...
##  $ X.1          : chr  "on time" "delayed" "" "on time" ...
##  $ Los.Angeles  : int  497 62 NA 694 117
##  $ Phoenix      : int  221 12 NA 4840 415
##  $ San.Diego    : int  212 20 NA 383 65
##  $ San.Francisco: int  503 102 NA 320 129
##  $ Seattle      : int  1841 305 NA 201 61

Change columnn names to ‘better’ names

flights <- flights %>% 
    rename('Airline' = 1, 'Flight.Status' = 2)
flights
##   Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska       on time         497     221       212           503    1841
## 2               delayed          62      12        20           102     305
## 3                                NA      NA        NA            NA      NA
## 4 AM WEST       on time         694    4840       383           320     201
## 5               delayed         117     415        65           129      61

Remove NAs and empty rows in csv:

flights <- flights %>% #
            filter(! is.na(Flight.Status) &  
                     str_length(Flight.Status) > 0) 
flights
##   Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska       on time         497     221       212           503    1841
## 2               delayed          62      12        20           102     305
## 3 AM WEST       on time         694    4840       383           320     201
## 4               delayed         117     415        65           129      61

Turn csv into dataframe:

df <- data.frame(flights)
head(df)
##   Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska       on time         497     221       212           503    1841
## 2               delayed          62      12        20           102     305
## 3 AM WEST       on time         694    4840       383           320     201
## 4               delayed         117     415        65           129      61

Add missing airline information:

df[2, "Airline"] <- "Alaska"
df[4, "Airline"] <- "AM WEST"
df
##   Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska       on time         497     221       212           503    1841
## 2  Alaska       delayed          62      12        20           102     305
## 3 AM WEST       on time         694    4840       383           320     201
## 4 AM WEST       delayed         117     415        65           129      61

Lengthen data by increasing the number of rows and decreasing the number of columns:

df <- df %>%
        pivot_longer(!c("Airline", "Flight.Status"),
                     names_to = "Destination",               
                     values_to = "Count")      
df
## # A tibble: 20 × 4
##    Airline Flight.Status Destination   Count
##    <chr>   <chr>         <chr>         <int>
##  1 Alaska  on time       Los.Angeles     497
##  2 Alaska  on time       Phoenix         221
##  3 Alaska  on time       San.Diego       212
##  4 Alaska  on time       San.Francisco   503
##  5 Alaska  on time       Seattle        1841
##  6 Alaska  delayed       Los.Angeles      62
##  7 Alaska  delayed       Phoenix          12
##  8 Alaska  delayed       San.Diego        20
##  9 Alaska  delayed       San.Francisco   102
## 10 Alaska  delayed       Seattle         305
## 11 AM WEST on time       Los.Angeles     694
## 12 AM WEST on time       Phoenix        4840
## 13 AM WEST on time       San.Diego       383
## 14 AM WEST on time       San.Francisco   320
## 15 AM WEST on time       Seattle         201
## 16 AM WEST delayed       Los.Angeles     117
## 17 AM WEST delayed       Phoenix         415
## 18 AM WEST delayed       San.Diego        65
## 19 AM WEST delayed       San.Francisco   129
## 20 AM WEST delayed       Seattle          61

Write to CSV:

write.csv(df, "./clean_flight_tidying.csv", row.names=FALSE)

Create two new dataframes of delayed and non time flights to graph them:

delayed_flights <- df %>%
                    filter(df$Flight.Status == "delayed")
delayed_flights
## # A tibble: 10 × 4
##    Airline Flight.Status Destination   Count
##    <chr>   <chr>         <chr>         <int>
##  1 Alaska  delayed       Los.Angeles      62
##  2 Alaska  delayed       Phoenix          12
##  3 Alaska  delayed       San.Diego        20
##  4 Alaska  delayed       San.Francisco   102
##  5 Alaska  delayed       Seattle         305
##  6 AM WEST delayed       Los.Angeles     117
##  7 AM WEST delayed       Phoenix         415
##  8 AM WEST delayed       San.Diego        65
##  9 AM WEST delayed       San.Francisco   129
## 10 AM WEST delayed       Seattle          61
on_time_flights <- df %>%
                    filter(df$Flight.Status == "on time")
on_time_flights
## # A tibble: 10 × 4
##    Airline Flight.Status Destination   Count
##    <chr>   <chr>         <chr>         <int>
##  1 Alaska  on time       Los.Angeles     497
##  2 Alaska  on time       Phoenix         221
##  3 Alaska  on time       San.Diego       212
##  4 Alaska  on time       San.Francisco   503
##  5 Alaska  on time       Seattle        1841
##  6 AM WEST on time       Los.Angeles     694
##  7 AM WEST on time       Phoenix        4840
##  8 AM WEST on time       San.Diego       383
##  9 AM WEST on time       San.Francisco   320
## 10 AM WEST on time       Seattle         201

Graph Delayed Flights:

 delayed_bar_graph <- ggplot(data=delayed_flights, aes(x=Destination, y=Count, fill=Airline))
    delayed_bar_graph <- delayed_bar_graph +  ggtitle('Delayed Flights') +
            geom_bar(stat="identity", position=position_dodge()) 
delayed_bar_graph

AM WEST has more delayed flights than Alaska in ever city but Seattle.

Graph On-Time Flights:

 ontime_bar_graph <- ggplot(data=on_time_flights, aes(x=Destination, y=Count, fill=Airline))
    ontime_bar_graph <- ontime_bar_graph +  ggtitle('On-Time Flights') +
            geom_bar(stat="identity", position=position_dodge()) 
ontime_bar_graph

AM WEST has more on-time flights than Alaska in Los Angeles, Phoenix, and San Diego. Alaska has more on-time flights than AM WEST in San Francisco and Seattle.