Assignment – Tidying and Transforming Data


The chart above describes arrival delays for two airlines across five destinations. Your task is to:
(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
(2) Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
(3) Perform analysis to compare the arrival delays for the two airlines.
(4) Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:
 The URL to the .Rmd file in your GitHub repository. and  The URL for your rpubs.com web page.

Import Data from Github

Flight_Data <- read_csv('https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-DATA607/main/Flight_Details.csv', 
                          show_col_types = FALSE)
## New names:
## • `` -> `...1`
## • `` -> `...2`
Flight_Data
## # A tibble: 5 × 7
##   ...1    ...2    los_angeles phoenix san_diego san_francisco seattle
##   <chr>   <chr>         <dbl>   <dbl>     <dbl>         <dbl>   <dbl>
## 1 ALASKA  on time         497     221       212           503    1841
## 2 <NA>    delayed          62      12        20           102     305
## 3 <NA>    <NA>             NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5 <NA>    delayed         117     415        65           129      61

Remove empty row

Flight_data <- Flight_Data[!apply(is.na(Flight_Data[1:7]),1,all), ]

Create dataframe to store the data and rename columns

Flight_data <- data.frame(Flight_data)

colnames(Flight_data) <- c("AIRLINE", "FLIGHT STATUS", "LOS ANGELES", "PHOENIX", "SAN DIEGO", "SAN FRANCISCO", "SEATTLE")

Flight_data
##   AIRLINE FLIGHT STATUS LOS ANGELES PHOENIX SAN DIEGO SAN FRANCISCO SEATTLE
## 1  ALASKA       on time         497     221       212           503    1841
## 2    <NA>       delayed          62      12        20           102     305
## 3 AM WEST       on time         694    4840       383           320     201
## 4    <NA>       delayed         117     415        65           129      61

Replace the NA in rows 2 and row 4

Flight_data[Flight_data==""]<-NA

Flight_data <- fill(Flight_data, AIRLINE)
Flight_data
##   AIRLINE FLIGHT STATUS LOS ANGELES PHOENIX SAN DIEGO SAN FRANCISCO SEATTLE
## 1  ALASKA       on time         497     221       212           503    1841
## 2  ALASKA       delayed          62      12        20           102     305
## 3 AM WEST       on time         694    4840       383           320     201
## 4 AM WEST       delayed         117     415        65           129      61

Use the pivot_longer function to transform data in long format

new_flight_df <- Flight_data %>%
pivot_longer(!c("AIRLINE", "FLIGHT STATUS"),
                     names_to = "DESTINATION", values_to = "FLIGHT TIMES")      
new_flight_df
## # A tibble: 20 × 4
##    AIRLINE `FLIGHT STATUS` DESTINATION   `FLIGHT TIMES`
##    <chr>   <chr>           <chr>                  <dbl>
##  1 ALASKA  on time         LOS ANGELES              497
##  2 ALASKA  on time         PHOENIX                  221
##  3 ALASKA  on time         SAN DIEGO                212
##  4 ALASKA  on time         SAN FRANCISCO            503
##  5 ALASKA  on time         SEATTLE                 1841
##  6 ALASKA  delayed         LOS ANGELES               62
##  7 ALASKA  delayed         PHOENIX                   12
##  8 ALASKA  delayed         SAN DIEGO                 20
##  9 ALASKA  delayed         SAN FRANCISCO            102
## 10 ALASKA  delayed         SEATTLE                  305
## 11 AM WEST on time         LOS ANGELES              694
## 12 AM WEST on time         PHOENIX                 4840
## 13 AM WEST on time         SAN DIEGO                383
## 14 AM WEST on time         SAN FRANCISCO            320
## 15 AM WEST on time         SEATTLE                  201
## 16 AM WEST delayed         LOS ANGELES              117
## 17 AM WEST delayed         PHOENIX                  415
## 18 AM WEST delayed         SAN DIEGO                 65
## 19 AM WEST delayed         SAN FRANCISCO            129
## 20 AM WEST delayed         SEATTLE                   61

Use spread function to transform data and display delayed or Ontime by city as well as the mutate function was used to creae a new total column.

flight_delay <- spread(new_flight_df, `FLIGHT STATUS` , `FLIGHT TIMES`)

flight_delay <- flight_delay|>
mutate(`Total Flights`= (delayed + `on time`))|>
  select(AIRLINE,DESTINATION,delayed,`on time`, `Total Flights`)|>
  group_by(AIRLINE)
 

flight_delay
## # A tibble: 10 × 5
## # Groups:   AIRLINE [2]
##    AIRLINE DESTINATION   delayed `on time` `Total Flights`
##    <chr>   <chr>           <dbl>     <dbl>           <dbl>
##  1 ALASKA  LOS ANGELES        62       497             559
##  2 ALASKA  PHOENIX            12       221             233
##  3 ALASKA  SAN DIEGO          20       212             232
##  4 ALASKA  SAN FRANCISCO     102       503             605
##  5 ALASKA  SEATTLE           305      1841            2146
##  6 AM WEST LOS ANGELES       117       694             811
##  7 AM WEST PHOENIX           415      4840            5255
##  8 AM WEST SAN DIEGO          65       383             448
##  9 AM WEST SAN FRANCISCO     129       320             449
## 10 AM WEST SEATTLE            61       201             262

Perform analysis to compare the arrival delays for the two airlines. This was done by filtering the flight status to show delayed

Delayed_DF <- new_flight_df%>% filter(str_detect(`FLIGHT STATUS`,"delayed"))
                                      
Delayed_DF
## # A tibble: 10 × 4
##    AIRLINE `FLIGHT STATUS` DESTINATION   `FLIGHT TIMES`
##    <chr>   <chr>           <chr>                  <dbl>
##  1 ALASKA  delayed         LOS ANGELES               62
##  2 ALASKA  delayed         PHOENIX                   12
##  3 ALASKA  delayed         SAN DIEGO                 20
##  4 ALASKA  delayed         SAN FRANCISCO            102
##  5 ALASKA  delayed         SEATTLE                  305
##  6 AM WEST delayed         LOS ANGELES              117
##  7 AM WEST delayed         PHOENIX                  415
##  8 AM WEST delayed         SAN DIEGO                 65
##  9 AM WEST delayed         SAN FRANCISCO            129
## 10 AM WEST delayed         SEATTLE                   61
Plot_Delayed <- ggplot(data=Delayed_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, fill=AIRLINE)) +   labs(title = "AIRLINE FLIGHTS DELAYED", x= "DESTINATION", y = "DELAYED TIMES") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette="Paired") +
geom_bar(stat="identity", position=position_dodge()) +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Plot_Delayed 

Perform analysis to compare the on time arrivals for the two airlines. This was done by filtering the flight status to show on-time

ontime_DF <- new_flight_df%>% filter(str_detect(`FLIGHT STATUS`,"on time"))
                                      
ontime_DF
## # A tibble: 10 × 4
##    AIRLINE `FLIGHT STATUS` DESTINATION   `FLIGHT TIMES`
##    <chr>   <chr>           <chr>                  <dbl>
##  1 ALASKA  on time         LOS ANGELES              497
##  2 ALASKA  on time         PHOENIX                  221
##  3 ALASKA  on time         SAN DIEGO                212
##  4 ALASKA  on time         SAN FRANCISCO            503
##  5 ALASKA  on time         SEATTLE                 1841
##  6 AM WEST on time         LOS ANGELES              694
##  7 AM WEST on time         PHOENIX                 4840
##  8 AM WEST on time         SAN DIEGO                383
##  9 AM WEST on time         SAN FRANCISCO            320
## 10 AM WEST on time         SEATTLE                  201
Plot_ontime <- ggplot(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, fill=AIRLINE)) +   labs(title = "AIRLINE FLIGHTS THAT ARRIVED ONTIME", x= "DESTINATION", y = "ON-TIME TIMES") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_brewer(palette="Paired") +
geom_bar(stat="identity", position=position_dodge()) +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Plot_ontime 

### Perform analysis to compare the delay and on time arrivals for the Alaska airlines.

Alaska <- new_flight_df%>% filter(str_detect(`AIRLINE`,"ALASKA"))

compare_plot <- ggplot(NULL, aes(x, y)) + 
geom_line(data=Alaska, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="green", size = 1) +
geom_line(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="purple", size = 1)+
  labs(title = "DELAYED & ON-TIME FLIGHT - ALASKA", x= "DESTINATION", y = "FLIGHT TIMES", ylim=c(100,1000), breaks = 10) + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))+
  scale_color_manual(name = "FLIGHT STATUS")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
# display the plot
compare_plot

### Perform analysis to compare the delay and on time arrivals for the AM West airlines.

Alaska <- new_flight_df%>% filter(str_detect(`AIRLINE`,"AM WEST"))

compare_plot <- ggplot(NULL, aes(x, y)) + 
geom_line(data=Alaska, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="green", size = 1) +
geom_line(data=ontime_DF, aes(x=DESTINATION, y=`FLIGHT TIMES`, group=1), col="purple", size = 1)+
  labs(title = "DELAYED & ON-TIME FLIGHT  - AM WEST", x= "DESTINATION", y = "FLIGHT TIMES", ylim=c(100,1000), breaks = 10) + 
  theme(axis.text.x = element_text(angle = 60, hjust = 1))+
  scale_color_manual(name = "FLIGHT STATUS")

# display the plot
compare_plot

CONCLUSION

The Airline AM West have both more on-time and delayed flights than Alaska airline.

Phoenix Airport had a considerable amount of flights over the other airlines. This airport also had more on-time and delayed flights.