DATA607Assignment 5

Create a CSV file, read the file and tidy the data

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(stringr)

#read the csv file without any header row
load <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Arrival_delays.csv',sep = ",", header= FALSE,stringsAsFactors=FALSE, quote = "\"")

# provide a name to the columns
load <-    rename(load,
    Airline  = V1 ,
    Arrivaltype = V2,
    Los_Angeles = V3,
    Phoenix = V4,
    San_Diego = V5 ,   
    San_Francisco = V6,
    Seattle = V7
  )
   

#delete the blank row and the row with text showing destination names
 load<- load[grep("^[a-z]" , load$Arrivaltype),]
  
#update the two rows with missing airlines
load$Airline[2]= "ALASKA" 
load$Airline[4]= "AMWEST"

Transform the data

#pivot the data.frame longer
load_final <- load |> 
  pivot_longer(
     cols = !(Airline:Arrivaltype) ,
     names_to = "Destinations", 
     values_to = "flight_count" ,
    values_drop_na = TRUE
   ) 

#remove comma from the flight count values
 load_final$flight_count<- as.numeric(gsub(",","",load_final$flight_count))

compare the arrival delays for the two airlines

AMWEST has more delayed flights than ALASKA Airlines. However for Seattle destination, the arrival delays are more for Alaska Airlines than AMWEST.

    load_final %>%
    group_by(Airline) %>%
    filter(Arrivaltype == "delayed")  %>%
   summarize(flight_count = sum(flight_count) )

## # A tibble: 2 × 2
##   Airline flight_count
##   <chr>          <dbl>
## 1 ALASKA           501
## 2 AMWEST           787

load_final %>%
    group_by(Destinations,Airline) %>%
    filter(Arrivaltype == "delayed")  %>%
   summarize(flight_count = sum(flight_count), )

## `summarise()` has grouped output by 'Destinations'. You can override using the
## `.groups` argument.

## # A tibble: 10 × 3
## # Groups:   Destinations [5]
##    Destinations  Airline flight_count
##    <chr>         <chr>          <dbl>
##  1 Los_Angeles   ALASKA            62
##  2 Los_Angeles   AMWEST           117
##  3 Phoenix       ALASKA            12
##  4 Phoenix       AMWEST           415
##  5 San_Diego     ALASKA            20
##  6 San_Diego     AMWEST            65
##  7 San_Francisco ALASKA           102
##  8 San_Francisco AMWEST           129
##  9 Seattle       ALASKA           305
## 10 Seattle       AMWEST            61

By plotting the data, i see that Alaska has recorded greater proportion of delayed flights

library(ggplot2)


  ggplot( data = load_final , aes( x=Airline  , y=flight_count, fill = Arrivaltype )) + 
    geom_bar(stat="identity")

 #   geom_line(aes(y= girl_ratio), color = "BLACK") +
 # labs( title = "Arrival delays for 2 Airlines", subtitle = "proportion of boys(blue) / girls(black) over time " ,x= "years", y= "propotion of boys/girls")

DATA607Assignment 5

Dhanya Nair

2024-02-23