Getting data from local machine

data <- read.csv("/Users/WN/Desktop/flights.csv")

data
##         X     X.1 Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2         delayed          62      12        20           102     305
## 3                          NA                NA            NA        
## 4 AM WEST on time         694   4,840       383           320     201
## 5         delayed         117     415        65           129      61

I don’t have any idea what the numbers represent under each “City” column, but I’m going to move forward with the assumption that they are for arrival delays in minutes.

data <- data %>%
  gather("Los.Angeles", "Phoenix", "San.Diego", "San.Fransisco", "Seattle", key="Arrivals", value="Arr_Delay")

data
##          X     X.1      Arrivals Arr_Delay
## 1   ALASKA on time   Los.Angeles       497
## 2          delayed   Los.Angeles        62
## 3                    Los.Angeles      <NA>
## 4  AM WEST on time   Los.Angeles       694
## 5          delayed   Los.Angeles       117
## 6   ALASKA on time       Phoenix       221
## 7          delayed       Phoenix        12
## 8                        Phoenix          
## 9  AM WEST on time       Phoenix     4,840
## 10         delayed       Phoenix       415
## 11  ALASKA on time     San.Diego       212
## 12         delayed     San.Diego        20
## 13                     San.Diego      <NA>
## 14 AM WEST on time     San.Diego       383
## 15         delayed     San.Diego        65
## 16  ALASKA on time San.Fransisco       503
## 17         delayed San.Fransisco       102
## 18                 San.Fransisco      <NA>
## 19 AM WEST on time San.Fransisco       320
## 20         delayed San.Fransisco       129
## 21  ALASKA on time       Seattle     1,841
## 22         delayed       Seattle       305
## 23                       Seattle          
## 24 AM WEST on time       Seattle       201
## 25         delayed       Seattle        61

Dropping rows with NA cells and also dropping rows with empty cells on the Distance column.

data <- data %>% drop_na()

data
##          X     X.1      Arrivals Arr_Delay
## 1   ALASKA on time   Los.Angeles       497
## 2          delayed   Los.Angeles        62
## 3  AM WEST on time   Los.Angeles       694
## 4          delayed   Los.Angeles       117
## 5   ALASKA on time       Phoenix       221
## 6          delayed       Phoenix        12
## 7                        Phoenix          
## 8  AM WEST on time       Phoenix     4,840
## 9          delayed       Phoenix       415
## 10  ALASKA on time     San.Diego       212
## 11         delayed     San.Diego        20
## 12 AM WEST on time     San.Diego       383
## 13         delayed     San.Diego        65
## 14  ALASKA on time San.Fransisco       503
## 15         delayed San.Fransisco       102
## 16 AM WEST on time San.Fransisco       320
## 17         delayed San.Fransisco       129
## 18  ALASKA on time       Seattle     1,841
## 19         delayed       Seattle       305
## 20                       Seattle          
## 21 AM WEST on time       Seattle       201
## 22         delayed       Seattle        61

Filling in the missing cells for the airline column and renaming the first two columns of the dataframe.

data <- data %>% filter(data$Arr_Delay != "")

for(row in 1:nrow(data)) {
  if(row %% 2 == 0) {
    data$X[row] = data$X[row - 1]
  }
}

data
##          X     X.1      Arrivals Arr_Delay
## 1   ALASKA on time   Los.Angeles       497
## 2   ALASKA delayed   Los.Angeles        62
## 3  AM WEST on time   Los.Angeles       694
## 4  AM WEST delayed   Los.Angeles       117
## 5   ALASKA on time       Phoenix       221
## 6   ALASKA delayed       Phoenix        12
## 7  AM WEST on time       Phoenix     4,840
## 8  AM WEST delayed       Phoenix       415
## 9   ALASKA on time     San.Diego       212
## 10  ALASKA delayed     San.Diego        20
## 11 AM WEST on time     San.Diego       383
## 12 AM WEST delayed     San.Diego        65
## 13  ALASKA on time San.Fransisco       503
## 14  ALASKA delayed San.Fransisco       102
## 15 AM WEST on time San.Fransisco       320
## 16 AM WEST delayed San.Fransisco       129
## 17  ALASKA on time       Seattle     1,841
## 18  ALASKA delayed       Seattle       305
## 19 AM WEST on time       Seattle       201
## 20 AM WEST delayed       Seattle        61
data <- data %>% rename_at('X', ~'Airlines')

data <- data %>% rename_at('X.1', ~'Status')

data$Arr_Delay <- as.numeric(sub(",", "", data$Arr_Delay, fixed = TRUE))

data
##    Airlines  Status      Arrivals Arr_Delay
## 1    ALASKA on time   Los.Angeles       497
## 2    ALASKA delayed   Los.Angeles        62
## 3   AM WEST on time   Los.Angeles       694
## 4   AM WEST delayed   Los.Angeles       117
## 5    ALASKA on time       Phoenix       221
## 6    ALASKA delayed       Phoenix        12
## 7   AM WEST on time       Phoenix      4840
## 8   AM WEST delayed       Phoenix       415
## 9    ALASKA on time     San.Diego       212
## 10   ALASKA delayed     San.Diego        20
## 11  AM WEST on time     San.Diego       383
## 12  AM WEST delayed     San.Diego        65
## 13   ALASKA on time San.Fransisco       503
## 14   ALASKA delayed San.Fransisco       102
## 15  AM WEST on time San.Fransisco       320
## 16  AM WEST delayed San.Fransisco       129
## 17   ALASKA on time       Seattle      1841
## 18   ALASKA delayed       Seattle       305
## 19  AM WEST on time       Seattle       201
## 20  AM WEST delayed       Seattle        61

Calculating the mean for arr_delay between the two airlines. Judging from the mean - we can conclude that AM WEST has a higher average when it comes to arrival delays in minutes.

data %>%
  group_by(Airlines) %>%
  summarise(mean_delay = mean(Arr_Delay))
## # A tibble: 2 × 2
##   Airlines mean_delay
##   <chr>         <dbl>
## 1 ALASKA         378.
## 2 AM WEST        722.