Getting data from local machine
data <- read.csv("/Users/WN/Desktop/flights.csv")
data
## X X.1 Los.Angeles Phoenix San.Diego San.Fransisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
I don’t have any idea what the numbers represent under each “City”
column, but I’m going to move forward with the assumption that they are
for arrival delays in minutes.
data <- data %>%
gather("Los.Angeles", "Phoenix", "San.Diego", "San.Fransisco", "Seattle", key="Arrivals", value="Arr_Delay")
data
## X X.1 Arrivals Arr_Delay
## 1 ALASKA on time Los.Angeles 497
## 2 delayed Los.Angeles 62
## 3 Los.Angeles <NA>
## 4 AM WEST on time Los.Angeles 694
## 5 delayed Los.Angeles 117
## 6 ALASKA on time Phoenix 221
## 7 delayed Phoenix 12
## 8 Phoenix
## 9 AM WEST on time Phoenix 4,840
## 10 delayed Phoenix 415
## 11 ALASKA on time San.Diego 212
## 12 delayed San.Diego 20
## 13 San.Diego <NA>
## 14 AM WEST on time San.Diego 383
## 15 delayed San.Diego 65
## 16 ALASKA on time San.Fransisco 503
## 17 delayed San.Fransisco 102
## 18 San.Fransisco <NA>
## 19 AM WEST on time San.Fransisco 320
## 20 delayed San.Fransisco 129
## 21 ALASKA on time Seattle 1,841
## 22 delayed Seattle 305
## 23 Seattle
## 24 AM WEST on time Seattle 201
## 25 delayed Seattle 61
Dropping rows with NA cells and also dropping rows with empty cells
on the Distance column.
data <- data %>% drop_na()
data
## X X.1 Arrivals Arr_Delay
## 1 ALASKA on time Los.Angeles 497
## 2 delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 delayed Phoenix 12
## 7 Phoenix
## 8 AM WEST on time Phoenix 4,840
## 9 delayed Phoenix 415
## 10 ALASKA on time San.Diego 212
## 11 delayed San.Diego 20
## 12 AM WEST on time San.Diego 383
## 13 delayed San.Diego 65
## 14 ALASKA on time San.Fransisco 503
## 15 delayed San.Fransisco 102
## 16 AM WEST on time San.Fransisco 320
## 17 delayed San.Fransisco 129
## 18 ALASKA on time Seattle 1,841
## 19 delayed Seattle 305
## 20 Seattle
## 21 AM WEST on time Seattle 201
## 22 delayed Seattle 61
Filling in the missing cells for the airline column and renaming the
first two columns of the dataframe.
data <- data %>% filter(data$Arr_Delay != "")
for(row in 1:nrow(data)) {
if(row %% 2 == 0) {
data$X[row] = data$X[row - 1]
}
}
data
## X X.1 Arrivals Arr_Delay
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4,840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Fransisco 503
## 14 ALASKA delayed San.Fransisco 102
## 15 AM WEST on time San.Fransisco 320
## 16 AM WEST delayed San.Fransisco 129
## 17 ALASKA on time Seattle 1,841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
data <- data %>% rename_at('X', ~'Airlines')
data <- data %>% rename_at('X.1', ~'Status')
data$Arr_Delay <- as.numeric(sub(",", "", data$Arr_Delay, fixed = TRUE))
data
## Airlines Status Arrivals Arr_Delay
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Fransisco 503
## 14 ALASKA delayed San.Fransisco 102
## 15 AM WEST on time San.Fransisco 320
## 16 AM WEST delayed San.Fransisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Calculating the mean for arr_delay between the two airlines. Judging
from the mean - we can conclude that AM WEST has a higher average when
it comes to arrival delays in minutes.
data %>%
group_by(Airlines) %>%
summarise(mean_delay = mean(Arr_Delay))
## # A tibble: 2 × 2
## Airlines mean_delay
## <chr> <dbl>
## 1 ALASKA 378.
## 2 AM WEST 722.