#install.packages('dplyr')
#install.packages('tidyr')
#load dplyr and tidyr libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
When working with data, you should expect to spend a good amount of time in the clean-up process, but it is not always ‘messy’ or unreadable. For example, data can still be organized in a data-frame in a way that is readable, but at the same time it may not be useful in such presented formats. In these cases, we may have to transpose the data-frame and re-organize to fit our needs.
#instantiate columns
Los_Angeles <- c(497, 62, NA, 694, 117)
Phoenix <- c(221, 12, NA, 4840, 415)
San_Diego <- c(212, 20, NA, 383, 65)
San_Francisco <- c(503, 102, NA, 320, 129)
Seattle <- c(1841, 305, NA, 201, 61)
Timing <- c('on_time', 'delayed', NA, 'on_time', 'delayed')
Airline <- c('Alaska', NA, NA, 'AM West', NA)
#create data-frame
flights_df <- data.frame(Airline, Timing, Los_Angeles, Phoenix, San_Diego, San_Francisco, Seattle)
#write to csv
write.csv(flights_df, 'flights.csv')
flights_csv <- read.csv('flights.csv')
flights_csv
## X Airline Timing Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 1 Alaska on_time 497 221 212 503 1841
## 2 2 <NA> delayed 62 12 20 102 305
## 3 3 <NA> <NA> NA NA NA NA NA
## 4 4 AM West on_time 694 4840 383 320 201
## 5 5 <NA> delayed 117 415 65 129 61
The data is loaded into a data-frame below, however, we need to adjust some data values.
#only use rows with data
flights <- flights_csv[c(1, 2, 4, 5), ]
flights
## X Airline Timing Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 1 Alaska on_time 497 221 212 503 1841
## 2 2 <NA> delayed 62 12 20 102 305
## 4 4 AM West on_time 694 4840 383 320 201
## 5 5 <NA> delayed 117 415 65 129 61
#fill in missing values
flights$Airline[2] = 'Alaska'
flights$Airline[4] = 'AM West'
flights
## X Airline Timing Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 1 Alaska on_time 497 221 212 503 1841
## 2 2 Alaska delayed 62 12 20 102 305
## 4 4 AM West on_time 694 4840 383 320 201
## 5 5 AM West delayed 117 415 65 129 61
Now that all of the required data is loaded, we need to transform what we have so that it is readable to our liking. In this case, this means pivoting the matrix and aggregating values and columns.
#pivot the data-frame to count number of (on time) vs (delayed) flights by airline and a new city column
t_flights1 <- pivot_longer(flights,
cols = c(Los_Angeles, Phoenix, San_Diego, San_Francisco, Seattle),
names_to = "City",
values_to = "Flight_Count")
t_flights1
## # A tibble: 20 × 5
## X Airline Timing City Flight_Count
## <int> <chr> <chr> <chr> <int>
## 1 1 Alaska on_time Los_Angeles 497
## 2 1 Alaska on_time Phoenix 221
## 3 1 Alaska on_time San_Diego 212
## 4 1 Alaska on_time San_Francisco 503
## 5 1 Alaska on_time Seattle 1841
## 6 2 Alaska delayed Los_Angeles 62
## 7 2 Alaska delayed Phoenix 12
## 8 2 Alaska delayed San_Diego 20
## 9 2 Alaska delayed San_Francisco 102
## 10 2 Alaska delayed Seattle 305
## 11 4 AM West on_time Los_Angeles 694
## 12 4 AM West on_time Phoenix 4840
## 13 4 AM West on_time San_Diego 383
## 14 4 AM West on_time San_Francisco 320
## 15 4 AM West on_time Seattle 201
## 16 5 AM West delayed Los_Angeles 117
## 17 5 AM West delayed Phoenix 415
## 18 5 AM West delayed San_Diego 65
## 19 5 AM West delayed San_Francisco 129
## 20 5 AM West delayed Seattle 61
Now that we’ve pivoted our matrix, we can continue to split up the ‘Timing’ column to match the destination city and include the ‘delayed’ vs ‘on_time’ on the same row, which makes it easier to read.
#Aggregate data by city and airline
aggregated_flights <- t_flights1 %>%
group_by(Airline, City, Timing) %>%
summarize(Flight_Count = sum(Flight_Count, na.rm = TRUE), .groups = 'drop')
aggregated_flights
## # A tibble: 20 × 4
## Airline City Timing Flight_Count
## <chr> <chr> <chr> <int>
## 1 AM West Los_Angeles delayed 117
## 2 AM West Los_Angeles on_time 694
## 3 AM West Phoenix delayed 415
## 4 AM West Phoenix on_time 4840
## 5 AM West San_Diego delayed 65
## 6 AM West San_Diego on_time 383
## 7 AM West San_Francisco delayed 129
## 8 AM West San_Francisco on_time 320
## 9 AM West Seattle delayed 61
## 10 AM West Seattle on_time 201
## 11 Alaska Los_Angeles delayed 62
## 12 Alaska Los_Angeles on_time 497
## 13 Alaska Phoenix delayed 12
## 14 Alaska Phoenix on_time 221
## 15 Alaska San_Diego delayed 20
## 16 Alaska San_Diego on_time 212
## 17 Alaska San_Francisco delayed 102
## 18 Alaska San_Francisco on_time 503
## 19 Alaska Seattle delayed 305
## 20 Alaska Seattle on_time 1841
#present corresponding values for (on time) and (delayed) side by side
wide_flights <- aggregated_flights %>%
pivot_wider(names_from = Timing,
values_from = Flight_Count)
wide_flights
## # A tibble: 10 × 4
## Airline City delayed on_time
## <chr> <chr> <int> <int>
## 1 AM West Los_Angeles 117 694
## 2 AM West Phoenix 415 4840
## 3 AM West San_Diego 65 383
## 4 AM West San_Francisco 129 320
## 5 AM West Seattle 61 201
## 6 Alaska Los_Angeles 62 497
## 7 Alaska Phoenix 12 221
## 8 Alaska San_Diego 20 212
## 9 Alaska San_Francisco 102 503
## 10 Alaska Seattle 305 1841
Now that we have data that is readable to our liking, we can create new columns to use for our analysis. Here, I created a column for the percentage of delayed flights relative to all flights at the given destination city and corresponding to a specific airline.
#create new column to show percentage of delayed flights for each row
wide_flights$pct_delayed <- wide_flights$delayed / (wide_flights$delayed + wide_flights$on_time)
wide_flights
## # A tibble: 10 × 5
## Airline City delayed on_time pct_delayed
## <chr> <chr> <int> <int> <dbl>
## 1 AM West Los_Angeles 117 694 0.144
## 2 AM West Phoenix 415 4840 0.0790
## 3 AM West San_Diego 65 383 0.145
## 4 AM West San_Francisco 129 320 0.287
## 5 AM West Seattle 61 201 0.233
## 6 Alaska Los_Angeles 62 497 0.111
## 7 Alaska Phoenix 12 221 0.0515
## 8 Alaska San_Diego 20 212 0.0862
## 9 Alaska San_Francisco 102 503 0.169
## 10 Alaska Seattle 305 1841 0.142
I decided to create two more columns for the mean of delayed flights as well as the total number of flights for each airline, regardless of destination. This is to compute a percentage for the mean in the same way that I computed a percentage for the individual cities.
mAir_delay <- wide_flights %>%
group_by(Airline) %>%
summarize(mean_delayed = mean(delayed))
mAir_delay
## # A tibble: 2 × 2
## Airline mean_delayed
## <chr> <dbl>
## 1 AM West 157.
## 2 Alaska 100.
mDelay_AMWest <- mAir_delay$mean_delayed[1]
mDelay_Alaska <- mAir_delay$mean_delayed[2]
wide_flights$mean_delay <- NA
wide_flights
## # A tibble: 10 × 6
## Airline City delayed on_time pct_delayed mean_delay
## <chr> <chr> <int> <int> <dbl> <lgl>
## 1 AM West Los_Angeles 117 694 0.144 NA
## 2 AM West Phoenix 415 4840 0.0790 NA
## 3 AM West San_Diego 65 383 0.145 NA
## 4 AM West San_Francisco 129 320 0.287 NA
## 5 AM West Seattle 61 201 0.233 NA
## 6 Alaska Los_Angeles 62 497 0.111 NA
## 7 Alaska Phoenix 12 221 0.0515 NA
## 8 Alaska San_Diego 20 212 0.0862 NA
## 9 Alaska San_Francisco 102 503 0.169 NA
## 10 Alaska Seattle 305 1841 0.142 NA
wide_flights$mean_delay <- ifelse(wide_flights$Airline == "AM West", mDelay_AMWest, wide_flights$mean_delay)
wide_flights$mean_delay <- ifelse(wide_flights$Airline == "Alaska", mDelay_Alaska, wide_flights$mean_delay)
wide_flights
## # A tibble: 10 × 6
## Airline City delayed on_time pct_delayed mean_delay
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 AM West Los_Angeles 117 694 0.144 157.
## 2 AM West Phoenix 415 4840 0.0790 157.
## 3 AM West San_Diego 65 383 0.145 157.
## 4 AM West San_Francisco 129 320 0.287 157.
## 5 AM West Seattle 61 201 0.233 157.
## 6 Alaska Los_Angeles 62 497 0.111 100.
## 7 Alaska Phoenix 12 221 0.0515 100.
## 8 Alaska San_Diego 20 212 0.0862 100.
## 9 Alaska San_Francisco 102 503 0.169 100.
## 10 Alaska Seattle 305 1841 0.142 100.
mAir_total <- wide_flights %>%
group_by(Airline) %>%
summarize(total = sum(delayed, on_time))
mAir_total
## # A tibble: 2 × 2
## Airline total
## <chr> <int>
## 1 AM West 7225
## 2 Alaska 3775
mTot_AMWest <- mAir_total$total[1]
mTot_Alaska <- mAir_total$total[2]
wide_flights$total <- NA
wide_flights
## # A tibble: 10 × 7
## Airline City delayed on_time pct_delayed mean_delay total
## <chr> <chr> <int> <int> <dbl> <dbl> <lgl>
## 1 AM West Los_Angeles 117 694 0.144 157. NA
## 2 AM West Phoenix 415 4840 0.0790 157. NA
## 3 AM West San_Diego 65 383 0.145 157. NA
## 4 AM West San_Francisco 129 320 0.287 157. NA
## 5 AM West Seattle 61 201 0.233 157. NA
## 6 Alaska Los_Angeles 62 497 0.111 100. NA
## 7 Alaska Phoenix 12 221 0.0515 100. NA
## 8 Alaska San_Diego 20 212 0.0862 100. NA
## 9 Alaska San_Francisco 102 503 0.169 100. NA
## 10 Alaska Seattle 305 1841 0.142 100. NA
wide_flights$total <- ifelse(wide_flights$Airline == "AM West", mTot_AMWest, wide_flights$total)
wide_flights$total <- ifelse(wide_flights$Airline == "Alaska", mTot_Alaska, wide_flights$total)
wide_flights
## # A tibble: 10 × 7
## Airline City delayed on_time pct_delayed mean_delay total
## <chr> <chr> <int> <int> <dbl> <dbl> <int>
## 1 AM West Los_Angeles 117 694 0.144 157. 7225
## 2 AM West Phoenix 415 4840 0.0790 157. 7225
## 3 AM West San_Diego 65 383 0.145 157. 7225
## 4 AM West San_Francisco 129 320 0.287 157. 7225
## 5 AM West Seattle 61 201 0.233 157. 7225
## 6 Alaska Los_Angeles 62 497 0.111 100. 3775
## 7 Alaska Phoenix 12 221 0.0515 100. 3775
## 8 Alaska San_Diego 20 212 0.0862 100. 3775
## 9 Alaska San_Francisco 102 503 0.169 100. 3775
## 10 Alaska Seattle 305 1841 0.142 100. 3775
wide_flights$pct_mean_delayed <- NA
wide_flights
## # A tibble: 10 × 8
## Airline City delayed on_time pct_delayed mean_delay total pct_mean_delayed
## <chr> <chr> <int> <int> <dbl> <dbl> <int> <lgl>
## 1 AM West Los_An… 117 694 0.144 157. 7225 NA
## 2 AM West Phoenix 415 4840 0.0790 157. 7225 NA
## 3 AM West San_Di… 65 383 0.145 157. 7225 NA
## 4 AM West San_Fr… 129 320 0.287 157. 7225 NA
## 5 AM West Seattle 61 201 0.233 157. 7225 NA
## 6 Alaska Los_An… 62 497 0.111 100. 3775 NA
## 7 Alaska Phoenix 12 221 0.0515 100. 3775 NA
## 8 Alaska San_Di… 20 212 0.0862 100. 3775 NA
## 9 Alaska San_Fr… 102 503 0.169 100. 3775 NA
## 10 Alaska Seattle 305 1841 0.142 100. 3775 NA
wide_flights$pct_mean_delayed <- wide_flights$mean_delay / wide_flights$total
wide_flights
## # A tibble: 10 × 8
## Airline City delayed on_time pct_delayed mean_delay total pct_mean_delayed
## <chr> <chr> <int> <int> <dbl> <dbl> <int> <dbl>
## 1 AM West Los_An… 117 694 0.144 157. 7225 0.0218
## 2 AM West Phoenix 415 4840 0.0790 157. 7225 0.0218
## 3 AM West San_Di… 65 383 0.145 157. 7225 0.0218
## 4 AM West San_Fr… 129 320 0.287 157. 7225 0.0218
## 5 AM West Seattle 61 201 0.233 157. 7225 0.0218
## 6 Alaska Los_An… 62 497 0.111 100. 3775 0.0265
## 7 Alaska Phoenix 12 221 0.0515 100. 3775 0.0265
## 8 Alaska San_Di… 20 212 0.0862 100. 3775 0.0265
## 9 Alaska San_Fr… 102 503 0.169 100. 3775 0.0265
## 10 Alaska Seattle 305 1841 0.142 100. 3775 0.0265
In my analysis, I decided to find the sample standard deviation based on the percentage points that I computed for the individual cities and the percentages of the means. I did this to compare the delays according variation in the data set.
delayed_df <- data.frame(Airline = wide_flights$Airline,
City = wide_flights$City,
Delayed = wide_flights$delayed,
Pct_Delay = wide_flights$pct_delayed,
Mean_Pct_Delay = wide_flights$pct_mean_delayed,
Pct_SD = round(sqrt(((wide_flights$pct_delayed - wide_flights$pct_mean_delayed)^2) / (wide_flights$total - 1)), 5),
Avg_Pct_SD = NA
)
delayed_mSD <- delayed_df %>%
group_by(Airline) %>%
summarize(dmean_SD = mean(Pct_SD))
delayed_mSD
## # A tibble: 2 × 2
## Airline dmean_SD
## <chr> <dbl>
## 1 AM West 0.00183
## 2 Alaska 0.00139
d_SD_AMWest <- delayed_mSD$dmean_SD[1]
d_SD_Alaska <- delayed_mSD$dmean_SD[2]
delayed_df$Avg_Pct_SD <- ifelse(wide_flights$Airline == "AM West", d_SD_AMWest, delayed_df$Avg_Pct_SD)
delayed_df$Avg_Pct_SD <- ifelse(wide_flights$Airline == "Alaska", d_SD_Alaska, delayed_df$Avg_Pct_SD)
delayed_df
## Airline City Delayed Pct_Delay Mean_Pct_Delay Pct_SD Avg_Pct_SD
## 1 AM West Los_Angeles 117 0.14426634 0.02178547 0.00144 0.001832
## 2 AM West Phoenix 415 0.07897241 0.02178547 0.00067 0.001832
## 3 AM West San_Diego 65 0.14508929 0.02178547 0.00145 0.001832
## 4 AM West San_Francisco 129 0.28730512 0.02178547 0.00312 0.001832
## 5 AM West Seattle 61 0.23282443 0.02178547 0.00248 0.001832
## 6 Alaska Los_Angeles 62 0.11091234 0.02654305 0.00137 0.001388
## 7 Alaska Phoenix 12 0.05150215 0.02654305 0.00041 0.001388
## 8 Alaska San_Diego 20 0.08620690 0.02654305 0.00097 0.001388
## 9 Alaska San_Francisco 102 0.16859504 0.02654305 0.00231 0.001388
## 10 Alaska Seattle 305 0.14212488 0.02654305 0.00188 0.001388
#write to csv
write.csv(delayed_df, 'flights.csv')
flights_csv <- read.csv('flights.csv')
flights_csv
## X Airline City Delayed Pct_Delay Mean_Pct_Delay Pct_SD
## 1 1 AM West Los_Angeles 117 0.14426634 0.02178547 0.00144
## 2 2 AM West Phoenix 415 0.07897241 0.02178547 0.00067
## 3 3 AM West San_Diego 65 0.14508929 0.02178547 0.00145
## 4 4 AM West San_Francisco 129 0.28730512 0.02178547 0.00312
## 5 5 AM West Seattle 61 0.23282443 0.02178547 0.00248
## 6 6 Alaska Los_Angeles 62 0.11091234 0.02654305 0.00137
## 7 7 Alaska Phoenix 12 0.05150215 0.02654305 0.00041
## 8 8 Alaska San_Diego 20 0.08620690 0.02654305 0.00097
## 9 9 Alaska San_Francisco 102 0.16859504 0.02654305 0.00231
## 10 10 Alaska Seattle 305 0.14212488 0.02654305 0.00188
## Avg_Pct_SD
## 1 0.001832
## 2 0.001832
## 3 0.001832
## 4 0.001832
## 5 0.001832
## 6 0.001388
## 7 0.001388
## 8 0.001388
## 9 0.001388
## 10 0.001388
My analysis focused mainly on comparing the spread of the data according to airline. In comparing the standard deviations of delayed flight percentages for the two airlines, we observe that AM West has a standard deviation of 0.001832, while Alaska has a lower standard deviation of 0.001388. This indicates that Alaska has more consistent performance in terms of flight delays, with less variability in the percentage of delayed flights compared to AM West. Although both standard deviations are quite small, suggesting that delays for both airlines are fairly stable, Alaska’s slightly smaller standard deviation suggests greater reliability in maintaining a predictable schedule despite having a slightly greater percentage for the mean of flight delays.