Load the required Library
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read data from Github
original_df <- read.csv("https://raw.githubusercontent.com/topkelama/DATA607/207dae685eaed5546a9aa9315b22d50b954efaca/airlines.csv", header = TRUE, na.strings = "")
overview of the dataframe
glimpse(original_df)
## Rows: 5
## Columns: 7
## $ X <chr> "ALASKA", NA, NA, "AM WEST", NA
## $ X.1 <chr> "on time", "delayed", NA, "on time", "delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 301, 61
Rename 1st and 2nd columns as the existing names do not make sense.
#made some changes on columns names along with 1st and 2nd columns
names(original_df) <- c("Airline", "Arrival_Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")
dataframe before cleaning and tidying
head(original_df)
## Airline Arrival_Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 301
## 5 <NA> delayed 117 415 65 129 61
Remove the row if all columns in a row has NA
original_df <- original_df %>%
# this will keep if any column in a row has a NA otherwise remove
filter(rowSums(is.na(.)) != ncol(.))
original_df
## Airline Arrival_Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 301
## 4 <NA> delayed 117 415 65 129 61
Change the dataframe to long format
long_df <- original_df %>%
pivot_longer(
cols = -c(Airline, Arrival_Status),
names_to = "Destination",
values_to = "Flight_Count"
)
#long_df
Fill NA value with respective Airline values
long_df <- long_df %>% mutate(Airline = as.character(na_if(Airline, 'NA'))) %>% fill(Airline, .direction = 'down')
#long_df
Transform the value of Arrival_status into 2 new columns for analysis.
airline_data <- long_df %>% spread(Arrival_Status, Flight_Count)
# 2 New columns
colnames(airline_data) [3] <- "Delayed"
colnames(airline_data) [4] <- "On_Time"
#airline_data
Perform analysis of delayed arrival for two airlines.
summary_data <- airline_data %>%
group_by(Airline) %>%
summarise(Delayed_mean = mean(Delayed, na.rm = TRUE),
Delayed_median = median(Delayed, na.rm = TRUE),
Delayed_sd = sd(Delayed, na.rm = TRUE),
Delayed_min = min(Delayed, na.rm = TRUE),
Delayed_max = max(Delayed, na.rm = TRUE))
summary_data
## # A tibble: 2 × 6
## Airline Delayed_mean Delayed_median Delayed_sd Delayed_min Delayed_max
## <chr> <dbl> <int> <dbl> <int> <int>
## 1 ALASKA 100. 62 120. 12 305
## 2 AM WEST 157. 117 147. 61 415
Total flight count grouping by Ariline
# Group by Airline and summarise the Flight_Count column
total_flight_counts <- long_df %>%
group_by(Airline) %>%
summarise(Total_Flight_Count = sum(Flight_Count, na.rm = TRUE))
# Print the result
print(total_flight_counts)
## # A tibble: 2 × 2
## Airline Total_Flight_Count
## <chr> <int>
## 1 ALASKA 3775
## 2 AM WEST 7325
Conclusion
To conclude, on average, AM WEST has 157.4 delayed flights, which is approximately 57.14% more than ALASKA’s average of 100.2 delayed flights. However, AM WEST operated 7325 flights in total, which is approximately 93.44% more than ALASKA’s 3775 flights. It’s important to note that these percentages are based on the data obtained and may vary depending on the time period or sample size. Therefore, it’s essential to consider the reliability of the data sources when drawing conclusions.