Assignment5

Load the required Library

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read data from Github

original_df <- read.csv("https://raw.githubusercontent.com/topkelama/DATA607/207dae685eaed5546a9aa9315b22d50b954efaca/airlines.csv", header = TRUE, na.strings = "")

overview of the dataframe

glimpse(original_df)

## Rows: 5
## Columns: 7
## $ X             <chr> "ALASKA", NA, NA, "AM WEST", NA
## $ X.1           <chr> "on time", "delayed", NA, "on time", "delayed"
## $ Los.Angeles   <int> 497, 62, NA, 694, 117
## $ Phoenix       <int> 221, 12, NA, 4840, 415
## $ San.Diego     <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle       <int> 1841, 305, NA, 301, 61

Rename 1st and 2nd columns as the existing names do not make sense.

#made some changes on columns names along with 1st and 2nd columns
names(original_df) <- c("Airline", "Arrival_Status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle")

dataframe before cleaning and tidying

head(original_df)

##   Airline Arrival_Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1  ALASKA        on time         497     221       212           503    1841
## 2    <NA>        delayed          62      12        20           102     305
## 3    <NA>           <NA>          NA      NA        NA            NA      NA
## 4 AM WEST        on time         694    4840       383           320     301
## 5    <NA>        delayed         117     415        65           129      61

Remove the row if all columns in a row has NA

original_df <- original_df %>%
    # this will keep if any column in a row has a NA otherwise remove
  filter(rowSums(is.na(.)) != ncol(.))
original_df

##   Airline Arrival_Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1  ALASKA        on time         497     221       212           503    1841
## 2    <NA>        delayed          62      12        20           102     305
## 3 AM WEST        on time         694    4840       383           320     301
## 4    <NA>        delayed         117     415        65           129      61

Change the dataframe to long format

long_df <- original_df %>%
  pivot_longer(
    cols = -c(Airline, Arrival_Status),
    names_to = "Destination",
    values_to = "Flight_Count"
  )
#long_df

Fill NA value with respective Airline values

long_df <- long_df %>% mutate(Airline = as.character(na_if(Airline, 'NA'))) %>%  fill(Airline, .direction = 'down')
#long_df

Transform the value of Arrival_status into 2 new columns for analysis.

airline_data <- long_df %>% spread(Arrival_Status, Flight_Count)

# 2 New columns

colnames(airline_data) [3] <- "Delayed"
colnames(airline_data) [4] <- "On_Time"
#airline_data

Perform analysis of delayed arrival for two airlines.

summary_data <- airline_data %>%
  group_by(Airline) %>%
  summarise(Delayed_mean = mean(Delayed, na.rm = TRUE),
            Delayed_median = median(Delayed, na.rm = TRUE),
            Delayed_sd = sd(Delayed, na.rm = TRUE),
            Delayed_min = min(Delayed, na.rm = TRUE),
            Delayed_max = max(Delayed, na.rm = TRUE))
summary_data

## # A tibble: 2 × 6
##   Airline Delayed_mean Delayed_median Delayed_sd Delayed_min Delayed_max
##   <chr>          <dbl>          <int>      <dbl>       <int>       <int>
## 1 ALASKA          100.             62       120.          12         305
## 2 AM WEST         157.            117       147.          61         415

Total flight count grouping by Ariline

# Group by Airline and summarise the Flight_Count column
total_flight_counts <- long_df %>%
  group_by(Airline) %>%
  summarise(Total_Flight_Count = sum(Flight_Count, na.rm = TRUE))

# Print the result
print(total_flight_counts)

## # A tibble: 2 × 2
##   Airline Total_Flight_Count
##   <chr>                <int>
## 1 ALASKA                3775
## 2 AM WEST               7325

Conclusion

To conclude, on average, AM WEST has 157.4 delayed flights, which is approximately 57.14% more than ALASKA’s average of 100.2 delayed flights. However, AM WEST operated 7325 flights in total, which is approximately 93.44% more than ALASKA’s 3775 flights. It’s important to note that these percentages are based on the data obtained and may vary depending on the time period or sample size. Therefore, it’s essential to consider the reliability of the data sources when drawing conclusions.

Assignment5

Chhiring Lama

2024-02-19