Loading in the data from an excel sheet which I converted as csv. The key part of this assignment was to pass the blanks in as "NA".

flights<-read.csv("https://raw.githubusercontent.com/Sangeetha-007/R-Practice/master/607/Assignments/Assignment%204/arrival%20delays%20.csv",na.strings=c("","NA"))
print(flights)
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM West on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

I am listing the column names.

colnames(flights)
## [1] "X"             "X.1"           "Los.Angeles"   "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"
la <-pull(flights)  #dyplyr 
print(la) #this line of code is just me playing around with dyplyr, it has nothing to do with the actual task!
## [1] 1841  305   NA  201   61

This code in this chunk is wrong, and thus commented out, but I kept it for me to sort of look back upon and see the wrong code I tried.

#flights_long <- flights %>% gather(Los.Angeles, Phoenix, San.Diego, San.Francisco, Seattle, -c("X", "X.1"))
#flights_long <- flights %>% gather(Los.Angeles -c(X, X.1))

Here, I am pivoting the wide data into long format. I renamed 2 columns, and filled in the "NA" values with a top-down approach to filling data.

flights_long<-pivot_longer(flights, -c(X, X.1), values_to = "Flight_Count", names_to = "Destination")
head(flights_long)
## # A tibble: 6 × 4
##   X      X.1     Destination   Flight_Count
##   <chr>  <chr>   <chr>                <int>
## 1 Alaska on time Los.Angeles            497
## 2 Alaska on time Phoenix                221
## 3 Alaska on time San.Diego              212
## 4 Alaska on time San.Francisco          503
## 5 Alaska on time Seattle               1841
## 6 <NA>   delayed Los.Angeles             62
flights_long2 <- fill(flights_long, X, .direction = c("down"))
head(flights_long2)
## # A tibble: 6 × 4
##   X      X.1     Destination   Flight_Count
##   <chr>  <chr>   <chr>                <int>
## 1 Alaska on time Los.Angeles            497
## 2 Alaska on time Phoenix                221
## 3 Alaska on time San.Diego              212
## 4 Alaska on time San.Francisco          503
## 5 Alaska on time Seattle               1841
## 6 Alaska delayed Los.Angeles             62

I dropped the "NA" values and renamed the first 2 columns.

flights_long2<-flights_long2 %>% drop_na() 
colnames(flights_long2)[1] = "Airline"
colnames(flights_long2)[2] = "Time"
print(n=25,flights_long2)
## # A tibble: 20 × 4
##    Airline Time    Destination   Flight_Count
##    <chr>   <chr>   <chr>                <int>
##  1 Alaska  on time Los.Angeles            497
##  2 Alaska  on time Phoenix                221
##  3 Alaska  on time San.Diego              212
##  4 Alaska  on time San.Francisco          503
##  5 Alaska  on time Seattle               1841
##  6 Alaska  delayed Los.Angeles             62
##  7 Alaska  delayed Phoenix                 12
##  8 Alaska  delayed San.Diego               20
##  9 Alaska  delayed San.Francisco          102
## 10 Alaska  delayed Seattle                305
## 11 AM West on time Los.Angeles            694
## 12 AM West on time Phoenix               4840
## 13 AM West on time San.Diego              383
## 14 AM West on time San.Francisco          320
## 15 AM West on time Seattle                201
## 16 AM West delayed Los.Angeles            117
## 17 AM West delayed Phoenix                415
## 18 AM West delayed San.Diego               65
## 19 AM West delayed San.Francisco          129
## 20 AM West delayed Seattle                 61

I took a summary of all the data present.

summary(flights_long2)
##    Airline              Time           Destination         Flight_Count    
##  Length:20          Length:20          Length:20          Min.   :  12.00  
##  Class :character   Class :character   Class :character   1st Qu.:  92.75  
##  Mode  :character   Mode  :character   Mode  :character   Median : 216.50  
##                                                           Mean   : 550.00  
##                                                           3rd Qu.: 435.50  
##                                                           Max.   :4840.00

I calculated the interquartile range (IQR) of Flight_Count. The IQR measures the spread of the middle half of the data.

print(IQR(flights_long2$`Flight_Count`))
## [1] 342.75

I went on to calculate IQR and median grouped by airline.

flights_long2 %>%
  group_by(Airline) %>%
  summarise(median_arrivalDelay = median(`Flight_Count`), iqr_arrivalDelay = IQR(`Flight_Count`))
## # A tibble: 2 × 3
##   Airline median_arrivalDelay iqr_arrivalDelay
##   <chr>                 <dbl>            <dbl>
## 1 Alaska                 216.              377
## 2 AM West                260.              287

Then for each destination I analyzed it's summary.

angeles<-flights_long2%>%group_by(Destination)%>%filter(Destination=="Los.Angeles")
summary(angeles)
##    Airline              Time           Destination         Flight_Count  
##  Length:4           Length:4           Length:4           Min.   : 62.0  
##  Class :character   Class :character   Class :character   1st Qu.:103.2  
##  Mode  :character   Mode  :character   Mode  :character   Median :307.0  
##                                                           Mean   :342.5  
##                                                           3rd Qu.:546.2  
##                                                           Max.   :694.0
phoenix<-flights_long2%>%group_by(Destination)%>%filter(Destination=="Phoenix")
summary(phoenix)
##    Airline              Time           Destination         Flight_Count   
##  Length:4           Length:4           Length:4           Min.   :  12.0  
##  Class :character   Class :character   Class :character   1st Qu.: 168.8  
##  Mode  :character   Mode  :character   Mode  :character   Median : 318.0  
##                                                           Mean   :1372.0  
##                                                           3rd Qu.:1521.2  
##                                                           Max.   :4840.0
diego<-flights_long2%>%group_by(Destination)%>%filter(Destination=="San.Diego")
summary(diego)
##    Airline              Time           Destination         Flight_Count   
##  Length:4           Length:4           Length:4           Min.   : 20.00  
##  Class :character   Class :character   Class :character   1st Qu.: 53.75  
##  Mode  :character   Mode  :character   Mode  :character   Median :138.50  
##                                                           Mean   :170.00  
##                                                           3rd Qu.:254.75  
##                                                           Max.   :383.00
francisco<-flights_long2%>%group_by(Destination)%>%filter(Destination=="San.Francisco")
summary(francisco)
##    Airline              Time           Destination         Flight_Count  
##  Length:4           Length:4           Length:4           Min.   :102.0  
##  Class :character   Class :character   Class :character   1st Qu.:122.2  
##  Mode  :character   Mode  :character   Mode  :character   Median :224.5  
##                                                           Mean   :263.5  
##                                                           3rd Qu.:365.8  
##                                                           Max.   :503.0
seattle <-flights_long2%>%group_by(Destination)%>%filter(Destination=="Seattle")
summary(seattle)
##    Airline              Time           Destination         Flight_Count 
##  Length:4           Length:4           Length:4           Min.   :  61  
##  Class :character   Class :character   Class :character   1st Qu.: 166  
##  Mode  :character   Mode  :character   Mode  :character   Median : 253  
##                                                           Mean   : 602  
##                                                           3rd Qu.: 689  
##                                                           Max.   :1841

Finally, I made bar graphs filtering the data between "on time" and "delayed".

onTime<-flights_long2%>%filter(Time=="on time")
onTime
## # A tibble: 10 × 4
##    Airline Time    Destination   Flight_Count
##    <chr>   <chr>   <chr>                <int>
##  1 Alaska  on time Los.Angeles            497
##  2 Alaska  on time Phoenix                221
##  3 Alaska  on time San.Diego              212
##  4 Alaska  on time San.Francisco          503
##  5 Alaska  on time Seattle               1841
##  6 AM West on time Los.Angeles            694
##  7 AM West on time Phoenix               4840
##  8 AM West on time San.Diego              383
##  9 AM West on time San.Francisco          320
## 10 AM West on time Seattle                201
ggplot(onTime, aes(fill=`Airline`, y=`Flight_Count`, x=Destination)) + 
    geom_bar(position="dodge", stat="identity")

delayed<-flights_long2%>%filter(Time=="delayed")
delayed
## # A tibble: 10 × 4
##    Airline Time    Destination   Flight_Count
##    <chr>   <chr>   <chr>                <int>
##  1 Alaska  delayed Los.Angeles             62
##  2 Alaska  delayed Phoenix                 12
##  3 Alaska  delayed San.Diego               20
##  4 Alaska  delayed San.Francisco          102
##  5 Alaska  delayed Seattle                305
##  6 AM West delayed Los.Angeles            117
##  7 AM West delayed Phoenix                415
##  8 AM West delayed San.Diego               65
##  9 AM West delayed San.Francisco          129
## 10 AM West delayed Seattle                 61
ggplot(delayed, aes(fill=`Airline`, y=`Flight_Count`, x=Destination)) + 
    geom_bar(position="dodge", stat="identity")