Loading in the data from an excel sheet which I converted as csv. The key part of this assignment was to pass the blanks in as "NA".
flights<-read.csv("https://raw.githubusercontent.com/Sangeetha-007/R-Practice/master/607/Assignments/Assignment%204/arrival%20delays%20.csv",na.strings=c("","NA"))
print(flights)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM West on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
I am listing the column names.
colnames(flights)
## [1] "X" "X.1" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
la <-pull(flights) #dyplyr
print(la) #this line of code is just me playing around with dyplyr, it has nothing to do with the actual task!
## [1] 1841 305 NA 201 61
I dropped the "NA" values and renamed the first 2 columns.
flights_long2<-flights_long2 %>% drop_na()
colnames(flights_long2)[1] = "Airline"
colnames(flights_long2)[2] = "Time"
print(n=25,flights_long2)
## # A tibble: 20 × 4
## Airline Time Destination Flight_Count
## <chr> <chr> <chr> <int>
## 1 Alaska on time Los.Angeles 497
## 2 Alaska on time Phoenix 221
## 3 Alaska on time San.Diego 212
## 4 Alaska on time San.Francisco 503
## 5 Alaska on time Seattle 1841
## 6 Alaska delayed Los.Angeles 62
## 7 Alaska delayed Phoenix 12
## 8 Alaska delayed San.Diego 20
## 9 Alaska delayed San.Francisco 102
## 10 Alaska delayed Seattle 305
## 11 AM West on time Los.Angeles 694
## 12 AM West on time Phoenix 4840
## 13 AM West on time San.Diego 383
## 14 AM West on time San.Francisco 320
## 15 AM West on time Seattle 201
## 16 AM West delayed Los.Angeles 117
## 17 AM West delayed Phoenix 415
## 18 AM West delayed San.Diego 65
## 19 AM West delayed San.Francisco 129
## 20 AM West delayed Seattle 61
I took a summary of all the data present.
summary(flights_long2)
## Airline Time Destination Flight_Count
## Length:20 Length:20 Length:20 Min. : 12.00
## Class :character Class :character Class :character 1st Qu.: 92.75
## Mode :character Mode :character Mode :character Median : 216.50
## Mean : 550.00
## 3rd Qu.: 435.50
## Max. :4840.00
I calculated the interquartile range (IQR) of Flight_Count. The IQR measures the spread of the middle half of the data.
print(IQR(flights_long2$`Flight_Count`))
## [1] 342.75
Then for each destination I analyzed it's summary.
angeles<-flights_long2%>%group_by(Destination)%>%filter(Destination=="Los.Angeles")
summary(angeles)
## Airline Time Destination Flight_Count
## Length:4 Length:4 Length:4 Min. : 62.0
## Class :character Class :character Class :character 1st Qu.:103.2
## Mode :character Mode :character Mode :character Median :307.0
## Mean :342.5
## 3rd Qu.:546.2
## Max. :694.0
phoenix<-flights_long2%>%group_by(Destination)%>%filter(Destination=="Phoenix")
summary(phoenix)
## Airline Time Destination Flight_Count
## Length:4 Length:4 Length:4 Min. : 12.0
## Class :character Class :character Class :character 1st Qu.: 168.8
## Mode :character Mode :character Mode :character Median : 318.0
## Mean :1372.0
## 3rd Qu.:1521.2
## Max. :4840.0
diego<-flights_long2%>%group_by(Destination)%>%filter(Destination=="San.Diego")
summary(diego)
## Airline Time Destination Flight_Count
## Length:4 Length:4 Length:4 Min. : 20.00
## Class :character Class :character Class :character 1st Qu.: 53.75
## Mode :character Mode :character Mode :character Median :138.50
## Mean :170.00
## 3rd Qu.:254.75
## Max. :383.00
francisco<-flights_long2%>%group_by(Destination)%>%filter(Destination=="San.Francisco")
summary(francisco)
## Airline Time Destination Flight_Count
## Length:4 Length:4 Length:4 Min. :102.0
## Class :character Class :character Class :character 1st Qu.:122.2
## Mode :character Mode :character Mode :character Median :224.5
## Mean :263.5
## 3rd Qu.:365.8
## Max. :503.0
seattle <-flights_long2%>%group_by(Destination)%>%filter(Destination=="Seattle")
summary(seattle)
## Airline Time Destination Flight_Count
## Length:4 Length:4 Length:4 Min. : 61
## Class :character Class :character Class :character 1st Qu.: 166
## Mode :character Mode :character Mode :character Median : 253
## Mean : 602
## 3rd Qu.: 689
## Max. :1841
Finally, I made bar graphs filtering the data between "on time" and "delayed".
onTime<-flights_long2%>%filter(Time=="on time")
onTime
## # A tibble: 10 × 4
## Airline Time Destination Flight_Count
## <chr> <chr> <chr> <int>
## 1 Alaska on time Los.Angeles 497
## 2 Alaska on time Phoenix 221
## 3 Alaska on time San.Diego 212
## 4 Alaska on time San.Francisco 503
## 5 Alaska on time Seattle 1841
## 6 AM West on time Los.Angeles 694
## 7 AM West on time Phoenix 4840
## 8 AM West on time San.Diego 383
## 9 AM West on time San.Francisco 320
## 10 AM West on time Seattle 201
ggplot(onTime, aes(fill=`Airline`, y=`Flight_Count`, x=Destination)) +
geom_bar(position="dodge", stat="identity")

delayed<-flights_long2%>%filter(Time=="delayed")
delayed
## # A tibble: 10 × 4
## Airline Time Destination Flight_Count
## <chr> <chr> <chr> <int>
## 1 Alaska delayed Los.Angeles 62
## 2 Alaska delayed Phoenix 12
## 3 Alaska delayed San.Diego 20
## 4 Alaska delayed San.Francisco 102
## 5 Alaska delayed Seattle 305
## 6 AM West delayed Los.Angeles 117
## 7 AM West delayed Phoenix 415
## 8 AM West delayed San.Diego 65
## 9 AM West delayed San.Francisco 129
## 10 AM West delayed Seattle 61
ggplot(delayed, aes(fill=`Airline`, y=`Flight_Count`, x=Destination)) +
geom_bar(position="dodge", stat="identity")
