| Title: “R Bridge Course Wk-2” |
| Output: html_document |
| Author: Arun Reddy |
# Data set Source
urlfile<-'https://raw.githubusercontent.com/selva86/datasets/master/hflights.csv'
# Ready csv
dsin<-read.csv(urlfile)
suppressMessages(library(dplyr))
# Conversion to tibble data frame
flights <- tbl_df(dsin)
flights
## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <fct> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ... with 227,486 more rows, and 13 more variables: TailNum <fct>,
## # ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <fct>, Dest <fct>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <fct>,
## # Diverted <int>
# Summary information of dataset
summary(flights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 XE :73053 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 CO :70032 1st Qu.: 855
## Median :1416 Median :1617 WN :45343 Median :1696
## Mean :1396 Mean :1578 OO :16061 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 MQ : 4648 3rd Qu.:2755
## Max. :2400 Max. :2400 US : 4082 Max. :7290
## NA's :2905 NA's :3066 (Other):14277
## TailNum ActualElapsedTime AirTime ArrDelay
## N14945 : 971 Min. : 34.0 Min. : 11.0 Min. :-70.000
## N15926 : 960 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## N16927 : 951 Median :128.0 Median :107.0 Median : 0.000
## N12946 : 948 Mean :129.3 Mean :108.1 Mean : 7.094
## N14937 : 946 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## N14942 : 946 Max. :575.0 Max. :549.0 Max. :978.000
## (Other):221774 NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 HOU: 52299 DAL : 9820 Min. : 79.0
## 1st Qu.: -3.000 IAH:175197 ATL : 7886 1st Qu.: 376.0
## Median : 0.000 MSY : 6823 Median : 809.0
## Mean : 9.445 DFW : 6653 Mean : 787.8
## 3rd Qu.: 9.000 LAX : 6064 3rd Qu.:1042.0
## Max. :981.000 DEN : 5920 Max. :3904.0
## NA's :2905 (Other):184330
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 :224523
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 A: 1202
## Median : 5.000 Median : 14.00 Median :0.00000 B: 1652
## Mean : 6.099 Mean : 15.09 Mean :0.01307 C: 118
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000 D: 1
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
# Mean and Median for Destination and UniqueCarriers
flights %>%
group_by(Dest,UniqueCarrier) %>%
summarise(mean.arrival.delay = mean(ArrDelay, na.rm=TRUE),median.arrival.day = median(ArrDelay, na.rm=TRUE))
## # A tibble: 241 x 4
## # Groups: Dest [?]
## Dest UniqueCarrier mean.arrival.delay median.arrival.day
## <fct> <fct> <dbl> <dbl>
## 1 ABQ CO 4.03 -2
## 2 ABQ OO 6.64 -1
## 3 ABQ WN 6.00 -1
## 4 ABQ XE 9.25 3
## 5 AEX XE 5.84 -2
## 6 AGS CO 4 4
## 7 AMA XE 6.84 0
## 8 ANC CO 26.1 13
## 9 ASE OO 6.79 -3
## 10 ATL CO 3.66 -4
## # ... with 231 more rows
# Full Data set
flights
## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <fct> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ... with 227,486 more rows, and 13 more variables: TailNum <fct>,
## # ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <fct>, Dest <fct>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <fct>,
## # Diverted <int>
# Subsetting the dataset, showing 50 rows with selected columns
select( flights, Year:DayOfWeek,UniqueCarrier,Dest,ArrDelay)
## # A tibble: 227,496 x 7
## Year Month DayofMonth DayOfWeek UniqueCarrier Dest ArrDelay
## <int> <int> <int> <int> <fct> <fct> <int>
## 1 2011 1 1 6 AA DFW -10
## 2 2011 1 2 7 AA DFW -9
## 3 2011 1 3 1 AA DFW -8
## 4 2011 1 4 2 AA DFW 3
## 5 2011 1 5 3 AA DFW -3
## 6 2011 1 6 4 AA DFW -7
## 7 2011 1 7 5 AA DFW -1
## 8 2011 1 8 6 AA DFW -16
## 9 2011 1 9 7 AA DFW 44
## 10 2011 1 10 1 AA DFW 43
## # ... with 227,486 more rows
flights.subset<- head(select( flights, Year:DayOfWeek,UniqueCarrier,Dest,ArrDelay), n=50)
# Subset showing 50 rows and 7 columns
flights.subset
## # A tibble: 50 x 7
## Year Month DayofMonth DayOfWeek UniqueCarrier Dest ArrDelay
## <int> <int> <int> <int> <fct> <fct> <int>
## 1 2011 1 1 6 AA DFW -10
## 2 2011 1 2 7 AA DFW -9
## 3 2011 1 3 1 AA DFW -8
## 4 2011 1 4 2 AA DFW 3
## 5 2011 1 5 3 AA DFW -3
## 6 2011 1 6 4 AA DFW -7
## 7 2011 1 7 5 AA DFW -1
## 8 2011 1 8 6 AA DFW -16
## 9 2011 1 9 7 AA DFW 44
## 10 2011 1 10 1 AA DFW 43
## # ... with 40 more rows
# Rename of columns
names(flights.subset) <- c("DateDim.Year","DateDim.Month","DateDim.Day","DateDim.Week","Flight.Carrier","Flight.Destination","Flight.ArrDelay")
flights.subset
## # A tibble: 50 x 7
## DateDim.Year DateDim.Month DateDim.Day DateDim.Week Flight.Carrier
## <int> <int> <int> <int> <fct>
## 1 2011 1 1 6 AA
## 2 2011 1 2 7 AA
## 3 2011 1 3 1 AA
## 4 2011 1 4 2 AA
## 5 2011 1 5 3 AA
## 6 2011 1 6 4 AA
## 7 2011 1 7 5 AA
## 8 2011 1 8 6 AA
## 9 2011 1 9 7 AA
## 10 2011 1 10 1 AA
## # ... with 40 more rows, and 2 more variables: Flight.Destination <fct>,
## # Flight.ArrDelay <int>
# Summary overview
summary(flights.subset)
## DateDim.Year DateDim.Month DateDim.Day DateDim.Week
## Min. :2011 Min. :1 Min. : 1.00 Min. :1.00
## 1st Qu.:2011 1st Qu.:1 1st Qu.: 7.00 1st Qu.:2.00
## Median :2011 Median :1 Median :13.00 Median :4.00
## Mean :2011 Mean :1 Mean :13.72 Mean :4.02
## 3rd Qu.:2011 3rd Qu.:1 3rd Qu.:19.00 3rd Qu.:6.00
## Max. :2011 Max. :1 Max. :31.00 Max. :7.00
##
## Flight.Carrier Flight.Destination Flight.ArrDelay
## AA :50 DFW :50 Min. :-30.00
## AS : 0 ABQ : 0 1st Qu.:-13.75
## B6 : 0 AEX : 0 Median : -7.00
## CO : 0 AGS : 0 Mean : -0.46
## DL : 0 AMA : 0 3rd Qu.: 2.00
## EV : 0 ANC : 0 Max. : 84.00
## (Other): 0 (Other): 0
# Mean and Median of new subset of dataframe
summary.ArrDelayMeanMedian <-flights.subset %>%
group_by(Flight.Carrier,Flight.Destination) %>%
summarise(Summary.mean = mean(Flight.ArrDelay),Summary.median= median(Flight.ArrDelay) )
summary.ArrDelayMeanMedian
## # A tibble: 1 x 4
## # Groups: Flight.Carrier [?]
## Flight.Carrier Flight.Destination Summary.mean Summary.median
## <fct> <fct> <dbl> <dbl>
## 1 AA DFW -0.46 -7
# Dataset
flights
## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## <int> <int> <int> <int> <int> <int> <fct> <int>
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## # ... with 227,486 more rows, and 13 more variables: TailNum <fct>,
## # ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <fct>, Dest <fct>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <fct>,
## # Diverted <int>
# Look up table to change the values of the column "UniqueCarrier
carrierLookup <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
# change the values of the column
flights$CarrierFullName <- carrierLookup [flights$UniqueCarrier]
# unique(select (flights,CarrierFullName) )
# The following code shows the column value change: UniqueCarrier is old and CarrierFullName is New
flights %>%
group_by(UniqueCarrier,CarrierFullName) %>%
summarise(FlightCount= n())
## # A tibble: 15 x 3
## # Groups: UniqueCarrier [?]
## UniqueCarrier CarrierFullName FlightCount
## <fct> <chr> <int>
## 1 AA American 3244
## 2 AS Alaska 365
## 3 B6 JetBlue 695
## 4 CO Continental 70032
## 5 DL Delta 2641
## 6 EV SkyWest 2204
## 7 F9 United 838
## 8 FL US_Airways 2139
## 9 MQ Southwest 4648
## 10 OO Atlantic_Southeast 16061
## 11 UA Frontier 2072
## 12 US AirTran 4082
## 13 WN American_Eagle 45343
## 14 XE ExpressJet 73053
## 15 YV Mesa 79
# 1 through 5 shows enough rows.
[GitHub Link for DataSet]: [https://raw.githubusercontent.com/selva86/datasets/master/hflights.csv] Data set used hflights