count(tornado[tornado$st=="WA" | tornado$st=="MS",])
## n
## 1 530
count(tornado[tornado$st=="WA" & tornado$yr<2012,])
## n
## 1 13
tornado[tornado$st=="WA" & tornado$yr %in% c(2011,2013,2014),]
## yr mo dy date time tz st stf stn f
## 6636 2011 5 27 2011-05-27 14:50:00 3 WA 53 1 EF-0
## 7040 2011 10 5 2011-10-05 16:19:00 3 WA 53 2 EF-0
## 8217 2013 3 21 2013-03-21 18:00:00 3 WA 53 0 EF-0
## 8851 2013 9 30 2013-09-30 08:20:00 3 WA 53 0 EF-1
## 9150 2014 4 27 2014-04-27 18:30:00 3 WA 53 0 EF-0
## 9747 2014 8 13 2014-08-13 19:30:00 3 WA 53 0 EF-0
## 9884 2014 10 23 2014-10-23 14:40:00 3 WA 53 0 EF-1
When the number of tornadoes is counted based on months, mo==2 has the most number of tornadoes. Therefore, the answer is February.
month.name[max((tornado[tornado$st=="HI",] %>% group_by(mo) %>% count())$n)]
## [1] "February"
tornado %>% group_by(yr) %>% count()
## # A tibble: 9 × 2
## # Groups: yr [9]
## yr n
## <int> <int>
## 1 2007 1116
## 2 2008 1738
## 3 2009 1182
## 4 2010 1315
## 5 2011 1777
## 6 2012 956
## 7 2013 939
## 8 2014 906
## 9 2015 1212
sqldf("select DepTime, Dest, UniqueCarrier, FlightNum, Origin from airline where DepTime >= 2100 and Dest='BNA'")
## DepTime Dest UniqueCarrier FlightNum Origin
## 1 2105 BNA WN 2181 BWI
## 2 2100 BNA WN 1918 MCI
## 3 2312 BNA OH 5421 CVG
## 4 2157 BNA DL 938 ATL
## 5 2217 BNA AA 2435 ORD
## 6 2317 BNA DH 7332 ORD
# na.omit(subset(airline[airline$DepTime >= 2100 & airline$Dest =="BNA",],select=c("UniqueCarrier","FlightNum","Origin")))
# na.omit(subset(airline[airline$DepTime >= 2100 & (airline$Dest == "MEM" | airline$Origin=="BNA"),],select=c("UniqueCarrier","FlightNum","Origin")))
sqldf("select DepTime, Dest, UniqueCarrier, FlightNum, Origin from airline where DepTime >= 2100 and (Dest='MEM' or origin='BNA')")
## DepTime Dest UniqueCarrier FlightNum Origin
## 1 2218 AUS WN 1447 BNA
## 2 2150 BWI WN 2658 BNA
## 3 2150 MDW WN 1511 BNA
## 4 2125 RDU WN 1732 BNA
## 5 2110 SAT WN 2769 BNA
## 6 2125 TPA WN 459 BNA
## 7 2201 MEM OO 6838 ORD
## 8 2217 MEM DL 579 ATL
## 9 2110 MEM FL 1723 ATL
## 10 2103 MEM NW 279 DTW
## 11 2313 MEM DH 6270 CVG
x <- sqldf("select UniqueCarrier, FlightNum, Origin from airline where ArrDelay < 0")
#y <- na.omit(subset(airline[airline$ArrDelay<0,],select=c("UniqueCarrier","FlightNum","Origin")))
# if(all_equal(x,y)){
paged_table(x)
nrow(x)
## [1] 10778
#}
# sqldf("select ArrDelay, DepDelay, UniqueCarrier, FlightNum, Origin from airline where ArrDelay > 120 and DepDelay <= 0")
na.omit(subset(airline[airline$ArrDelay>120 & airline$DepDelay<=0, ], select = c( "UniqueCarrier", "FlightNum", "Origin")))
## UniqueCarrier FlightNum Origin
## 9495 TZ 505 PIE
subset(head(airline[order(airline$DepDelay),],5), select = c( "DepDelay", "UniqueCarrier", "FlightNum", "Origin"))
## DepDelay UniqueCarrier FlightNum Origin
## 14202 -26 MQ 3647 DFW
## 6293 -24 OH 5260 RIC
## 5109 -22 NW 707 GTF
## 16874 -21 AA 1685 HOU
## 5983 -20 NW 1926 MDW
subset(head(airline[order(airline$ArrDelay),],5), select = c( "ArrDelay", "UniqueCarrier", "FlightNum", "Origin"))
## ArrDelay UniqueCarrier FlightNum Origin
## 6128 -52 OH 5071 BUF
## 6099 -49 OH 5710 BGM
## 6688 -49 OH 5082 BUF
## 8118 -48 OO 6704 MFR
## 10935 -48 DL 1181 BOS
Bonus marks: Experts Only. 1. Transform using the Airline data: create a new data frame with only the columns ’DepDelay’, ’ArrDelay’, ’Origin’, ’Dest’, AirTime’, and ’Distance’. In addition keep only the observation for flights that were delayed (DepDelay) by more than 1 hour. (Use the Airline data) (a) Add a new column reporting departure delays minus the median departure delay to the new data frame (b) Convert the departure and arrival delays columns from minutes into hours (c) Add a column with the average flight speed (in mph) (d) Report your new table using the function summary()
# bonus <- airline[airline$DepDelay>60,] %>% select(DepDelay,ArrDelay,Origin,Dest,AirTime,Distance) %>% mutate( DepDelay = DepDelay/60 , ArrDelay = ArrDelay/60, DepDelayMinusMedian=DepDelay-median(airline$DepDelay,na.rm = TRUE), SpeedInMph=round((Distance/AirTime)*60,2)) %>% rename(DepDelayInHours=DepDelay, ArrDelayInHours=ArrDelay)
# head(bonus,20)
summary(airline[airline$DepDelay>60,] %>% select (DepDelay, ArrDelay, Origin, Dest, AirTime, Distance) %>% mutate( DepDelay = DepDelay/60 , ArrDelay = ArrDelay/60, DepDelayMinusMedian = DepDelay - median( airline$DepDelay, na.rm = TRUE), SpeedInMph = round((Distance/AirTime) * 60,2 )) %>% rename( DepDelayInHours = DepDelay, ArrDelayInHours = ArrDelay))
## DepDelayInHours ArrDelayInHours Origin Dest
## Min. : 1.017 Min. : 0.3333 Length:968 Length:968
## 1st Qu.: 1.233 1st Qu.: 1.2833 Class :character Class :character
## Median : 1.583 Median : 1.7167 Mode :character Mode :character
## Mean : 1.765 Mean : 1.8423
## 3rd Qu.: 2.067 3rd Qu.: 2.2000
## Max. :18.000 Max. :17.7833
## NA's :183 NA's :185
## AirTime Distance DepDelayMinusMedian SpeedInMph
## Min. :-1395.0 Min. : 56.0 Min. : 1.017 Min. :-11.35
## 1st Qu.: 55.0 1st Qu.: 316.0 1st Qu.: 1.233 1st Qu.:328.80
## Median : 90.0 Median : 590.0 Median : 1.583 Median :393.05
## Mean : 101.1 Mean : 677.9 Mean : 1.765 Mean :378.69
## 3rd Qu.: 134.0 3rd Qu.: 867.0 3rd Qu.: 2.067 3rd Qu.:442.68
## Max. : 346.0 Max. :2704.0 Max. :18.000 Max. :543.11
## NA's :185 NA's :183 NA's :183 NA's :185
Notes: mutate() is used to add new columns of calculated values to the data. Speed in miles per hour (SpeedInMph) is rounded off to two decimal points for readability.