Dataset 1: Tornado dataset


- Using Tornadoes Data

Step 1: Load data from local directory


#Load Data using read function
df_1 <- read.csv(file = "../Downloads/tornado.csv")
#head(df_1)

# To view interactive Table 
library(DT)
datatable(head(df_1,20))


Step 2: Check unique attributes of Dataset


# Check dimension of dataset
paste("Number of unique rows", dim(df_1)[1], ", Number of unique cols", dim(df_1)[2])
## [1] "Number of unique rows 11141 , Number of unique cols 10"


- Data Manipulation

  1. How many tornadoes occurred in WA (Washington) or MS (Mississippi)?
df_a = subset(df_1,st == "WA" | st == "MS")
total_tornadoes = length(df_a$st)

paste("Total Tornadoes occurred in WA (Washington) or MS (Mississippi) are",total_tornadoes)
## [1] "Total Tornadoes occurred in WA (Washington) or MS (Mississippi) are 530"


  1. How many tornadoes occurred in WA after 2012?
df_b = subset(df_1, st == "WA" & yr > 2012)
total_tornadoes <- nrow(df_b)

paste("Total Tornadoes occurred in WA after 2012 are",total_tornadoes)
## [1] "Total Tornadoes occurred in WA after 2012 are 7"


  1. Get all tornadoes that occurred in WA in 2012, 2013, and 2014.
df_c = subset(df_1, st == "WA" & (yr == 2012 | yr == 2013 | yr == 2014))
total_tornadoes <- nrow(df_c)

paste("All tornadoes that occurred in WA in 2012, 2013, and 2014 are", total_tornadoes)
## [1] "All tornadoes that occurred in WA in 2012, 2013, and 2014 are 5"


  1. Return the ’mo’ (month), ’yr’ (year), and ’f’ (F-scale) for all tornadoes that occurred in HI (Hawaii).
df_d = subset(df_1, st == "HI", select=c(mo, yr, f))
df_d
##       mo   yr    f
## 2766   9 2008 EF-0
## 2842  12 2008 EF-0
## 2878   2 2009 EF-0
## 5371   2 2011 EF-0
## 7359   3 2012 EF-0
## 10093  4 2015 EF-0


  1. Arrange the tornadoes by date and time. What state had the most recent tornado?
# Arranging tornadoes by datetime
sorted_df <- df_1[order(df_1$date, df_1$time),]
head(sorted_df,20)
##      yr mo dy       date     time tz st stf stn    f
## 1  2007  1  4 2007-01-04 15:45:00  3 LA  22   1 EF-1
## 2  2007  1  4 2007-01-04 16:35:00  3 LA  22   2 EF-1
## 3  2007  1  5 2007-01-05 00:27:00  3 MS  28   1 EF-1
## 4  2007  1  5 2007-01-05 00:40:00  3 MS  28   2 EF-0
## 5  2007  1  5 2007-01-05 00:57:00  3 MS  28   3 EF-1
## 6  2007  1  5 2007-01-05 01:07:00  3 MS  28   4 EF-1
## 7  2007  1  5 2007-01-05 01:25:00  3 MS  28   5 EF-2
## 8  2007  1  5 2007-01-05 01:40:00  3 MS  28   6 EF-2
## 9  2007  1  5 2007-01-05 02:05:00  3 MS  28   7 EF-1
## 10 2007  1  5 2007-01-05 09:05:00  3 GA  13   1 EF-1
## 11 2007  1  5 2007-01-05 10:00:00  3 GA  13   2 EF-0
## 12 2007  1  5 2007-01-05 13:24:00  3 SC  45   1 EF-1
## 13 2007  1  5 2007-01-05 14:11:00  3 SC  45   2 EF-0
## 14 2007  1  5 2007-01-05 15:39:00  3 NC  37   1 EF-0
## 15 2007  1  5 2007-01-05 16:10:00  3 GA  13   3 EF-0
## 16 2007  1  7 2007-01-07 16:10:00  3 AL   1   1 EF-1
## 17 2007  1  7 2007-01-07 17:39:00  3 GA  13   4 EF-2
## 18 2007  1  7 2007-01-07 19:50:00  3 AL   1   2 EF-0
## 19 2007  1  7 2007-01-07 20:29:00  3 AL   1   3 EF-1
## 20 2007  1 12 2007-01-12 16:35:00  3 TX  48   1 EF-0
# Finding state which had the most recent tornado
recent_tornadoes <- df_1[order(rev(df_1$date), rev(df_1$time)),]
recent_tornado_state <- recent_tornadoes[1,"st"]

paste("State which had the most recent tornado is",recent_tornado_state)
## [1] "State which had the most recent tornado is NC"


  1. Get the count of tornadoes by months.
tornado_count <- table(df_1$mo)
tornado_count_by_month <- as.data.frame(t(tornado_count))[,-1]
colnames(tornado_count_by_month) <- c("Month number", "Count of Tornadoes")
tornado_count_by_month
##    Month number Count of Tornadoes
## 1             1                355
## 2             2                463
## 3             3                751
## 4             4               2171
## 5             5               2500
## 6             6               1899
## 7             7                861
## 8             8                517
## 9             9                401
## 10           10                525
## 11           11                350
## 12           12                348



Dataset 2: Airline dataset


- Using Airline Data

Step 1: Load data from local directory


#Load Data using read function
df_2 <- read.csv(file = "../Downloads/airline.csv")
#head(df_1)

# To view interactive Table 
library(DT)
datatable(head(df_2,20))


Step 2: Check unique attributes of Dataset


# Check dimension of dataset
paste("Number of unique rows", dim(df_2)[1], ", Number of unique cols", dim(df_2)[2])
## [1] "Number of unique rows 19928 , Number of unique cols 29"


- Data Manipulation

  1. Subset: Find the carrier (UniqueCarrier), the flight number (FlightNum), and origin (origin) of all the flights that:

i. Departed (DepTime) after 10pm and flew (Dest) to Nashville (’BNA’).

subset(df_2, DepTime > 2200 & Dest == "BNA", select=c(UniqueCarrier, FlightNum, Origin))
##       UniqueCarrier FlightNum Origin
## 7021             OH      5421    CVG
## 17401            AA      2435    ORD
## 19478            DH      7332    ORD


ii. departed (DepTime) after 10pm, but Originated (Origin) from Nashville ’BNA’ or landed (Dest) in Memphis ’MEM’. If you don’t get a list of four flight, something is wrong.

subset(df_2, DepTime > 2200 & (Origin == "BNA" | Dest == "MEM"), select=c(UniqueCarrier, FlightNum, Origin))
##       UniqueCarrier FlightNum Origin
## 2392             WN      1447    BNA
## 8169             OO      6838    ORD
## 10405            DL       579    ATL
## 19352            DH      6270    CVG


iii. were delayed (ArrDelay) by more two hours. Note that a negative delay is a flight that arrived early.

df_iii <- subset(df_2, ArrDelay > 120, select=c(UniqueCarrier, FlightNum, Origin))
head(df_iii,10)
##     UniqueCarrier FlightNum Origin
## 27             UA       441    ORD
## 65             UA       471    BUF
## 66             UA       471    ORD
## 78             UA       481    ORD
## 86             UA       489    CMH
## 126            UA       518    ORD
## 149            UA       538    ORD
## 154            UA       542    ORD
## 155            UA       543    BOS
## 158            UA       544    SNA
paste("Dataframe rows:", dim(df_iii)[1],", Columns:",dim(df_iii)[2])
## [1] "Dataframe rows: 280 , Columns: 3"


iv. arrived (ArrDelay) more than two hours late, but didn’t leave late (DepDelay).

df_iv <- subset(df_2, ArrDelay > 120 & DepDelay <= 0, select=c(UniqueCarrier, FlightNum, Origin))
head(df_iv,20)
##      UniqueCarrier FlightNum Origin
## 9495            TZ       505    PIE
paste("Dataframe rows:",dim(df_iv)[1], ", Columns:",dim(df_iv)[2])
## [1] "Dataframe rows: 1 , Columns: 3"


  1. Arrange: make use of minus sign in front of the function to reverse order() and the function head() to get the relevant part: (Use the Airline data)

i. Sort flights to find the 5 most delayed (DepDelay) flights

head(df_2[order(-df_2$DepDelay),],5)
##       Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 5884  2004     3         25         4     745       1345     954       1607
## 11220 2004     3         25         4    2031       1507    2319       1759
## 14475 2004     3         25         4    2005       1501    2230       1717
## 15132 2004     3         25         4    1807       1306    2336       1830
## 16220 2004     3         25         4    1218        721    1411        912
##       UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 5884             NW      1816  N608NW                69             82      48
## 11220            DL      1603  N978DL               108            112      82
## 14475            MQ      4072  N514MQ               145            136      93
## 15132            NW       180  N352NW               209            204     188
## 16220            AA      1035  N5FKAA               173            171     155
##       ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 5884      1067     1080    MDW  DTW      229     10      11         0
## 11220      320      324    MSP  CVG      596      7      19         0
## 14475      313      304    ORD  OKC      693      6      46         0
## 15132      306      301    LAX  MEM     1619      7      14         0
## 16220      299      297    MCO  ORD     1005      6      12         0
##       CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 5884                          0         1067            0        0
## 11220                         0            0            0        0
## 14475                         0          254            0        9
## 15132                         0          301            0        5
## 16220                         0          297            0        2
##       SecurityDelay LateAircraftDelay
## 5884              0                 0
## 11220             0               320
## 14475             0                50
## 15132             0                 0
## 16220             0                 0


ii. Sort flights to find the 5 least delayed (DepDelay) flights

head(df_2[order(df_2$DepDelay),],5)
##       Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 14202 2004     3         25         4    2047       2113    2225       2227
## 6293  2004     3         25         4     911        935    1053       1109
## 5109  2004     3         25         4    1053       1115    1129       1159
## 16874 2004     3         25         4    2109       2130    2159       2218
## 5983  2004     3         25         4    1015       1035    1226       1243
##       UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 14202            MQ      3647  N817MQ                98             74      52
## 6293             OH      5260  N933CA               102             94      75
## 5109             NW       707  N986US                36             44      28
## 16874            AA      1685  N4XGAA                50             48      36
## 5983             NW      1926   N9346                71             68      46
##       ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 14202       -2      -26    DFW  SGF      364      3      43         0
## 6293       -16      -24    RIC  CVG      413      1      26         0
## 5109       -30      -22    GTF  FCA      146      2       6         0
## 16874      -19      -21    HOU  AUS      148      5       9         0
## 5983       -17      -20    MDW  DTW      229     12      13         0
##       CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 14202                         0            0            0        0
## 6293                          0            0            0        0
## 5109                          0            0            0        0
## 16874                         0            0            0        0
## 5983                          0            0            0        0
##       SecurityDelay LateAircraftDelay
## 14202             0                 0
## 6293              0                 0
## 5109              0                 0
## 16874             0                 0
## 5983              0                 0


iii. Sort flights by destination (Dest) and break ties by descending arrival delay (ArrDelay)

sorted_flights_breaking_by_arrival <- df_2[order(df_2$Dest, -df_2$ArrDelay),]
head(sorted_flights_breaking_by_arrival,5)
##       Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 19583 2004     3         25         4    1744       1615    2110       1910
## 8513  2004     3         25         4    2015       1910    2124       2029
## 19679 2004     3         25         4    1342       1350    1712       1645
## 6398  2004     3         25         4    1116       1055    1240       1228
## 8331  2004     3         25         4    1139       1140    1300       1255
##       UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 19583            DH      7442  N676BR               146            115      85
## 8513             XE      2460  N14930                69             79      53
## 19679            DH      7546  N662BR               150            115      86
## 6398             OH      5533  N709CA                84             93      73
## 8331             XE      2431  N28518                81             75      58
##       ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 19583      120       89    ORD  ABE      654      4      56         0
## 8513        55       65    CLE  ABE      339      3      13         0
## 19679       27       -8    ORD  ABE      654      4      60         0
## 6398        12       21    CVG  ABE      503      2       9         0
## 8331         5       -1    CLE  ABE      339     11      12         0
##       CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 19583                         0            0            0       30
## 8513                          0            0            0        0
## 19679                         0            0            0       27
## 6398                          0            0            0        0
## 8331                          0            0            0        0
##       SecurityDelay LateAircraftDelay
## 19583             0                90
## 8513              0                55
## 19679             0                 0
## 6398              0                 0
## 8331              0                 0
paste("Dataframe rows:",dim(sorted_flights_breaking_by_arrival)[1], ", Columns :",dim(sorted_flights_breaking_by_arrival)[2])
## [1] "Dataframe rows: 19928 , Columns : 29"


Bonus Questions:


  1. Transform using the Airline data: create a new data frame with only the columns ’DepDelay’, ’ArrDelay’, ’Origin’, ’Dest’, AirTime’, and ’Distance’. In addition keep only the observation for flights that were delayed (DepDelay) by more than 1 hour. (Use the Airline data)
new_dataframe <- as.data.frame(subset(df_2, DepDelay > 60, select=c(DepDelay, ArrDelay, Origin, Dest, AirTime, Distance)))
head(new_dataframe,10)
##     DepDelay ArrDelay Origin Dest AirTime Distance
## 8        112      102    DEN  SNA     116      846
## 23        65       78    ORD  ROC      72      528
## 44        73       72    IAH  ORD     139      925
## 65       164      144    BUF  ORD      79      473
## 66       130      138    ORD  MSP      57      334
## 78       122      144    ORD  SAN     238     1723
## 86        65      180    CMH  ORD      71      296
## 107       86       79    LAX  DEN     109      862
## 111       89       79    DEN  OKC      64      495
## 121      130      113    ALB  ORD     107      723
paste("New dataframe rows:", dim(new_dataframe)[1], ", COlumns:", dim(new_dataframe)[2])
## [1] "New dataframe rows: 785 , COlumns: 6"

(a) Add a new column reporting departure delays minus the mean departure delay to the new data frame

#Creating new column with name 'DepDelayMinusMeanDepDelay'
new_dataframe$DepDelayMinusMeanDepDelay <- new_dataframe$DepDelay - mean(new_dataframe$DepDelay)
head(new_dataframe,10)
##     DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8        112      102    DEN  SNA     116      846                  6.073885
## 23        65       78    ORD  ROC      72      528                -40.926115
## 44        73       72    IAH  ORD     139      925                -32.926115
## 65       164      144    BUF  ORD      79      473                 58.073885
## 66       130      138    ORD  MSP      57      334                 24.073885
## 78       122      144    ORD  SAN     238     1723                 16.073885
## 86        65      180    CMH  ORD      71      296                -40.926115
## 107       86       79    LAX  DEN     109      862                -19.926115
## 111       89       79    DEN  OKC      64      495                -16.926115
## 121      130      113    ALB  ORD     107      723                 24.073885


(b) Convert the departure and arrival delays columns from minutes into hours

new_dataframe$DepDelay <- round(new_dataframe$DepDelay/60,2)
new_dataframe$ArrDelay <- round(new_dataframe$ArrDelay/60,2)
head(new_dataframe,10)
##     DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8       1.87     1.70    DEN  SNA     116      846                  6.073885
## 23      1.08     1.30    ORD  ROC      72      528                -40.926115
## 44      1.22     1.20    IAH  ORD     139      925                -32.926115
## 65      2.73     2.40    BUF  ORD      79      473                 58.073885
## 66      2.17     2.30    ORD  MSP      57      334                 24.073885
## 78      2.03     2.40    ORD  SAN     238     1723                 16.073885
## 86      1.08     3.00    CMH  ORD      71      296                -40.926115
## 107     1.43     1.32    LAX  DEN     109      862                -19.926115
## 111     1.48     1.32    DEN  OKC      64      495                -16.926115
## 121     2.17     1.88    ALB  ORD     107      723                 24.073885


(c) Add a column with the average flight speed (in mph)

#Added new column with name 'AvgFlightSpeed' which denotes speed in mph
new_dataframe$AvgFlightSpeed <- new_dataframe$Distance/(new_dataframe$AirTime/60)
head(new_dataframe,10)
##     DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8       1.87     1.70    DEN  SNA     116      846                  6.073885
## 23      1.08     1.30    ORD  ROC      72      528                -40.926115
## 44      1.22     1.20    IAH  ORD     139      925                -32.926115
## 65      2.73     2.40    BUF  ORD      79      473                 58.073885
## 66      2.17     2.30    ORD  MSP      57      334                 24.073885
## 78      2.03     2.40    ORD  SAN     238     1723                 16.073885
## 86      1.08     3.00    CMH  ORD      71      296                -40.926115
## 107     1.43     1.32    LAX  DEN     109      862                -19.926115
## 111     1.48     1.32    DEN  OKC      64      495                -16.926115
## 121     2.17     1.88    ALB  ORD     107      723                 24.073885
##     AvgFlightSpeed
## 8         437.5862
## 23        440.0000
## 44        399.2806
## 65        359.2405
## 66        351.5789
## 78        434.3697
## 86        250.1408
## 107       474.4954
## 111       464.0625
## 121       405.4206


(d) Report your new table using the function summary()

summary(new_dataframe)
##     DepDelay         ArrDelay         Origin              Dest          
##  Min.   : 1.020   Min.   : 0.330   Length:785         Length:785        
##  1st Qu.: 1.230   1st Qu.: 1.280   Class :character   Class :character  
##  Median : 1.580   Median : 1.720   Mode  :character   Mode  :character  
##  Mean   : 1.766   Mean   : 1.842                                        
##  3rd Qu.: 2.070   3rd Qu.: 2.200                                        
##  Max.   :18.000   Max.   :17.780                                        
##                   NA's   :2                                             
##     AirTime           Distance      DepDelayMinusMeanDepDelay AvgFlightSpeed  
##  Min.   :-1395.0   Min.   :  56.0   Min.   :-44.93            Min.   :-11.35  
##  1st Qu.:   55.0   1st Qu.: 316.0   1st Qu.:-31.93            1st Qu.:328.80  
##  Median :   90.0   Median : 590.0   Median :-10.93            Median :393.05  
##  Mean   :  101.1   Mean   : 677.9   Mean   :  0.00            Mean   :378.69  
##  3rd Qu.:  134.0   3rd Qu.: 867.0   3rd Qu.: 18.07            3rd Qu.:442.68  
##  Max.   :  346.0   Max.   :2704.0   Max.   :974.07            Max.   :543.11  
##  NA's   :2                                                    NA's   :2