Step 1: Load data from local directory
#Load Data using read function
df_1 <- read.csv(file = "../Downloads/tornado.csv")
#head(df_1)
# To view interactive Table
library(DT)
datatable(head(df_1,20))
Step 2: Check unique attributes of Dataset
# Check dimension of dataset
paste("Number of unique rows", dim(df_1)[1], ", Number of unique cols", dim(df_1)[2])
## [1] "Number of unique rows 11141 , Number of unique cols 10"
- How many tornadoes occurred in WA (Washington) or MS (Mississippi)?
df_a = subset(df_1,st == "WA" | st == "MS")
total_tornadoes = length(df_a$st)
paste("Total Tornadoes occurred in WA (Washington) or MS (Mississippi) are",total_tornadoes)
## [1] "Total Tornadoes occurred in WA (Washington) or MS (Mississippi) are 530"
- How many tornadoes occurred in WA after 2012?
df_b = subset(df_1, st == "WA" & yr > 2012)
total_tornadoes <- nrow(df_b)
paste("Total Tornadoes occurred in WA after 2012 are",total_tornadoes)
## [1] "Total Tornadoes occurred in WA after 2012 are 7"
- Get all tornadoes that occurred in WA in 2012, 2013, and 2014.
df_c = subset(df_1, st == "WA" & (yr == 2012 | yr == 2013 | yr == 2014))
total_tornadoes <- nrow(df_c)
paste("All tornadoes that occurred in WA in 2012, 2013, and 2014 are", total_tornadoes)
## [1] "All tornadoes that occurred in WA in 2012, 2013, and 2014 are 5"
- Return the ’mo’ (month), ’yr’ (year), and ’f’ (F-scale) for all tornadoes that occurred in HI (Hawaii).
df_d = subset(df_1, st == "HI", select=c(mo, yr, f))
df_d
## mo yr f
## 2766 9 2008 EF-0
## 2842 12 2008 EF-0
## 2878 2 2009 EF-0
## 5371 2 2011 EF-0
## 7359 3 2012 EF-0
## 10093 4 2015 EF-0
- Arrange the tornadoes by date and time. What state had the most recent tornado?
# Arranging tornadoes by datetime
sorted_df <- df_1[order(df_1$date, df_1$time),]
head(sorted_df,20)
## yr mo dy date time tz st stf stn f
## 1 2007 1 4 2007-01-04 15:45:00 3 LA 22 1 EF-1
## 2 2007 1 4 2007-01-04 16:35:00 3 LA 22 2 EF-1
## 3 2007 1 5 2007-01-05 00:27:00 3 MS 28 1 EF-1
## 4 2007 1 5 2007-01-05 00:40:00 3 MS 28 2 EF-0
## 5 2007 1 5 2007-01-05 00:57:00 3 MS 28 3 EF-1
## 6 2007 1 5 2007-01-05 01:07:00 3 MS 28 4 EF-1
## 7 2007 1 5 2007-01-05 01:25:00 3 MS 28 5 EF-2
## 8 2007 1 5 2007-01-05 01:40:00 3 MS 28 6 EF-2
## 9 2007 1 5 2007-01-05 02:05:00 3 MS 28 7 EF-1
## 10 2007 1 5 2007-01-05 09:05:00 3 GA 13 1 EF-1
## 11 2007 1 5 2007-01-05 10:00:00 3 GA 13 2 EF-0
## 12 2007 1 5 2007-01-05 13:24:00 3 SC 45 1 EF-1
## 13 2007 1 5 2007-01-05 14:11:00 3 SC 45 2 EF-0
## 14 2007 1 5 2007-01-05 15:39:00 3 NC 37 1 EF-0
## 15 2007 1 5 2007-01-05 16:10:00 3 GA 13 3 EF-0
## 16 2007 1 7 2007-01-07 16:10:00 3 AL 1 1 EF-1
## 17 2007 1 7 2007-01-07 17:39:00 3 GA 13 4 EF-2
## 18 2007 1 7 2007-01-07 19:50:00 3 AL 1 2 EF-0
## 19 2007 1 7 2007-01-07 20:29:00 3 AL 1 3 EF-1
## 20 2007 1 12 2007-01-12 16:35:00 3 TX 48 1 EF-0
# Finding state which had the most recent tornado
recent_tornadoes <- df_1[order(rev(df_1$date), rev(df_1$time)),]
recent_tornado_state <- recent_tornadoes[1,"st"]
paste("State which had the most recent tornado is",recent_tornado_state)
## [1] "State which had the most recent tornado is NC"
- Get the count of tornadoes by months.
tornado_count <- table(df_1$mo)
tornado_count_by_month <- as.data.frame(t(tornado_count))[,-1]
colnames(tornado_count_by_month) <- c("Month number", "Count of Tornadoes")
tornado_count_by_month
## Month number Count of Tornadoes
## 1 1 355
## 2 2 463
## 3 3 751
## 4 4 2171
## 5 5 2500
## 6 6 1899
## 7 7 861
## 8 8 517
## 9 9 401
## 10 10 525
## 11 11 350
## 12 12 348
Step 1: Load data from local directory
#Load Data using read function
df_2 <- read.csv(file = "../Downloads/airline.csv")
#head(df_1)
# To view interactive Table
library(DT)
datatable(head(df_2,20))
Step 2: Check unique attributes of Dataset
# Check dimension of dataset
paste("Number of unique rows", dim(df_2)[1], ", Number of unique cols", dim(df_2)[2])
## [1] "Number of unique rows 19928 , Number of unique cols 29"
- Subset: Find the carrier (UniqueCarrier), the flight number (FlightNum), and origin (origin) of all the flights that:
subset(df_2, DepTime > 2200 & Dest == "BNA", select=c(UniqueCarrier, FlightNum, Origin))
## UniqueCarrier FlightNum Origin
## 7021 OH 5421 CVG
## 17401 AA 2435 ORD
## 19478 DH 7332 ORD
subset(df_2, DepTime > 2200 & (Origin == "BNA" | Dest == "MEM"), select=c(UniqueCarrier, FlightNum, Origin))
## UniqueCarrier FlightNum Origin
## 2392 WN 1447 BNA
## 8169 OO 6838 ORD
## 10405 DL 579 ATL
## 19352 DH 6270 CVG
df_iii <- subset(df_2, ArrDelay > 120, select=c(UniqueCarrier, FlightNum, Origin))
head(df_iii,10)
## UniqueCarrier FlightNum Origin
## 27 UA 441 ORD
## 65 UA 471 BUF
## 66 UA 471 ORD
## 78 UA 481 ORD
## 86 UA 489 CMH
## 126 UA 518 ORD
## 149 UA 538 ORD
## 154 UA 542 ORD
## 155 UA 543 BOS
## 158 UA 544 SNA
paste("Dataframe rows:", dim(df_iii)[1],", Columns:",dim(df_iii)[2])
## [1] "Dataframe rows: 280 , Columns: 3"
df_iv <- subset(df_2, ArrDelay > 120 & DepDelay <= 0, select=c(UniqueCarrier, FlightNum, Origin))
head(df_iv,20)
## UniqueCarrier FlightNum Origin
## 9495 TZ 505 PIE
paste("Dataframe rows:",dim(df_iv)[1], ", Columns:",dim(df_iv)[2])
## [1] "Dataframe rows: 1 , Columns: 3"
- Arrange: make use of minus sign in front of the function to reverse order() and the function head() to get the relevant part: (Use the Airline data)
head(df_2[order(-df_2$DepDelay),],5)
## Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 5884 2004 3 25 4 745 1345 954 1607
## 11220 2004 3 25 4 2031 1507 2319 1759
## 14475 2004 3 25 4 2005 1501 2230 1717
## 15132 2004 3 25 4 1807 1306 2336 1830
## 16220 2004 3 25 4 1218 721 1411 912
## UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 5884 NW 1816 N608NW 69 82 48
## 11220 DL 1603 N978DL 108 112 82
## 14475 MQ 4072 N514MQ 145 136 93
## 15132 NW 180 N352NW 209 204 188
## 16220 AA 1035 N5FKAA 173 171 155
## ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 5884 1067 1080 MDW DTW 229 10 11 0
## 11220 320 324 MSP CVG 596 7 19 0
## 14475 313 304 ORD OKC 693 6 46 0
## 15132 306 301 LAX MEM 1619 7 14 0
## 16220 299 297 MCO ORD 1005 6 12 0
## CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 5884 0 1067 0 0
## 11220 0 0 0 0
## 14475 0 254 0 9
## 15132 0 301 0 5
## 16220 0 297 0 2
## SecurityDelay LateAircraftDelay
## 5884 0 0
## 11220 0 320
## 14475 0 50
## 15132 0 0
## 16220 0 0
head(df_2[order(df_2$DepDelay),],5)
## Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 14202 2004 3 25 4 2047 2113 2225 2227
## 6293 2004 3 25 4 911 935 1053 1109
## 5109 2004 3 25 4 1053 1115 1129 1159
## 16874 2004 3 25 4 2109 2130 2159 2218
## 5983 2004 3 25 4 1015 1035 1226 1243
## UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 14202 MQ 3647 N817MQ 98 74 52
## 6293 OH 5260 N933CA 102 94 75
## 5109 NW 707 N986US 36 44 28
## 16874 AA 1685 N4XGAA 50 48 36
## 5983 NW 1926 N9346 71 68 46
## ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 14202 -2 -26 DFW SGF 364 3 43 0
## 6293 -16 -24 RIC CVG 413 1 26 0
## 5109 -30 -22 GTF FCA 146 2 6 0
## 16874 -19 -21 HOU AUS 148 5 9 0
## 5983 -17 -20 MDW DTW 229 12 13 0
## CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 14202 0 0 0 0
## 6293 0 0 0 0
## 5109 0 0 0 0
## 16874 0 0 0 0
## 5983 0 0 0 0
## SecurityDelay LateAircraftDelay
## 14202 0 0
## 6293 0 0
## 5109 0 0
## 16874 0 0
## 5983 0 0
sorted_flights_breaking_by_arrival <- df_2[order(df_2$Dest, -df_2$ArrDelay),]
head(sorted_flights_breaking_by_arrival,5)
## Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
## 19583 2004 3 25 4 1744 1615 2110 1910
## 8513 2004 3 25 4 2015 1910 2124 2029
## 19679 2004 3 25 4 1342 1350 1712 1645
## 6398 2004 3 25 4 1116 1055 1240 1228
## 8331 2004 3 25 4 1139 1140 1300 1255
## UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 19583 DH 7442 N676BR 146 115 85
## 8513 XE 2460 N14930 69 79 53
## 19679 DH 7546 N662BR 150 115 86
## 6398 OH 5533 N709CA 84 93 73
## 8331 XE 2431 N28518 81 75 58
## ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled
## 19583 120 89 ORD ABE 654 4 56 0
## 8513 55 65 CLE ABE 339 3 13 0
## 19679 27 -8 ORD ABE 654 4 60 0
## 6398 12 21 CVG ABE 503 2 9 0
## 8331 5 -1 CLE ABE 339 11 12 0
## CancellationCode Diverted CarrierDelay WeatherDelay NASDelay
## 19583 0 0 0 30
## 8513 0 0 0 0
## 19679 0 0 0 27
## 6398 0 0 0 0
## 8331 0 0 0 0
## SecurityDelay LateAircraftDelay
## 19583 0 90
## 8513 0 55
## 19679 0 0
## 6398 0 0
## 8331 0 0
paste("Dataframe rows:",dim(sorted_flights_breaking_by_arrival)[1], ", Columns :",dim(sorted_flights_breaking_by_arrival)[2])
## [1] "Dataframe rows: 19928 , Columns : 29"
- Transform using the Airline data: create a new data frame with only the columns ’DepDelay’, ’ArrDelay’, ’Origin’, ’Dest’, AirTime’, and ’Distance’. In addition keep only the observation for flights that were delayed (DepDelay) by more than 1 hour. (Use the Airline data)
new_dataframe <- as.data.frame(subset(df_2, DepDelay > 60, select=c(DepDelay, ArrDelay, Origin, Dest, AirTime, Distance)))
head(new_dataframe,10)
## DepDelay ArrDelay Origin Dest AirTime Distance
## 8 112 102 DEN SNA 116 846
## 23 65 78 ORD ROC 72 528
## 44 73 72 IAH ORD 139 925
## 65 164 144 BUF ORD 79 473
## 66 130 138 ORD MSP 57 334
## 78 122 144 ORD SAN 238 1723
## 86 65 180 CMH ORD 71 296
## 107 86 79 LAX DEN 109 862
## 111 89 79 DEN OKC 64 495
## 121 130 113 ALB ORD 107 723
paste("New dataframe rows:", dim(new_dataframe)[1], ", COlumns:", dim(new_dataframe)[2])
## [1] "New dataframe rows: 785 , COlumns: 6"
#Creating new column with name 'DepDelayMinusMeanDepDelay'
new_dataframe$DepDelayMinusMeanDepDelay <- new_dataframe$DepDelay - mean(new_dataframe$DepDelay)
head(new_dataframe,10)
## DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8 112 102 DEN SNA 116 846 6.073885
## 23 65 78 ORD ROC 72 528 -40.926115
## 44 73 72 IAH ORD 139 925 -32.926115
## 65 164 144 BUF ORD 79 473 58.073885
## 66 130 138 ORD MSP 57 334 24.073885
## 78 122 144 ORD SAN 238 1723 16.073885
## 86 65 180 CMH ORD 71 296 -40.926115
## 107 86 79 LAX DEN 109 862 -19.926115
## 111 89 79 DEN OKC 64 495 -16.926115
## 121 130 113 ALB ORD 107 723 24.073885
new_dataframe$DepDelay <- round(new_dataframe$DepDelay/60,2)
new_dataframe$ArrDelay <- round(new_dataframe$ArrDelay/60,2)
head(new_dataframe,10)
## DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8 1.87 1.70 DEN SNA 116 846 6.073885
## 23 1.08 1.30 ORD ROC 72 528 -40.926115
## 44 1.22 1.20 IAH ORD 139 925 -32.926115
## 65 2.73 2.40 BUF ORD 79 473 58.073885
## 66 2.17 2.30 ORD MSP 57 334 24.073885
## 78 2.03 2.40 ORD SAN 238 1723 16.073885
## 86 1.08 3.00 CMH ORD 71 296 -40.926115
## 107 1.43 1.32 LAX DEN 109 862 -19.926115
## 111 1.48 1.32 DEN OKC 64 495 -16.926115
## 121 2.17 1.88 ALB ORD 107 723 24.073885
#Added new column with name 'AvgFlightSpeed' which denotes speed in mph
new_dataframe$AvgFlightSpeed <- new_dataframe$Distance/(new_dataframe$AirTime/60)
head(new_dataframe,10)
## DepDelay ArrDelay Origin Dest AirTime Distance DepDelayMinusMeanDepDelay
## 8 1.87 1.70 DEN SNA 116 846 6.073885
## 23 1.08 1.30 ORD ROC 72 528 -40.926115
## 44 1.22 1.20 IAH ORD 139 925 -32.926115
## 65 2.73 2.40 BUF ORD 79 473 58.073885
## 66 2.17 2.30 ORD MSP 57 334 24.073885
## 78 2.03 2.40 ORD SAN 238 1723 16.073885
## 86 1.08 3.00 CMH ORD 71 296 -40.926115
## 107 1.43 1.32 LAX DEN 109 862 -19.926115
## 111 1.48 1.32 DEN OKC 64 495 -16.926115
## 121 2.17 1.88 ALB ORD 107 723 24.073885
## AvgFlightSpeed
## 8 437.5862
## 23 440.0000
## 44 399.2806
## 65 359.2405
## 66 351.5789
## 78 434.3697
## 86 250.1408
## 107 474.4954
## 111 464.0625
## 121 405.4206
summary(new_dataframe)
## DepDelay ArrDelay Origin Dest
## Min. : 1.020 Min. : 0.330 Length:785 Length:785
## 1st Qu.: 1.230 1st Qu.: 1.280 Class :character Class :character
## Median : 1.580 Median : 1.720 Mode :character Mode :character
## Mean : 1.766 Mean : 1.842
## 3rd Qu.: 2.070 3rd Qu.: 2.200
## Max. :18.000 Max. :17.780
## NA's :2
## AirTime Distance DepDelayMinusMeanDepDelay AvgFlightSpeed
## Min. :-1395.0 Min. : 56.0 Min. :-44.93 Min. :-11.35
## 1st Qu.: 55.0 1st Qu.: 316.0 1st Qu.:-31.93 1st Qu.:328.80
## Median : 90.0 Median : 590.0 Median :-10.93 Median :393.05
## Mean : 101.1 Mean : 677.9 Mean : 0.00 Mean :378.69
## 3rd Qu.: 134.0 3rd Qu.: 867.0 3rd Qu.: 18.07 3rd Qu.:442.68
## Max. : 346.0 Max. :2704.0 Max. :974.07 Max. :543.11
## NA's :2 NA's :2