This project is where you show off your ability to
(1) use R packages
(2) change the shape of data in a data frame
(3) provide basic summary statistics and graphics as part of your exploratory data analysis.
. You should start by installing and loading the R package hflights, which is available on CRAN and described here: https://cran.r-project.org/web/packages/hflights/hflights.pdf
. You should perform some basic transformations on the data frame, such as including only a subset of columns, created one or more new derived columns (such as flight date), and/or filtering out rows.
. You should then create basic summary statistics. You should create at least one interesting question for analysis, and answer the question using two or more graphics in R.
#install.packages("hflights")
library(ggplot2)
library(gcookbook)
library(hflights)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
nrow(hflights)
## [1] 227496
ncol(hflights)
## [1] 21
colnames(hflights)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
head(hflights, n=10)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## 5430 2011 1 7 5 1359 1509 AA
## 5431 2011 1 8 6 1355 1454 AA
## 5432 2011 1 9 7 1443 1554 AA
## 5433 2011 1 10 1 1443 1553 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## 5430 428 N493AA 70 43 -1 -1 IAH
## 5431 428 N477AA 59 40 -16 -5 IAH
## 5432 428 N476AA 71 41 44 43 IAH
## 5433 428 N504AA 70 45 43 43 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
## 5430 DFW 224 12 15 0 0
## 5431 DFW 224 7 12 0 0
## 5432 DFW 224 8 22 0 0
## 5433 DFW 224 6 19 0 0
hflights_df = hflights[c("UniqueCarrier","Month","DayofMonth", "Year",
"Origin","Dest","DepTime", "ArrTime", "DepDelay", "ArrDelay",
"Cancelled","CancellationCode")]
head(hflights_df, 10)
## UniqueCarrier Month DayofMonth Year Origin Dest DepTime ArrTime
## 5424 AA 1 1 2011 IAH DFW 1400 1500
## 5425 AA 1 2 2011 IAH DFW 1401 1501
## 5426 AA 1 3 2011 IAH DFW 1352 1502
## 5427 AA 1 4 2011 IAH DFW 1403 1513
## 5428 AA 1 5 2011 IAH DFW 1405 1507
## 5429 AA 1 6 2011 IAH DFW 1359 1503
## 5430 AA 1 7 2011 IAH DFW 1359 1509
## 5431 AA 1 8 2011 IAH DFW 1355 1454
## 5432 AA 1 9 2011 IAH DFW 1443 1554
## 5433 AA 1 10 2011 IAH DFW 1443 1553
## DepDelay ArrDelay Cancelled CancellationCode
## 5424 0 -10 0
## 5425 1 -9 0
## 5426 -8 -8 0
## 5427 3 3 0
## 5428 5 -3 0
## 5429 -1 -7 0
## 5430 -1 -1 0
## 5431 -5 -16 0
## 5432 43 44 0
## 5433 43 43 0
carrier_df = hflights_df [c("UniqueCarrier")]
nrow(carrier_df)
## [1] 227496
ggplot(carrier_df, aes(x=UniqueCarrier), stat="count") + geom_bar(fill="#AFFAAA", color="black")
origins_df=sqldf("select Origin, count(*) As NumberOfFlights from hflights_df group by Origin")
## Loading required package: tcltk
print(origins_df)
## Origin NumberOfFlights
## 1 HOU 52299
## 2 IAH 175197
ggplot(origins_df, aes(x=Origin, y=NumberOfFlights)) + geom_bar(fill="#9999ff", color="black", stat="identity")
destinations_df = sqldf("select Dest, NumberOfFlights from (select Dest, count(*) As NumberOfFlights from hflights_df group by Dest) order by NumberOfFlights desc")
nrow(destinations_df)
## [1] 116
destinations_df = head(destinations_df, 20)
print (destinations_df)
## Dest NumberOfFlights
## 1 DAL 9820
## 2 ATL 7886
## 3 MSY 6823
## 4 DFW 6653
## 5 LAX 6064
## 6 DEN 5920
## 7 ORD 5748
## 8 PHX 5096
## 9 AUS 5022
## 10 SAT 4893
## 11 CRP 4813
## 12 CLT 4735
## 13 EWR 4314
## 14 LAS 4082
## 15 HRL 3983
## 16 MCO 3687
## 17 BNA 3481
## 18 MCI 3174
## 19 OKC 3170
## 20 TPA 3085
ggplot(destinations_df, aes(x=Dest, y=NumberOfFlights)) + geom_bar(fill="#ff6666", color="black", stat="identity")
hflights_df$Date = paste( hflights_df$Month , hflights_df$DayofMonth , hflights_df$Year, sep = "/" )
hflights_df$MonthStr [hflights_df$Month=="1"] = "01"
hflights_df$MonthStr [hflights_df$Month=="2"] = "02"
hflights_df$MonthStr [hflights_df$Month=="3"] = "03"
hflights_df$MonthStr [hflights_df$Month=="4"] = "04"
hflights_df$MonthStr [hflights_df$Month=="5"] = "05"
hflights_df$MonthStr [hflights_df$Month=="6"] = "06"
hflights_df$MonthStr [hflights_df$Month=="7"] = "07"
hflights_df$MonthStr [hflights_df$Month=="8"] = "08"
hflights_df$MonthStr [hflights_df$Month=="9"] = "09"
hflights_df$MonthStr [hflights_df$Month=="10"] = "10"
hflights_df$MonthStr [hflights_df$Month=="11"] = "11"
hflights_df$MonthStr [hflights_df$Month=="12"] = "12"
monthly_df=sqldf("select MonthStr, count(*) As NumberOfFlights from hflights_df group by MonthStr")
print(monthly_df)
## MonthStr NumberOfFlights
## 1 01 18910
## 2 02 17128
## 3 03 19470
## 4 04 18593
## 5 05 19172
## 6 06 19600
## 7 07 20548
## 8 08 20176
## 9 09 18065
## 10 10 18696
## 11 11 18021
## 12 12 19117
ggplot(monthly_df, aes(x=MonthStr, y=NumberOfFlights)) + geom_bar(fill="#8877aa", color="black", stat="identity")
monthly_df_sorted = sqldf("select MonthStr As Month, NumberOfFlights from monthly_df order by NumberOfFlights desc")
head(monthly_df_sorted, 12)
## Month NumberOfFlights
## 1 07 20548
## 2 08 20176
## 3 06 19600
## 4 03 19470
## 5 05 19172
## 6 12 19117
## 7 01 18910
## 8 10 18696
## 9 04 18593
## 10 09 18065
## 11 11 18021
## 12 02 17128
head(hflights_df, n=10)
## UniqueCarrier Month DayofMonth Year Origin Dest DepTime ArrTime
## 5424 AA 1 1 2011 IAH DFW 1400 1500
## 5425 AA 1 2 2011 IAH DFW 1401 1501
## 5426 AA 1 3 2011 IAH DFW 1352 1502
## 5427 AA 1 4 2011 IAH DFW 1403 1513
## 5428 AA 1 5 2011 IAH DFW 1405 1507
## 5429 AA 1 6 2011 IAH DFW 1359 1503
## 5430 AA 1 7 2011 IAH DFW 1359 1509
## 5431 AA 1 8 2011 IAH DFW 1355 1454
## 5432 AA 1 9 2011 IAH DFW 1443 1554
## 5433 AA 1 10 2011 IAH DFW 1443 1553
## DepDelay ArrDelay Cancelled CancellationCode Date MonthStr
## 5424 0 -10 0 1/1/2011 01
## 5425 1 -9 0 1/2/2011 01
## 5426 -8 -8 0 1/3/2011 01
## 5427 3 3 0 1/4/2011 01
## 5428 5 -3 0 1/5/2011 01
## 5429 -1 -7 0 1/6/2011 01
## 5430 -1 -1 0 1/7/2011 01
## 5431 -5 -16 0 1/8/2011 01
## 5432 43 44 0 1/9/2011 01
## 5433 43 43 0 1/10/2011 01