1.Load packages - dplyr, hflights
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
#explore
data(hflights)
dim(hflights)
## [1] 227496 21
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
2.Combine data from the flights database with data from the hflights dataset
Identify the data elements needed for the flights table in Postgres flights database. And export the complete cases into a csv file, and import the data into the postgres db.
#Fetch the data into a dataframe.
hflights_df <- select(tbl_df(hflights),Year,Month, DayofMonth,DepTime,DepDelay,ArrTime,ArrDelay,UniqueCarrier,TailNum,FlightNum,Origin,Dest,AirTime,Distance)
#hour and minute are 'not nullable' fields in flights db.So just initializing these to -1s
hflights_df[, "hour"] <- -1
hflights_df[, "minute"] <- -1
#just need to grab the complete cases only. [NAs for deptime, depdelay etc..need to be ommitted]
hflights_df <- hflights_df[complete.cases(hflights_df),]
#Note down the number of observations[/rows]
dim(hflights_df)
## [1] 223874 16
#Now write this data frame into a csv
write.table(hflights_df, file='hflights_exported_data.csv',row.names=FALSE, col.names=FALSE, sep=",")
In Postgres - Import the data into the table flights
COPY flights FROM ‘C:/hflights_exported_data.csv’ WITH DELIMITER ‘,’ CSV;
So, the number of rows exported is matched with the rows imported.
3.Review the combined data in flights database and determine which city would you rather fly out of?
Assumption: Houston flight data is from 2011 , and NY flight data is from 2013. So, we are not comparing data from same time periods.
#Dep Delay, Arr Delay flights origniated from NewYork Vs Houston
data <- structure(list(NewYork= c(39L,38L,40L,39L),
Houston = c(22L, 48L,24L,47L)),
.Names = c("New York", "Houston"),
class = "data.frame", row.names=c(NA,-4L))
colors <- c("blue", "orange","yellow","green")
barplot(as.matrix(data), main="Origin - NewYork Vs Houston - Delays", ylab = "Delay", beside=TRUE, col=colors)
legend("topleft", c("Dep Delay Mins","Dep Delay %","Arr Delay Mins","Arr Delay %"), cex=0.65, bty="n", fill=colors)
Avg dep delay time in Houston is 17 minutes lower than NewYork. However, the delay percentages in Houston is 10% more.
Also NewYork has got 20% more departures (more number of flights from NY), compared to Houston.
Conclusion - NY is slightly better option to fly from.
4.Is it easier to get from Houton to New York than it is to get from New York to Houston? Explain
Houston to NewYork is slightly better option due to minimum delay minutes.
5.Find the carriers with dep delays greater than the ‘overall average dep delay time’
delayCarriers <- hflights_df %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 0) %>%
filter(DepDelay > mean(DepDelay, na.rm=TRUE)) %>%
group_by(UniqueCarrier) %>%
summarise(avg_delay = round(mean(DepDelay, na.rm=TRUE))) %>%
arrange(desc(avg_delay))
#plot the above.
library(ggplot2)
qplot(UniqueCarrier, data=delayCarriers, geom="bar", weight=avg_delay, ylab="Avg Delay", main="Carriers with Dep Delay > Ave Dep Delay", fill = I("grey50"))
6.Fetch the combined records directly from POSTGRES flights db & determine if old flights are causing delays.
#install.packages("RODBC",echo=FALSE)
#install.packages("RPostgreSQL",echo=FALSE)
library(RODBC)
library(RPostgreSQL)
## Loading required package: DBI
con <- dbConnect(PostgreSQL(), user= "postgres", password="test123", dbname="flights")
rs <- dbSendQuery(con,
"select p.year as manufactured_year, round(sum(f.arr_delay)) as total_arr_delay from planes p join flights f on p.tailnum = f.tailnum where f.arr_delay > 0 and p.year is not null group by manufactured_year order by manufactured_year desc, total_arr_delay")
#fetch observations
df <- fetch(rs, n=-1)
# Clearing the result set
dbClearResult(rs)
## [1] TRUE
# Disconnect
dbDisconnect(con)
## [1] TRUE
# Plot the graph
plot(df)
#lm() function to estimate a linear regression model
fit <- lm(df$total_arr_delay~df$manufactured_year, data=faithful)
#lines(df$manufactured_year, fitted(fit), col="blue") , or use abline.
abline(fit, col = "blue")
The above concludes that the old flights might not be causing delays.
7.Fetch the combined records directly from POSTGRES flights db & for each Origin , calculate which two days of the year they had their longest departure delays.
con <- dbConnect(PostgreSQL(), user= "postgres", password="test123", dbname="flights")
rs <- dbSendQuery(con, "select year, month, day, origin, dep_delay from flights where dep_delay > 0 ")
#fetch observations
df <- fetch(rs, n=-1)
dim(df)
## [1] 238006 5
# Clearing the result set
dbClearResult(rs)
## [1] TRUE
# Disconnect
dbDisconnect(con)
## [1] TRUE
#Now analyse with the df
df %>%
group_by(origin) %>%
select(month,day,dep_delay) %>%
top_n(2) %>%
arrange(origin, desc(dep_delay))
## Selecting by dep_delay
## Source: local data frame [10 x 4]
## Groups: origin
##
## origin month day dep_delay
## 1 EWR 1 10 1126
## 2 EWR 12 5 896
## 3 HOU 5 20 803
## 4 HOU 10 25 730
## 5 IAH 8 1 981
## 6 IAH 12 12 970
## 7 JFK 1 9 1301
## 8 JFK 6 15 1137
## 9 LGA 3 17 911
## 10 LGA 7 22 898
8.Fetch the combined records directly from POSTGRES flights db & for each origin , calculate the min and max arrival and departure delays.
con <- dbConnect(PostgreSQL(), user= "postgres", password="test123", dbname="flights")
rs <- dbSendQuery(con, "select origin, arr_delay, dep_delay from flights")
#fetch observations
df <- fetch(rs, n=-1)
dim(df)
## [1] 560650 3
# Clearing the result set
dbClearResult(rs)
## [1] TRUE
# Disconnect
dbDisconnect(con)
## [1] TRUE
#Now analyse with the df
df %>%
group_by(origin) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("delay"))
## Source: local data frame [5 x 5]
##
## origin arr_delay_min dep_delay_min arr_delay_max dep_delay_max
## 1 EWR -86 -25 1109 1126
## 2 HOU -44 -23 822 803
## 3 IAH -70 -33 978 981
## 4 JFK -79 -43 1272 1301
## 5 LGA -68 -33 915 911