# load data
#install.packages("dplyr")
#install.packages("nycflights13", repos='http://cran.us.r-project.org')
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 3.3.3
fl_data<-flights
summary(fl_data)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :13.18 Mean :26.23 Mean :2013-07-03 05:02:36
## 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
#Extract only relevant columns and clean it by eliminating invalid rows.
origin <- fl_data$origin
dest <- fl_data$dest
dep_delay <-fl_data$dep_delay
arr_delay <- fl_data$arr_delay
fl_rsrch_data <- na.omit(data.frame(origin,dest,dep_delay,arr_delay))
fl_data_EWR <- subset(fl_rsrch_data, origin=='EWR')
fl_data_LGA <- subset(fl_rsrch_data, origin=='LGA')
fl_data_JFK <- subset(fl_rsrch_data, origin=='JFK')
summary(fl_data_EWR);summary(fl_data_LGA);summary(fl_data_JFK)
## origin dest dep_delay arr_delay
## EWR:117127 ORD : 5828 Min. : -25.00 Min. : -86.000
## JFK: 0 BOS : 5247 1st Qu.: -4.00 1st Qu.: -16.000
## LGA: 0 SFO : 5064 Median : -1.00 Median : -4.000
## CLT : 4893 Mean : 15.01 Mean : 9.107
## MCO : 4893 3rd Qu.: 15.00 3rd Qu.: 16.000
## ATL : 4876 Max. :1126.00 Max. :1109.000
## (Other):86326
## origin dest dep_delay arr_delay
## EWR: 0 ATL :10041 Min. :-33.00 Min. :-68.000
## JFK: 0 ORD : 8507 1st Qu.: -6.00 1st Qu.:-17.000
## LGA:101140 CLT : 5961 Median : -3.00 Median : -5.000
## MIA : 5702 Mean : 10.29 Mean : 5.784
## DTW : 4908 3rd Qu.: 7.00 3rd Qu.: 12.000
## DFW : 4682 Max. :911.00 Max. :915.000
## (Other):61339
## origin dest dep_delay arr_delay
## EWR: 0 LAX :11159 Min. : -43.00 Min. : -79.000
## JFK:109079 SFO : 8109 1st Qu.: -5.00 1st Qu.: -18.000
## LGA: 0 BOS : 5773 Median : -1.00 Median : -6.000
## MCO : 5429 Mean : 12.02 Mean : 5.551
## SJU : 4710 3rd Qu.: 10.00 3rd Qu.: 13.000
## FLL : 4210 Max. :1301.00 Max. :1272.000
## (Other):69689
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Analyse and varify if “origin location” of a given flight route has a significant effect on the “delay in departure time” that is observed upon a flight’s departure to its designated destination
Departs from the origin, so it is required to varify the dependency of origin and dep_delay.
EWR_ORIG <- fl_data_EWR$dep_delay
JFK_ORIG <- fl_data_JFK$dep_delay
LGA_ORIG <- fl_data_LGA$dep_delay
EWR <- sample(fl_data_EWR$dep_delay, min(length(EWR_ORIG),length(JFK_ORIG) , length(LGA_ORIG)))
JFK <- sample(fl_data_JFK$dep_delay, min(length(EWR_ORIG),length(JFK_ORIG) , length(LGA_ORIG)))
LGA <- sample(fl_data_LGA$dep_delay, min(length(EWR_ORIG),length(JFK_ORIG) , length(LGA_ORIG)))
What are the cases, and how many are there?
There re 101140 cases of delays are there from 3 Airports EWR, JFK and LGA
Describe the method of data collection.
Airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.
https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf
https://github.com/hadley/nycflights13/blob/master/data/flights.rda?raw=true
What type of study is this (observational/experiment)?
Observational
If you collected the data, state self-collected. If not, provide a citation/link.
Airline on-time data for all flights departing NYC in 2013. Also includes useful ‘metadata’ on airlines, airports, weather, and planes.
https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf
https://github.com/hadley/nycflights13/blob/master/data/flights.rda?raw=true
What is the response variable, and what type is it (numerical/categorical)?
Numerical (Delay)
What is the explanatory variable, and what type is it (numerical/categorival)?
categorical (Airports)
Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
# There are delays of departure from 3 Airports (EWR, JFK and LGA).
# We nned to test if the varibility of he means of is so large that it seems unlikely to be from chance # # alone. The ANOVA and F test is suitable in this situation as there are more than 2 groups of data.
#The summary details of the 3 data sets are as below.
summary(EWR)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -25.00 -4.00 -1.00 14.96 15.00 1126.00
summary(JFK)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -43.00 -5.00 -1.00 12.03 10.00 1301.00
summary(LGA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33.00 -6.00 -3.00 10.29 7.00 911.00