# Loading packages and dataset
library(dplyr)
library(tidyverse)
library(ggplot2)
library(GGally)
library(stringr)
library(lubridate)
library(readr)
flights1801 <- read_csv("stat650 dataset/ONTIME_1801.csv")
flights1802 <- read_csv("stat650 dataset/ONTIME_1802.csv")
flights1803 <- read_csv("stat650 dataset/ONTIME_1803.csv")
flights1804 <- read_csv("stat650 dataset/ONTIME_1804.csv")
flights1805 <- read_csv("stat650 dataset/ONTIME_1805.csv")
# To merge dataset
flights18 <- data.frame(rbind(flights1801, flights1802, flights1803, flights1804, flights1805))
# Remove single dataset to free-up spaces
rm(flights1801, flights1802, flights1803, flights1804, flights1805)
# Bay Area only dataset
sfoflights18 <- flights18 %>%
select(-24) %>%
rename(CARRIER = OP_UNIQUE_CARRIER,
AIRLINE_ID = OP_CARRIER_AIRLINE_ID) %>%
filter(ORIGIN %in% c("SFO", "OAK", "SJC") )
# CRS_DEP_TIME: scheduled departure time
** At the time I started my project, [OST_R | On-Time Performance] only has data till May 2018
sfoflights18 %>%
filter(MONTH == 1, CANCELLED == 0) %>%
select(MONTH, ORIGIN, DEP_TIME) %>%
group_by(MONTH) %>%
summarise(n = n())
library(nycflights13)
var_nyc <- names(flights)
var_sfo <- tolower(names(sfoflights18))
sort(var_nyc)
## [1] "air_time" "arr_delay" "arr_time" "carrier"
## [5] "day" "dep_delay" "dep_time" "dest"
## [9] "distance" "flight" "hour" "minute"
## [13] "month" "origin" "sched_arr_time" "sched_dep_time"
## [17] "tailnum" "time_hour" "year"
sort(var_sfo)
## [1] "airline_id" "arr_delay"
## [3] "arr_time" "cancelled"
## [5] "carrier" "crs_arr_time"
## [7] "crs_dep_time" "day_of_month"
## [9] "day_of_week" "dep_delay"
## [11] "dep_time" "dest_airport_id"
## [13] "dest_airport_seq_id" "dest_city_market_id"
## [15] "fl_date" "month"
## [17] "origin" "origin_airport_id"
## [19] "origin_airport_seq_id" "origin_city_market_id"
## [21] "origin_city_name" "tail_num"
## [23] "year"
| Variables | Description |
|---|---|
| arr_delay | Arrival Delay in minutes. Early arrival show negative rs. |
| arr_time | Arrival time (local time: hhmm) |
| crs_arr_time | Scheduled arrival time (local time: hhmm). |
| carrier | Carrier Code. |
| day_of_month | Day of Month. |
| crs_dep_time | Scheduled Departures time (local time: hhmm). |
| dep_delay | Departures Delay in minutes. Early departures show ive |
| dep_time | Departures time (local time: hhmm) |
| dest_airport_id | Destination Airport (SFOflights shows the ID) |
| month | Flight date (Month) |
| origin | Origin Airport |
| tail_num | Tail Number |
| fl_date | Flight Date (SFO data only shows yyyymmdd) |
| year | Flight date (year) |
| New Variables | Description |
|---|---|
| cancelled | Variable that show if the flight was cancelled or not. |
| dest_city_name | Destination city name |
| dest_airport_id | Destination Airport ID number |
| origin_city_market_id | Origin City ID number |
| ———————- | ——————————————- |
| Imprtant Vairbales | Description |
|---|---|
| cancelled | Variable that show if the flight was cancelled or not. |
| origin | Origin Airport |
| tailnum | Tail Number |
| fl_date | Flight Date (SFO data only shows yyyymmdd) |
| dep_time | Departures time (local time: hhmm) |
| arr_time | Arrival time (local time: hhmm) |
| arr_delay | Arrival Delay in minutes. Early arrival show negative numbers. |
| dep_delay | Departures Delay in minutes. Early departures show negative numbers. |
Answers:
Highest cancelled rate is in March 2018.
Lowest cancelled rate is in Feburary 2018.
There is no significant seasonal patterns since we only have five months of data, the sample size is too small. Especially all data is in one season (Quarter 1).
cnl_rate <- sfoflights18 %>%
select(MONTH, CANCELLED) %>%
group_by(MONTH) %>%
summarise(n = n(), cnl_n = sum(CANCELLED == 1), cnl_prop = paste(round(cnl_n / n * 100, 2), "%") ); cnl_rate
ggplot(data = cnl_rate, aes(x = MONTH, y = cnl_n)) +
geom_point() +
geom_line() +
ggtitle("SFO Flights Historical Cancellation Proportion")
# sfoflights18 has data in Bay Area only
sfoflights18 %>%
filter(CANCELLED == 0) %>%
group_by(TAIL_NUM) %>%
summarise(n = n()) %>%
arrange(desc(n) )
sfoflights18 %>%
select(FL_DATE, CANCELLED, TAIL_NUM) %>%
mutate( weeks = format(as.Date(FL_DATE), "%U") ) %>%
filter(TAIL_NUM == "N633VA", CANCELLED == 0) %>%
group_by(weeks) %>%
summarise(n = n()) %>%
ggplot(aes(x = weeks, y = n, group = 1)) +
geom_point() +
geom_line() +
geom_smooth() +
ggtitle("N633VA trips/week")
# Using "Planes" dataset from nycflights13 packages
planes2 <- planes %>%
rename(plane_year = year, TAIL_NUM = tailnum)
# Oldest plane that flew from SF airports in 2018: made in 1989
sfoflights18 %>%
inner_join(planes2, by = c("TAIL_NUM" = "TAIL_NUM")) %>%
filter(!is.na(plane_year), CANCELLED == 0) %>%
group_by(plane_year, TAIL_NUM) %>%
summarise(n = n()) %>%
arrange(plane_year)
# Number of airplanes that flew from SF are included in the planes table (old dataset):
sfoflights18 %>%
inner_join(planes2, by = c("TAIL_NUM" = "TAIL_NUM")) %>%
filter(!is.na(plane_year), CANCELLED == 0) %>%
summarise(n = n())
planes2 %>% filter(is.na(plane_year)) %>%
summarise(n = n())
# SFOFlights18 have missing date of manufacture: 1352
sfoflights18 %>%
inner_join(planes2, by = c("TAIL_NUM" = "TAIL_NUM")) %>%
filter(is.na(plane_year)) %>%
summarise(n = n())
# Top 5 most common manufacturers:
planes2 %>%
group_by(manufacturer) %>%
summarise(n = n()) %>%
arrange(desc(n))
planes3 <- planes2 %>%
mutate( manufacturer2 = ifelse(manufacturer %in% c("BOEING", "AIRBUS INDUSTRIE", "BOMBARDIER INC", "AIRBUS", "EMBRAER"), manufacturer, "Others"))
# Distribution of manufactureres over time
sfoflights18 %>%
inner_join(planes3, by = c("TAIL_NUM" = "TAIL_NUM")) %>%
select(manufacturer2, FL_DATE, CANCELLED) %>%
filter(CANCELLED == 0) %>%
group_by(FL_DATE, manufacturer2) %>%
summarise(n = n()) %>%
ggplot(aes(x = FL_DATE, y = n, color = manufacturer2)) +
geom_line() +
ggtitle("Distribution of Top 5 Most Common Manufacturers", subtitle = "Airplanes flying from SF in 2018")
# Import new (SF Bay Area) weather data
sfweather <- read.csv("stat650 dataset/sfoflights_weather.csv", stringsAsFactors = FALSE)
### Data wrangling (weather)
# REPORTTPYE = FM-15 for hourly data
sfweather_hour0 <- sfweather %>%
select(-c(1, 3:5, 8, 10, 12, 14, 16, 22:24, 26:90)) %>%
filter(!is.na(STATION_NAME), REPORTTPYE == "FM-15") %>%
mutate(station = substr(STATION_NAME, 1, 5),
ORIGIN = ifelse(station == "SAN J", "SJC",
ifelse(station == "SAN F", "SFO",
ifelse(station == "OAKLA", "OAK", ""))),
TEMP = as.numeric(substr(HOURLYDRYBULBTEMPF, 1, 2)) ,
VISIB = as.numeric(substr(HOURLYVISIBILITY, 1, 4)) ,
DEWP = as.numeric(substr(HOURLYDewPointTempF, 1, 2)) ,
WIND_SPEED = as.numeric(substr(HOURLYWindSpeed, 1, 2)) ,
WIND_DIR = as.numeric(ifelse(HOURLYWindDirection == "VRB", NA, HOURLYWindDirection)) ,
PRECIP = as.numeric(ifelse(HOURLYPrecip == "T", NA,
substr(HOURLYPrecip, 1, 4))) ,
HUMID = as.numeric(HOURLYRelativeHumidity),
WIND_GUST = as.numeric(HOURLYWindGustSpeed),
Date2 = mdy_hm(DATE),
YEAR = year(Date2),
MONTH = month(Date2),
DAY_OF_MONTH = day(Date2),
FL_DATE = date(Date2),
CRS_DEP_HR = hour(Date2),
CRS_DEP_MIN = minute(Date2))
## Warning in evalq(as.numeric(substr(HOURLYVISIBILITY, 1, 4)),
## <environment>): NAs introduced by coercion
# Total obs: 9452
# Keep only unique data (based on DATE) - total obs: 9372
sfweather_hour1 <- distinct(sfweather_hour0, DATE, .keep_all = TRUE)
# Simplify the weather dataset, keep variables that are useful for merging:
# Break down minutes into 5 intervals:
sfweather_hour <- sfweather_hour1 %>%
select(ORIGIN, YEAR, MONTH, DAY_OF_MONTH, FL_DATE, CRS_DEP_HR, CRS_DEP_MIN,
TEMP, DEWP, HUMID, WIND_DIR, WIND_SPEED, WIND_GUST,
PRECIP, HOURLYStationPressure, VISIB) %>%
mutate(CRS_DEP_TIME_H = ifelse(CRS_DEP_HR < 10,
paste("0", CRS_DEP_HR, sep=""), CRS_DEP_HR),
CRS_DEP_TIME_M = ifelse(CRS_DEP_MIN <= 11, "first",
ifelse(CRS_DEP_MIN <= 23, "second",
ifelse(CRS_DEP_MIN <= 35, "third",
ifelse(CRS_DEP_MIN <= 47, "fourth", "fifth")))) )
### Data wrangling (sfoflights)
# Break down minutes into 5 intervals:
sfoflights18_merge <- sfoflights18 %>%
select(-c(7, 9:11, 13, 15:16 ) ) %>%
mutate(CRS_DEP_TIME_Hr = substr(CRS_DEP_TIME, 1, 2),
CRS_DEP_TIME_MIN = as.numeric(substr(CRS_DEP_TIME, 3, 4))) %>%
mutate(CRS_DEP_TIME_H = ifelse(CRS_DEP_TIME_Hr == "24", "23", CRS_DEP_TIME_Hr),
CRS_DEP_TIME_M = ifelse(CRS_DEP_TIME_MIN <= 11, "first",
ifelse(CRS_DEP_TIME_MIN <= 23, "second",
ifelse(CRS_DEP_TIME_MIN <= 35, "third",
ifelse(CRS_DEP_TIME_MIN <= 47, "fourth", "fifth")))) )
### Merge sfweather with sfoflights
sfo_data <- left_join(sfoflights18_merge, sfweather_hour,
by = c("YEAR", "MONTH", "DAY_OF_MONTH", "FL_DATE",
"ORIGIN", "CRS_DEP_TIME_H", "CRS_DEP_TIME_M") )
I have tried my best to reduced the dublicated data when mergging the SFweather data to SFOFlights data. I breakdown the Scheduled depearture time (CRS_DEP_TIME_MIN) from SFOFlights dataset and the Time variable (CRS_DEP_TIME_MIN, modified from DATE/TIME variables) from the SFWeather dataset into 5 intervals, 12 minutes apart. There is now only 15 extra data points were created after mergging, all from SJC location. If I breakdown the time varibales further more, the dataset will then create even more extra/duplicated points. Given the fact that there were 113058 observations from the original SFOFlights dataset, 15 extra is insignificant and less impactful to the analysis.
sfweather_hr <- sfweather_hour %>%
select(ORIGIN, YEAR, MONTH, DAY_OF_MONTH, FL_DATE,
CRS_DEP_HR, TEMP, DEWP, HUMID, WIND_DIR, WIND_SPEED,
WIND_GUST, PRECIP, HOURLYStationPressure, VISIB)
################# Hourly Distribution
# Using SFweather_hr dataset:
# DEP_TIME = "hours" variable extract from the original sfweather dataset.
# ORIGIN = three differnt location (SFO, OAK, and SJC).
sfweather_hr %>%
filter(TEMP != "", MONTH == 5 ) %>%
ggplot(aes(x = DAY_OF_MONTH, y = TEMP, color = ORIGIN)) +
geom_point() +
geom_smooth() +
ggtitle("Distribution of Temperature", subtitle = "May 2018") +
labs(y = "Temp (F)", x = "Day", caption = "Source: NOAA | Climate Data Online")
# Wind Speed over 25 seems to be outliers.
summary(sfweather_hr$WIND_SPEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 5.000 8.000 8.613 13.000 36.000 15
boxplot(sfweather_hr$WIND_SPEED)
# Breakdown by month:
sfweather_hr %>%
filter(!is.na(WIND_SPEED)) %>%
#group_by(ORIGIN) %>%
ggplot(aes(x = WIND_SPEED, y = WIND_SPEED, group = MONTH)) +
geom_boxplot(aes(outlier.alpha = 0.1) ) + facet_grid(rows = vars(ORIGIN))
sfweather_hr2 <- sfweather_hr %>%
select(FL_DATE, DEWP, HUMID) %>%
filter(!is.na(DEWP), !is.na(HUMID)) %>%
gather(key = grp, val = statistics, DEWP, HUMID)
ggplot(sfweather_hr2, aes(x = FL_DATE, y = statistics, color = grp)) +
geom_line() +
geom_smooth()
# Scatter Plot
sfweather_hr %>%
select(FL_DATE, DEWP, HUMID) %>%
filter(!is.na(DEWP), !is.na(HUMID)) %>%
ggplot(aes(x = DEWP, y = HUMID)) +
geom_point() +
geom_smooth()
# Correlation between DEWP and HUMID, corr = 0.504
sfweather_hr %>%
select(DEWP, HUMID) %>%
filter(!is.na(DEWP), !is.na(HUMID)) %>%
ggpairs()
sfweather_hr %>%
select(FL_DATE, PRECIP, VISIB) %>%
filter(!is.na(PRECIP), !is.na(VISIB)) %>%
ggplot(aes(x = PRECIP, y = VISIB)) +
geom_point() +
geom_smooth()
# Correlation between PRECIP and VISIB, corr = -0.466
sfweather_hr %>%
select(PRECIP, VISIB) %>%
filter(!is.na(PRECIP), !is.na(VISIB)) %>%
ggpairs()
sfweather_hr %>%
select(FL_DATE, PRECIP) %>%
filter(PRECIP > 0) %>%
summarise(count = n_distinct(FL_DATE))
# Day of the week (Monday - Sunday)
sfo_data %>%
select(DAY_OF_WEEK, VISIB ) %>%
filter(!is.na(VISIB)) %>%
group_by(DAY_OF_WEEK) %>%
summarise(Avg_day = mean(VISIB))
# Month of the Year: (Jan - May)
sfo_data %>%
select(DAY_OF_MONTH, VISIB) %>%
filter(!is.na(VISIB)) %>%
group_by(DAY_OF_MONTH) %>%
summarise(Avg_month = mean(VISIB))