library(ggplot2)
library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
#q1
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
mydata=flights %>%
filter(!is.na(dep_time)) %>%
mutate(months=as.character(month), days=as.character(day)) %>%
mutate(dep_time = make_datetime_100(year, month, day, dep_time))
data2 <- filter(mydata, month == 12)
plot1 <- ggplot(data = mydata) +
geom_freqpoly(mapping = aes(x = dep_time), binwidth=86400 )
plot1 + labs(title="Fig 1.1(a)", subtitle = " Freq Polygon: Represents freq of flights for different departure times")
plot <- ggplot(data = mydata, mapping = aes(y=dep_time)) +
geom_boxplot()
plot + labs(title="Fig 1.1 (b)", subtitle=" Box Plot: Represents flights dataset using departure times")
plot2 <- ggplot(data = mydata, mapping = aes(x = dep_time, colour = carrier)) +
geom_freqpoly(binwidth = 86400)
plot2 + labs(title="Fig 1.1 (c)", subtitle="Freq Polygon: Represents freq of flights for different dep times carrier wise")
plot3 <- ggplot(data = mydata, mapping = aes(x = dep_time, colour = origin)) +
geom_freqpoly(binwidth = 86400)
plot3 + labs(title="Fig 1.1 (d)", subtitle="Frequency Polygon: Represents frequencies of flights for different dep times origin wise")
#Q2
plot4 <- ggplot(data = data2) +
geom_histogram(mapping = aes(x = dep_time), binwidth=86400 )
plot4 + labs(title="Fig 1.2 (a)", subtitle="Histogram : Represents freq of flights for Dec for different dep times")
plot5 <- ggplot(data = data2 , mapping = aes(y=dep_time)) +
geom_boxplot()
plot5 + labs(title="Fig 1.2 (b)", subtitle="Box Plot: Represents Dec flights using dep times")
plot6 <- ggplot(data = data2, mapping = aes(x = dep_time)) +
geom_freqpoly(binwidth = 86400)
plot6 + labs(title="Fig 1.2 (c)", subtitle="Freq Polygon: Represents frequencies of flights for different dep times")
data3 <- filter(mydata, (month == 12 & day == 31 ) | (month ==1 & day == 1))
data4 <- mutate(data3, date= paste(as.character(day),as.character(month),"2013", sep="/" ))
data5 <- filter(mydata, (month == 12 & day == 31 ))
data6 <- filter(mydata, (month == 1 & day == 1 ))
plot7 <- ggplot(data = data5) +
geom_freqpoly(mapping = aes(x = dep_time), binwidth=3600)
plot7 + labs(title="Fig 1.3 (a)", subtitle="Histogram : Represents frequencies of flights for 31 Dec different dep times")
plot8 <- ggplot(data = data6) +
geom_freqpoly(mapping = aes(x = dep_time), binwidth=3600)
plot8 + labs(title="Fig 1.3 (b)", subtitle="Histogram : Represents frequencies of flights for 1 Jan for different dep times")
plot9 <- ggplot(data = data4) +
geom_histogram(mapping = aes(x = dep_time, fill=date), binwidth=86400)
plot9 + labs(title="Fig 1.3 (c)", subtitle="Histogram : Represents frequencies of flights for 31 Dec and 1 Jan for different dep times")
#plot8 <- ggplot(data = data4 , mapping = aes(x=date, y=dep_time, fill=date)) +
# geom_boxplot()
#plot8 + labs(title="Box Plot: Represents the flights on 31 Dec and 1 Jan using dep times")
#plot9 <- ggplot(data = data4, mapping = aes(x = dep_time, colour = date)) +
# geom_freqpoly(binwidth = 60)
#plot9 + labs(title="Freq Polygon: Represents freq of flights on 31 Dec and 1 Jan for different dep times ")
From the frequency polygon in Fig 1.1 (a), we can infer that about 700-1000 flights depart at most departure times on most days. However, some departure times were observed to have a lower count of around 500 flights while a particular departure time on a day between Jan 2013 and April 2013 was noted to correspond to only 125 flights. From the box plot in Fig 1.1 (b), we can infer that about a quarter of the total flights in 2013 departed before April. As the median lies around July, we can infer that more than half of the total flights in 2013, departed in the second half of the year. From 1.1(c), we can infer that at most departure times, the maximum number of flights departing are of the carrier UA. From 1.1(d), we can infer that most flight at most departure times depart from EWR. Also, we can see that before september, around equal number of flights departed from both JFK and LGA at most departure times. However, after September there was a rise in the number of flights departing from LGA.
We can observe fro the histogram in Fig 1.2(a) and the frequency polygon in 1.2 (c), that in December, about 800-1000 flights depart at ost departure times on most days. The maximum observed was about 1000 on 2 Dec,2013 and minimum observed was around 500 slightly before mid December. From the box plot in Fig 1.2 (b), we can observe that about a quarter of the total flights in December, departed before 9th December. As the median lies close to 16th December, we can assume that approximately equal number of flights depart in the two halves of the month
We can observe, by comparing Fig 1.3(a) and Fig 1.3(b), we can infer that the plots look somewhat similar in the first half of the day for both 31 Dec and 1 Jan. However, we can notice that between 3 PM - 8 PM, the number of flights on 31 Dec seem to lie mostly in the range of 30-60, while those on 1 Jan seem to lie in the range of 40-70. From the Histogram in Fig 1.3(c), we can infer that relatively more flights departed on 1 Jan as compared to 31 December. This could be because several people return home after celebrating New Year’s eve at a holiday destination or their native place and it is generally marks the end of vacations.
mydata %>%
group_by(carrier, hour) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE),dist = mean(distance, na.rm = TRUE)) %>%
ggplot(mapping = aes(x = delay, y = dist, color = carrier)) +
geom_point()
After grouping the data by carriers and hours, we observe that there is no well defined correlation between average delay and distance. The maximum average delay was observed on a flight of carrier YV and the average distance was 250 miles