# Loading the data packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
# Load and view the nycflights data frame.
data(nycflights)
nycflights
## # A tibble: 32,735 x 16
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 6 30 940 15 1216 -4 VX N626VA
## 2 2013 5 7 1657 -3 2104 10 DL N3760C
## 3 2013 12 8 859 -1 1238 11 DL N712TW
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA
## 7 2013 12 9 1259 14 1617 22 WN N218WN
## 8 2013 8 13 1920 85 2032 71 B6 N284JB
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA
## 10 2013 4 30 1323 62 1549 60 EV N12163
## # ... with 32,725 more rows, and 7 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>
# Viewing the column names
colnames(nycflights)
## [1] "year" "month" "day" "dep_time" "dep_delay" "arr_time"
## [7] "arr_delay" "carrier" "tailnum" "flight" "origin" "dest"
## [13] "air_time" "distance" "hour" "minute"
?nycflights
## starting httpd help server ... done
# T o see the quick details of data
glimpse(nycflights)
## Rows: 32,735
## Columns: 16
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, ~
## $ month <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10~
## $ day <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, ~
## $ dep_time <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940~
## $ dep_delay <dbl> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, ~
## $ arr_time <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, ~
## $ arr_delay <dbl> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, ~
## $ carrier <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", ~
## $ tailnum <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", ~
## $ flight <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, ~
## $ origin <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA~
## $ dest <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA~
## $ air_time <dbl> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,~
## $ distance <dbl> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,~
## $ hour <dbl> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6~
## $ minute <dbl> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24~
unique(nycflights$dest)
## [1] "LAX" "SJU" "TPA" "ORF" "ORD" "HOU" "IAD" "MIA" "JAX" "ROC" "RSW" "DAY"
## [13] "ATL" "BTV" "BUF" "DCA" "FLL" "SFO" "PIT" "PBI" "DEN" "CLT" "CMH" "LAS"
## [25] "DTW" "BNA" "PHL" "MKE" "DFW" "SNA" "CLE" "MCO" "BQN" "ABQ" "BOS" "IAH"
## [37] "OMA" "SYR" "EGE" "PWM" "AUS" "STT" "MSY" "CVG" "RDU" "MDW" "IND" "TYS"
## [49] "STL" "TUL" "JAC" "SEA" "MSP" "BWI" "SAT" "CRW" "BUR" "SLC" "CHS" "RIC"
## [61] "SAN" "XNA" "MEM" "SRQ" "PHX" "MCI" "CAK" "SAV" "SDF" "TVC" "OAK" "GSP"
## [73] "ALB" "BDL" "DSM" "LGB" "PDX" "MSN" "SMF" "GRR" "GSO" "BGR" "ACK" "SJC"
## [85] "AVL" "OKC" "PVD" "MHT" "HNL" "MTJ" "BHM" "PSE" "ILM" "MVY" "HDN" "BZN"
## [97] "CHO" "CAE" "EYW" "ANC" "MYR" "PSP"
#select rows where flight destination is equal to 'Los Angeles'
nycflights[nycflights$dest == 'LAX', ]
## # A tibble: 1,583 x 16
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 6 30 940 15 1216 -4 VX N626VA
## 2 2013 12 8 859 -1 1238 11 DL N712TW
## 3 2013 7 5 920 5 1204 -6 AA N328AA
## 4 2013 8 22 1108 -7 1352 -12 UA N597UA
## 5 2013 3 27 1158 -2 1455 -16 DL N721TW
## 6 2013 6 9 1914 -2 2234 9 UA N26208
## 7 2013 11 26 1545 0 1900 -20 AA N324AA
## 8 2013 3 24 2005 5 2248 -37 UA N39726
## 9 2013 9 17 1437 -8 1736 -22 UA N505UA
## 10 2013 3 5 1153 -7 1526 15 DL N717TW
## # ... with 1,573 more rows, and 7 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>
nycflights <- nycflights %>% mutate(month_name = month.name[month])
nycflights
## # A tibble: 32,735 x 17
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 6 30 940 15 1216 -4 VX N626VA
## 2 2013 5 7 1657 -3 2104 10 DL N3760C
## 3 2013 12 8 859 -1 1238 11 DL N712TW
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA
## 7 2013 12 9 1259 14 1617 22 WN N218WN
## 8 2013 8 13 1920 85 2032 71 B6 N284JB
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA
## 10 2013 4 30 1323 62 1549 60 EV N12163
## # ... with 32,725 more rows, and 8 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # month_name <chr>
library(dplyr)
nycflights %>%
select (flight, dep_delay, month_name) %>%
arrange(desc(month_name))
## # A tibble: 32,735 x 3
## flight dep_delay month_name
## <int> <dbl> <chr>
## 1 2279 -10 September
## 2 317 -7 September
## 3 1416 8 September
## 4 3370 -5 September
## 5 3604 -2 September
## 6 1053 4 September
## 7 775 -10 September
## 8 1410 32 September
## 9 1580 -5 September
## 10 4333 -2 September
## # ... with 32,725 more rows
ggplot(data = nycflights, aes(x = month_name, y = dep_delay)) +
geom_point() +
geom_smooth(method=lm) # add linear trend line
## `geom_smooth()` using formula 'y ~ x'
# Selecting data for JFK airport
JFK_airport <- nycflights %>%
select (flight, origin, dep_time, dep_delay, air_time) %>%
filter(origin =="JFK")
JFK_airport
## # A tibble: 10,897 x 5
## flight origin dep_time dep_delay air_time
## <int> <chr> <int> <dbl> <dbl>
## 1 407 JFK 940 15 313
## 2 329 JFK 1657 -3 216
## 3 422 JFK 859 -1 376
## 4 2391 JFK 1841 -4 135
## 5 1407 JFK 1920 85 48
## 6 20 JFK 940 5 50
## 7 34 JFK 1217 -4 46
## 8 1271 JFK 757 -3 131
## 9 27 JFK 1638 8 334
## 10 97 JFK 2310 105 223
## # ... with 10,887 more rows
# Calculating the percentage of delay for JFK
JFK_percentage_delay <- (nrow(subset(JFK_airport, dep_delay > 0))/
nrow(JFK_airport))*100
JFK_percentage_delay
## [1] 38.15729
# Selecting data for LGA airport
LGA_airport <- nycflights %>%
select (flight, origin, dep_time, dep_delay, air_time) %>%
filter(origin =="LGA")
LGA_airport
## # A tibble: 10,067 x 5
## flight origin dep_time dep_delay air_time
## <int> <chr> <int> <dbl> <dbl>
## 1 3652 LGA 1102 -3 50
## 2 353 LGA 1817 -3 138
## 3 2279 LGA 725 -10 148
## 4 1639 LGA 1320 5 161
## 5 645 LGA 2054 115 104
## 6 5273 LGA 1126 11 58
## 7 369 LGA 1626 -3 150
## 8 1433 LGA 626 -4 105
## 9 3388 LGA 1251 -4 83
## 10 3478 LGA 821 -8 77
## # ... with 10,057 more rows
# Calculating the percentage of delay for LGA
LGA_percentage_delay <- (nrow(subset(LGA_airport, dep_delay > 0))/
nrow(LGA_airport))*100
LGA_percentage_delay
## [1] 33.07837
# Selecting data for EWR airport
EWR_airport <- nycflights %>%
select (flight, origin, dep_time, dep_delay, air_time) %>%
filter(origin =="EWR")
EWR_airport
## # A tibble: 11,771 x 5
## flight origin dep_time dep_delay air_time
## <int> <chr> <int> <dbl> <dbl>
## 1 1428 EWR 1259 14 240
## 2 4162 EWR 1323 62 110
## 3 5790 EWR 809 -2 87
## 4 4412 EWR 2024 37 53
## 5 4241 EWR 644 -1 45
## 6 1030 EWR 859 -1 121
## 7 1724 EWR 729 9 154
## 8 3852 EWR 2253 123 53
## 9 3709 EWR 752 -3 103
## 10 4224 EWR 1944 15 117
## # ... with 11,761 more rows
# Calculating the percentage of delay for EWR
EWR_percentage_delay <- (nrow(subset(EWR_airport, dep_delay > 0))/
nrow(EWR_airport))*100
EWR_percentage_delay
## [1] 45.11936
delay_percentage <- c(JFK_percentage_delay, LGA_percentage_delay, EWR_percentage_delay)
delay_percentage
## [1] 38.15729 33.07837 45.11936
min(delay_percentage)
## [1] 33.07837
# LGA has the least percentage of delay; it means it is the best airport for on time percentage
# for departing flights
# the distribution of departure delays of all flights with a histogram.
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# define the binwidth
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram(binwidth = 15)
# Exercise 1: ## Look carefully at these three histograms. How do they compare? ## Are features revealed in one that are obscured in another?
# Increasing the binwidth, presents more detail of data but unnecessary larger bandwidth
# hides the changes in data. First histogram is the most balanced presenration
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram(binwidth = 150)
# to visualize only on delays of flights headed to Los Angeles
LAX_flights <- nycflights %>% filter(dest == "LAX")
ggplot(data = LAX_flights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# obtain numerical summaries for LA flights
LAX_flights %>%
summarize(mean_LAX = mean(dep_delay),
median_LAX = median(dep_delay),
count = n(),
stdev = sd(dep_delay),
Inter_quart_range = IQR(dep_delay),
min_valve = min(dep_delay),
max_value = max(dep_delay))
## # A tibble: 1 x 7
## mean_LAX median_LAX count stdev Inter_quart_range min_valve max_value
## <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 9.78 -1 1583 33.5 11 -13 345
# Selecting data for JFK airport
SFO_airport <- nycflights %>%
filter(dest =="SFO", month == 2)
SFO_airport
## # A tibble: 68 x 17
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 2 18 1527 57 1903 48 DL N711ZX
## 2 2013 2 3 613 14 1008 38 UA N502UA
## 3 2013 2 15 955 -5 1313 -28 DL N717TW
## 4 2013 2 18 1928 15 2239 -6 UA N24212
## 5 2013 2 24 1340 2 1644 -21 UA N76269
## 6 2013 2 25 1415 -10 1737 -13 UA N532UA
## 7 2013 2 7 1032 1 1352 -10 B6 N627JB
## 8 2013 2 15 1805 20 2122 2 AA N335AA
## 9 2013 2 13 1056 -4 1412 -13 UA N532UA
## 10 2013 2 8 656 -4 1039 -6 DL N710TW
## # ... with 58 more rows, and 8 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # month_name <chr>
# The count of flights that meet the criteria
nrow(SFO_airport)
## [1] 68
ggplot(data = SFO_airport, aes(x = arr_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# obtain numerical summaries for SFO flights
SFO_airport %>%
summarise(mean_SFO = mean(arr_delay),
median_SFO = median(arr_delay),
count = n(),
stdev = sd(arr_delay),
Inter_quart_range = IQR(arr_delay),
min_valve = min(arr_delay),
max_value = max(arr_delay))
## # A tibble: 1 x 7
## mean_SFO median_SFO count stdev Inter_quart_range min_valve max_value
## <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 -4.5 -11 68 36.3 23.2 -66 196
# The UA and DL carriers has the most variable arrival delays because the IQR is the widest spread range
SFO_airport %>%
group_by(carrier) %>%
summarise(median_sfo = median(arr_delay),
Inter_quart_range = IQR(arr_delay),
count = n()) %>%
arrange(desc(count))
## # A tibble: 5 x 4
## carrier median_sfo Inter_quart_range count
## <chr> <dbl> <dbl> <int>
## 1 UA -10 22 21
## 2 DL -15 22 19
## 3 VX -22.5 21.2 12
## 4 AA 5 17.5 10
## 5 B6 -10.5 12.2 6
# The flights group by month shows that July is the highest average delay departing from an NYC airport
nycflights %>%
group_by(month) %>%
summarise(mean_nyc = mean(dep_delay),
median_nyc = median(dep_delay)) %>%
arrange(desc(mean_nyc))
## # A tibble: 12 x 3
## month mean_nyc median_nyc
## <int> <dbl> <dbl>
## 1 7 20.8 0
## 2 6 20.4 0
## 3 12 17.4 1
## 4 4 14.6 -2
## 5 3 13.5 -1
## 6 5 13.3 -1
## 7 8 12.6 -1
## 8 2 10.7 -2
## 9 1 10.2 -2
## 10 9 6.87 -3
## 11 11 6.10 -2
## 12 10 5.88 -3
# pros and cons: Mean values sometimes get effected by the outliers and resulted in a wrong judgement
# So its better to consider median value
# classifying each flight as “on time” or “delayed” by creating a new variable with the mutate function.
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights
## # A tibble: 32,735 x 18
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 6 30 940 15 1216 -4 VX N626VA
## 2 2013 5 7 1657 -3 2104 10 DL N3760C
## 3 2013 12 8 859 -1 1238 11 DL N712TW
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA
## 7 2013 12 9 1259 14 1617 22 WN N218WN
## 8 2013 8 13 1920 85 2032 71 B6 N284JB
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA
## 10 2013 4 30 1323 62 1549 60 EV N12163
## # ... with 32,725 more rows, and 9 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # month_name <chr>, dep_type <chr>
# We can now find best on time rate NYC airport
nycflights %>%
group_by(origin) %>%
summarise(dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(dep_rate))
## # A tibble: 3 x 2
## origin dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
# We can now visualize the results above and conclude that LGA has the best departure percentage
ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
geom_bar()
# More Practice: # Exercise 7: ## Mutate the data frame so that it includes a new variable that contains the average speed, ## avg_speed traveled by the plane for each flight (in mph)
nycflights <- nycflights %>%
mutate(avg_speed = distance / (air_time / 60))
nycflights
## # A tibble: 32,735 x 19
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## <int> <int> <int> <int> <dbl> <int> <dbl> <chr> <chr>
## 1 2013 6 30 940 15 1216 -4 VX N626VA
## 2 2013 5 7 1657 -3 2104 10 DL N3760C
## 3 2013 12 8 859 -1 1238 11 DL N712TW
## 4 2013 5 14 1841 -4 2122 -34 DL N914DL
## 5 2013 7 21 1102 -3 1230 -8 9E N823AY
## 6 2013 1 1 1817 -3 2008 3 AA N3AXAA
## 7 2013 12 9 1259 14 1617 22 WN N218WN
## 8 2013 8 13 1920 85 2032 71 B6 N284JB
## 9 2013 9 26 725 -10 1027 -8 AA N3FSAA
## 10 2013 4 30 1323 62 1549 60 EV N12163
## # ... with 32,725 more rows, and 10 more variables: flight <int>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # month_name <chr>, dep_type <chr>, avg_speed <dbl>
ggplot(data = nycflights, aes(x = distance, y = avg_speed, color= carrier)) +
geom_point() +
geom_smooth(method=lm) # add linear trend line
## `geom_smooth()` using formula 'y ~ x'
# Exercise 9: ## Replicate the following plot
filtered_carrier <- nycflights %>%
filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
# Now draw the plot
ggplot(data = filtered_carrier, aes(x = dep_delay, y = arr_delay, color= carrier)) + geom_point()
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.