Load Packages:

# Loading the data packages

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(openintro)

## Warning: package 'openintro' was built under R version 4.1.2

## Loading required package: airports

## Warning: package 'airports' was built under R version 4.1.2

## Loading required package: cherryblossom

## Warning: package 'cherryblossom' was built under R version 4.1.2

## Loading required package: usdata

## Warning: package 'usdata' was built under R version 4.1.2

The Data:

#  Load and view the nycflights data frame.
data(nycflights)
nycflights

## # A tibble: 32,735 x 16
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     6    30      940        15     1216        -4 VX      N626VA 
##  2  2013     5     7     1657        -3     2104        10 DL      N3760C 
##  3  2013    12     8      859        -1     1238        11 DL      N712TW 
##  4  2013     5    14     1841        -4     2122       -34 DL      N914DL 
##  5  2013     7    21     1102        -3     1230        -8 9E      N823AY 
##  6  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
##  7  2013    12     9     1259        14     1617        22 WN      N218WN 
##  8  2013     8    13     1920        85     2032        71 B6      N284JB 
##  9  2013     9    26      725       -10     1027        -8 AA      N3FSAA 
## 10  2013     4    30     1323        62     1549        60 EV      N12163 
## # ... with 32,725 more rows, and 7 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>

# Viewing the column names

colnames(nycflights)

##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay" "arr_time" 
##  [7] "arr_delay" "carrier"   "tailnum"   "flight"    "origin"    "dest"     
## [13] "air_time"  "distance"  "hour"      "minute"

?nycflights

## starting httpd help server ... done

# T o see the quick details of data
glimpse(nycflights)

## Rows: 32,735
## Columns: 16
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, ~
## $ month     <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10~
## $ day       <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, ~
## $ dep_time  <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940~
## $ dep_delay <dbl> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, ~
## $ arr_time  <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, ~
## $ arr_delay <dbl> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, ~
## $ carrier   <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", ~
## $ tailnum   <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", ~
## $ flight    <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, ~
## $ origin    <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA~
## $ dest      <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA~
## $ air_time  <dbl> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,~
## $ distance  <dbl> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,~
## $ hour      <dbl> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6~
## $ minute    <dbl> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24~

unique(nycflights$dest)

##   [1] "LAX" "SJU" "TPA" "ORF" "ORD" "HOU" "IAD" "MIA" "JAX" "ROC" "RSW" "DAY"
##  [13] "ATL" "BTV" "BUF" "DCA" "FLL" "SFO" "PIT" "PBI" "DEN" "CLT" "CMH" "LAS"
##  [25] "DTW" "BNA" "PHL" "MKE" "DFW" "SNA" "CLE" "MCO" "BQN" "ABQ" "BOS" "IAH"
##  [37] "OMA" "SYR" "EGE" "PWM" "AUS" "STT" "MSY" "CVG" "RDU" "MDW" "IND" "TYS"
##  [49] "STL" "TUL" "JAC" "SEA" "MSP" "BWI" "SAT" "CRW" "BUR" "SLC" "CHS" "RIC"
##  [61] "SAN" "XNA" "MEM" "SRQ" "PHX" "MCI" "CAK" "SAV" "SDF" "TVC" "OAK" "GSP"
##  [73] "ALB" "BDL" "DSM" "LGB" "PDX" "MSN" "SMF" "GRR" "GSO" "BGR" "ACK" "SJC"
##  [85] "AVL" "OKC" "PVD" "MHT" "HNL" "MTJ" "BHM" "PSE" "ILM" "MVY" "HDN" "BZN"
##  [97] "CHO" "CAE" "EYW" "ANC" "MYR" "PSP"

How delayed were flights that were headed to Los Angeles?

#select rows where flight destination is equal to 'Los Angeles'
nycflights[nycflights$dest == 'LAX', ]

## # A tibble: 1,583 x 16
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     6    30      940        15     1216        -4 VX      N626VA 
##  2  2013    12     8      859        -1     1238        11 DL      N712TW 
##  3  2013     7     5      920         5     1204        -6 AA      N328AA 
##  4  2013     8    22     1108        -7     1352       -12 UA      N597UA 
##  5  2013     3    27     1158        -2     1455       -16 DL      N721TW 
##  6  2013     6     9     1914        -2     2234         9 UA      N26208 
##  7  2013    11    26     1545         0     1900       -20 AA      N324AA 
##  8  2013     3    24     2005         5     2248       -37 UA      N39726 
##  9  2013     9    17     1437        -8     1736       -22 UA      N505UA 
## 10  2013     3     5     1153        -7     1526        15 DL      N717TW 
## # ... with 1,573 more rows, and 7 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>

nycflights <- nycflights %>% mutate(month_name = month.name[month])
nycflights

## # A tibble: 32,735 x 17
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     6    30      940        15     1216        -4 VX      N626VA 
##  2  2013     5     7     1657        -3     2104        10 DL      N3760C 
##  3  2013    12     8      859        -1     1238        11 DL      N712TW 
##  4  2013     5    14     1841        -4     2122       -34 DL      N914DL 
##  5  2013     7    21     1102        -3     1230        -8 9E      N823AY 
##  6  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
##  7  2013    12     9     1259        14     1617        22 WN      N218WN 
##  8  2013     8    13     1920        85     2032        71 B6      N284JB 
##  9  2013     9    26      725       -10     1027        -8 AA      N3FSAA 
## 10  2013     4    30     1323        62     1549        60 EV      N12163 
## # ... with 32,725 more rows, and 8 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   month_name <chr>

library(dplyr)

How do departure delays vary by month?

nycflights %>%
  select (flight, dep_delay, month_name) %>%
  arrange(desc(month_name))

## # A tibble: 32,735 x 3
##    flight dep_delay month_name
##     <int>     <dbl> <chr>     
##  1   2279       -10 September 
##  2    317        -7 September 
##  3   1416         8 September 
##  4   3370        -5 September 
##  5   3604        -2 September 
##  6   1053         4 September 
##  7    775       -10 September 
##  8   1410        32 September 
##  9   1580        -5 September 
## 10   4333        -2 September 
## # ... with 32,725 more rows

ggplot(data = nycflights, aes(x = month_name, y = dep_delay)) + 
  geom_point() +
  geom_smooth(method=lm) # add linear trend line

## `geom_smooth()` using formula 'y ~ x'

Which of the three major NYC airports has the best on time percentage for departing flights?

# Selecting data for JFK airport
JFK_airport <- nycflights %>%
  select (flight, origin, dep_time, dep_delay, air_time) %>%
 filter(origin =="JFK") 
JFK_airport

## # A tibble: 10,897 x 5
##    flight origin dep_time dep_delay air_time
##     <int> <chr>     <int>     <dbl>    <dbl>
##  1    407 JFK         940        15      313
##  2    329 JFK        1657        -3      216
##  3    422 JFK         859        -1      376
##  4   2391 JFK        1841        -4      135
##  5   1407 JFK        1920        85       48
##  6     20 JFK         940         5       50
##  7     34 JFK        1217        -4       46
##  8   1271 JFK         757        -3      131
##  9     27 JFK        1638         8      334
## 10     97 JFK        2310       105      223
## # ... with 10,887 more rows

# Calculating the percentage of delay for JFK
JFK_percentage_delay <- (nrow(subset(JFK_airport, dep_delay > 0))/
nrow(JFK_airport))*100

JFK_percentage_delay

## [1] 38.15729

# Selecting data for LGA airport
LGA_airport <- nycflights %>%
  select (flight, origin, dep_time, dep_delay, air_time) %>%
 filter(origin =="LGA")

LGA_airport

## # A tibble: 10,067 x 5
##    flight origin dep_time dep_delay air_time
##     <int> <chr>     <int>     <dbl>    <dbl>
##  1   3652 LGA        1102        -3       50
##  2    353 LGA        1817        -3      138
##  3   2279 LGA         725       -10      148
##  4   1639 LGA        1320         5      161
##  5    645 LGA        2054       115      104
##  6   5273 LGA        1126        11       58
##  7    369 LGA        1626        -3      150
##  8   1433 LGA         626        -4      105
##  9   3388 LGA        1251        -4       83
## 10   3478 LGA         821        -8       77
## # ... with 10,057 more rows

# Calculating the percentage of delay for LGA
LGA_percentage_delay <- (nrow(subset(LGA_airport, dep_delay > 0))/
nrow(LGA_airport))*100

LGA_percentage_delay

## [1] 33.07837

# Selecting data for EWR airport
EWR_airport <- nycflights %>%
  select (flight, origin, dep_time, dep_delay, air_time) %>%
 filter(origin =="EWR")

EWR_airport

## # A tibble: 11,771 x 5
##    flight origin dep_time dep_delay air_time
##     <int> <chr>     <int>     <dbl>    <dbl>
##  1   1428 EWR        1259        14      240
##  2   4162 EWR        1323        62      110
##  3   5790 EWR         809        -2       87
##  4   4412 EWR        2024        37       53
##  5   4241 EWR         644        -1       45
##  6   1030 EWR         859        -1      121
##  7   1724 EWR         729         9      154
##  8   3852 EWR        2253       123       53
##  9   3709 EWR         752        -3      103
## 10   4224 EWR        1944        15      117
## # ... with 11,761 more rows

# Calculating the percentage of delay for EWR
EWR_percentage_delay <- (nrow(subset(EWR_airport, dep_delay > 0))/
nrow(EWR_airport))*100

EWR_percentage_delay

## [1] 45.11936

delay_percentage <- c(JFK_percentage_delay, LGA_percentage_delay, EWR_percentage_delay)
delay_percentage

## [1] 38.15729 33.07837 45.11936

min(delay_percentage)

## [1] 33.07837

# LGA has the least percentage of delay; it means it is the best airport for on time percentage 
# for departing flights

Analysis:

Departure Delays:

# the distribution of departure delays of all flights with a histogram.
ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#  define the binwidth
ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

# Exercise 1: ## Look carefully at these three histograms. How do they compare? ## Are features revealed in one that are obscured in another?

# Increasing the binwidth, presents more detail of data but unnecessary larger bandwidth 
# hides the changes in data. First histogram is the most balanced presenration
ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 150)

#  to visualize only on delays of flights headed to Los Angeles
LAX_flights <- nycflights %>% filter(dest == "LAX")

ggplot(data = LAX_flights, aes(x = dep_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# obtain numerical summaries for LA flights

LAX_flights %>%
  summarize(mean_LAX   = mean(dep_delay), 
            median_LAX  = median(dep_delay), 
            count       = n(),
            stdev       = sd(dep_delay),
            Inter_quart_range = IQR(dep_delay),
            min_valve = min(dep_delay),
            max_value = max(dep_delay))

## # A tibble: 1 x 7
##   mean_LAX median_LAX count stdev Inter_quart_range min_valve max_value
##      <dbl>      <dbl> <int> <dbl>             <dbl>     <dbl>     <dbl>
## 1     9.78         -1  1583  33.5                11       -13       345

Exercise 2:

Create a new data frame that includes flights headed to SFO in February,

and save this data frame as sfo_feb_flights. How many flights meet these criteria?

# Selecting data for JFK airport
SFO_airport <- nycflights %>%
 filter(dest =="SFO", month == 2) 
SFO_airport

## # A tibble: 68 x 17
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     2    18     1527        57     1903        48 DL      N711ZX 
##  2  2013     2     3      613        14     1008        38 UA      N502UA 
##  3  2013     2    15      955        -5     1313       -28 DL      N717TW 
##  4  2013     2    18     1928        15     2239        -6 UA      N24212 
##  5  2013     2    24     1340         2     1644       -21 UA      N76269 
##  6  2013     2    25     1415       -10     1737       -13 UA      N532UA 
##  7  2013     2     7     1032         1     1352       -10 B6      N627JB 
##  8  2013     2    15     1805        20     2122         2 AA      N335AA 
##  9  2013     2    13     1056        -4     1412       -13 UA      N532UA 
## 10  2013     2     8      656        -4     1039        -6 DL      N710TW 
## # ... with 58 more rows, and 8 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   month_name <chr>

# The count of flights that meet the criteria
nrow(SFO_airport)

## [1] 68

Exercise 3:

Describe the distribution of the arrival delays of these flights using a histogram

and appropriate summary statistics

ggplot(data = SFO_airport, aes(x = arr_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# obtain numerical summaries for SFO flights

SFO_airport %>%
  summarise(mean_SFO   = mean(arr_delay), 
            median_SFO  = median(arr_delay), 
            count       = n(),
            stdev       = sd(arr_delay),
            Inter_quart_range = IQR(arr_delay),
            min_valve = min(arr_delay),
            max_value = max(arr_delay))

## # A tibble: 1 x 7
##   mean_SFO median_SFO count stdev Inter_quart_range min_valve max_value
##      <dbl>      <dbl> <int> <dbl>             <dbl>     <dbl>     <dbl>
## 1     -4.5        -11    68  36.3              23.2       -66       196

Exercise 4:

Calculate the median and interquartile range for arr_delays

of flights in the sfo_airport data frame, grouped by carrier.

Which carrier has the most variable arrival delays?

# The UA and DL carriers has the most variable arrival delays because the IQR is the widest spread range
SFO_airport %>%
  group_by(carrier) %>%
  summarise(median_sfo = median(arr_delay),
            Inter_quart_range = IQR(arr_delay),
            count = n()) %>%
  arrange(desc(count))

## # A tibble: 5 x 4
##   carrier median_sfo Inter_quart_range count
##   <chr>        <dbl>             <dbl> <int>
## 1 UA           -10                22      21
## 2 DL           -15                22      19
## 3 VX           -22.5              21.2    12
## 4 AA             5                17.5    10
## 5 B6           -10.5              12.2     6

Departure Delays by Month:

Which month would you expect to have the highest average delay departing from an NYC airport?

# The flights group by month shows that July is the highest average delay departing from an NYC airport

nycflights %>%
  group_by(month) %>%
  summarise(mean_nyc = mean(dep_delay),
            median_nyc = median(dep_delay)) %>%
  arrange(desc(mean_nyc))

## # A tibble: 12 x 3
##    month mean_nyc median_nyc
##    <int>    <dbl>      <dbl>
##  1     7    20.8           0
##  2     6    20.4           0
##  3    12    17.4           1
##  4     4    14.6          -2
##  5     3    13.5          -1
##  6     5    13.3          -1
##  7     8    12.6          -1
##  8     2    10.7          -2
##  9     1    10.2          -2
## 10     9     6.87         -3
## 11    11     6.10         -2
## 12    10     5.88         -3

Exercise 5:

Suppose you really dislike departure delays and you want to schedule your travel in a month that minimizes your ## potential departure delay leaving NYC. One option is to choose the month with the lowest mean departure delay. ## Another option is to choose the month with the lowest median departure delay.

What are the pros and cons of these two choices?

# pros and cons: Mean values sometimes get effected by the outliers and resulted in a wrong judgement
# So its better to consider median value

On Time Departure rate for NYC Airports:

flying out of NYC and want to know which of the three major NYC airports has the best on time

departure rate of departing flights

# classifying each flight as “on time” or “delayed” by creating a new variable with the mutate function.
nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))

nycflights

## # A tibble: 32,735 x 18
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     6    30      940        15     1216        -4 VX      N626VA 
##  2  2013     5     7     1657        -3     2104        10 DL      N3760C 
##  3  2013    12     8      859        -1     1238        11 DL      N712TW 
##  4  2013     5    14     1841        -4     2122       -34 DL      N914DL 
##  5  2013     7    21     1102        -3     1230        -8 9E      N823AY 
##  6  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
##  7  2013    12     9     1259        14     1617        22 WN      N218WN 
##  8  2013     8    13     1920        85     2032        71 B6      N284JB 
##  9  2013     9    26      725       -10     1027        -8 AA      N3FSAA 
## 10  2013     4    30     1323        62     1549        60 EV      N12163 
## # ... with 32,725 more rows, and 9 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   month_name <chr>, dep_type <chr>

# We can now find best on time rate NYC airport  

nycflights %>%
  group_by(origin) %>%
  summarise(dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(dep_rate))

## # A tibble: 3 x 2
##   origin dep_rate
##   <chr>     <dbl>
## 1 LGA       0.728
## 2 JFK       0.694
## 3 EWR       0.637

Exercise 6:

If you were selecting an airport simply based on on time departure percentage,

which NYC airport would you choose to fly out of?

# We can now visualize the results above and conclude that LGA has the best departure percentage

ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
  geom_bar()

# More Practice: # Exercise 7: ## Mutate the data frame so that it includes a new variable that contains the average speed, ## avg_speed traveled by the plane for each flight (in mph)

nycflights <- nycflights %>%
  mutate(avg_speed = distance / (air_time / 60))

nycflights

## # A tibble: 32,735 x 19
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     6    30      940        15     1216        -4 VX      N626VA 
##  2  2013     5     7     1657        -3     2104        10 DL      N3760C 
##  3  2013    12     8      859        -1     1238        11 DL      N712TW 
##  4  2013     5    14     1841        -4     2122       -34 DL      N914DL 
##  5  2013     7    21     1102        -3     1230        -8 9E      N823AY 
##  6  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
##  7  2013    12     9     1259        14     1617        22 WN      N218WN 
##  8  2013     8    13     1920        85     2032        71 B6      N284JB 
##  9  2013     9    26      725       -10     1027        -8 AA      N3FSAA 
## 10  2013     4    30     1323        62     1549        60 EV      N12163 
## # ... with 32,725 more rows, and 10 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   month_name <chr>, dep_type <chr>, avg_speed <dbl>

Exercise 8:

Make a scatterplot of avg_speed vs. distance. Describe the relationship between average speed and distance.

ggplot(data = nycflights, aes(x = distance, y = avg_speed, color= carrier)) + 
  geom_point() +
  geom_smooth(method=lm) # add linear trend line

## `geom_smooth()` using formula 'y ~ x'

# Exercise 9: ## Replicate the following plot

filtered_carrier <- nycflights %>%
  filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
# Now draw the plot
ggplot(data = filtered_carrier, aes(x = dep_delay, y = arr_delay, color= carrier)) + geom_point()

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Week1 Assignment2 - Introduction to Data