Homework 3

data <- nycflights13::flights
head(data, n = 5)

## # A tibble: 5 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Q1:

LGA has the smallest average delay

return1 <- data %>% group_by(origin) %>% summarise(mean = mean(dep_delay, na.rm = TRUE), median = median(dep_delay, na.rm = TRUE), var = var(dep_delay, na.rm = TRUE), observations = length(dep_delay)
  )
  
print(return1)

## # A tibble: 3 × 5
##   origin  mean median   var observations
##   <chr>  <dbl>  <dbl> <dbl>        <int>
## 1 EWR     15.1     -1 1708.       120835
## 2 JFK     12.1     -1 1524.       111279
## 3 LGA     10.3     -3 1599.       104662

Q2:

Yes if the dep_delay is <= 0.

Q3:

data$date <- make_date(data$year, data$month, data$day)
return <- data %>% sample_n(10) %>% select(carrier, flight, arr_delay, dep_delay, air_time, date)

print(return)

## # A tibble: 10 × 6
##    carrier flight arr_delay dep_delay air_time date      
##    <chr>    <int>     <dbl>     <dbl>    <dbl> <date>    
##  1 B6        1176       -24        -5       36 2013-04-21
##  2 EV        4471         3        -5      106 2013-07-12
##  3 AS           7       -16        -3      323 2013-06-05
##  4 WN        3421       -10        -5      118 2013-04-21
##  5 EV        4671        -7        -1       72 2013-02-07
##  6 DL        1616         9        -3      342 2013-09-30
##  7 B6         305         2         0      142 2013-07-17
##  8 MQ        4479       -19        -8       70 2013-03-09
##  9 EV        4297       -21        -3       78 2013-10-01
## 10 US        2156       -21        -5       40 2013-08-01

Q4:

return <- data %>%  group_by(origin) %>% slice_max(order_by = arr_delay, n = 6, with_ties = FALSE) %>% select(origin, carrier, flight, arr_delay, dep_delay, air_time)

print(return)

## # A tibble: 18 × 6
## # Groups:   origin [3]
##    origin carrier flight arr_delay dep_delay air_time
##    <chr>  <chr>    <int>     <dbl>     <dbl>    <dbl>
##  1 EWR    MQ        3695      1109      1126      111
##  2 EWR    AA         172       878       896      149
##  3 EWR    MQ        3744       875       878      112
##  4 EWR    DL        1223       847       849      290
##  5 EWR    AA         172       846       845      145
##  6 EWR    DL        2042       796       798      109
##  7 JFK    HA          51      1272      1301      640
##  8 JFK    MQ        3535      1127      1137       74
##  9 JFK    AA         177      1007      1014      354
## 10 JFK    MQ        3075       989      1005       96
## 11 JFK    DL        2391       931       960      139
## 12 JFK    DL        2391       856       825      173
## 13 LGA    DL        2119       915       911      167
## 14 LGA    DL        2047       895       898      109
## 15 LGA    F9         835       834       853      233
## 16 LGA    DL        1435       821       812      174
## 17 LGA    AA        2019       802       803      134
## 18 LGA    DL        1715       780       787      160

Q5:

UA

EWR <- table(data$carrier[data$origin == "EWR"])
return <- names(EWR)[which.max(EWR)]

cat(return)

## UA

Q6:

166

big5 <- (data$carrier == "AA" |  data$carrier == "DL" | data$carrier == "WN" | data$carrier == "UA" | data$carrier == "AS")
search <- data$dep_delay == 0 & data$arr_delay == 0
return <- sum(search & big5, na.rm = TRUE)
cat(return)

## 166

Q7:

LAX with 2580 flights and SFO with 2197 flights.

aeroporte <- table(data$dest[data$carrier == "VX"])
check1 <- names(aeroporte)[which.max(aeroporte)]
count1 <- max(aeroporte)

cond <- aeroporte[names(aeroporte) != check1]
check2 <- names(cond)[which.max(cond)]
count2 <- max(cond)

cat(check1, count1)

## LAX 2580

cat(check2, count2)

## SFO 2197

Q8:

342 tied for the longest distance JFK to HNL

[1] “2013-09-12” “2013-09-17” “2013-09-19” “2013-09-24” “2013-09-26”

[6] “2013-10-01” “2013-10-03” “2013-10-08” “2013-10-10” “2013-10-15”

[11] “2013-10-17” “2013-10-22” “2013-10-24” “2013-10-29” “2013-10-31”

[16] “2013-11-05” “2013-11-07” “2013-11-14” “2013-11-19” “2013-11-28”

[21] “2013-12-03” “2013-12-10” “2013-12-12”

data <- data %>%  rename(`Flight Number` = flight, `Plane Tail Number` = tailnum)
return1 <- data %>% arrange(desc(distance)) %>% slice(1:10) %>% select(carrier, `Flight Number`, `Plane Tail Number`, distance, origin, dest, year, month, day)

print(return1)

## # A tibble: 10 × 9
##    carrier `Flight Number` `Plane Tail Number` distance origin dest   year month
##    <chr>             <int> <chr>                  <dbl> <chr>  <chr> <int> <int>
##  1 HA                   51 N380HA                  4983 JFK    HNL    2013     1
##  2 HA                   51 N380HA                  4983 JFK    HNL    2013     1
##  3 HA                   51 N380HA                  4983 JFK    HNL    2013     1
##  4 HA                   51 N384HA                  4983 JFK    HNL    2013     1
##  5 HA                   51 N381HA                  4983 JFK    HNL    2013     1
##  6 HA                   51 N385HA                  4983 JFK    HNL    2013     1
##  7 HA                   51 N385HA                  4983 JFK    HNL    2013     1
##  8 HA                   51 N389HA                  4983 JFK    HNL    2013     1
##  9 HA                   51 N384HA                  4983 JFK    HNL    2013     1
## 10 HA                   51 N388HA                  4983 JFK    HNL    2013     1
## # ℹ 1 more variable: day <int>

ceiling <- max(data$distance, na.rm = TRUE)
return2 <- data %>% filter(distance == ceiling)

print(return2)

## # A tibble: 342 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      857            900        -3     1516           1530
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     3      914            900        14     1504           1530
##  4  2013     1     4      900            900         0     1516           1530
##  5  2013     1     5      858            900        -2     1519           1530
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8      901            900         1     1504           1530
##  9  2013     1     9      641            900      1301     1242           1530
## 10  2013     1    10      859            900        -1     1449           1530
## # ℹ 332 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, `Flight Number` <int>,
## #   `Plane Tail Number` <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, date <date>

date <- seq.Date(from = as.Date("2013-01-01"), to = as.Date("2013-12-31"), by = "day")
op <- return2$date
return3 <- setdiff(date, op)

print(return3)

##  [1] "2013-09-12" "2013-09-17" "2013-09-19" "2013-09-24" "2013-09-26"
##  [6] "2013-10-01" "2013-10-03" "2013-10-08" "2013-10-10" "2013-10-15"
## [11] "2013-10-17" "2013-10-22" "2013-10-24" "2013-10-29" "2013-10-31"
## [16] "2013-11-05" "2013-11-07" "2013-11-14" "2013-11-19" "2013-11-28"
## [21] "2013-12-03" "2013-12-10" "2013-12-12"

Q9:

url <- "https://docs.google.com/spreadsheets/d/1EjWi2CkoEX1aMtK7cl8mcDhfuQqRl_0n-fAM2C78_OA/export?format=csv"
read <- read_csv(url)

## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Name
## dbl (6): Q1, Q5, Q10, Q90, Q95, Q99
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

return <- read %>% pivot_longer( cols = -Name, names_to = "quantile", values_to = "value" ) %>% rename(distribution = Name) %>% arrange(distribution)

print(return)

## # A tibble: 18 × 3
##    distribution quantile   value
##    <chr>        <chr>      <dbl>
##  1 Exponential  Q1        0.0100
##  2 Exponential  Q5        0.0513
##  3 Exponential  Q10       0.105 
##  4 Exponential  Q90       2.30  
##  5 Exponential  Q95       3.00  
##  6 Exponential  Q99       4.61  
##  7 Normal       Q1       -2.33  
##  8 Normal       Q5       -1.64  
##  9 Normal       Q10      -1.28  
## 10 Normal       Q90       1.28  
## 11 Normal       Q95       1.64  
## 12 Normal       Q99       2.33  
## 13 Uniform      Q1        0.01  
## 14 Uniform      Q5        0.05  
## 15 Uniform      Q10       0.1   
## 16 Uniform      Q90       0.9   
## 17 Uniform      Q95       0.95  
## 18 Uniform      Q99       0.99

Q10:

url <- read_csv("https://moderndive.com/data/zinc_tidy.csv")

## Rows: 20 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): location
## dbl (2): loc_id, concentration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

return <- url %>%  pivot_wider(names_from = location, values_from = concentration) %>% mutate(difference = bottom - surface, character = paste(bottom, "-", surface),  logical = bottom >  surface)

print(return)

## # A tibble: 10 × 6
##    loc_id bottom surface difference character     logical
##     <dbl>  <dbl>   <dbl>      <dbl> <chr>         <lgl>  
##  1      1  0.43    0.415     0.0150 0.43 - 0.415  TRUE   
##  2      2  0.266   0.238     0.0280 0.266 - 0.238 TRUE   
##  3      3  0.567   0.39      0.177  0.567 - 0.39  TRUE   
##  4      4  0.531   0.41      0.121  0.531 - 0.41  TRUE   
##  5      5  0.707   0.605     0.102  0.707 - 0.605 TRUE   
##  6      6  0.716   0.609     0.107  0.716 - 0.609 TRUE   
##  7      7  0.651   0.632     0.0190 0.651 - 0.632 TRUE   
##  8      8  0.589   0.523     0.0660 0.589 - 0.523 TRUE   
##  9      9  0.469   0.411     0.058  0.469 - 0.411 TRUE   
## 10     10  0.723   0.612     0.111  0.723 - 0.612 TRUE

Homework 3

2025-10-04

Q1:

LGA has the smallest average delay

Q2:

Yes if the dep_delay is <= 0.

Q3:

Q4:

Q5:

UA

Q6:

166

Q7:

LAX with 2580 flights and SFO with 2197 flights.

Q8:

342 tied for the longest distance JFK to HNL

[1] “2013-09-12” “2013-09-17” “2013-09-19” “2013-09-24” “2013-09-26”

[6] “2013-10-01” “2013-10-03” “2013-10-08” “2013-10-10” “2013-10-15”

[11] “2013-10-17” “2013-10-22” “2013-10-24” “2013-10-29” “2013-10-31”

[16] “2013-11-05” “2013-11-07” “2013-11-14” “2013-11-19” “2013-11-28”

[21] “2013-12-03” “2013-12-10” “2013-12-12”

Q9:

Q10: