data <- nycflights13::flights
head(data, n = 5)
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Q1:
LGA has the smallest average delay
return1 <- data %>% group_by(origin) %>% summarise(mean = mean(dep_delay, na.rm = TRUE), median = median(dep_delay, na.rm = TRUE), var = var(dep_delay, na.rm = TRUE), observations = length(dep_delay)
)
print(return1)
## # A tibble: 3 × 5
## origin mean median var observations
## <chr> <dbl> <dbl> <dbl> <int>
## 1 EWR 15.1 -1 1708. 120835
## 2 JFK 12.1 -1 1524. 111279
## 3 LGA 10.3 -3 1599. 104662
Q2:
Yes if the dep_delay is <= 0.
Q3:
data$date <- make_date(data$year, data$month, data$day)
return <- data %>% sample_n(10) %>% select(carrier, flight, arr_delay, dep_delay, air_time, date)
print(return)
## # A tibble: 10 × 6
## carrier flight arr_delay dep_delay air_time date
## <chr> <int> <dbl> <dbl> <dbl> <date>
## 1 B6 1176 -24 -5 36 2013-04-21
## 2 EV 4471 3 -5 106 2013-07-12
## 3 AS 7 -16 -3 323 2013-06-05
## 4 WN 3421 -10 -5 118 2013-04-21
## 5 EV 4671 -7 -1 72 2013-02-07
## 6 DL 1616 9 -3 342 2013-09-30
## 7 B6 305 2 0 142 2013-07-17
## 8 MQ 4479 -19 -8 70 2013-03-09
## 9 EV 4297 -21 -3 78 2013-10-01
## 10 US 2156 -21 -5 40 2013-08-01
Q4:
return <- data %>% group_by(origin) %>% slice_max(order_by = arr_delay, n = 6, with_ties = FALSE) %>% select(origin, carrier, flight, arr_delay, dep_delay, air_time)
print(return)
## # A tibble: 18 × 6
## # Groups: origin [3]
## origin carrier flight arr_delay dep_delay air_time
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 EWR MQ 3695 1109 1126 111
## 2 EWR AA 172 878 896 149
## 3 EWR MQ 3744 875 878 112
## 4 EWR DL 1223 847 849 290
## 5 EWR AA 172 846 845 145
## 6 EWR DL 2042 796 798 109
## 7 JFK HA 51 1272 1301 640
## 8 JFK MQ 3535 1127 1137 74
## 9 JFK AA 177 1007 1014 354
## 10 JFK MQ 3075 989 1005 96
## 11 JFK DL 2391 931 960 139
## 12 JFK DL 2391 856 825 173
## 13 LGA DL 2119 915 911 167
## 14 LGA DL 2047 895 898 109
## 15 LGA F9 835 834 853 233
## 16 LGA DL 1435 821 812 174
## 17 LGA AA 2019 802 803 134
## 18 LGA DL 1715 780 787 160
Q5:
UA
EWR <- table(data$carrier[data$origin == "EWR"])
return <- names(EWR)[which.max(EWR)]
cat(return)
## UA
Q6:
166
big5 <- (data$carrier == "AA" | data$carrier == "DL" | data$carrier == "WN" | data$carrier == "UA" | data$carrier == "AS")
search <- data$dep_delay == 0 & data$arr_delay == 0
return <- sum(search & big5, na.rm = TRUE)
cat(return)
## 166
Q7:
LAX with 2580 flights and SFO with 2197 flights.
aeroporte <- table(data$dest[data$carrier == "VX"])
check1 <- names(aeroporte)[which.max(aeroporte)]
count1 <- max(aeroporte)
cond <- aeroporte[names(aeroporte) != check1]
check2 <- names(cond)[which.max(cond)]
count2 <- max(cond)
cat(check1, count1)
## LAX 2580
cat(check2, count2)
## SFO 2197
Q8:
342 tied for the longest distance JFK to HNL
[1] “2013-09-12” “2013-09-17” “2013-09-19” “2013-09-24”
“2013-09-26”
[6] “2013-10-01” “2013-10-03” “2013-10-08” “2013-10-10”
“2013-10-15”
[11] “2013-10-17” “2013-10-22” “2013-10-24” “2013-10-29”
“2013-10-31”
[16] “2013-11-05” “2013-11-07” “2013-11-14” “2013-11-19”
“2013-11-28”
[21] “2013-12-03” “2013-12-10” “2013-12-12”
data <- data %>% rename(`Flight Number` = flight, `Plane Tail Number` = tailnum)
return1 <- data %>% arrange(desc(distance)) %>% slice(1:10) %>% select(carrier, `Flight Number`, `Plane Tail Number`, distance, origin, dest, year, month, day)
print(return1)
## # A tibble: 10 × 9
## carrier `Flight Number` `Plane Tail Number` distance origin dest year month
## <chr> <int> <chr> <dbl> <chr> <chr> <int> <int>
## 1 HA 51 N380HA 4983 JFK HNL 2013 1
## 2 HA 51 N380HA 4983 JFK HNL 2013 1
## 3 HA 51 N380HA 4983 JFK HNL 2013 1
## 4 HA 51 N384HA 4983 JFK HNL 2013 1
## 5 HA 51 N381HA 4983 JFK HNL 2013 1
## 6 HA 51 N385HA 4983 JFK HNL 2013 1
## 7 HA 51 N385HA 4983 JFK HNL 2013 1
## 8 HA 51 N389HA 4983 JFK HNL 2013 1
## 9 HA 51 N384HA 4983 JFK HNL 2013 1
## 10 HA 51 N388HA 4983 JFK HNL 2013 1
## # ℹ 1 more variable: day <int>
ceiling <- max(data$distance, na.rm = TRUE)
return2 <- data %>% filter(distance == ceiling)
print(return2)
## # A tibble: 342 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 857 900 -3 1516 1530
## 2 2013 1 2 909 900 9 1525 1530
## 3 2013 1 3 914 900 14 1504 1530
## 4 2013 1 4 900 900 0 1516 1530
## 5 2013 1 5 858 900 -2 1519 1530
## 6 2013 1 6 1019 900 79 1558 1530
## 7 2013 1 7 1042 900 102 1620 1530
## 8 2013 1 8 901 900 1 1504 1530
## 9 2013 1 9 641 900 1301 1242 1530
## 10 2013 1 10 859 900 -1 1449 1530
## # ℹ 332 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, `Flight Number` <int>,
## # `Plane Tail Number` <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, date <date>
date <- seq.Date(from = as.Date("2013-01-01"), to = as.Date("2013-12-31"), by = "day")
op <- return2$date
return3 <- setdiff(date, op)
print(return3)
## [1] "2013-09-12" "2013-09-17" "2013-09-19" "2013-09-24" "2013-09-26"
## [6] "2013-10-01" "2013-10-03" "2013-10-08" "2013-10-10" "2013-10-15"
## [11] "2013-10-17" "2013-10-22" "2013-10-24" "2013-10-29" "2013-10-31"
## [16] "2013-11-05" "2013-11-07" "2013-11-14" "2013-11-19" "2013-11-28"
## [21] "2013-12-03" "2013-12-10" "2013-12-12"
Q9:
url <- "https://docs.google.com/spreadsheets/d/1EjWi2CkoEX1aMtK7cl8mcDhfuQqRl_0n-fAM2C78_OA/export?format=csv"
read <- read_csv(url)
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Name
## dbl (6): Q1, Q5, Q10, Q90, Q95, Q99
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
return <- read %>% pivot_longer( cols = -Name, names_to = "quantile", values_to = "value" ) %>% rename(distribution = Name) %>% arrange(distribution)
print(return)
## # A tibble: 18 × 3
## distribution quantile value
## <chr> <chr> <dbl>
## 1 Exponential Q1 0.0100
## 2 Exponential Q5 0.0513
## 3 Exponential Q10 0.105
## 4 Exponential Q90 2.30
## 5 Exponential Q95 3.00
## 6 Exponential Q99 4.61
## 7 Normal Q1 -2.33
## 8 Normal Q5 -1.64
## 9 Normal Q10 -1.28
## 10 Normal Q90 1.28
## 11 Normal Q95 1.64
## 12 Normal Q99 2.33
## 13 Uniform Q1 0.01
## 14 Uniform Q5 0.05
## 15 Uniform Q10 0.1
## 16 Uniform Q90 0.9
## 17 Uniform Q95 0.95
## 18 Uniform Q99 0.99
Q10:
url <- read_csv("https://moderndive.com/data/zinc_tidy.csv")
## Rows: 20 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): location
## dbl (2): loc_id, concentration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
return <- url %>% pivot_wider(names_from = location, values_from = concentration) %>% mutate(difference = bottom - surface, character = paste(bottom, "-", surface), logical = bottom > surface)
print(return)
## # A tibble: 10 × 6
## loc_id bottom surface difference character logical
## <dbl> <dbl> <dbl> <dbl> <chr> <lgl>
## 1 1 0.43 0.415 0.0150 0.43 - 0.415 TRUE
## 2 2 0.266 0.238 0.0280 0.266 - 0.238 TRUE
## 3 3 0.567 0.39 0.177 0.567 - 0.39 TRUE
## 4 4 0.531 0.41 0.121 0.531 - 0.41 TRUE
## 5 5 0.707 0.605 0.102 0.707 - 0.605 TRUE
## 6 6 0.716 0.609 0.107 0.716 - 0.609 TRUE
## 7 7 0.651 0.632 0.0190 0.651 - 0.632 TRUE
## 8 8 0.589 0.523 0.0660 0.589 - 0.523 TRUE
## 9 9 0.469 0.411 0.058 0.469 - 0.411 TRUE
## 10 10 0.723 0.612 0.111 0.723 - 0.612 TRUE