Question 1A:
Turing: 2049-05-15 Neuman: 2040-11-18
turing <- as.Date("1912-06-23")
neuman <- as.Date("1903-12-28")
today <- as.Date("2025-10-02")
diff <- as.numeric(difftime(turing, neuman, units = "days"))
diff
## [1] 3100
td <- as.numeric(difftime(today, turing, units = "days"))
tnm <- (floor(td / 10000) + 1) * 10000
turingresult <- turing + days(tnm)
turingresult
## [1] "2049-05-15"
nd <- as.numeric(difftime(today, neuman, units = "days"))
nnm <- (floor(nd / 10000) + 1) * 10000
neumanresult <- neuman + days(nnm)
neumanresult
## [1] "2040-11-18"
Question 2:
Moon landing in New York time: 1969-07-20 16:17:00
Moon landing in Hawaii time : 1969-07-20 10:17:00
Moon step in New York time : 1969-07-20 22:56:00
Moon step in Hawaii time : 1969-07-20 16:56:00
v1 <- mdy_hms("07/20/1969 20:17:00", tz = "UTC") # landing
v2 <- mdy_hms("07/21/1969 02:56:00", tz = "UTC") # first step
v1ny <- format(v1, tz = "America/New_York")
v2ny <- format(v2, tz = "America/New_York")
v1ha <- format(v1, tz = "Pacific/Honolulu")
v2ha <- format(v2, tz = "Pacific/Honolulu")
cat("Moon landing in New York time:", v1ny, "\n")
## Moon landing in New York time: 1969-07-20 16:17:00
cat("Moon landing in Hawaii time :", v1ha, "\n\n")
## Moon landing in Hawaii time : 1969-07-20 10:17:00
cat("Moon step in New York time :", v2ny, "\n")
## Moon step in New York time : 1969-07-20 22:56:00
cat("Moon step in Hawaii time :", v2ha, "\n")
## Moon step in Hawaii time : 1969-07-20 16:56:00
data <- nycflights13::flights
head(data, n = 5)
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Question 3
a: No, 48
index <- c("carrier", "flight", "year", "month", "day")
cond <- data[duplicated(data[index]) | duplicated(data[index], fromLast = TRUE), ]
repeating <- distinct(cond[index, ])
return <- distinct(cond)
print(return)
## # A tibble: 48 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 8 600 600 0 725 735
## 2 2013 6 8 1551 1540 11 1748 1750
## 3 2013 6 15 604 600 4 731 735
## 4 2013 6 15 1549 1540 9 1809 1750
## 5 2013 6 22 609 600 9 719 735
## 6 2013 6 22 1603 1540 23 1801 1750
## 7 2013 6 29 602 600 2 728 735
## 8 2013 6 29 1546 1540 6 1802 1750
## 9 2013 7 6 600 600 0 726 735
## 10 2013 7 6 1537 1540 -3 1726 1750
## # ℹ 38 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(return[, c("carrier", "flight", "year", "month", "day")])
## # A tibble: 6 × 5
## carrier flight year month day
## <chr> <int> <int> <int> <int>
## 1 WN 2269 2013 6 8
## 2 WN 2269 2013 6 8
## 3 WN 2269 2013 6 15
## 4 WN 2269 2013 6 15
## 5 WN 2269 2013 6 22
## 6 WN 2269 2013 6 22
b: No, 14
index <- c("carrier", "flight", "tailnum", "year", "month", "day")
cond <- data[duplicated(data[index]) | duplicated(data[index], fromLast = TRUE), ]
repeating <- distinct(cond[index, ])
return <- distinct(cond)
print(return)
## # A tibble: 14 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 6 8 600 600 0 725 735
## 2 2013 6 8 1551 1540 11 1748 1750
## 3 2013 6 15 604 600 4 731 735
## 4 2013 6 15 1549 1540 9 1809 1750
## 5 2013 6 22 609 600 9 719 735
## 6 2013 6 22 1603 1540 23 1801 1750
## 7 2013 6 29 602 600 2 728 735
## 8 2013 6 29 1546 1540 6 1802 1750
## 9 2013 7 6 600 600 0 726 735
## 10 2013 7 6 1537 1540 -3 1726 1750
## 11 2013 8 3 619 600 19 748 735
## 12 2013 8 3 1609 1540 29 2105 1750
## 13 2013 8 10 605 600 5 732 735
## 14 2013 8 10 1614 1540 34 1839 1750
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(return[, c("carrier", "flight", "tailnum", "year", "month", "day")])
## # A tibble: 6 × 6
## carrier flight tailnum year month day
## <chr> <int> <chr> <int> <int> <int>
## 1 WN 2269 N487WN 2013 6 8
## 2 WN 2269 N487WN 2013 6 8
## 3 WN 2269 N230WN 2013 6 15
## 4 WN 2269 N230WN 2013 6 15
## 5 WN 2269 N440LV 2013 6 22
## 6 WN 2269 N440LV 2013 6 22
c: 6.895377 minutes
avg <- mean(data$arr_delay, na.rm = TRUE)
cat(avg, "minutes")
## 6.895377 minutes
d: Because na.rm cannot be used for table() as the table() function
doesn’t know that it is asking for the removal of any “NA”.
e: 8650
return <- data[data$year == 2013 & data$dep_delay == 0 & data$arr_delay == 0, ]
print(return)
## # A tibble: 8,650 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 2013 1 2 600 600 0 846 846
## 6 2013 1 2 1437 1437 0 1742 1742
## 7 NA NA NA NA NA NA NA NA
## 8 NA NA NA NA NA NA NA NA
## 9 NA NA NA NA NA NA NA NA
## 10 NA NA NA NA NA NA NA NA
## # ℹ 8,640 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
f: LGA has the largest proportion at .6479811
time <- data$dep_delay <= 0
torigin <- table(data$origin)
otorigin <- table(data$origin[time])
porigin <- otorigin / torigin
porigin
##
## EWR JFK LGA
## 0.5369719 0.6055500 0.6479811
g: 0.0165238
delay <- data$arr_delay == 0
proportion <- sum(delay, na.rm = TRUE) / sum(!is.na(data$arr_delay))
proportion
## [1] 0.0165238
h: No, AS has a very small dataset which means that there would be
skews making this dataset unreliable and unconsistent
table(data$carrier)
##
## 9E AA AS B6 DL EV F9 FL HA MQ OO UA US
## 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665 20536
## VX WN YV
## 5162 12275 601
i: Yes, 63.47809% are on time for a big 5 flight in comparison to
55.8875%
big5 <- c("AA", "DL", "UA", "WN", "AS")
checkbig5 <- data$carrier %in% big5
delay <- data$arr_delay <= 0
proportionb5 <- sum(delay[checkbig5], na.rm = TRUE) / sum(!is.na(data$arr_delay[checkbig5]))
oproportion <- sum(delay[!checkbig5], na.rm = TRUE) / sum(!is.na(data$arr_delay[!checkbig5]))
print(proportionb5)
## [1] 0.6347809
print(oproportion)
## [1] 0.558875
j: 77.51401%
anna <- ifelse(data$distance >= 2000, data$arr_delay <= 30, data$arr_delay <= 15)
return <- sum(anna, na.rm = TRUE) / sum(!is.na(data$arr_delay))
return
## [1] 0.7751401