Question 1A:

3100 days

Question 1B:

Turing: 2049-05-15 Neuman: 2040-11-18

turing <- as.Date("1912-06-23")
neuman <- as.Date("1903-12-28")


today <- as.Date("2025-10-02")

diff <- as.numeric(difftime(turing, neuman, units = "days"))
diff

## [1] 3100

td <- as.numeric(difftime(today, turing, units = "days"))
tnm <- (floor(td / 10000) + 1) * 10000
turingresult <- turing + days(tnm)
turingresult

## [1] "2049-05-15"

nd <- as.numeric(difftime(today, neuman, units = "days"))
nnm <- (floor(nd / 10000) + 1) * 10000
neumanresult <- neuman + days(nnm)
neumanresult

## [1] "2040-11-18"

Question 2:

Moon landing in New York time: 1969-07-20 16:17:00

Moon landing in Hawaii time : 1969-07-20 10:17:00

Moon step in New York time : 1969-07-20 22:56:00

Moon step in Hawaii time : 1969-07-20 16:56:00

v1 <- mdy_hms("07/20/1969 20:17:00", tz = "UTC")  # landing
v2 <- mdy_hms("07/21/1969 02:56:00", tz = "UTC")  # first step

v1ny <- format(v1, tz = "America/New_York")
v2ny <- format(v2, tz = "America/New_York")

v1ha <- format(v1, tz = "Pacific/Honolulu")
v2ha <- format(v2, tz = "Pacific/Honolulu")

cat("Moon landing in New York time:", v1ny, "\n")

## Moon landing in New York time: 1969-07-20 16:17:00

cat("Moon landing in Hawaii time :", v1ha, "\n\n")

## Moon landing in Hawaii time : 1969-07-20 10:17:00

cat("Moon step in New York time  :", v2ny, "\n")

## Moon step in New York time  : 1969-07-20 22:56:00

cat("Moon step in Hawaii time    :", v2ha, "\n")

## Moon step in Hawaii time    : 1969-07-20 16:56:00

data <- nycflights13::flights
head(data, n = 5)

## # A tibble: 5 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Question 3

a: No, 48

index <- c("carrier", "flight", "year", "month", "day")

cond <- data[duplicated(data[index]) | duplicated(data[index], fromLast = TRUE), ]
repeating <- distinct(cond[index, ])
return <- distinct(cond)
 
print(return)

## # A tibble: 48 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     6     8      600            600         0      725            735
##  2  2013     6     8     1551           1540        11     1748           1750
##  3  2013     6    15      604            600         4      731            735
##  4  2013     6    15     1549           1540         9     1809           1750
##  5  2013     6    22      609            600         9      719            735
##  6  2013     6    22     1603           1540        23     1801           1750
##  7  2013     6    29      602            600         2      728            735
##  8  2013     6    29     1546           1540         6     1802           1750
##  9  2013     7     6      600            600         0      726            735
## 10  2013     7     6     1537           1540        -3     1726           1750
## # ℹ 38 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

head(return[, c("carrier", "flight", "year", "month", "day")])

## # A tibble: 6 × 5
##   carrier flight  year month   day
##   <chr>    <int> <int> <int> <int>
## 1 WN        2269  2013     6     8
## 2 WN        2269  2013     6     8
## 3 WN        2269  2013     6    15
## 4 WN        2269  2013     6    15
## 5 WN        2269  2013     6    22
## 6 WN        2269  2013     6    22

b: No, 14

index <- c("carrier", "flight", "tailnum", "year", "month", "day")

cond <- data[duplicated(data[index]) | duplicated(data[index], fromLast = TRUE), ]
repeating <- distinct(cond[index, ])
return <- distinct(cond)
 
print(return)

## # A tibble: 14 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     6     8      600            600         0      725            735
##  2  2013     6     8     1551           1540        11     1748           1750
##  3  2013     6    15      604            600         4      731            735
##  4  2013     6    15     1549           1540         9     1809           1750
##  5  2013     6    22      609            600         9      719            735
##  6  2013     6    22     1603           1540        23     1801           1750
##  7  2013     6    29      602            600         2      728            735
##  8  2013     6    29     1546           1540         6     1802           1750
##  9  2013     7     6      600            600         0      726            735
## 10  2013     7     6     1537           1540        -3     1726           1750
## 11  2013     8     3      619            600        19      748            735
## 12  2013     8     3     1609           1540        29     2105           1750
## 13  2013     8    10      605            600         5      732            735
## 14  2013     8    10     1614           1540        34     1839           1750
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

head(return[, c("carrier", "flight", "tailnum", "year", "month", "day")])

## # A tibble: 6 × 6
##   carrier flight tailnum  year month   day
##   <chr>    <int> <chr>   <int> <int> <int>
## 1 WN        2269 N487WN   2013     6     8
## 2 WN        2269 N487WN   2013     6     8
## 3 WN        2269 N230WN   2013     6    15
## 4 WN        2269 N230WN   2013     6    15
## 5 WN        2269 N440LV   2013     6    22
## 6 WN        2269 N440LV   2013     6    22

c: 6.895377 minutes

avg <- mean(data$arr_delay, na.rm = TRUE)

cat(avg, "minutes")

## 6.895377 minutes

d: Because na.rm cannot be used for table() as the table() function doesn’t know that it is asking for the removal of any “NA”.

e: 8650

return <- data[data$year == 2013 & data$dep_delay == 0 & data$arr_delay == 0, ]
print(return)

## # A tibble: 8,650 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1    NA    NA    NA       NA             NA        NA       NA             NA
##  2    NA    NA    NA       NA             NA        NA       NA             NA
##  3    NA    NA    NA       NA             NA        NA       NA             NA
##  4    NA    NA    NA       NA             NA        NA       NA             NA
##  5  2013     1     2      600            600         0      846            846
##  6  2013     1     2     1437           1437         0     1742           1742
##  7    NA    NA    NA       NA             NA        NA       NA             NA
##  8    NA    NA    NA       NA             NA        NA       NA             NA
##  9    NA    NA    NA       NA             NA        NA       NA             NA
## 10    NA    NA    NA       NA             NA        NA       NA             NA
## # ℹ 8,640 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

f: LGA has the largest proportion at .6479811

time <- data$dep_delay <= 0
torigin <- table(data$origin)
otorigin <- table(data$origin[time])
porigin <- otorigin / torigin

porigin

## 
##       EWR       JFK       LGA 
## 0.5369719 0.6055500 0.6479811

g: 0.0165238

delay <- data$arr_delay == 0
proportion <- sum(delay, na.rm = TRUE) / sum(!is.na(data$arr_delay))

proportion

## [1] 0.0165238

h: No, AS has a very small dataset which means that there would be skews making this dataset unreliable and unconsistent

table(data$carrier)

## 
##    9E    AA    AS    B6    DL    EV    F9    FL    HA    MQ    OO    UA    US 
## 18460 32729   714 54635 48110 54173   685  3260   342 26397    32 58665 20536 
##    VX    WN    YV 
##  5162 12275   601

i: Yes, 63.47809% are on time for a big 5 flight in comparison to 55.8875%

big5 <- c("AA", "DL", "UA", "WN", "AS")

checkbig5 <- data$carrier %in% big5
delay <- data$arr_delay <= 0

proportionb5 <- sum(delay[checkbig5], na.rm = TRUE) / sum(!is.na(data$arr_delay[checkbig5]))
oproportion <- sum(delay[!checkbig5], na.rm = TRUE) / sum(!is.na(data$arr_delay[!checkbig5]))

print(proportionb5)

## [1] 0.6347809

print(oproportion)

## [1] 0.558875

j: 77.51401%

anna <- ifelse(data$distance >= 2000,  data$arr_delay <= 30,  data$arr_delay <= 15) 

return <- sum(anna, na.rm = TRUE) / sum(!is.na(data$arr_delay))

return

## [1] 0.7751401

Homework 2

Dev Amin

2025-09-30

Question 1A:

3100 days

Question 1B:

Turing: 2049-05-15 Neuman: 2040-11-18

Question 2:

Moon landing in New York time: 1969-07-20 16:17:00

Moon landing in Hawaii time : 1969-07-20 10:17:00

Moon step in New York time : 1969-07-20 22:56:00

Moon step in Hawaii time : 1969-07-20 16:56:00

Question 3

a: No, 48

b: No, 14

c: 6.895377 minutes

d: Because na.rm cannot be used for table() as the table() function doesn’t know that it is asking for the removal of any “NA”.

e: 8650

f: LGA has the largest proportion at .6479811

g: 0.0165238

h: No, AS has a very small dataset which means that there would be skews making this dataset unreliable and unconsistent

i: Yes, 63.47809% are on time for a big 5 flight in comparison to 55.8875%

j: 77.51401%