Question 1 How many flights arrived late each month? The late arrival is the flight arrivingmore than 5 minutes after its scheduled arrival time.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(nycflights13)
# Late arrivals by month
late_arrivals_by_month <- flights %>%
  group_by(month) %>%
  summarise(lateflights = sum(arr_delay > 5, na.rm = TRUE))
print(late_arrivals_by_month)
## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291
# Calculate the total number of flights per month
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total = n())

Question 2 What percentage of traffic did each carrier represent, by month?

# Calculate the total number of flights per month
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total = n())

# Calculate the number of flights per carrier per month
carrier_flights_per_month <- flights %>%
  group_by(carrier, month) %>%
  summarise(count = n()) 
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
# Join total flights with carrier flights and calculate percentage
carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
  mutate(percentage = paste0(round((count / total) * 100, 3), "%"))

# Spread data to wide format for easier viewing
spread_data <- carrier_percentage %>% 
  select(-count, -total) %>% 
  spread(key = month, value = percentage)

print(spread_data)
## # A tibble: 16 × 13
## # Groups:   carrier [16]
##    carrier `1`     `2`     `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>   <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.825%  5.847%  5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
##  2 AA      10.347% 10.088% 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.48% 9.39… 9.45…
##  3 AS      0.23%   0.224%  0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
##  4 B6      16.394% 16.444% 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
##  5 DL      13.665% 13.803% 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.446% 15.338% 16.3… 16.1% 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
##  7 F9      0.218%  0.196%  0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21% 0.19… 0.22…
##  8 FL      1.215%  1.186%  1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
##  9 HA      0.115%  0.112%  0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ      8.41%   8.192%  7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8%    7.71… 7.54%
## 11 OO      0.004%  <NA>    <NA>  <NA>  <NA>  0.00… <NA>  0.01… 0.07… <NA>  0.01…
## 12 UA      17.172% 17.418% 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US      5.932%  6.22%   5.96… 6.09… 6.19… 6.14… 6.07% 6.06… 6.15… 6.39% 6.23…
## 14 VX      1.17%   1.086%  1.05… 1.64… 1.72… 1.7%  1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN      3.688%  3.651%  3.46… 3.45… 3.49… 3.64% 3.65… 3.57% 3.66… 3.77… 3.78…
## 16 YV      0.17%   0.192%  0.06… 0.13… 0.17% 0.17… 0.27… 0.22… 0.15… 0.22… 0.18%
## # ℹ 1 more variable: `12` <chr>

Question 3 What was the latest flight to depart each month?

# Calculate the delay time
flights <- flights %>%
  mutate(delay = dep_delay)

# Find the flight with the most delayed departure time each month
most_delayed_flights <- flights %>%
  group_by(month) %>%
  filter(delay == max(delay, na.rm = TRUE)) %>%
  slice(1)

print(most_delayed_flights)
## # A tibble: 12 × 20
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>

Question 4 Count the usefulness by learning platform.

data <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_data <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  gather(key = "learning_platform", value = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_replace(learning_platform, "LearningPlatformUsefulness", ""))
count_data <- usefulness_data %>%
  count(learning_platform, usefulness)
count_data
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows

Question 5

library(tidyverse)
library(lubridate)
library(dplyr)
library(scales)
library(ggplot2)
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(zoo)
usefulness_data_filtered <- usefulness_data %>%
  filter(usefulness != "Not Useful")
total_responses <- usefulness_data %>%
  group_by(learning_platform) %>%
  summarise(tot = n())
useful_counts <- usefulness_data_filtered %>%
  count(learning_platform)
joined_data <- left_join(useful_counts, total_responses, by = "learning_platform")
perc_usefulness <- joined_data %>%
  mutate(perc_usefulness = n / tot)
perc_usefulness
## # A tibble: 18 × 4
##    learning_platform     n   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.970
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.980
perc_usefulness <- perc_usefulness %>%
  mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness)) %>%
  mutate(learning_platform = fct_rev(learning_platform))
perc_usefulness <- perc_usefulness %>%
  mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness, .desc = TRUE))  # s'
ggplot(perc_usefulness, aes(x = learning_platform, y = perc_usefulness)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +  
  labs(x = "Learning Platform", y = "Percentage of Usefulness", title = "Usefulness of Learning Platforms")