# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)


# Question 1
data("flights")
late_flights_by_month <- flights %>%
  mutate(late = arr_delay > 5) %>%  
  group_by(month) %>%
  summarise(lateflights = sum(late, na.rm = TRUE))
late_flights_by_month
## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291
# Question 2
traffic_percentage <- flights %>%
  group_by(month, carrier) %>%
  summarise(n_flights = n(), .groups = 'drop') %>%
  group_by(month) %>%
  mutate(total_flights = sum(n_flights)) %>%
  ungroup() %>%
  mutate(percentage = n_flights / total_flights * 100) %>%
  select(month, carrier, percentage) %>%
  pivot_wider(names_from = month, values_from = percentage, names_prefix = '')
traffic_percentage
## # A tibble: 16 × 13
##    carrier      `1`    `2`     `3`    `4`    `5`      `6`    `7`     `8`     `9`
##    <chr>      <dbl>  <dbl>   <dbl>  <dbl>  <dbl>    <dbl>  <dbl>   <dbl>   <dbl>
##  1 9E       5.83     5.85   5.64    5.33   5.08   5.09     5.08   4.96    5.58  
##  2 AA      10.3     10.1    9.67    9.61   9.73   9.76     9.79   9.74    9.48  
##  3 AS       0.230    0.224  0.215   0.212  0.215  0.212    0.211  0.211   0.218 
##  4 B6      16.4     16.4   16.5    15.9   15.9   16.4     16.9   16.9    15.6   
##  5 DL      13.7     13.8   14.5    14.4   14.2   14.6     14.4   14.7    14.1   
##  6 EV      15.4     15.3   16.4    16.1   16.7   15.8     15.8   15.6    17.1   
##  7 F9       0.218    0.196  0.198   0.201  0.201  0.195    0.197  0.188   0.210 
##  8 FL       1.21     1.19   1.10    1.10   1.13   0.892    0.894  0.897   0.925 
##  9 HA       0.115    0.112  0.108   0.106  0.108  0.106    0.105  0.106   0.0907
## 10 MQ       8.41     8.19   7.82    7.80   7.93   7.71     7.68   7.72    8.00  
## 11 OO       0.00370 NA     NA      NA     NA      0.00708 NA      0.0136  0.0725
## 12 UA      17.2     17.4   17.2    17.8   17.2   17.6     17.2   17.5    17.0   
## 13 US       5.93     6.22   5.97    6.10   6.20   6.15     6.07   6.07    6.16  
## 14 VX       1.17     1.09   1.05    1.64   1.72   1.70     1.66   1.67    1.64  
## 15 WN       3.69     3.65   3.46    3.46   3.49   3.64     3.66   3.57    3.66  
## 16 YV       0.170    0.192  0.0624  0.134  0.170  0.173    0.275  0.222   0.152 
## # ℹ 3 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>
# Question 3
latest_flight_each_month <- flights %>%
  group_by(month) %>%
  filter(dep_time == max(dep_time, na.rm = TRUE)) %>%
  ungroup()
latest_flight_each_month
## # A tibble: 35 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     7     2359           2359         0      506            437
##  2  2013     1    12     2359           2359         0      429            437
##  3  2013     1    13     2359           2130       149      435            218
##  4  2013     1    18     2359           2359         0      439            437
##  5  2013     1    19     2359           2359         0      437            444
##  6  2013     1    25     2359           2359         0      500            444
##  7  2013    10    30     2400           2359         1      327            337
##  8  2013    11    27     2400           2359         1      515            445
##  9  2013    12     5     2400           2359         1      427            440
## 10  2013    12     9     2400           2359         1      432            440
## # ℹ 25 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Question 4
data <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_data <- data %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  gather(key = "learning_platform", value = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_replace(learning_platform, "LearningPlatformUsefulness", ""))
count_data <- usefulness_data %>%
  count(learning_platform, usefulness)
count_data
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
# Question 5
usefulness_data_filtered <- usefulness_data %>%
  filter(usefulness != "Not Useful")
total_responses <- usefulness_data %>%
  group_by(learning_platform) %>%
  summarise(tot = n())
useful_counts <- usefulness_data_filtered %>%
  count(learning_platform)
joined_data <- left_join(useful_counts, total_responses, by = "learning_platform")
perc_usefulness <- joined_data %>%
  mutate(perc_usefulness = n / tot)
perc_usefulness
## # A tibble: 18 × 4
##    learning_platform     n   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.970
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.980
# Question 6
perc_usefulness <- perc_usefulness %>%
  mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness)) %>%
  mutate(learning_platform = fct_rev(learning_platform))
perc_usefulness <- perc_usefulness %>%
  mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness, .desc = TRUE))  
ggplot(perc_usefulness, aes(x = perc_usefulness*100, y = learning_platform)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +  
  labs(x = "percentage", y = "learning platform")