## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load and inspect the flights data

## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Calculate the number of late arrivals each month

flights %>%
  mutate(late_arrival = arr_delay > 5) %>%
  group_by(month) %>%
  summarise(late_arrivals = sum(late_arrival, na.rm = TRUE))
## # A tibble: 12 × 2
##    month late_arrivals
##    <int>         <int>
##  1     1          8988
##  2     2          8119
##  3     3          9033
##  4     4         10544
##  5     5          8490
##  6     6         10739
##  7     7         11518
##  8     8          9649
##  9     9          5347
## 10    10          7628
## 11    11          7485
## 12    12         12291

##Calculate the percentage of traffic by carrier and month

##Find the latest departure for each month

flights %>%
  group_by(month) %>%
  slice(which.max(dep_time))
## # A tibble: 12 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     7     2359           2359         0      506            437
##  2  2013     2     7     2400           2359         1      432            436
##  3  2013     3    15     2400           2359         1      324            338
##  4  2013     4     2     2400           2359         1      339            343
##  5  2013     5    21     2400           2359         1      339            350
##  6  2013     6    17     2400           2145       135      102           2315
##  7  2013     7     7     2400           1950       250      107           2130
##  8  2013     8    10     2400           2245        75      110              1
##  9  2013     9     2     2400           2359         1      411            340
## 10  2013    10    30     2400           2359         1      327            337
## 11  2013    11    27     2400           2359         1      515            445
## 12  2013    12     5     2400           2359         1      427            440
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

##Load and inspect the dataset

library(readr)

multipleChoiceResponses <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%  
  set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>% 
  gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)

usefulness_by_platform %>% group_by(learning_platform, usefulness) %>% 
  summarise(n = n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
#Calculate the number of total responses by learning platform

total_usefulness_by_platform <- usefulness_by_platform %>% 
                                group_by(learning_platform) %>% 
                                summarise(tot = n())

#Calculate the number of useful responses by learning platform

usefulness_count <- usefulness_by_platform %>% 
                    filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>% 
                    group_by(learning_platform) %>% 
                    summarise(count = n())

#Calculate the percentage of usefulness for each learning platform

perc_usefulness <- usefulness_count %>% 
                   left_join(total_usefulness_by_platform, by = "learning_platform") %>% 
                   mutate(perc_usefulness = count/tot) %>% 
                   mutate(perc_usefulness = round(perc_usefulness, digits = 3))

#Print the result

perc_usefulness
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.97 
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.98