Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)

Use the flights data from package nycflights13 to answer the following questions

How many flights arrived late each month? The late arrival is the flight arriving more than 5 minutes after its scheduled arrival time.

lateflights <- flights %>% 
  filter(arr_delay>5) %>% 
  group_by(month) %>% 
  summarise(lateflights = n()) %>% 
  ungroup()

lateflights
## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291

What percentage of traffic did each carrier represent, by month?

#Calculate total number of flights by carrier and month
total_flights_per_carrier <- flights %>%
  group_by(month, carrier) %>%
  summarise(total_flights_by_carrier = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
#Calculate total number of flights per month
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total_flights_per_month = n())

#Join the two dataset and calculate the percentage
percentage_traffic_per_carrier <- total_flights_per_carrier %>%
  left_join(total_flights_per_month, by = "month") %>%
  mutate(percentage_traffic = (total_flights_by_carrier / total_flights_per_month)) %>%
  mutate(percentage_traffic = scales::percent(percentage_traffic)) %>% 
  select(month, carrier, percentage_traffic)

#Transpose and print the result
percentage_traffic_per_carrier %>% spread(key = month, value = percentage_traffic)
## # A tibble: 16 × 13
##    carrier `1`     `2`     `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>   <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.825%  5.8475% 5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
##  2 AA      10.347% 10.087… 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.47… 9.39… 9.45…
##  3 AS      0.230%  0.2244% 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
##  4 B6      16.394% 16.444… 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
##  5 DL      13.665% 13.803… 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.446% 15.338… 16.3… 16.1… 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
##  7 F9      0.218%  0.1964% 0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21… 0.19… 0.22…
##  8 FL      1.215%  1.1863% 1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
##  9 HA      0.115%  0.1122% 0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ      8.410%  8.1921% 7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8.00… 7.71… 7.54…
## 11 OO      0.004%  <NA>    <NA>  <NA>  <NA>  0.00… <NA>  0.01… 0.07… <NA>  0.01…
## 12 UA      17.172% 17.418… 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US      5.932%  6.2202% 5.96… 6.09… 6.19… 6.14… 6.07… 6.06… 6.15… 6.39… 6.23…
## 14 VX      1.170%  1.0861% 1.05… 1.64… 1.72… 1.70… 1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN      3.688%  3.6512% 3.46… 3.45… 3.49… 3.64… 3.65… 3.57… 3.66… 3.77… 3.78…
## 16 YV      0.170%  0.1924% 0.06… 0.13… 0.17… 0.17… 0.27… 0.22… 0.15… 0.22… 0.18…
## # ℹ 1 more variable: `12` <chr>

What was the latest flight to depart each month?

flights %>%
  group_by(month) %>%
  filter(dep_delay == max(dep_delay, na.rm = TRUE)) %>% 
  arrange(-desc(month))
## # A tibble: 12 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Use the multipleChoiceResponses1.cvs to answer the following questions:

Import dataset

library(readr)
multipleChoiceResponses <- read_csv("C:/Users/Admin/OneDrive - 亞洲大學[Asia University]/Financial Database Mana & Application/Data/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Count the usefulness by learning platform

#Select only the columns with LearningPlatformUsefulness and remove rows where usefulness is NA.
#Remove LearningPlatformUsefulness from each string in learning_platform
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%  
  set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>% 
  gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)

#Use count() to change the dataset to have one row per learning_platform usefulness pair with a cilmn that is the number of entries with that pairing
usefulness_by_platform %>% group_by(learning_platform, usefulness) %>% 
  summarise(n = n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows

Compute the number of total reponses and number of responses which are at least useful

#Calculate the number of total responses by learning platform
total_usefulness_by_platform <- usefulness_by_platform %>% group_by(learning_platform) %>% summarise(tot = n())

#Calculate the number of useful responses by learning platform
usefulness_count <- usefulness_by_platform %>% filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>% 
  group_by(learning_platform) %>% summarise(count = n())

#Calculate the percentage of usefulness for each learning platform
perc_usefulness <- usefulness_count %>% left_join(total_usefulness_by_platform, by = "learning_platform") %>% 
  mutate(perc_usefulness = count/tot) %>% 
  mutate(perc_usefulness = round(perc_usefulness, digits = 3))

#Print the result
perc_usefulness
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.97 
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.98

Visualization

#Change platforms into factors
perc_usefulness_fct <- perc_usefulness %>% 
  mutate(learning_platform = fct(learning_platform) %>% 
           fct_reorder(perc_usefulness) %>% 
           fct_rev() %>% 
           fct_relevel("Courses", after = 0))

#Plot the percentage of usefulness of each platform
perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
  geom_segment(aes(xend = learning_platform, yend=0))+
  geom_point()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(
    x = "Learning Platform",
    y = "Percent finding at least somewhat useful"
  )+
  scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))