# Load libraries
library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) 
library(nycflights13)
## Part 1 

# Question 1 How many flights arrived late each month?
flights <- nycflights13::flights
flights <- flights %>%
  mutate(is_late = arr_delay > 5)
late_flights_by_month <- flights %>%
  group_by(month) %>%
  summarise(lateflights = sum(is_late, na.rm = TRUE))
print(late_flights_by_month)
## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291
# Question 2 What percentage of traffic did each carrier represent, by month?
percentage_traffic_by_carrier <- flights %>%
  group_by(month, carrier) %>%
  summarise(percentage_traffic = n() / nrow(flights) * 100, .groups = 'drop_last')
percentage_traffic_pivoted <- percentage_traffic_by_carrier %>%
  pivot_wider(names_from = carrier, values_from = percentage_traffic)
print(percentage_traffic_pivoted)
## # A tibble: 12 × 17
## # Groups:   month [12]
##    month  `9E`    AA     AS    B6    DL    EV     F9     FL      HA    MQ
##    <int> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl> <dbl>
##  1     1 0.467 0.830 0.0184  1.31  1.10  1.24 0.0175 0.0974 0.00920 0.674
##  2     2 0.433 0.747 0.0166  1.22  1.02  1.14 0.0145 0.0879 0.00831 0.607
##  3     3 0.483 0.828 0.0184  1.42  1.24  1.40 0.0169 0.0938 0.00920 0.670
##  4     4 0.449 0.808 0.0178  1.34  1.22  1.35 0.0169 0.0923 0.00891 0.657
##  5     5 0.434 0.832 0.0184  1.36  1.21  1.43 0.0172 0.0965 0.00920 0.678
##  6     6 0.427 0.819 0.0178  1.37  1.23  1.32 0.0163 0.0748 0.00891 0.647
##  7     7 0.444 0.856 0.0184  1.48  1.26  1.38 0.0172 0.0781 0.00920 0.671
##  8     8 0.432 0.848 0.0184  1.47  1.28  1.35 0.0163 0.0781 0.00920 0.672
##  9     9 0.457 0.776 0.0178  1.27  1.15  1.40 0.0172 0.0757 0.00742 0.655
## 10    10 0.497 0.806 0.0184  1.29  1.22  1.46 0.0169 0.0701 0.00624 0.662
## 11    11 0.474 0.765 0.0154  1.27  1.14  1.33 0.0181 0.0600 0.00742 0.610
## 12    12 0.485 0.803 0.0160  1.41  1.22  1.28 0.0181 0.0632 0.00831 0.635
## # ℹ 6 more variables: OO <dbl>, UA <dbl>, US <dbl>, VX <dbl>, WN <dbl>,
## #   YV <dbl>
# Question 3 What was the latest flight to depart each month?
flights$dep_time <- ymd_hm(paste(flights$year, flights$month, flights$day, flights$dep_time, sep = " "))
## Warning: 106157 failed to parse.
latest_departure_by_month <- flights %>%
  group_by(month) %>%
  filter(dep_time == max(dep_time, na.rm = TRUE))
latest_departure_by_month <- select(latest_departure_by_month, year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour)
print(latest_departure_by_month)
## # A tibble: 14 × 19
## # Groups:   month [12]
##     year month   day dep_time            sched_dep_time dep_delay arr_time
##    <int> <int> <int> <dttm>                       <int>     <dbl>    <int>
##  1  2013     1    31 2013-01-31 23:54:00           2055       179      144
##  2  2013    10    31 2013-10-31 23:57:00           2359        -2      345
##  3  2013    11    30 2013-11-30 23:54:00           2359        -5      430
##  4  2013    12    31 2013-12-31 23:56:00           2359        -3      436
##  5  2013     2    28 2013-02-28 23:59:00           2359         0      443
##  6  2013     3    31 2013-03-31 23:58:00           2359        -1      332
##  7  2013     4    30 2013-04-30 23:51:00           2359        -8      345
##  8  2013     5    31 2013-05-31 23:55:00           2359        -4      338
##  9  2013     5    31 2013-05-31 23:55:00           2359        -4      335
## 10  2013     6    30 2013-06-30 23:59:00           2110       169      118
## 11  2013     7    31 2013-07-31 23:52:00           2245        67       49
## 12  2013     8    31 2013-08-31 23:59:00           2359         0      345
## 13  2013     8    31 2013-08-31 23:59:00           2359         0      346
## 14  2013     9    30 2013-09-30 23:49:00           2359       -10      325
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## Part 2 ----

library(readr)

multipleChoiceResponses <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%  
  set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>% 
  gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)

usefulness_by_platform %>% group_by(learning_platform, usefulness) %>% 
  summarise(n = n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
#Calculate the number of total responses by learning platform

total_usefulness_by_platform <- usefulness_by_platform %>% 
  group_by(learning_platform) %>% 
  summarise(tot = n())

#Calculate the number of useful responses by learning platform

usefulness_count <- usefulness_by_platform %>% 
  filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>% 
  group_by(learning_platform) %>% 
  summarise(count = n())

#Calculate the percentage of usefulness for each learning platform

perc_usefulness <- usefulness_count %>% 
  left_join(total_usefulness_by_platform, by = "learning_platform") %>% 
  mutate(perc_usefulness = count/tot) %>% 
  mutate(perc_usefulness = round(perc_usefulness, digits = 3))

#Print the result

perc_usefulness
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.97 
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.98
#Change platforms into factors

perc_usefulness_fct <- perc_usefulness %>% 
  mutate(learning_platform = fct(learning_platform) %>% 
           fct_reorder(perc_usefulness) %>% 
           fct_rev() %>% 
           fct_relevel("Courses", after = 0))

#Plot the percentage of usefulness of each platform

perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
  geom_segment(aes(xend = learning_platform, yend=0))+
  geom_point()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(
    x = "Learning Platform",
    y = "Percent finding at least somewhat useful"
  )+
  scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00