library('tidyr')
library('readr')
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
library('lubridate')
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library('tidyverse')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library('nycflights13')
late_flights <- flights %>%
  filter(arr_delay > 5)

monthly_late_flights <- late_flights %>%
  group_by(month) %>%
  summarize(count = n())
print(monthly_late_flights)
## # A tibble: 12 × 2
##    month count
##    <int> <int>
##  1     1  8988
##  2     2  8119
##  3     3  9033
##  4     4 10544
##  5     5  8490
##  6     6 10739
##  7     7 11518
##  8     8  9649
##  9     9  5347
## 10    10  7628
## 11    11  7485
## 12    12 12291
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total = n())

carrier_flights_per_month <- flights %>%
  group_by(carrier, month) %>%
  summarise(count = n()) 
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
  mutate(percentage = paste0(round((count / total) * 100, 2), "%"))

spread_data <- carrier_percentage %>% 
  select(-count, -total) %>% 
  spread(key = month, value = percentage)

print(spread_data)
## # A tibble: 16 × 13
## # Groups:   carrier [16]
##    carrier `1`    `2`    `3`    `4`    `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>  <chr>  <chr>  <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.83%  5.85%  5.64%  5.33%  5.08% 5.09% 5.08% 4.96% 5.58% 5.79% 5.85%
##  2 AA      10.35% 10.09% 9.67%  9.61%  9.73% 9.76% 9.79% 9.74% 9.48% 9.4%  9.45%
##  3 AS      0.23%  0.22%  0.22%  0.21%  0.22% 0.21% 0.21% 0.21% 0.22% 0.21% 0.19%
##  4 B6      16.39% 16.44% 16.55% 15.94% 15.8… 16.3… 16.9… 16.8… 15.5… 15.1% 15.7…
##  5 DL      13.66% 13.8%  14.53% 14.44% 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.45% 15.34% 16.39% 16.1%  16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.4%
##  7 F9      0.22%  0.2%   0.2%   0.2%   0.2%  0.19% 0.2%  0.19% 0.21% 0.2%  0.22%
##  8 FL      1.21%  1.19%  1.1%   1.1%   1.13% 0.89% 0.89% 0.9%  0.92% 0.82% 0.74%
##  9 HA      0.11%  0.11%  0.11%  0.11%  0.11% 0.11% 0.11% 0.11% 0.09% 0.07% 0.09%
## 10 MQ      8.41%  8.19%  7.82%  7.8%   7.93% 7.71% 7.68% 7.72% 8%    7.71% 7.54%
## 11 OO      0%     <NA>   <NA>   <NA>   <NA>  0.01% <NA>  0.01% 0.07% <NA>  0.02%
## 12 UA      17.17% 17.42% 17.24% 17.82% 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8%
## 13 US      5.93%  6.22%  5.97%  6.1%   6.2%  6.15% 6.07% 6.07% 6.16% 6.39% 6.23%
## 14 VX      1.17%  1.09%  1.05%  1.64%  1.72% 1.7%  1.66% 1.67% 1.64% 1.63% 1.65%
## 15 WN      3.69%  3.65%  3.46%  3.46%  3.49% 3.64% 3.66% 3.57% 3.66% 3.78% 3.79%
## 16 YV      0.17%  0.19%  0.06%  0.13%  0.17% 0.17% 0.28% 0.22% 0.15% 0.23% 0.18%
## # ℹ 1 more variable: `12` <chr>
flights <- flights %>%
  mutate(delay = dep_delay)

most_delayed_flights <- flights %>%
  group_by(month) %>%
  filter(delay == max(delay, na.rm = TRUE)) %>%
  slice(1)

print(most_delayed_flights)
## # A tibble: 12 × 20
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>
# QUESTION 2
library(tidyr)

respons <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
result <- respons %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  gather(key = "learning_platform", value = "usefulness", na.rm = TRUE) %>%
  mutate(learning_platform = gsub('LearningPlatformUsefulness', '', learning_platform)) %>%
  count(learning_platform, usefulness)

print(result)
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
atleastuseful <- respons %>%
  gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
  count(learning_platform, name = "count") %>%
  left_join(
    respons %>%
      gather(key = "learning_platform", value = "usefulness", starts_with("LearningPlatformUsefulness")) %>%
      filter(!is.na(usefulness) & usefulness != "Not Useful") %>%
      mutate(learning_platform = gsub("LearningPlatformUsefulness", "", learning_platform)) %>%
      count(learning_platform, name = "at_least_useful"),
    by = "learning_platform"
  ) %>%
  mutate(
    tot = ifelse(is.na(at_least_useful), count, at_least_useful),
    perc_usefulness = tot / count
  ) %>%
  select(learning_platform, tot, count, perc_usefulness)

atleastuseful
## # A tibble: 18 × 4
##    learning_platform   tot count perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.970
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.980
atleastuseful %>%
  mutate(
    learning_platform = fct_reorder(learning_platform, perc_usefulness, .desc = TRUE),
    perc_usefulness = as.numeric(perc_usefulness)
  ) %>%
  ggplot(aes(y = learning_platform, yend = learning_platform, x = 0, xend = perc_usefulness)) +
  geom_segment(color = "pink") +
  geom_point(aes(x = perc_usefulness), color = "pink", size = 3) + 
  scale_x_continuous(labels = scales::percent_format()) +
  coord_flip() +
  labs(
    title = "Percentage of Usefulness by Learning Platform",
    x = "Percent findings at least somewhat useful",
    y = "Learning platform"
  )