# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
# Question 1
data("flights")
late_flights_by_month <- flights %>%
mutate(late = arr_delay > 5) %>%
group_by(month) %>%
summarise(lateflights = sum(late, na.rm = TRUE))
late_flights_by_month
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
# Question 2
traffic_percentage <- flights %>%
group_by(month, carrier) %>%
summarise(n_flights = n(), .groups = 'drop') %>%
group_by(month) %>%
mutate(total_flights = sum(n_flights)) %>%
ungroup() %>%
mutate(percentage = n_flights / total_flights * 100) %>%
select(month, carrier, percentage) %>%
pivot_wider(names_from = month, values_from = percentage, names_prefix = '')
traffic_percentage
## # A tibble: 16 × 13
## carrier `1` `2` `3` `4` `5` `6` `7` `8` `9`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9E 5.83 5.85 5.64 5.33 5.08 5.09 5.08 4.96 5.58
## 2 AA 10.3 10.1 9.67 9.61 9.73 9.76 9.79 9.74 9.48
## 3 AS 0.230 0.224 0.215 0.212 0.215 0.212 0.211 0.211 0.218
## 4 B6 16.4 16.4 16.5 15.9 15.9 16.4 16.9 16.9 15.6
## 5 DL 13.7 13.8 14.5 14.4 14.2 14.6 14.4 14.7 14.1
## 6 EV 15.4 15.3 16.4 16.1 16.7 15.8 15.8 15.6 17.1
## 7 F9 0.218 0.196 0.198 0.201 0.201 0.195 0.197 0.188 0.210
## 8 FL 1.21 1.19 1.10 1.10 1.13 0.892 0.894 0.897 0.925
## 9 HA 0.115 0.112 0.108 0.106 0.108 0.106 0.105 0.106 0.0907
## 10 MQ 8.41 8.19 7.82 7.80 7.93 7.71 7.68 7.72 8.00
## 11 OO 0.00370 NA NA NA NA 0.00708 NA 0.0136 0.0725
## 12 UA 17.2 17.4 17.2 17.8 17.2 17.6 17.2 17.5 17.0
## 13 US 5.93 6.22 5.97 6.10 6.20 6.15 6.07 6.07 6.16
## 14 VX 1.17 1.09 1.05 1.64 1.72 1.70 1.66 1.67 1.64
## 15 WN 3.69 3.65 3.46 3.46 3.49 3.64 3.66 3.57 3.66
## 16 YV 0.170 0.192 0.0624 0.134 0.170 0.173 0.275 0.222 0.152
## # ℹ 3 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>
# Question 3
latest_flight_each_month <- flights %>%
group_by(month) %>%
filter(dep_time == max(dep_time, na.rm = TRUE)) %>%
ungroup()
latest_flight_each_month
## # A tibble: 35 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 7 2359 2359 0 506 437
## 2 2013 1 12 2359 2359 0 429 437
## 3 2013 1 13 2359 2130 149 435 218
## 4 2013 1 18 2359 2359 0 439 437
## 5 2013 1 19 2359 2359 0 437 444
## 6 2013 1 25 2359 2359 0 500 444
## 7 2013 10 30 2400 2359 1 327 337
## 8 2013 11 27 2400 2359 1 515 445
## 9 2013 12 5 2400 2359 1 427 440
## 10 2013 12 9 2400 2359 1 432 440
## # ℹ 25 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Question 4
data <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_data <- data %>%
select(starts_with("LearningPlatformUsefulness")) %>%
gather(key = "learning_platform", value = "usefulness") %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_replace(learning_platform, "LearningPlatformUsefulness", ""))
count_data <- usefulness_data %>%
count(learning_platform, usefulness)
count_data
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
# Question 5
usefulness_data_filtered <- usefulness_data %>%
filter(usefulness != "Not Useful")
total_responses <- usefulness_data %>%
group_by(learning_platform) %>%
summarise(tot = n())
useful_counts <- usefulness_data_filtered %>%
count(learning_platform)
joined_data <- left_join(useful_counts, total_responses, by = "learning_platform")
perc_usefulness <- joined_data %>%
mutate(perc_usefulness = n / tot)
perc_usefulness
## # A tibble: 18 × 4
## learning_platform n tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.970
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.980
# Question 6
perc_usefulness <- perc_usefulness %>%
mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness)) %>%
mutate(learning_platform = fct_rev(learning_platform))
perc_usefulness <- perc_usefulness %>%
mutate(learning_platform = fct_reorder(learning_platform, perc_usefulness, .desc = TRUE))
ggplot(perc_usefulness, aes(x = perc_usefulness*100, y = learning_platform)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(x = "percentage", y = "learning platform")
