## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
mutate(late_arrival = arr_delay > 5) %>%
group_by(month) %>%
summarise(late_arrivals = sum(late_arrival, na.rm = TRUE))
## # A tibble: 12 × 2
## month late_arrivals
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
##Calculate the percentage of traffic by carrier and month
##Find the latest departure for each month
flights %>%
group_by(month) %>%
slice(which.max(dep_time))
## # A tibble: 12 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 7 2359 2359 0 506 437
## 2 2013 2 7 2400 2359 1 432 436
## 3 2013 3 15 2400 2359 1 324 338
## 4 2013 4 2 2400 2359 1 339 343
## 5 2013 5 21 2400 2359 1 339 350
## 6 2013 6 17 2400 2145 135 102 2315
## 7 2013 7 7 2400 1950 250 107 2130
## 8 2013 8 10 2400 2245 75 110 1
## 9 2013 9 2 2400 2359 1 411 340
## 10 2013 10 30 2400 2359 1 327 337
## 11 2013 11 27 2400 2359 1 515 445
## 12 2013 12 5 2400 2359 1 427 440
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
##Load and inspect the dataset
library(readr)
multipleChoiceResponses <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%
set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>%
gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)
usefulness_by_platform %>% group_by(learning_platform, usefulness) %>%
summarise(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
#Calculate the number of total responses by learning platform
total_usefulness_by_platform <- usefulness_by_platform %>%
group_by(learning_platform) %>%
summarise(tot = n())
#Calculate the number of useful responses by learning platform
usefulness_count <- usefulness_by_platform %>%
filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>%
group_by(learning_platform) %>%
summarise(count = n())
#Calculate the percentage of usefulness for each learning platform
perc_usefulness <- usefulness_count %>%
left_join(total_usefulness_by_platform, by = "learning_platform") %>%
mutate(perc_usefulness = count/tot) %>%
mutate(perc_usefulness = round(perc_usefulness, digits = 3))
#Print the result
perc_usefulness
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.97
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.98