# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
## Part 1
# Question 1 How many flights arrived late each month?
flights <- nycflights13::flights
flights <- flights %>%
mutate(is_late = arr_delay > 5)
late_flights_by_month <- flights %>%
group_by(month) %>%
summarise(lateflights = sum(is_late, na.rm = TRUE))
print(late_flights_by_month)
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
# Question 2 What percentage of traffic did each carrier represent, by month?
percentage_traffic_by_carrier <- flights %>%
group_by(month, carrier) %>%
summarise(percentage_traffic = n() / nrow(flights) * 100, .groups = 'drop_last')
percentage_traffic_pivoted <- percentage_traffic_by_carrier %>%
pivot_wider(names_from = carrier, values_from = percentage_traffic)
print(percentage_traffic_pivoted)
## # A tibble: 12 × 17
## # Groups: month [12]
## month `9E` AA AS B6 DL EV F9 FL HA MQ
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.467 0.830 0.0184 1.31 1.10 1.24 0.0175 0.0974 0.00920 0.674
## 2 2 0.433 0.747 0.0166 1.22 1.02 1.14 0.0145 0.0879 0.00831 0.607
## 3 3 0.483 0.828 0.0184 1.42 1.24 1.40 0.0169 0.0938 0.00920 0.670
## 4 4 0.449 0.808 0.0178 1.34 1.22 1.35 0.0169 0.0923 0.00891 0.657
## 5 5 0.434 0.832 0.0184 1.36 1.21 1.43 0.0172 0.0965 0.00920 0.678
## 6 6 0.427 0.819 0.0178 1.37 1.23 1.32 0.0163 0.0748 0.00891 0.647
## 7 7 0.444 0.856 0.0184 1.48 1.26 1.38 0.0172 0.0781 0.00920 0.671
## 8 8 0.432 0.848 0.0184 1.47 1.28 1.35 0.0163 0.0781 0.00920 0.672
## 9 9 0.457 0.776 0.0178 1.27 1.15 1.40 0.0172 0.0757 0.00742 0.655
## 10 10 0.497 0.806 0.0184 1.29 1.22 1.46 0.0169 0.0701 0.00624 0.662
## 11 11 0.474 0.765 0.0154 1.27 1.14 1.33 0.0181 0.0600 0.00742 0.610
## 12 12 0.485 0.803 0.0160 1.41 1.22 1.28 0.0181 0.0632 0.00831 0.635
## # ℹ 6 more variables: OO <dbl>, UA <dbl>, US <dbl>, VX <dbl>, WN <dbl>,
## # YV <dbl>
# Question 3 What was the latest flight to depart each month?
flights$dep_time <- ymd_hm(paste(flights$year, flights$month, flights$day, flights$dep_time, sep = " "))
## Warning: 106157 failed to parse.
latest_departure_by_month <- flights %>%
group_by(month) %>%
filter(dep_time == max(dep_time, na.rm = TRUE))
latest_departure_by_month <- select(latest_departure_by_month, year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour)
print(latest_departure_by_month)
## # A tibble: 14 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dttm> <int> <dbl> <int>
## 1 2013 1 31 2013-01-31 23:54:00 2055 179 144
## 2 2013 10 31 2013-10-31 23:57:00 2359 -2 345
## 3 2013 11 30 2013-11-30 23:54:00 2359 -5 430
## 4 2013 12 31 2013-12-31 23:56:00 2359 -3 436
## 5 2013 2 28 2013-02-28 23:59:00 2359 0 443
## 6 2013 3 31 2013-03-31 23:58:00 2359 -1 332
## 7 2013 4 30 2013-04-30 23:51:00 2359 -8 345
## 8 2013 5 31 2013-05-31 23:55:00 2359 -4 338
## 9 2013 5 31 2013-05-31 23:55:00 2359 -4 335
## 10 2013 6 30 2013-06-30 23:59:00 2110 169 118
## 11 2013 7 31 2013-07-31 23:52:00 2245 67 49
## 12 2013 8 31 2013-08-31 23:59:00 2359 0 345
## 13 2013 8 31 2013-08-31 23:59:00 2359 0 346
## 14 2013 9 30 2013-09-30 23:49:00 2359 -10 325
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## Part 2 ----
library(readr)
multipleChoiceResponses <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%
set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>%
gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)
usefulness_by_platform %>% group_by(learning_platform, usefulness) %>%
summarise(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
#Calculate the number of total responses by learning platform
total_usefulness_by_platform <- usefulness_by_platform %>%
group_by(learning_platform) %>%
summarise(tot = n())
#Calculate the number of useful responses by learning platform
usefulness_count <- usefulness_by_platform %>%
filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>%
group_by(learning_platform) %>%
summarise(count = n())
#Calculate the percentage of usefulness for each learning platform
perc_usefulness <- usefulness_count %>%
left_join(total_usefulness_by_platform, by = "learning_platform") %>%
mutate(perc_usefulness = count/tot) %>%
mutate(perc_usefulness = round(perc_usefulness, digits = 3))
#Print the result
perc_usefulness
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.97
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.98
#Change platforms into factors
perc_usefulness_fct <- perc_usefulness %>%
mutate(learning_platform = fct(learning_platform) %>%
fct_reorder(perc_usefulness) %>%
fct_rev() %>%
fct_relevel("Courses", after = 0))
#Plot the percentage of usefulness of each platform
perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
geom_segment(aes(xend = learning_platform, yend=0))+
geom_point()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
x = "Learning Platform",
y = "Percent finding at least somewhat useful"
)+
scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))

summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
