install.packages("nycflights13")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(nycflights13)
data(flights)
# Question 1: How many flights arrived late each month?
late_arrivals <- flights[flights$arr_delay > 5, ]
late_arrivals_by_month <- table(format(late_arrivals$time_hour, "%Y-%m"))
result_df <- as.data.frame(matrix(c(names(late_arrivals_by_month), as.vector(late_arrivals_by_month)), ncol = 2, byrow = FALSE))
colnames(result_df) <- c("Month", "LateFlights")
print(result_df)
## Month LateFlights
## 1 2013-01 8988
## 2 2013-02 8119
## 3 2013-03 9033
## 4 2013-04 10544
## 5 2013-05 8490
## 6 2013-06 10739
## 7 2013-07 11518
## 8 2013-08 9649
## 9 2013-09 5347
## 10 2013-10 7628
## 11 2013-11 7485
## 12 2013-12 12291
# Question 2: What percentage of traffic did each carrier represent by month?
traffic_percentage_by_carrier <- tapply(flights$arr_delay, list(flights$carrier, format(flights$time_hour, "%Y-%m")), function(x) length(x) / length(flights$arr_delay) * 100)
print(traffic_percentage_by_carrier)
## 2013-01 2013-02 2013-03 2013-04 2013-05 2013-06
## 9E 0.4670760387 0.433225646 0.483110435 0.448666176 0.434116445 0.4266931135
## AA 0.8296315652 0.747381049 0.827553032 0.808252370 0.832303965 0.8186450341
## AS 0.0184098629 0.016628263 0.018409863 0.017815996 0.018409863 0.0178159964
## B6 1.3145236003 1.218317220 1.416965579 1.341247595 1.358766658 1.3724255885
## DL 1.0956837779 1.022638193 1.243853481 1.215050954 1.212081621 1.2251466850
## EV 1.2385086823 1.136363636 1.403306649 1.354312659 1.430327577 1.3231346652
## F9 0.0175190631 0.014549730 0.016925197 0.016925197 0.017222130 0.0163313300
## FL 0.0973941136 0.087892249 0.093830914 0.092346248 0.096503314 0.0748271848
## HA 0.0092049315 0.008314132 0.009204931 0.008907998 0.009204931 0.0089079982
## MQ 0.6743354633 0.606931610 0.669881464 0.656519467 0.678195596 0.6467206689
## OO 0.0002969333 NA NA NA NA 0.0005938665
## UA 1.3768795876 1.290472005 1.476055301 1.498622230 1.472789035 1.4772430339
## US 0.4756871036 0.460840440 0.511022163 0.512803763 0.530025893 0.5154761622
## VX 0.0938309143 0.080468917 0.089970782 0.138370905 0.147278903 0.1425279711
## WN 0.2957455401 0.270506212 0.296339407 0.290994608 0.298714873 0.3052474048
## YV 0.0136589306 0.014252797 0.005344799 0.011283464 0.014549730 0.0145497304
## 2013-07 2013-08 2013-09 2013-10 2013-11 2013-12
## 9E 0.443618310 0.432334846 0.457277241 0.496769366 0.473608571 0.484892035
## AA 0.855761693 0.848041428 0.776183576 0.806173837 0.765197045 0.803204504
## AS 0.018409863 0.018409863 0.017815996 0.018409863 0.015440530 0.016034397
## B6 1.479915433 1.470413569 1.274140675 1.294926004 1.273546809 1.407760648
## DL 1.262263344 1.282157873 1.152991900 1.215347887 1.142896168 1.215347887
## EV 1.378067321 1.354906525 1.403009716 1.457348505 1.327588664 1.278891607
## F9 0.017222130 0.016331330 0.017222130 0.016925197 0.018112930 0.018112930
## FL 0.078093451 0.078093451 0.075717985 0.070076252 0.059980521 0.063246787
## HA 0.009204931 0.009204931 0.007423332 0.006235599 0.007423332 0.008314132
## MQ 0.671366131 0.671959997 0.655034801 0.661567333 0.610494810 0.635140271
## OO NA 0.001187733 0.005938665 NA 0.001484666 NA
## UA 1.504263962 1.521486092 1.393804784 1.502482362 1.441314108 1.464177970
## US 0.530322826 0.528244293 0.504192698 0.548138822 0.504489631 0.476577903
## VX 0.145200371 0.145200371 0.134510773 0.140152505 0.133916906 0.141340238
## WN 0.319500202 0.310889137 0.299902606 0.323954201 0.306732071 0.326329667
## YV 0.024051595 0.019300663 0.012471197 0.019597596 0.014549730 0.014846664
# Question 3: What was the latest flight to depart each month?
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
latest_departure_by_month <- flights %>%
group_by(year, month) %>%
arrange(desc(time_hour)) %>%
slice(1)
print(latest_departure_by_month)
## # A tibble: 12 × 19
## # Groups: year, month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 31 4 2359 5 455 444
## 2 2013 2 28 2342 2352 -10 413 437
## 3 2013 3 31 2346 2355 -9 336 345
## 4 2013 4 30 2348 2355 -7 328 345
## 5 2013 5 31 33 2359 34 408 341
## 6 2013 6 30 21 2300 81 116 8
## 7 2013 7 31 10 2359 11 344 340
## 8 2013 8 31 2351 2305 46 44 13
## 9 2013 9 30 2349 2359 -10 325 350
## 10 2013 10 31 2352 2359 -7 327 340
## 11 2013 11 30 11 2359 12 457 445
## 12 2013 12 31 13 2359 14 439 437
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
data <- read.csv("multipleChoiceResponses1.csv", header = TRUE)
# 2. Count the usefulness by learning platform
usefulness_columns <- colnames(data)[1:18]
usefulness_counts <- data %>%
select(all_of(usefulness_columns)) %>%
pivot_longer(cols = everything(), names_to = "LearningPlatform", values_to = "Usefulness") %>%
filter(!is.na(Usefulness)) %>%
mutate(Usefulness = gsub("Not Useful", "Least Useful", Usefulness),
LearningPlatform = gsub("LearningPlatformUsefulness", "", LearningPlatform)) %>%
count(LearningPlatform, Usefulness)
print(usefulness_counts)
## # A tibble: 54 × 3
## LearningPlatform Usefulness n
## <chr> <chr> <int>
## 1 Arxiv Least Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Least Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Least Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Least Useful 16
## # ℹ 44 more rows
# 3. Compute the number of total responses and the number of responses which are at least useful
total_responses <- nrow(data)
useful_responses <- nrow(filter(data, rowSums(!is.na(select(data, all_of(usefulness_columns)))) > 0))
print(paste("Total Responses:", total_responses))
## [1] "Total Responses: 16716"
print(paste("Useful Responses:", useful_responses))
## [1] "Useful Responses: 11105"
# 4. Generate the plot based on previous results
plot_data <- usefulness_counts %>%
filter(Usefulness != "Not Useful") %>%
mutate(LearningPlatform = fct_reorder(LearningPlatform, desc(Usefulness)))
ggplot(plot_data, aes(x = LearningPlatform, y = n, fill = Usefulness)) +
geom_bar(stat = "identity") +
labs(title = "Usefulness by Learning Platform",
x = "Learning Platform",
y = "Number of Responses",
fill = "Usefulness")
