finalexam

install.packages("nycflights13")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(nycflights13)

data(flights)

# Question 1: How many flights arrived late each month?
late_arrivals <- flights[flights$arr_delay > 5, ]
late_arrivals_by_month <- table(format(late_arrivals$time_hour, "%Y-%m"))
result_df <- as.data.frame(matrix(c(names(late_arrivals_by_month), as.vector(late_arrivals_by_month)), ncol = 2, byrow = FALSE))
colnames(result_df) <- c("Month", "LateFlights")

print(result_df)

##      Month LateFlights
## 1  2013-01        8988
## 2  2013-02        8119
## 3  2013-03        9033
## 4  2013-04       10544
## 5  2013-05        8490
## 6  2013-06       10739
## 7  2013-07       11518
## 8  2013-08        9649
## 9  2013-09        5347
## 10 2013-10        7628
## 11 2013-11        7485
## 12 2013-12       12291

# Question 2: What percentage of traffic did each carrier represent by month?
traffic_percentage_by_carrier <- tapply(flights$arr_delay, list(flights$carrier, format(flights$time_hour, "%Y-%m")), function(x) length(x) / length(flights$arr_delay) * 100)
print(traffic_percentage_by_carrier)

##         2013-01     2013-02     2013-03     2013-04     2013-05      2013-06
## 9E 0.4670760387 0.433225646 0.483110435 0.448666176 0.434116445 0.4266931135
## AA 0.8296315652 0.747381049 0.827553032 0.808252370 0.832303965 0.8186450341
## AS 0.0184098629 0.016628263 0.018409863 0.017815996 0.018409863 0.0178159964
## B6 1.3145236003 1.218317220 1.416965579 1.341247595 1.358766658 1.3724255885
## DL 1.0956837779 1.022638193 1.243853481 1.215050954 1.212081621 1.2251466850
## EV 1.2385086823 1.136363636 1.403306649 1.354312659 1.430327577 1.3231346652
## F9 0.0175190631 0.014549730 0.016925197 0.016925197 0.017222130 0.0163313300
## FL 0.0973941136 0.087892249 0.093830914 0.092346248 0.096503314 0.0748271848
## HA 0.0092049315 0.008314132 0.009204931 0.008907998 0.009204931 0.0089079982
## MQ 0.6743354633 0.606931610 0.669881464 0.656519467 0.678195596 0.6467206689
## OO 0.0002969333          NA          NA          NA          NA 0.0005938665
## UA 1.3768795876 1.290472005 1.476055301 1.498622230 1.472789035 1.4772430339
## US 0.4756871036 0.460840440 0.511022163 0.512803763 0.530025893 0.5154761622
## VX 0.0938309143 0.080468917 0.089970782 0.138370905 0.147278903 0.1425279711
## WN 0.2957455401 0.270506212 0.296339407 0.290994608 0.298714873 0.3052474048
## YV 0.0136589306 0.014252797 0.005344799 0.011283464 0.014549730 0.0145497304
##        2013-07     2013-08     2013-09     2013-10     2013-11     2013-12
## 9E 0.443618310 0.432334846 0.457277241 0.496769366 0.473608571 0.484892035
## AA 0.855761693 0.848041428 0.776183576 0.806173837 0.765197045 0.803204504
## AS 0.018409863 0.018409863 0.017815996 0.018409863 0.015440530 0.016034397
## B6 1.479915433 1.470413569 1.274140675 1.294926004 1.273546809 1.407760648
## DL 1.262263344 1.282157873 1.152991900 1.215347887 1.142896168 1.215347887
## EV 1.378067321 1.354906525 1.403009716 1.457348505 1.327588664 1.278891607
## F9 0.017222130 0.016331330 0.017222130 0.016925197 0.018112930 0.018112930
## FL 0.078093451 0.078093451 0.075717985 0.070076252 0.059980521 0.063246787
## HA 0.009204931 0.009204931 0.007423332 0.006235599 0.007423332 0.008314132
## MQ 0.671366131 0.671959997 0.655034801 0.661567333 0.610494810 0.635140271
## OO          NA 0.001187733 0.005938665          NA 0.001484666          NA
## UA 1.504263962 1.521486092 1.393804784 1.502482362 1.441314108 1.464177970
## US 0.530322826 0.528244293 0.504192698 0.548138822 0.504489631 0.476577903
## VX 0.145200371 0.145200371 0.134510773 0.140152505 0.133916906 0.141340238
## WN 0.319500202 0.310889137 0.299902606 0.323954201 0.306732071 0.326329667
## YV 0.024051595 0.019300663 0.012471197 0.019597596 0.014549730 0.014846664

# Question 3: What was the latest flight to depart each month?
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

latest_departure_by_month <- flights %>%
  group_by(year, month) %>%
  arrange(desc(time_hour)) %>%
  slice(1)

print(latest_departure_by_month)

## # A tibble: 12 × 19
## # Groups:   year, month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    31        4           2359         5      455            444
##  2  2013     2    28     2342           2352       -10      413            437
##  3  2013     3    31     2346           2355        -9      336            345
##  4  2013     4    30     2348           2355        -7      328            345
##  5  2013     5    31       33           2359        34      408            341
##  6  2013     6    30       21           2300        81      116              8
##  7  2013     7    31       10           2359        11      344            340
##  8  2013     8    31     2351           2305        46       44             13
##  9  2013     9    30     2349           2359       -10      325            350
## 10  2013    10    31     2352           2359        -7      327            340
## 11  2013    11    30       11           2359        12      457            445
## 12  2013    12    31       13           2359        14      439            437
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)


data <- read.csv("multipleChoiceResponses1.csv", header = TRUE)
# 2. Count the usefulness by learning platform

usefulness_columns <- colnames(data)[1:18]
usefulness_counts <- data %>%
  select(all_of(usefulness_columns)) %>%
  pivot_longer(cols = everything(), names_to = "LearningPlatform", values_to = "Usefulness") %>%
  filter(!is.na(Usefulness)) %>%
mutate(Usefulness = gsub("Not Useful", "Least Useful", Usefulness),
LearningPlatform = gsub("LearningPlatformUsefulness", "", LearningPlatform)) %>%
  count(LearningPlatform, Usefulness)

print(usefulness_counts)

## # A tibble: 54 × 3
##    LearningPlatform Usefulness          n
##    <chr>            <chr>           <int>
##  1 Arxiv            Least Useful       37
##  2 Arxiv            Somewhat useful  1038
##  3 Arxiv            Very useful      1316
##  4 Blogs            Least Useful       45
##  5 Blogs            Somewhat useful  2406
##  6 Blogs            Very useful      2314
##  7 College          Least Useful      101
##  8 College          Somewhat useful  1405
##  9 College          Very useful      1853
## 10 Communities      Least Useful       16
## # ℹ 44 more rows

# 3. Compute the number of total responses and the number of responses which are at least useful
total_responses <- nrow(data)
useful_responses <- nrow(filter(data, rowSums(!is.na(select(data, all_of(usefulness_columns)))) > 0))

print(paste("Total Responses:", total_responses))

## [1] "Total Responses: 16716"

print(paste("Useful Responses:", useful_responses))

## [1] "Useful Responses: 11105"

# 4. Generate the plot based on previous results
plot_data <- usefulness_counts %>%
  filter(Usefulness != "Not Useful") %>%
  mutate(LearningPlatform = fct_reorder(LearningPlatform, desc(Usefulness)))

ggplot(plot_data, aes(x = LearningPlatform, y = n, fill = Usefulness)) +
  geom_bar(stat = "identity") +
  labs(title = "Usefulness by Learning Platform",
       x = "Learning Platform",
       y = "Number of Responses",
       fill = "Usefulness")

finalexam

Leo Tzang

2024-01-02