This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(scales)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#1
# Load the flights data
flights_data <- flights
# Summarize the number of late flights per month
late_flights_summary <- flights_data %>%
mutate(is_late = arr_delay > 5) %>%
group_by(month) %>%
summarize(lateflights = sum(is_late, na.rm = TRUE))
# View the summary table
late_flights_summary
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
#2
# Calculate the total number of flights per month
total_flights_per_month <- flights %>%
group_by(month) %>%
summarise(total = n())
# Calculate the number of flights per carrier per month
carrier_flights_per_month <- flights %>%
group_by(carrier, month) %>%
summarise(count = n())
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
# Join total flights with carrier flights and calculate percentage
carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
mutate(percentage = paste0(round((count / total) * 100, 3), "%"))
# Spread data to wide format for easier viewing
spread_data <- carrier_percentage %>%
select(-count, -total) %>%
spread(key = month, value = percentage)
print(spread_data)
## # A tibble: 16 × 13
## # Groups: carrier [16]
## carrier `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 9E 5.825% 5.847% 5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
## 2 AA 10.347% 10.088% 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.48% 9.39… 9.45…
## 3 AS 0.23% 0.224% 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
## 4 B6 16.394% 16.444% 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
## 5 DL 13.665% 13.803% 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
## 6 EV 15.446% 15.338% 16.3… 16.1% 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
## 7 F9 0.218% 0.196% 0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21% 0.19… 0.22…
## 8 FL 1.215% 1.186% 1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
## 9 HA 0.115% 0.112% 0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ 8.41% 8.192% 7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8% 7.71… 7.54%
## 11 OO 0.004% <NA> <NA> <NA> <NA> 0.00… <NA> 0.01… 0.07… <NA> 0.01…
## 12 UA 17.172% 17.418% 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US 5.932% 6.22% 5.96… 6.09… 6.19… 6.14… 6.07% 6.06… 6.15… 6.39% 6.23…
## 14 VX 1.17% 1.086% 1.05… 1.64… 1.72… 1.7% 1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN 3.688% 3.651% 3.46… 3.45… 3.49… 3.64% 3.65… 3.57% 3.66… 3.77… 3.78…
## 16 YV 0.17% 0.192% 0.06… 0.13… 0.17% 0.17… 0.27… 0.22… 0.15… 0.22… 0.18%
## # ℹ 1 more variable: `12` <chr>
#3
flights <- flights %>%
mutate(delay = dep_delay)
# Find the flight with the most delayed departure time each month
most_delayed_flights <- flights %>%
group_by(month) %>%
filter(delay == max(delay, na.rm = TRUE)) %>%
slice(1)
print(most_delayed_flights)
## # A tibble: 12 × 20
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 2 10 2243 830 853 100 1106
## 3 2013 3 17 2321 810 911 135 1020
## 4 2013 4 10 1100 1900 960 1342 2211
## 5 2013 5 3 1133 2055 878 1250 2215
## 6 2013 6 15 1432 1935 1137 1607 2120
## 7 2013 7 22 845 1600 1005 1044 1815
## 8 2013 8 8 2334 1454 520 120 1710
## 9 2013 9 20 1139 1845 1014 1457 2210
## 10 2013 10 14 2042 900 702 2255 1127
## 11 2013 11 3 603 1645 798 829 1913
## 12 2013 12 5 756 1700 896 1058 2020
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>
#4
responses <- read.csv("C:/Users/daavka/Desktop/multipleChoiceResponses1.csv")
usefulness_count <- responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
gather(key = "learning_platform", value = "usefulness") %>%
filter(!is.na(usefulness)) %>%
group_by(learning_platform, usefulness) %>%
summarise(count = n(), .groups = 'drop')
# Remove "LearningPlatformUsefulness" from each string in learning_platform
usefulness_count$learning_platform <- sub("LearningPlatformUsefulness", "", usefulness_count$learning_platform)
print(usefulness_count)
## # A tibble: 340 × 3
## learning_platform usefulness count
## <chr> <chr> <int>
## 1 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 2 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 3 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 4 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 5 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 6 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,… 1
## 7 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Somewhat … 1
## 8 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very usef… 1
## 9 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very usef… 1
## 10 Arxiv "NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,Very useful,… 1
## # ℹ 330 more rows
#5
selected_data <- responses %>% select(starts_with("LearningPlatformUsefulness"))
# Convert data into long format
long_data <- selected_data %>%
pivot_longer(cols = everything(), names_to = "learning_platform", values_to = "usefulness") %>%
filter(!is.na(usefulness))
# Remove "LearningPlatformUsefulness" from each string in learning_platform
long_data$learning_platform <- sub("LearningPlatformUsefulness", "", long_data$learning_platform)
# Compute total count and count of useful responses for each learning platform
result <- long_data %>%
group_by(learning_platform) %>%
summarise(
count = n(),
tot = sum(usefulness != "Not Useful"),
perc_usefulness = tot / count
)
print(result)
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2603 2566 0.986
## 2 Blogs 4895 4850 0.991
## 3 College 3547 3448 0.972
## 4 Communities 1378 1362 0.988
## 5 Company 1218 1177 0.966
## 6 Conferences 2400 2284 0.952
## 7 Courses 6130 6084 0.992
## 8 Documentation 2540 2500 0.984
## 9 Friends 1822 1771 0.972
## 10 Kaggle 6675 6619 0.992
## 11 Newsletters 1329 1274 0.959
## 12 Podcasts 1456 1333 0.916
## 13 Projects 4945 4907 0.992
## 14 SO 5761 5697 0.989
## 15 Textbook 4363 4294 0.984
## 16 TradeBook 597 588 0.985
## 17 Tutoring 1663 1631 0.981
## 18 YouTube 5392 5290 0.981
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.