library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
late_flights <- flights %>%
filter(arr_delay > 5) %>%
group_by(month) %>%
summarise(late_flights = n())
print(late_flights)
## # A tibble: 12 × 2
## month late_flights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
total_flights_per_month <- flights %>%
group_by(month) %>%
summarise(total = n())
carrier_flights_per_month <- flights %>%
group_by(carrier, month) %>%
summarise(count = n())
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
mutate(percentage = paste0(round((count / total) * 100, 2), "%"))
spread_data <- carrier_percentage %>%
select(-count, -total) %>%
spread(key = month, value = percentage)
print(spread_data)
## # A tibble: 16 × 13
## # Groups: carrier [16]
## carrier `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 9E 5.83% 5.85% 5.64% 5.33% 5.08% 5.09% 5.08% 4.96% 5.58% 5.79% 5.85%
## 2 AA 10.35% 10.09% 9.67% 9.61% 9.73% 9.76% 9.79% 9.74% 9.48% 9.4% 9.45%
## 3 AS 0.23% 0.22% 0.22% 0.21% 0.22% 0.21% 0.21% 0.21% 0.22% 0.21% 0.19%
## 4 B6 16.39% 16.44% 16.55% 15.94% 15.8… 16.3… 16.9… 16.8… 15.5… 15.1% 15.7…
## 5 DL 13.66% 13.8% 14.53% 14.44% 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
## 6 EV 15.45% 15.34% 16.39% 16.1% 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.4%
## 7 F9 0.22% 0.2% 0.2% 0.2% 0.2% 0.19% 0.2% 0.19% 0.21% 0.2% 0.22%
## 8 FL 1.21% 1.19% 1.1% 1.1% 1.13% 0.89% 0.89% 0.9% 0.92% 0.82% 0.74%
## 9 HA 0.11% 0.11% 0.11% 0.11% 0.11% 0.11% 0.11% 0.11% 0.09% 0.07% 0.09%
## 10 MQ 8.41% 8.19% 7.82% 7.8% 7.93% 7.71% 7.68% 7.72% 8% 7.71% 7.54%
## 11 OO 0% <NA> <NA> <NA> <NA> 0.01% <NA> 0.01% 0.07% <NA> 0.02%
## 12 UA 17.17% 17.42% 17.24% 17.82% 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8%
## 13 US 5.93% 6.22% 5.97% 6.1% 6.2% 6.15% 6.07% 6.07% 6.16% 6.39% 6.23%
## 14 VX 1.17% 1.09% 1.05% 1.64% 1.72% 1.7% 1.66% 1.67% 1.64% 1.63% 1.65%
## 15 WN 3.69% 3.65% 3.46% 3.46% 3.49% 3.64% 3.66% 3.57% 3.66% 3.78% 3.79%
## 16 YV 0.17% 0.19% 0.06% 0.13% 0.17% 0.17% 0.28% 0.22% 0.15% 0.23% 0.18%
## # ℹ 1 more variable: `12` <chr>
flights <- flights |>
mutate(delay = dep_delay)
most_delayed_flights <- flights |>
group_by(month) |>
filter(delay == max(delay, na.rm = TRUE)) |>
ungroup() |>
slice_head(n = 12)
print(most_delayed_flights)
## # A tibble: 12 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 10 14 2042 900 702 2255 1127
## 3 2013 11 3 603 1645 798 829 1913
## 4 2013 12 5 756 1700 896 1058 2020
## 5 2013 2 10 2243 830 853 100 1106
## 6 2013 3 17 2321 810 911 135 1020
## 7 2013 4 10 1100 1900 960 1342 2211
## 8 2013 5 3 1133 2055 878 1250 2215
## 9 2013 6 15 1432 1935 1137 1607 2120
## 10 2013 7 22 845 1600 1005 1044 1815
## 11 2013 8 8 2334 1454 520 120 1710
## 12 2013 9 20 1139 1845 1014 1457 2210
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>
responses <- read_csv("multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_counts <- responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(everything(), names_to = "learning_platform", values_to = "usefulness") %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
count(learning_platform, usefulness)
print(usefulness_counts)
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
usefulness_summary <- responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(everything(), names_to = "learning_platform", values_to = "usefulness") %>%
filter(!is.na(usefulness)) %>%
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
group_by(learning_platform) %>%
summarise(
total_responses = n(),
useful_responses = sum(usefulness %in% c("Somewhat useful", "Very useful")),
perc_usefulness = useful_responses / total_responses * 100
)
print(usefulness_summary)
## # A tibble: 18 × 4
## learning_platform total_responses useful_responses perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2391 2354 98.5
## 2 Blogs 4765 4720 99.1
## 3 College 3359 3258 97.0
## 4 Communities 1142 1126 98.6
## 5 Company 981 940 95.8
## 6 Conferences 2182 2063 94.5
## 7 Courses 5992 5945 99.2
## 8 Documentation 2321 2279 98.2
## 9 Friends 1581 1530 96.8
## 10 Kaggle 6583 6527 99.1
## 11 Newsletters 1089 1033 94.9
## 12 Podcasts 1214 1090 89.8
## 13 Projects 4794 4755 99.2
## 14 SO 5640 5576 98.9
## 15 Textbook 4181 4112 98.3
## 16 TradeBook 333 324 97.3
## 17 Tutoring 1426 1394 97.8
## 18 YouTube 5229 5125 98.0
twitter_data <- readRDS("twitter_data.rds")
complaint_stats <- twitter_data %>%
group_by(complaint_label) %>%
summarise(
avg_followers = mean(usr_followers_count, na.rm = TRUE),
min_followers = min(usr_followers_count, na.rm = TRUE),
max_followers = max(usr_followers_count, na.rm = TRUE)
)
print(complaint_stats)
## # A tibble: 2 × 4
## complaint_label avg_followers min_followers max_followers
## <chr> <dbl> <dbl> <dbl>
## 1 Complaint 3234. 0 1259803
## 2 Non-Complaint 4487. 0 2200851
install.packages("tidytext")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidytext)
# Tokenize the text and count word occurrences
word_counts <- twitter_data %>%
unnest_tokens(word, tweet_text) %>%
count(word, sort = TRUE)
print(word_counts)
## # A tibble: 18,601 × 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1989
## 4 i 1752
## 5 t.co 1405
## 6 http 1361
## 7 for 1356
## 8 you 1345
## 9 on 1289
## 10 and 1153
## # ℹ 18,591 more rows