# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
install.packages("nycflights13")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(nycflights13)
# Calculate late flights by month
flights %>%
filter(arr_delay > 5) %>%
group_by(month) %>%
summarise(lateflights = n())
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
# Calculate percentage of traffic by carrier and month
flights %>%
group_by(month, carrier) %>%
summarise(count = n()) %>%
group_by(month) %>%
mutate(percentage = count / sum(count) * 100) %>%
pivot_wider(names_from = month, values_from = percentage)
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
## # A tibble: 160 × 14
## carrier count `1` `2` `3` `4` `5` `6` `7` `8` `9`
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9E 1573 5.83 NA NA NA NA NA NA NA NA
## 2 AA 2794 10.3 NA NA NA NA NA NA NA NA
## 3 AS 62 0.230 NA 0.215 NA 0.215 NA 0.211 0.211 NA
## 4 B6 4427 16.4 NA NA NA NA NA NA NA NA
## 5 DL 3690 13.7 NA NA NA NA NA NA NA NA
## 6 EV 4171 15.4 NA NA NA NA NA NA NA NA
## 7 F9 59 0.218 NA NA NA NA NA NA NA NA
## 8 FL 328 1.21 NA NA NA NA NA NA NA NA
## 9 HA 31 0.115 NA 0.108 NA 0.108 NA 0.105 0.106 NA
## 10 MQ 2271 8.41 NA NA NA NA NA NA NA NA
## # ℹ 150 more rows
## # ℹ 3 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>
# Find the latest departure flight for each month
flights %>%
group_by(month) %>%
filter(dep_time == max(dep_time, na.rm = TRUE))
## # A tibble: 35 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 7 2359 2359 0 506 437
## 2 2013 1 12 2359 2359 0 429 437
## 3 2013 1 13 2359 2130 149 435 218
## 4 2013 1 18 2359 2359 0 439 437
## 5 2013 1 19 2359 2359 0 437 444
## 6 2013 1 25 2359 2359 0 500 444
## 7 2013 10 30 2400 2359 1 327 337
## 8 2013 11 27 2400 2359 1 515 445
## 9 2013 12 5 2400 2359 1 427 440
## 10 2013 12 9 2400 2359 1 432 440
## # ℹ 25 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Load data
responses <- read.csv("multipleChoiceResponses.csv")
# Clean and count usefulness
responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(cols = everything(), names_to = "platform", values_to = "usefulness") %>%
drop_na(usefulness) %>%
mutate(platform = str_remove(platform, "LearningPlatformUsefulness")) %>%
count(platform, usefulness)
## # A tibble: 72 × 3
## platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv "" 14325
## 2 Arxiv "Not Useful" 37
## 3 Arxiv "Somewhat useful" 1038
## 4 Arxiv "Very useful" 1316
## 5 Blogs "" 11951
## 6 Blogs "Not Useful" 45
## 7 Blogs "Somewhat useful" 2406
## 8 Blogs "Very useful" 2314
## 9 College "" 13357
## 10 College "Not Useful" 101
## # ℹ 62 more rows
responses %>%
select(starts_with("LearningPlatformUsefulness")) %>%
pivot_longer(cols = everything(), names_to = "platform", values_to = "usefulness") %>%
drop_na(usefulness) %>%
mutate(platform = str_remove(platform, "LearningPlatformUsefulness")) %>%
group_by(platform) %>%
summarise(count = sum(usefulness != "Not Useful"), tot = n(), perc_usefulness = count / tot)
## # A tibble: 18 × 4
## platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 16679 16716 0.998
## 2 Blogs 16671 16716 0.997
## 3 College 16615 16716 0.994
## 4 Communities 16700 16716 0.999
## 5 Company 16675 16716 0.998
## 6 Conferences 16597 16716 0.993
## 7 Courses 16669 16716 0.997
## 8 Documentation 16674 16716 0.997
## 9 Friends 16665 16716 0.997
## 10 Kaggle 16660 16716 0.997
## 11 Newsletters 16660 16716 0.997
## 12 Podcasts 16592 16716 0.993
## 13 Projects 16677 16716 0.998
## 14 SO 16652 16716 0.996
## 15 Textbook 16647 16716 0.996
## 16 TradeBook 16707 16716 0.999
## 17 Tutoring 16684 16716 0.998
## 18 YouTube 16612 16716 0.994
# Load data
library(tidytext)
twitter_data <- readRDS("twitter_data.rds")
# Compute follower stats by complaint label
twitter_data %>%
group_by(complaint_label) %>%
summarise(
avg_followers = mean(usr_followers_count, na.rm = TRUE),
min_followers = min(usr_followers_count, na.rm = TRUE),
max_followers = max(usr_followers_count, na.rm = TRUE)
)
## # A tibble: 2 × 4
## complaint_label avg_followers min_followers max_followers
## <chr> <dbl> <dbl> <dbl>
## 1 Complaint 3234. 0 1259803
## 2 Non-Complaint 4487. 0 2200851
colnames(twitter_data)
## [1] "tweet_id" "date" "complaint_label"
## [4] "tweet_text" "usr_followers_count" "usr_verified"
twitter_data <- twitter_data %>%
rename(text = tweet_text)
twitter_data %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
## # A tibble: 18,536 × 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1996
## 4 i 1754
## 5 t 1430
## 6 co 1411
## 7 http 1364
## 8 for 1356
## 9 you 1345
## 10 on 1289
## # ℹ 18,526 more rows
library(stopwords)
twitter_data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
count(word, sort = TRUE)
## # A tibble: 17,933 × 2
## word n
## <chr> <int>
## 1 http 1364
## 2 flight 996
## 3 klm 912
## 4 united 908
## 5 americanair 830
## 6 delta 595
## 7 de 476
## 8 southwestair 474
## 9 usairways 473
## 10 british_airways 381
## # ℹ 17,923 more rows
# Tokenize and count words
twitter_data %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
## # A tibble: 18,536 × 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1996
## 4 i 1754
## 5 t 1430
## 6 co 1411
## 7 http 1364
## 8 for 1356
## 9 you 1345
## 10 on 1289
## # ℹ 18,526 more rows