Final Exam

# Load required libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
install.packages("nycflights13")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(nycflights13)

# Calculate late flights by month
flights %>%
  filter(arr_delay > 5) %>%
  group_by(month) %>%
  summarise(lateflights = n())

## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291

# Calculate percentage of traffic by carrier and month
flights %>%
  group_by(month, carrier) %>%
  summarise(count = n()) %>%
  group_by(month) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  pivot_wider(names_from = month, values_from = percentage)

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

## # A tibble: 160 × 14
##    carrier count    `1`   `2`    `3`   `4`    `5`   `6`    `7`    `8`   `9`
##    <chr>   <int>  <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl>
##  1 9E       1573  5.83     NA NA        NA NA        NA NA     NA        NA
##  2 AA       2794 10.3      NA NA        NA NA        NA NA     NA        NA
##  3 AS         62  0.230    NA  0.215    NA  0.215    NA  0.211  0.211    NA
##  4 B6       4427 16.4      NA NA        NA NA        NA NA     NA        NA
##  5 DL       3690 13.7      NA NA        NA NA        NA NA     NA        NA
##  6 EV       4171 15.4      NA NA        NA NA        NA NA     NA        NA
##  7 F9         59  0.218    NA NA        NA NA        NA NA     NA        NA
##  8 FL        328  1.21     NA NA        NA NA        NA NA     NA        NA
##  9 HA         31  0.115    NA  0.108    NA  0.108    NA  0.105  0.106    NA
## 10 MQ       2271  8.41     NA NA        NA NA        NA NA     NA        NA
## # ℹ 150 more rows
## # ℹ 3 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>

# Find the latest departure flight for each month
flights %>%
  group_by(month) %>%
  filter(dep_time == max(dep_time, na.rm = TRUE))

## # A tibble: 35 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     7     2359           2359         0      506            437
##  2  2013     1    12     2359           2359         0      429            437
##  3  2013     1    13     2359           2130       149      435            218
##  4  2013     1    18     2359           2359         0      439            437
##  5  2013     1    19     2359           2359         0      437            444
##  6  2013     1    25     2359           2359         0      500            444
##  7  2013    10    30     2400           2359         1      327            337
##  8  2013    11    27     2400           2359         1      515            445
##  9  2013    12     5     2400           2359         1      427            440
## 10  2013    12     9     2400           2359         1      432            440
## # ℹ 25 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# Load data
responses <- read.csv("multipleChoiceResponses.csv")

# Clean and count usefulness
responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(cols = everything(), names_to = "platform", values_to = "usefulness") %>%
  drop_na(usefulness) %>%
  mutate(platform = str_remove(platform, "LearningPlatformUsefulness")) %>%
  count(platform, usefulness)

## # A tibble: 72 × 3
##    platform usefulness            n
##    <chr>    <chr>             <int>
##  1 Arxiv    ""                14325
##  2 Arxiv    "Not Useful"         37
##  3 Arxiv    "Somewhat useful"  1038
##  4 Arxiv    "Very useful"      1316
##  5 Blogs    ""                11951
##  6 Blogs    "Not Useful"         45
##  7 Blogs    "Somewhat useful"  2406
##  8 Blogs    "Very useful"      2314
##  9 College  ""                13357
## 10 College  "Not Useful"        101
## # ℹ 62 more rows

responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(cols = everything(), names_to = "platform", values_to = "usefulness") %>%
  drop_na(usefulness) %>%
  mutate(platform = str_remove(platform, "LearningPlatformUsefulness")) %>%
  group_by(platform) %>%
  summarise(count = sum(usefulness != "Not Useful"), tot = n(), perc_usefulness = count / tot)

## # A tibble: 18 × 4
##    platform      count   tot perc_usefulness
##    <chr>         <int> <int>           <dbl>
##  1 Arxiv         16679 16716           0.998
##  2 Blogs         16671 16716           0.997
##  3 College       16615 16716           0.994
##  4 Communities   16700 16716           0.999
##  5 Company       16675 16716           0.998
##  6 Conferences   16597 16716           0.993
##  7 Courses       16669 16716           0.997
##  8 Documentation 16674 16716           0.997
##  9 Friends       16665 16716           0.997
## 10 Kaggle        16660 16716           0.997
## 11 Newsletters   16660 16716           0.997
## 12 Podcasts      16592 16716           0.993
## 13 Projects      16677 16716           0.998
## 14 SO            16652 16716           0.996
## 15 Textbook      16647 16716           0.996
## 16 TradeBook     16707 16716           0.999
## 17 Tutoring      16684 16716           0.998
## 18 YouTube       16612 16716           0.994

# Load data
library(tidytext)
twitter_data <- readRDS("twitter_data.rds")

# Compute follower stats by complaint label
twitter_data %>%
  group_by(complaint_label) %>%
  summarise(
    avg_followers = mean(usr_followers_count, na.rm = TRUE),
    min_followers = min(usr_followers_count, na.rm = TRUE),
    max_followers = max(usr_followers_count, na.rm = TRUE)
  )

## # A tibble: 2 × 4
##   complaint_label avg_followers min_followers max_followers
##   <chr>                   <dbl>         <dbl>         <dbl>
## 1 Complaint               3234.             0       1259803
## 2 Non-Complaint           4487.             0       2200851

colnames(twitter_data)

## [1] "tweet_id"            "date"                "complaint_label"    
## [4] "tweet_text"          "usr_followers_count" "usr_verified"

twitter_data <- twitter_data %>%
  rename(text = tweet_text)

twitter_data %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

## # A tibble: 18,536 × 2
##    word      n
##    <chr> <int>
##  1 to     2834
##  2 the    2212
##  3 a      1996
##  4 i      1754
##  5 t      1430
##  6 co     1411
##  7 http   1364
##  8 for    1356
##  9 you    1345
## 10 on     1289
## # ℹ 18,526 more rows

library(stopwords)

twitter_data %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE)

## # A tibble: 17,933 × 2
##    word                n
##    <chr>           <int>
##  1 http             1364
##  2 flight            996
##  3 klm               912
##  4 united            908
##  5 americanair       830
##  6 delta             595
##  7 de                476
##  8 southwestair      474
##  9 usairways         473
## 10 british_airways   381
## # ℹ 17,923 more rows

# Tokenize and count words
twitter_data %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

## # A tibble: 18,536 × 2
##    word      n
##    <chr> <int>
##  1 to     2834
##  2 the    2212
##  3 a      1996
##  4 i      1754
##  5 t      1430
##  6 co     1411
##  7 http   1364
##  8 for    1356
##  9 you    1345
## 10 on     1289
## # ℹ 18,526 more rows

Final Exam

Vicky

2024-12-31