Final Exam

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(nycflights13)

late_flights <- flights %>%
  filter(arr_delay > 5) %>%
  group_by(month) %>%
  summarise(late_flights = n())

print(late_flights)

## # A tibble: 12 × 2
##    month late_flights
##    <int>        <int>
##  1     1         8988
##  2     2         8119
##  3     3         9033
##  4     4        10544
##  5     5         8490
##  6     6        10739
##  7     7        11518
##  8     8         9649
##  9     9         5347
## 10    10         7628
## 11    11         7485
## 12    12        12291

total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total = n())

carrier_flights_per_month <- flights %>%
  group_by(carrier, month) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.

carrier_percentage <- left_join(carrier_flights_per_month, total_flights_per_month, by = "month") %>%
  mutate(percentage = paste0(round((count / total) * 100, 2), "%"))

spread_data <- carrier_percentage %>% 
  select(-count, -total) %>% 
  spread(key = month, value = percentage)

print(spread_data)

## # A tibble: 16 × 13
## # Groups:   carrier [16]
##    carrier `1`    `2`    `3`    `4`    `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>  <chr>  <chr>  <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.83%  5.85%  5.64%  5.33%  5.08% 5.09% 5.08% 4.96% 5.58% 5.79% 5.85%
##  2 AA      10.35% 10.09% 9.67%  9.61%  9.73% 9.76% 9.79% 9.74% 9.48% 9.4%  9.45%
##  3 AS      0.23%  0.22%  0.22%  0.21%  0.22% 0.21% 0.21% 0.21% 0.22% 0.21% 0.19%
##  4 B6      16.39% 16.44% 16.55% 15.94% 15.8… 16.3… 16.9… 16.8… 15.5… 15.1% 15.7…
##  5 DL      13.66% 13.8%  14.53% 14.44% 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.45% 15.34% 16.39% 16.1%  16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.4%
##  7 F9      0.22%  0.2%   0.2%   0.2%   0.2%  0.19% 0.2%  0.19% 0.21% 0.2%  0.22%
##  8 FL      1.21%  1.19%  1.1%   1.1%   1.13% 0.89% 0.89% 0.9%  0.92% 0.82% 0.74%
##  9 HA      0.11%  0.11%  0.11%  0.11%  0.11% 0.11% 0.11% 0.11% 0.09% 0.07% 0.09%
## 10 MQ      8.41%  8.19%  7.82%  7.8%   7.93% 7.71% 7.68% 7.72% 8%    7.71% 7.54%
## 11 OO      0%     <NA>   <NA>   <NA>   <NA>  0.01% <NA>  0.01% 0.07% <NA>  0.02%
## 12 UA      17.17% 17.42% 17.24% 17.82% 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8%
## 13 US      5.93%  6.22%  5.97%  6.1%   6.2%  6.15% 6.07% 6.07% 6.16% 6.39% 6.23%
## 14 VX      1.17%  1.09%  1.05%  1.64%  1.72% 1.7%  1.66% 1.67% 1.64% 1.63% 1.65%
## 15 WN      3.69%  3.65%  3.46%  3.46%  3.49% 3.64% 3.66% 3.57% 3.66% 3.78% 3.79%
## 16 YV      0.17%  0.19%  0.06%  0.13%  0.17% 0.17% 0.28% 0.22% 0.15% 0.23% 0.18%
## # ℹ 1 more variable: `12` <chr>

flights <- flights |> 
  mutate(delay = dep_delay)

most_delayed_flights <- flights |> 
  group_by(month) |> 
  filter(delay == max(delay, na.rm = TRUE)) |> 
  ungroup() |> 
  slice_head(n = 12)

print(most_delayed_flights)

## # A tibble: 12 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013    10    14     2042            900       702     2255           1127
##  3  2013    11     3      603           1645       798      829           1913
##  4  2013    12     5      756           1700       896     1058           2020
##  5  2013     2    10     2243            830       853      100           1106
##  6  2013     3    17     2321            810       911      135           1020
##  7  2013     4    10     1100           1900       960     1342           2211
##  8  2013     5     3     1133           2055       878     1250           2215
##  9  2013     6    15     1432           1935      1137     1607           2120
## 10  2013     7    22      845           1600      1005     1044           1815
## 11  2013     8     8     2334           1454       520      120           1710
## 12  2013     9    20     1139           1845      1014     1457           2210
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, delay <dbl>

responses <- read_csv("multipleChoiceResponses1.csv")

## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

usefulness_counts <- responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(everything(), names_to = "learning_platform", values_to = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  count(learning_platform, usefulness)

print(usefulness_counts)

## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows

usefulness_summary <- responses %>%
  select(starts_with("LearningPlatformUsefulness")) %>%
  pivot_longer(everything(), names_to = "learning_platform", values_to = "usefulness") %>%
  filter(!is.na(usefulness)) %>%
  mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness")) %>%
  group_by(learning_platform) %>%
  summarise(
    total_responses = n(),
    useful_responses = sum(usefulness %in% c("Somewhat useful", "Very useful")),
    perc_usefulness = useful_responses / total_responses * 100
  )

print(usefulness_summary)

## # A tibble: 18 × 4
##    learning_platform total_responses useful_responses perc_usefulness
##    <chr>                       <int>            <int>           <dbl>
##  1 Arxiv                        2391             2354            98.5
##  2 Blogs                        4765             4720            99.1
##  3 College                      3359             3258            97.0
##  4 Communities                  1142             1126            98.6
##  5 Company                       981              940            95.8
##  6 Conferences                  2182             2063            94.5
##  7 Courses                      5992             5945            99.2
##  8 Documentation                2321             2279            98.2
##  9 Friends                      1581             1530            96.8
## 10 Kaggle                       6583             6527            99.1
## 11 Newsletters                  1089             1033            94.9
## 12 Podcasts                     1214             1090            89.8
## 13 Projects                     4794             4755            99.2
## 14 SO                           5640             5576            98.9
## 15 Textbook                     4181             4112            98.3
## 16 TradeBook                     333              324            97.3
## 17 Tutoring                     1426             1394            97.8
## 18 YouTube                      5229             5125            98.0

twitter_data <- readRDS("twitter_data.rds")

complaint_stats <- twitter_data %>%
  group_by(complaint_label) %>%
  summarise(
    avg_followers = mean(usr_followers_count, na.rm = TRUE),
    min_followers = min(usr_followers_count, na.rm = TRUE),
    max_followers = max(usr_followers_count, na.rm = TRUE)
  )

print(complaint_stats)

## # A tibble: 2 × 4
##   complaint_label avg_followers min_followers max_followers
##   <chr>                   <dbl>         <dbl>         <dbl>
## 1 Complaint               3234.             0       1259803
## 2 Non-Complaint           4487.             0       2200851

install.packages("tidytext")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(tidytext)

# Tokenize the text and count word occurrences
word_counts <- twitter_data %>%
  unnest_tokens(word, tweet_text) %>%
  count(word, sort = TRUE)

print(word_counts)

## # A tibble: 18,601 × 2
##    word      n
##    <chr> <int>
##  1 to     2834
##  2 the    2212
##  3 a      1989
##  4 i      1752
##  5 t.co   1405
##  6 http   1361
##  7 for    1356
##  8 you    1345
##  9 on     1289
## 10 and    1153
## # ℹ 18,591 more rows

Final Exam

113035131 Baljingarav

2024-12-31