library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(dplyr)
lateflights <- flights %>% 
  filter(arr_delay>5) %>% 
  group_by(month) %>% 
  summarise(lateflights = n()) %>% 
  ungroup()

lateflights
## # A tibble: 12 × 2
##    month lateflights
##    <int>       <int>
##  1     1        8988
##  2     2        8119
##  3     3        9033
##  4     4       10544
##  5     5        8490
##  6     6       10739
##  7     7       11518
##  8     8        9649
##  9     9        5347
## 10    10        7628
## 11    11        7485
## 12    12       12291
total_flights_per_carrier <- flights %>%
  group_by(month, carrier) %>%
  summarise(total_flights_by_carrier = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
total_flights_per_month <- flights %>%
  group_by(month) %>%
  summarise(total_flights_per_month = n())
percentage_traffic_per_carrier <- total_flights_per_carrier %>%
  left_join(total_flights_per_month, by = "month") %>%
  mutate(percentage_traffic = (total_flights_by_carrier / total_flights_per_month)) %>%
  mutate(percentage_traffic = scales::percent(percentage_traffic)) %>% 
  select(month, carrier, percentage_traffic)
percentage_traffic_per_carrier %>% spread(key = month, value = percentage_traffic)
## # A tibble: 16 × 13
##    carrier `1`     `2`     `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
##    <chr>   <chr>   <chr>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 9E      5.825%  5.8475% 5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
##  2 AA      10.347% 10.087… 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.47… 9.39… 9.45…
##  3 AS      0.230%  0.2244% 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
##  4 B6      16.394% 16.444… 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
##  5 DL      13.665% 13.803… 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
##  6 EV      15.446% 15.338… 16.3… 16.1… 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
##  7 F9      0.218%  0.1964% 0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21… 0.19… 0.22…
##  8 FL      1.215%  1.1863% 1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
##  9 HA      0.115%  0.1122% 0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ      8.410%  8.1921% 7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8.00… 7.71… 7.54…
## 11 OO      0.004%  <NA>    <NA>  <NA>  <NA>  0.00… <NA>  0.01… 0.07… <NA>  0.01…
## 12 UA      17.172% 17.418… 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US      5.932%  6.2202% 5.96… 6.09… 6.19… 6.14… 6.07… 6.06… 6.15… 6.39… 6.23…
## 14 VX      1.170%  1.0861% 1.05… 1.64… 1.72… 1.70… 1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN      3.688%  3.6512% 3.46… 3.45… 3.49… 3.64… 3.65… 3.57… 3.66… 3.77… 3.78…
## 16 YV      0.170%  0.1924% 0.06… 0.13… 0.17… 0.17… 0.27… 0.22… 0.15… 0.22… 0.18…
## # ℹ 1 more variable: `12` <chr>
flights %>%
  group_by(month) %>%
  filter(dep_delay == max(dep_delay, na.rm = TRUE)) %>% 
  arrange(-desc(month))
## # A tibble: 12 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     2    10     2243            830       853      100           1106
##  3  2013     3    17     2321            810       911      135           1020
##  4  2013     4    10     1100           1900       960     1342           2211
##  5  2013     5     3     1133           2055       878     1250           2215
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     7    22      845           1600      1005     1044           1815
##  8  2013     8     8     2334           1454       520      120           1710
##  9  2013     9    20     1139           1845      1014     1457           2210
## 10  2013    10    14     2042            900       702     2255           1127
## 11  2013    11     3      603           1645       798      829           1913
## 12  2013    12     5      756           1700       896     1058           2020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
library(readr)
multipleChoiceResponses <- read_csv("/cloud/project/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl  (1): Age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%  
  set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>% 
  gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)
usefulness_by_platform %>% group_by(learning_platform, usefulness) %>% 
  summarise(n = n()) %>% 
  ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
##    learning_platform usefulness          n
##    <chr>             <chr>           <int>
##  1 Arxiv             Not Useful         37
##  2 Arxiv             Somewhat useful  1038
##  3 Arxiv             Very useful      1316
##  4 Blogs             Not Useful         45
##  5 Blogs             Somewhat useful  2406
##  6 Blogs             Very useful      2314
##  7 College           Not Useful        101
##  8 College           Somewhat useful  1405
##  9 College           Very useful      1853
## 10 Communities       Not Useful         16
## # ℹ 44 more rows
total_usefulness_by_platform <- usefulness_by_platform %>% group_by(learning_platform) %>% summarise(tot = n())
usefulness_count <- usefulness_by_platform %>% filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>% 
  group_by(learning_platform) %>% summarise(count = n())
perc_usefulness <- usefulness_count %>% left_join(total_usefulness_by_platform, by = "learning_platform") %>% 
  mutate(perc_usefulness = count/tot) %>% 
  mutate(perc_usefulness = round(perc_usefulness, digits = 3))
perc_usefulness
## # A tibble: 18 × 4
##    learning_platform count   tot perc_usefulness
##    <chr>             <int> <int>           <dbl>
##  1 Arxiv              2354  2391           0.985
##  2 Blogs              4720  4765           0.991
##  3 College            3258  3359           0.97 
##  4 Communities        1126  1142           0.986
##  5 Company             940   981           0.958
##  6 Conferences        2063  2182           0.945
##  7 Courses            5945  5992           0.992
##  8 Documentation      2279  2321           0.982
##  9 Friends            1530  1581           0.968
## 10 Kaggle             6527  6583           0.991
## 11 Newsletters        1033  1089           0.949
## 12 Podcasts           1090  1214           0.898
## 13 Projects           4755  4794           0.992
## 14 SO                 5576  5640           0.989
## 15 Textbook           4112  4181           0.983
## 16 TradeBook           324   333           0.973
## 17 Tutoring           1394  1426           0.978
## 18 YouTube            5125  5229           0.98
perc_usefulness_fct <- perc_usefulness %>% 
  mutate(learning_platform = fct(learning_platform) %>% 
           fct_reorder(perc_usefulness) %>% 
           fct_rev() %>% 
           fct_relevel("Courses", after = 0))
perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
  geom_segment(aes(xend = learning_platform, yend=0))+
  geom_point()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(
    x = "Learning Platform",
    y = "Percent finding at least somewhat useful"
  )+
  scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))

library(tidytext)
twitter_data <- readRDS("twitter_data.rds")
twitter_data %>%
  group_by(complaint_label) %>%
  summarise(
    avg_followers = mean(usr_followers_count, na.rm = TRUE),
    min_followers = min(usr_followers_count, na.rm = TRUE),
    max_followers = max(usr_followers_count, na.rm = TRUE)
  )
## # A tibble: 2 × 4
##   complaint_label avg_followers min_followers max_followers
##   <chr>                   <dbl>         <dbl>         <dbl>
## 1 Complaint               3234.             0       1259803
## 2 Non-Complaint           4487.             0       2200851
colnames(twitter_data)
## [1] "tweet_id"            "date"                "complaint_label"    
## [4] "tweet_text"          "usr_followers_count" "usr_verified"
twitter_data <- twitter_data %>%
  rename(text = tweet_text)

twitter_data %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)
## # A tibble: 18,536 × 2
##    word      n
##    <chr> <int>
##  1 to     2834
##  2 the    2212
##  3 a      1996
##  4 i      1754
##  5 t      1430
##  6 co     1411
##  7 http   1364
##  8 for    1356
##  9 you    1345
## 10 on     1289
## # ℹ 18,526 more rows
library(stopwords)

twitter_data %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE)
## # A tibble: 17,933 × 2
##    word                n
##    <chr>           <int>
##  1 http             1364
##  2 flight            996
##  3 klm               912
##  4 united            908
##  5 americanair       830
##  6 delta             595
##  7 de                476
##  8 southwestair      474
##  9 usairways         473
## 10 british_airways   381
## # ℹ 17,923 more rows
twitter_data %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)
## # A tibble: 18,536 × 2
##    word      n
##    <chr> <int>
##  1 to     2834
##  2 the    2212
##  3 a      1996
##  4 i      1754
##  5 t      1430
##  6 co     1411
##  7 http   1364
##  8 for    1356
##  9 you    1345
## 10 on     1289
## # ℹ 18,526 more rows