library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(dplyr)
lateflights <- flights %>%
filter(arr_delay>5) %>%
group_by(month) %>%
summarise(lateflights = n()) %>%
ungroup()
lateflights
## # A tibble: 12 × 2
## month lateflights
## <int> <int>
## 1 1 8988
## 2 2 8119
## 3 3 9033
## 4 4 10544
## 5 5 8490
## 6 6 10739
## 7 7 11518
## 8 8 9649
## 9 9 5347
## 10 10 7628
## 11 11 7485
## 12 12 12291
total_flights_per_carrier <- flights %>%
group_by(month, carrier) %>%
summarise(total_flights_by_carrier = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
total_flights_per_month <- flights %>%
group_by(month) %>%
summarise(total_flights_per_month = n())
percentage_traffic_per_carrier <- total_flights_per_carrier %>%
left_join(total_flights_per_month, by = "month") %>%
mutate(percentage_traffic = (total_flights_by_carrier / total_flights_per_month)) %>%
mutate(percentage_traffic = scales::percent(percentage_traffic)) %>%
select(month, carrier, percentage_traffic)
percentage_traffic_per_carrier %>% spread(key = month, value = percentage_traffic)
## # A tibble: 16 × 13
## carrier `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 9E 5.825% 5.8475% 5.64… 5.33… 5.07… 5.08… 5.07… 4.96… 5.58… 5.79… 5.84…
## 2 AA 10.347% 10.087… 9.66… 9.60… 9.73… 9.76… 9.79… 9.73… 9.47… 9.39… 9.45…
## 3 AS 0.230% 0.2244% 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.21… 0.19…
## 4 B6 16.394% 16.444… 16.5… 15.9… 15.8… 16.3… 16.9… 16.8… 15.5… 15.0… 15.7…
## 5 DL 13.665% 13.803… 14.5… 14.4… 14.1… 14.6… 14.4… 14.7… 14.0… 14.1… 14.1…
## 6 EV 15.446% 15.338… 16.3… 16.1… 16.7… 15.7… 15.7… 15.5… 17.1… 16.9… 16.3…
## 7 F9 0.218% 0.1964% 0.19… 0.20… 0.20… 0.19… 0.19… 0.18… 0.21… 0.19… 0.22…
## 8 FL 1.215% 1.1863% 1.09… 1.09… 1.12… 0.89… 0.89… 0.89… 0.92… 0.81… 0.74…
## 9 HA 0.115% 0.1122% 0.10… 0.10… 0.10… 0.10… 0.10… 0.10… 0.09… 0.07… 0.09…
## 10 MQ 8.410% 8.1921% 7.82… 7.80… 7.93… 7.71… 7.68… 7.71… 8.00… 7.71… 7.54…
## 11 OO 0.004% <NA> <NA> <NA> <NA> 0.00… <NA> 0.01… 0.07… <NA> 0.01…
## 12 UA 17.172% 17.418… 17.2… 17.8… 17.2… 17.6… 17.2… 17.4… 17.0… 17.5… 17.8…
## 13 US 5.932% 6.2202% 5.96… 6.09… 6.19… 6.14… 6.07… 6.06… 6.15… 6.39… 6.23…
## 14 VX 1.170% 1.0861% 1.05… 1.64… 1.72… 1.70… 1.66… 1.66… 1.64… 1.63… 1.65…
## 15 WN 3.688% 3.6512% 3.46… 3.45… 3.49… 3.64… 3.65… 3.57… 3.66… 3.77… 3.78…
## 16 YV 0.170% 0.1924% 0.06… 0.13… 0.17… 0.17… 0.27… 0.22… 0.15… 0.22… 0.18…
## # ℹ 1 more variable: `12` <chr>
flights %>%
group_by(month) %>%
filter(dep_delay == max(dep_delay, na.rm = TRUE)) %>%
arrange(-desc(month))
## # A tibble: 12 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 2 10 2243 830 853 100 1106
## 3 2013 3 17 2321 810 911 135 1020
## 4 2013 4 10 1100 1900 960 1342 2211
## 5 2013 5 3 1133 2055 878 1250 2215
## 6 2013 6 15 1432 1935 1137 1607 2120
## 7 2013 7 22 845 1600 1005 1044 1815
## 8 2013 8 8 2334 1454 520 120 1710
## 9 2013 9 20 1139 1845 1014 1457 2210
## 10 2013 10 14 2042 900 702 2255 1127
## 11 2013 11 3 603 1645 798 829 1913
## 12 2013 12 5 756 1700 896 1058 2020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
library(readr)
multipleChoiceResponses <- read_csv("/cloud/project/multipleChoiceResponses1.csv")
## Rows: 16716 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (46): LearningPlatformUsefulnessArxiv, LearningPlatformUsefulnessBlogs, ...
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usefulness_by_platform <- multipleChoiceResponses %>% select(starts_with("LearningPlatformUsefulness")) %>%
set_names(names(.) %>% str_replace("LearningPlatformUsefulness", "")) %>%
gather(key = "learning_platform", value = "usefulness",convert = FALSE, na.rm = TRUE)
usefulness_by_platform %>% group_by(learning_platform, usefulness) %>%
summarise(n = n()) %>%
ungroup()
## `summarise()` has grouped output by 'learning_platform'. You can override using
## the `.groups` argument.
## # A tibble: 54 × 3
## learning_platform usefulness n
## <chr> <chr> <int>
## 1 Arxiv Not Useful 37
## 2 Arxiv Somewhat useful 1038
## 3 Arxiv Very useful 1316
## 4 Blogs Not Useful 45
## 5 Blogs Somewhat useful 2406
## 6 Blogs Very useful 2314
## 7 College Not Useful 101
## 8 College Somewhat useful 1405
## 9 College Very useful 1853
## 10 Communities Not Useful 16
## # ℹ 44 more rows
total_usefulness_by_platform <- usefulness_by_platform %>% group_by(learning_platform) %>% summarise(tot = n())
usefulness_count <- usefulness_by_platform %>% filter(!grepl("Not Useful",usefulness,ignore.case = TRUE)) %>%
group_by(learning_platform) %>% summarise(count = n())
perc_usefulness <- usefulness_count %>% left_join(total_usefulness_by_platform, by = "learning_platform") %>%
mutate(perc_usefulness = count/tot) %>%
mutate(perc_usefulness = round(perc_usefulness, digits = 3))
perc_usefulness
## # A tibble: 18 × 4
## learning_platform count tot perc_usefulness
## <chr> <int> <int> <dbl>
## 1 Arxiv 2354 2391 0.985
## 2 Blogs 4720 4765 0.991
## 3 College 3258 3359 0.97
## 4 Communities 1126 1142 0.986
## 5 Company 940 981 0.958
## 6 Conferences 2063 2182 0.945
## 7 Courses 5945 5992 0.992
## 8 Documentation 2279 2321 0.982
## 9 Friends 1530 1581 0.968
## 10 Kaggle 6527 6583 0.991
## 11 Newsletters 1033 1089 0.949
## 12 Podcasts 1090 1214 0.898
## 13 Projects 4755 4794 0.992
## 14 SO 5576 5640 0.989
## 15 Textbook 4112 4181 0.983
## 16 TradeBook 324 333 0.973
## 17 Tutoring 1394 1426 0.978
## 18 YouTube 5125 5229 0.98
perc_usefulness_fct <- perc_usefulness %>%
mutate(learning_platform = fct(learning_platform) %>%
fct_reorder(perc_usefulness) %>%
fct_rev() %>%
fct_relevel("Courses", after = 0))
perc_usefulness_fct %>% ggplot(aes(x = learning_platform, y = perc_usefulness))+
geom_segment(aes(xend = learning_platform, yend=0))+
geom_point()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(
x = "Learning Platform",
y = "Percent finding at least somewhat useful"
)+
scale_y_continuous(labels = scales::percent_format(scale = 100, suffix = "%"))

library(tidytext)
twitter_data <- readRDS("twitter_data.rds")
twitter_data %>%
group_by(complaint_label) %>%
summarise(
avg_followers = mean(usr_followers_count, na.rm = TRUE),
min_followers = min(usr_followers_count, na.rm = TRUE),
max_followers = max(usr_followers_count, na.rm = TRUE)
)
## # A tibble: 2 × 4
## complaint_label avg_followers min_followers max_followers
## <chr> <dbl> <dbl> <dbl>
## 1 Complaint 3234. 0 1259803
## 2 Non-Complaint 4487. 0 2200851
colnames(twitter_data)
## [1] "tweet_id" "date" "complaint_label"
## [4] "tweet_text" "usr_followers_count" "usr_verified"
twitter_data <- twitter_data %>%
rename(text = tweet_text)
twitter_data %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
## # A tibble: 18,536 × 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1996
## 4 i 1754
## 5 t 1430
## 6 co 1411
## 7 http 1364
## 8 for 1356
## 9 you 1345
## 10 on 1289
## # ℹ 18,526 more rows
library(stopwords)
twitter_data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word") %>%
count(word, sort = TRUE)
## # A tibble: 17,933 × 2
## word n
## <chr> <int>
## 1 http 1364
## 2 flight 996
## 3 klm 912
## 4 united 908
## 5 americanair 830
## 6 delta 595
## 7 de 476
## 8 southwestair 474
## 9 usairways 473
## 10 british_airways 381
## # ℹ 17,923 more rows
twitter_data %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
## # A tibble: 18,536 × 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1996
## 4 i 1754
## 5 t 1430
## 6 co 1411
## 7 http 1364
## 8 for 1356
## 9 you 1345
## 10 on 1289
## # ℹ 18,526 more rows