- Data activity (10 min)
- Intro to text mining (40 min)
- Break (5 min)
- Discuss Katrina, class readings (15 min)
- Intro to ProQuest TDM Studio (20 min)
- Problem set questions (Remainder)
2023-07-10
print.data.frame(groups)
## group 1 group 2 group 3 ## 1 Cortez, Hugo Alexander Cai, Qingyuan Somyurek, Ecem ## 2 Widodo, Ignazio Marco Gupta, Umang Jun, Ernest Ng Wei ## 3 Leong, Wen Hou Lester Knutson, Blue C Spindler, Laine Addison ## 4 Gnanam, Akash Y Tan, Zheng Yang Premkrishna, Shrish ## group 4 group 5 group 6 ## 1 Saccone, Alexander Connor Albertini, Federico ## 2 Ramos, Jessica Andria Potestades Ng, Michelle Shah, Jainam ## 3 Ning, Zhi Yan Dotson, Bianca Ciara ## 4 Alsayegh, Aisha E H M I Su, Barry Lim, Fang Jan ## group 7 ## 1 Tian, Zerui ## 2 Wan Rosli, Nadia ## 3 Huynh Le Hue Tam, Vivian ## 4 Andrew Yu Ming Xin,
| Row | Person | Birthday | Occupation |
|---|---|---|---|
| 1 | Joe | 12/3/1963 | Carpenter |
| 2 | Malik | 6/8/1978 | Architect |
| 3 | Suzanna | 4/3/2001 | Student |
| Row | County | Temperature | PM2.5 |
|---|---|---|---|
| 1 | Santa Clara | 78.1 | 12.1 |
| 2 | San Mateo | 82.3 | 32.1 |
| 3 | San Francisco | 65.4 | 44.7 |
| Row | Paper | Article | Text |
|---|---|---|---|
| 1 | New York Times | Study Compares Gas Stove Pollu… | Using |
| 2 | New York Times | Study Compares Gas Stove Pollu… | a |
| 3 | New York Times | Study Compares Gas Stove Pollu… | single |
| Row | Paper | Article | Text |
|---|---|---|---|
| 1 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | Using a single gas-stove burner can raise indoor concentrations of benzene, … |
| 2 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | For the peer-reviewed study, researchers at Stanford’s Doerr School of Sustainability … |
| 3 | New York Times | Study Compares Gas Stove Pollution to Secondhand Cigarette Smoke | In about a third of the homes, a single gas burner … |
library(readr)
ca_wf <- read_csv("ca_wf.csv")
%>% do?ca_wf %>% mutate(new_var = "") do?%<>%?filter()tidytext format?library(tidytext) library(dplyr) # first, set up liveblog dataframe tidy_blogs <- ca_wf %>% filter(type == "liveblog")
# unnest tokens tidy_blogs %<>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
unnest_tokens() doing?anti_join(stop_words)?# unnest tokens tidy_blogs %<>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
stop_words? You can run View(stop_words) to look at theseanti_join the stop_words?# look at examples tidy_blogs %>% select(type, word) %>% head()
## # A tibble: 6 × 2 ## type word ## <chr> <chr> ## 1 liveblog 6pm ## 2 liveblog york ## 3 liveblog city ## 4 liveblog skies ## 5 liveblog shrouded ## 6 liveblog thick
count() function to get word frequencies# look at blog word frequencies tidy_blogs %>% count(word, sort = TRUE)
## # A tibble: 2,379 × 2 ## word n ## <chr> <int> ## 1 air 111 ## 2 quality 69 ## 3 smoke 68 ## 4 wildfires 64 ## 5 trump 59 ## 6 pence 58 ## 7 york 58 ## 8 canada 55 ## 9 president 55 ## 10 city 46 ## # ℹ 2,369 more rows
# look at article frequencies tidy_articles %>% count(word, sort = TRUE)
## # A tibble: 1,808 × 2 ## word n ## <chr> <int> ## 1 air 96 ## 2 smoke 58 ## 3 quality 50 ## 4 york 40 ## 5 canada 37 ## 6 wildfires 34 ## 7 climate 33 ## 8 fires 32 ## 9 city 31 ## 10 wednesday 30 ## # ℹ 1,798 more rows
library(tidyr)
frequency <- bind_rows(tidy_blogs,
tidy_articles) %>%
count(type, word) %>%
group_by(type) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = type, values_from = proportion)
print.data.frame(groups)
## group 1 group 2 group 3 ## 1 Cortez, Hugo Alexander Cai, Qingyuan Somyurek, Ecem ## 2 Widodo, Ignazio Marco Gupta, Umang Jun, Ernest Ng Wei ## 3 Leong, Wen Hou Lester Knutson, Blue C Spindler, Laine Addison ## 4 Gnanam, Akash Y Tan, Zheng Yang Premkrishna, Shrish ## group 4 group 5 group 6 ## 1 Saccone, Alexander Connor Albertini, Federico ## 2 Ramos, Jessica Andria Potestades Ng, Michelle Shah, Jainam ## 3 Ning, Zhi Yan Dotson, Bianca Ciara ## 4 Alsayegh, Aisha E H M I Su, Barry Lim, Fang Jan ## group 7 ## 1 Tian, Zerui ## 2 Wan Rosli, Nadia ## 3 Huynh Le Hue Tam, Vivian ## 4 Andrew Yu Ming Xin,
library(pdftools)
# read hearing into R
levees_hearing <- pdf_text("G:/My Drive/Data_Disasters/Course_site/Data/Katrina_hearings/katrina_hearing_levees.pdf") %>%
as.data.frame()
# set column names
colnames(levees_hearing) <- c("text")
# unnest tokens tidy_hearing <- levees_hearing %>% unnest_tokens(word, text) %>% anti_join(stop_words) # take a look head(tidy_hearing)
## word ## 1 hrg ## 2 109 ## 3 526 ## 4 hurricane ## 5 katrina ## 6 levees
# look at top words tidy_hearing_counts <- tidy_hearing %>% count(word, sort = TRUE) # now we can look at the top 20 head(tidy_hearing_counts, 20)
## word n ## 1 09 726 ## 2 6601 617 ## 3 2006 365 ## 4 37 365 ## 5 2002 364 ## 6 31 364 ## 7 00000 363 ## 8 024446 363 ## 9 0ct 363 ## 10 24446 363 ## 11 aug 363 ## 12 docs 363 ## 13 fmt 363 ## 14 frm 363 ## 15 jkt 363 ## 16 pat 363 ## 17 po 363 ## 18 psn 363 ## 19 saffairs 363 ## 20 sfmt 363
library(magrittr) # vector for additional stop words addl_stop_words <- tidy_hearing_counts %>% filter(n > 300 ) %>% select(word) # include 6633 as well addl_stop_words %<>% bind_rows(data.frame(word = "6633"))
# add additional stop words
custom_stop_words <- bind_rows(data.frame(word = addl_stop_words,
lexicon = c("custom")),
stop_words)
# examine new stop words dataset
head(custom_stop_words)
## word lexicon ## 1 09 custom ## 2 6601 custom ## 3 2006 custom ## 4 37 custom ## 5 2002 custom ## 6 31 custom
# remove custom stop words tidy_hearing %<>% anti_join(custom_stop_words) # new top words tidy_hearing_counts <- tidy_hearing %>% count(word, sort = TRUE)
# now we can look at the top 20 head(tidy_hearing_counts, 20)
## word n ## 1 levees 127 ## 2 senator 126 ## 3 corps 121 ## 4 levee 92 ## 5 orleans 91 ## 6 slide 86 ## 7 dr 80 ## 8 seed 76 ## 9 hurricane 72 ## 10 chairman 62 ## 11 water 62 ## 12 engineers 60 ## 13 van 57 ## 14 heerden 56 ## 15 level 56 ## 16 canal 55 ## 17 storm 54 ## 18 surge 48 ## 19 lieberman 47 ## 20 nicholson 44
bind_rows() and group_by() might be usefulSAMPLEDATA” with the name of your datasettidytext in ProQuest