- Data activity (10 min)
- Introduction to Topic Modelling (40 min)
- Break (5 min)
- Working with
ggplot2(35 min) - Final project time (Remainder)
2023-07-27
ggplot2 (35 min)print.data.frame(groups)
## group 1 group 2 group 3 ## 1 Alsayegh, Aisha E H M I Shah, Jainam ## 2 Knutson, Blue C Huynh Le Hue Tam, Vivian Andrew Yu Ming Xin, ## 3 Wan Rosli, Nadia Spindler, Laine Addison Dotson, Bianca Ciara ## 4 Ning, Zhi Yan Gnanam, Akash Y Widodo, Ignazio Marco ## group 4 group 5 group 6 ## 1 Cortez, Hugo Alexander Tian, Zerui ## 2 Leong, Wen Hou Lester Tan, Zheng Yang Gupta, Umang ## 3 Jun, Ernest Ng Wei Saccone, Alexander Connor Somyurek, Ecem ## 4 Premkrishna, Shrish Lim, Fang Jan Albertini, Federico ## group 7 ## 1 Su, Barry ## 2 Cai, Qingyuan ## 3 Ng, Michelle ## 4 Ramos, Jessica Andria Potestades
co_river <- gu_content('"Colorado River"')
library(readr)
co_river <- read_csv("guardian_co_river.csv")
head(co_river)
## # A tibble: 6 × 49 ## id type section_id section_name web_publication_date web_title web_url ## <chr> <chr> <chr> <chr> <dttm> <chr> <chr> ## 1 us-news/… arti… us-news US news 2023-06-22 22:46:32 Supreme … https:… ## 2 global/2… arti… global Global 2023-05-31 17:00:22 The farm… https:… ## 3 us-news/… arti… us-news US news 2023-05-22 22:15:09 US state… https:… ## 4 environm… arti… environme… Environment 2023-04-12 06:20:06 US consi… https:… ## 5 us-news/… arti… us-news US news 2023-04-18 11:01:34 Colorado… https:… ## 6 environm… arti… us-news US news 2023-05-23 17:52:41 Historic… https:… ## # ℹ 42 more variables: api_url <chr>, tags <lgl>, is_hosted <lgl>, ## # pillar_id <chr>, pillar_name <chr>, headline <chr>, standfirst <chr>, ## # trail_text <chr>, byline <chr>, main <chr>, body <chr>, wordcount <dbl>, ## # first_publication_date <dttm>, is_inappropriate_for_sponsorship <lgl>, ## # is_premoderated <lgl>, last_modified <dttm>, production_office <chr>, ## # publication <chr>, short_url <chr>, should_hide_adverts <lgl>, ## # show_in_related_content <lgl>, thumbnail <chr>, legally_sensitive <lgl>, …
library(dplyr) library(tidytext) # create a tidytext dataset tidy_co <- co_river %>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
# create counts of each word - we will include id in the count() # function to also get n values for words in each article tidy_co_counts <- tidy_co %>% count(id, word, sort = TRUE)
| Article | nuclear | bomb | fallout | marks | dawn | of | new | epoch | in | which | … |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Canadian Lake Chosen to Represent Start of Anthropocene | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | |
| This quiet lake could mark the start of a new Anthropocene epoch | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 1 | 2 | 0 |
# recast as dtm co_dtm <- tidy_co_counts %>% cast_dtm(id, word, n)
library(topicmodels) # lda co_lda <- LDA(co_dtm, 2, control = list(seed = 1))
# colorado topics co_topics <- tidy(co_lda, matrix = "beta")
# format for plot co_top_terms <- co_topics %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% arrange(topic, -beta) %>% mutate(term = reorder_within(term, beta, topic))
library(ggplot2) # plot our topics! ggplot(co_top_terms, aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered()
ggplot2ggplot2library(ggplot2) # a blank canvas ggplot( , aes( , ))
ggplot2library(ggplot2) # create df df <- data.frame(x = c(1.5, 3), y = c(1.5, 1)) # ggplot ggplot(df, aes(x, y))
ggplot2# create df df <- data.frame(x = c(1.5, 3), y = c(1.5, 1)) # ggplot ggplot(df, aes(x, y))+ geom_point()
ggplot2# create df df <- data.frame(x = c(1.5, 3), y = c(1.5, 1)) # ggplot ggplot(df, aes(x, y))+ geom_bar()
ggplot2# create df df <- data.frame(x = c(1.5, 3), y = c(1.5, 1)) # ggplot ggplot(df, aes(x, y))+ geom_col()
ggplot2# create df df <- data.frame(x = c(1.5, 3), y = c(1.5, 1)) # ggplot ggplot(df, aes(x, y))+ geom_line()
ggplot2# create df
df <- data.frame(x = c(1.5, 3, 1.5, 3),
y = c(1.5, 1, 2.4, 3.5),
z = c("a", "a", "b", "b"))
# ggplot
ggplot(df, aes(x, y, group = z))+
geom_line()
ggplot2aes within geom_line() to give lines different aesthetics# create df
df <- data.frame(x = c(1.5, 3, 1.5, 3),
y = c(1.5, 1, 2.4, 3.5),
z = c("a", "a", "b", "b"))
# ggplot
ggplot(df, aes(x, y, group = z))+
geom_line(aes(color = z))
ggplot2# create df
df <- data.frame(x = c(1.5, 3, 1.5, 3),
y = c(1.5, 1, 2.4, 3.5),
z = c("a", "a", "b", "b"))
# ggplot
ggplot(df, aes(x, y, group = z))+
geom_line(aes(color = z, lty = z))+
theme_classic()
ggplot2library(readr)
# read in water data
lakepowell <- read_csv("LAKEPOWELL.csv")
# take a look
head(lakepowell)
## # A tibble: 6 × 2 ## Date `Elevation (feet)` ## <dttm> <dbl> ## 1 1964-07-01 00:00:00 3484. ## 2 1965-07-01 00:00:00 3531. ## 3 1966-07-01 00:00:00 3540. ## 4 1967-07-01 00:00:00 3533. ## 5 1968-07-01 00:00:00 3546. ## 6 1969-07-01 00:00:00 3581.
ggplot2# plot lake powell ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+ geom_point()+ geom_line()
ggplot2 (35 min)print.data.frame(groups)
## group 1 group 2 group 3 ## 1 Alsayegh, Aisha E H M I Shah, Jainam ## 2 Knutson, Blue C Huynh Le Hue Tam, Vivian Andrew Yu Ming Xin, ## 3 Wan Rosli, Nadia Spindler, Laine Addison Dotson, Bianca Ciara ## 4 Ning, Zhi Yan Gnanam, Akash Y Widodo, Ignazio Marco ## group 4 group 5 group 6 ## 1 Cortez, Hugo Alexander Tian, Zerui ## 2 Leong, Wen Hou Lester Tan, Zheng Yang Gupta, Umang ## 3 Jun, Ernest Ng Wei Saccone, Alexander Connor Somyurek, Ecem ## 4 Premkrishna, Shrish Lim, Fang Jan Albertini, Federico ## group 7 ## 1 Su, Barry ## 2 Cai, Qingyuan ## 3 Ng, Michelle ## 4 Ramos, Jessica Andria Potestades
# read in guardian data
co_river <- read_csv("guardian_co_river.csv")
## Rows: 351 Columns: 49 ## ── Column specification ──────────────────────────────────────────────────────── ## Delimiter: "," ## chr (25): id, type, section_id, section_name, web_title, web_url, api_url, ... ## dbl (4): wordcount, char_count, newspaper_page_number, star_rating ## lgl (15): tags, is_hosted, is_inappropriate_for_sponsorship, is_premoderate... ## dttm (4): web_publication_date, first_publication_date, last_modified, comm... ## date (1): newspaper_edition_date ## ## ℹ Use `spec()` to retrieve the full column specification for this data. ## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# create a tidytext dataset tidy_co <- co_river %>% unnest_tokens(word, body_text) %>% anti_join(stop_words)
## Joining with `by = join_by(word)`
# create counts of each word - we will include id in the count() # function to also get n values for words in each article tidy_co_counts <- tidy_co %>% count(id, word, sort = TRUE)
# recast as dtm co_dtm <- tidy_co_counts %>% cast_dtm(id, word, n) # lda co_lda <- LDA(co_dtm, 2, control = list(seed = 1)) # colorado topics co_topics <- tidy(co_lda, matrix = "beta")
# format for plot co_top_terms <- co_topics %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% arrange(topic, -beta) %>% mutate(term = reorder_within(term, beta, topic))
library(ggplot2) # plot our topics! ggplot(co_top_terms, aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered()
library(tidyr)
beta_wide <- co_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
# limit to 10 largest and smallest log ratios
beta_wide <- bind_rows(
beta_wide %>%
slice_min(log_ratio, n = 10) %>%
mutate(topic = "topic1"),
beta_wide %>%
slice_max(log_ratio, n = 10) %>%
mutate(topic = "topic2")
)
# calculate gamma for all documents co_documents <- tidy(co_lda, matrix = "gamma") # view documents head(co_documents)
## # A tibble: 6 × 3 ## document topic gamma ## <chr> <int> <dbl> ## 1 global-development/2013/jul/06/water-supplies-shrinking-threat-… 1 1.00e+0 ## 2 world/live/2023/jun/20/russia-ukraine-war-live-attacks-reported… 1 2.70e-5 ## 3 film/2023/feb/21/herzog-swinton-rushdie-cinema-tom-luddy-tellur… 1 8.57e-1 ## 4 environment/2018/may/25/best-us-national-parks-escape-crowds 1 1.00e+0 ## 5 us-news/2016/apr/25/drought-water-rights-wet-asset-buying-snake… 1 9.39e-1 ## 6 sustainable-business/blog/us-water-paradox-demand-infrastructure 1 1.00e+0
# examine one article co_documents %>% filter(document == "world/uselectionroadtrip/2008/oct/17/uselections2008")
## # A tibble: 2 × 3 ## document topic gamma ## <chr> <int> <dbl> ## 1 world/uselectionroadtrip/2008/oct/17/uselections2008 1 0.478 ## 2 world/uselectionroadtrip/2008/oct/17/uselections2008 2 0.522
# examine one article co_documents %>% filter(document == "world/uselectionroadtrip/2008/oct/17/uselections2008")
## # A tibble: 2 × 3 ## document topic gamma ## <chr> <int> <dbl> ## 1 world/uselectionroadtrip/2008/oct/17/uselections2008 1 0.478 ## 2 world/uselectionroadtrip/2008/oct/17/uselections2008 2 0.522
ggplot2ggplot2# read in water data
lakepowell <- read_csv("LAKEPOWELL.csv")
ggplot2ggplot2# plot lake powell ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+ geom_point()+ geom_line()
ggplot2# plot lake powell ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+ geom_point()+ geom_line()
ggplot2ggplot2# plot lake powell
ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+
geom_line()+
geom_ribbon(aes(ymin = min(`Elevation (feet)`),
ymax = `Elevation (feet)`),
fill = "lightblue3")
ggplot2# plot lake powell
ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+
geom_line()+
geom_ribbon(aes(ymin = 3300,
ymax = `Elevation (feet)`),
fill = "lightblue3")+
lims(y = c(3300, 3750))+
geom_hline(yintercept = 3490)
ggplot2ggplot2# plot lake powell
ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+
geom_line()+
geom_ribbon(aes(ymin = 3300,
ymax = `Elevation (feet)`),
fill = "lightblue3")+
lims(y = c(3300, 3750))+
geom_hline(yintercept = 3490)+
annotate("text", x= as.POSIXct("1980-07-01"), y = 3500, label = "Minimum Power Pool (3490')")
ggplot2ggplot2# plot lake powell
ggplot(lakepowell, aes(x = Date, y = `Elevation (feet)`))+
geom_line()+
geom_ribbon(aes(ymin = 3300,
ymax = `Elevation (feet)`),
fill = "lightblue3")+
lims(y = c(3300, 3750))+
geom_hline(yintercept = 3490)+
annotate("text", x= as.POSIXct("1980-07-01"), y = 3500,
label = "Minimum Power Pool (3490')")+
annotate("text", x= as.POSIXct("1970-07-01"), y = 3700,
label = "Fill-up Period")+
annotate("segment", x= as.POSIXct("1970-07-01"), y = 3675,
xend = as.POSIXct("1970-12-01"), yend = 3625)
ggplot2ggplot2ggplot!ggplot2# create df
ozone <- read_csv("ozone-depleting-substance-consumption.csv")
# take a look
head(ozone)
## # A tibble: 6 × 9 ## Entity Code Year Consumption of controlled sub…¹ Consumption of contr…² ## <chr> <chr> <dbl> <dbl> <dbl> ## 1 Afghanistan AFG 1986 0 0 ## 2 Afghanistan AFG 1989 0 0 ## 3 Afghanistan AFG 1995 0 0 ## 4 Afghanistan AFG 1996 0 0 ## 5 Afghanistan AFG 1997 0 0 ## 6 Afghanistan AFG 1998 0 0 ## # ℹ abbreviated names: ## # ¹​`Consumption of controlled substance (zero-filled) - Chemical: Methyl Chloroform (TCA)`, ## # ²​`Consumption of controlled substance (zero-filled) - Chemical: Methyl Bromide (MB)` ## # ℹ 4 more variables: ## # `Consumption of controlled substance (zero-filled) - Chemical: Hydrochlorofluorocarbons (HCFCs)` <dbl>, ## # `Consumption of controlled substance (zero-filled) - Chemical: Carbon Tetrachloride (CTC)` <dbl>, ## # `Consumption of controlled substance (zero-filled) - Chemical: Halons` <dbl>, …
ggplot2library(magrittr) library(dplyr) # filter to world ozone ozone %<>% filter(Entity == "World") # take a look head(ozone)
## # A tibble: 6 × 9 ## Entity Code Year Consumption of controlled subst…¹ Consumption of contr…² ## <chr> <chr> <dbl> <dbl> <dbl> ## 1 World OWID_WRL 1986 4.6 12.3 ## 2 World OWID_WRL 1989 65667. 128. ## 3 World OWID_WRL 1990 16754 426. ## 4 World OWID_WRL 1991 35873. 38665. ## 5 World OWID_WRL 1992 56688. 3622. ## 6 World OWID_WRL 1993 38084. 5301. ## # ℹ abbreviated names: ## # ¹​`Consumption of controlled substance (zero-filled) - Chemical: Methyl Chloroform (TCA)`, ## # ²​`Consumption of controlled substance (zero-filled) - Chemical: Methyl Bromide (MB)` ## # ℹ 4 more variables: ## # `Consumption of controlled substance (zero-filled) - Chemical: Hydrochlorofluorocarbons (HCFCs)` <dbl>, ## # `Consumption of controlled substance (zero-filled) - Chemical: Carbon Tetrachloride (CTC)` <dbl>, ## # `Consumption of controlled substance (zero-filled) - Chemical: Halons` <dbl>, …
ggplot2library(stringr) # shorten variable names ozone %<>% rename_with(~str_remove_all(., ".*: "))
ggplot2# plot cfcs ggplot(ozone, aes(x = Year, y = `Chlorofluorocarbons (CFCs)` ))+ geom_line()
ggplot2library(tidyr)
# pivot to long
ozone %<>%
pivot_longer(-c(Entity, Code, Year), names_to = "Substance",
values_to = "Tons")
ggplot2# plot cfcs ggplot(ozone, aes(x = Year, y = Tons ))+ geom_line(aes(col = Substance), lwd = 1.5)+ facet_wrap(~Substance)+ theme_dark()+ theme(legend.position = "none")
ggplot2# plot cfcs ggplot(ozone, aes(x = Year, y = Tons ))+ geom_line(aes(col = Substance), lwd = 1.5)+ facet_wrap(~Substance, scales = "free")+ theme_dark()+ theme(legend.position = "none")