library(rmarkdown)
library(RedditExtractoR)
library(jsonlite)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(httr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
## The following object is masked from 'package:ggplot2':
##
## annotate
#install.packages("textclean")
#install.packages("tmap")
#library(tmap)
library(corpus)
library(quanteda)
## Package version: 3.2.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
library(textclean)
library(knitr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(cleanNLP)
library(quanteda.textstats)
# colors from RColorBrewer::brewer.pal(6, "Set1")
palette(c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33"))
# Load
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library("stopwords")
##
## Attaching package: 'stopwords'
## The following object is masked from 'package:tm':
##
## stopwords
library(tidyverse)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(tidytext)
Question: For your second blog post, consider the following points. You can write about all of them, or select one question, depending on where you’re now.
Answer and Explanation: My second blog post concerns the first of the two research ideas I had decided The characteristics include (1) its ‘content’ and/or (2) how it can be scraped. I Summon up your knowledge of some useful packages we’ve reviewed and/or NLP tools in relation to your research project. F Sorting out adjectives? F Extracting major verbs or named entities? … I Specify a research paper or two regarding your research domain/topic. Focus on their ‘data,’ analytic strategy, and findings. If they use text-as-data methods, use them as a friendly example. If they have nothing to do with text-as-data methods, imagine what you can do differently. I Grab partial data of your project, if you’re ready. to do so. Report your success and failure!
#top_guns_urls <- find_thread_urls(subreddit="guns", sort_by="top")
load("/Users/noahmilstein/Desktop/Spring 2022/Textasdata/text_as_data_work/df_guns.RData")
str(top_guns_urls)
top_guns_urls_df=top_guns_urls[,c("title", "date_utc", "comments")]
#guns_contents <- get_thread_content(top_guns_urls_df$url[1:1000])
#str(guns_contents$threads)
top_guns_urls_df=top_guns_urls[,c("title", "date_utc", "comments")]
top_guns_corpus<-corpus(top_guns_urls_df$title )
cnlp_init_udpipe()
text_for_top_guns <- as.character(top_guns_corpus)
top_guns_corpus_2 <- docvars(top_guns_corpus)
top_guns_corpus_2$text <- text_for_top_guns
annotated.guns_corpus <- cnlp_annotate(top_guns_corpus_2)
head(annotated.guns_corpus$token)
## # A tibble: 6 × 11
## doc_id sid tid token token_with_ws lemma upos xpos feats tid_source
## <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 1 1 Smith "Smith " Smith PROPN NNP Numb… 4
## 2 1 1 2 and "and " and CCONJ CC <NA> 3
## 3 1 1 3 Wesson "Wesson " Wesson PROPN NNP Numb… 1
## 4 1 1 4 Saturday "Saturday " Saturd… PROPN NNP Numb… 0
## 5 1 1 5 anyone "anyone" anyone PRON NN Numb… 4
## 6 1 1 6 ? "?" ? PUNCT . <NA> 4
## # … with 1 more variable: relation <chr>
head(annotated.guns_corpus$document)
## doc_id
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
doc_id_guns<-annotated.guns_corpus$document
doc_id_guns$date<-top_guns_urls_df$date_utc
annoData <- left_join(doc_id_guns, annotated.guns_corpus$token, by = "doc_id")
annoData$date<-as.Date(annoData$date)
annoData %>%
group_by(date) %>%
summarize(Sentences = max(sid)) %>%
ggplot(aes(date, Sentences)) +
geom_line() +
geom_smooth() +
theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# calculate readability
readability <- textstat_readability(top_guns_corpus,
measure = c("Flesch.Kincaid"))
# add in a chapter number
readability$chapter <- c(1:nrow(readability))
# plot results
ggplot(readability, aes(x = chapter, y = Flesch.Kincaid)) +
geom_line() +
geom_smooth() +
theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
readability <- textstat_readability(top_guns_corpus,
measure = c("Flesch.Kincaid", "FOG", "Coleman.Liau.grade"))
# add in a chapter number
readability$post <- c(1:nrow(readability))
# plot results
ggplot(readability, aes(x = post)) +
geom_line(aes(y = Flesch.Kincaid), color = "black") +
geom_line(aes(y = FOG), color = "red") +
geom_line(aes(y = Coleman.Liau.grade), color = "blue") +
theme_bw()
annoData$date<-as.Date(annoData$date)
readability$added_dates<-as.Date(top_guns_urls_df$date_utc)
ggplot(readability, aes(x = added_dates)) +
geom_smooth(aes(y = Flesch.Kincaid), color = "black") +
geom_smooth(aes(y = FOG), color = "red") +
geom_smooth(aes(y = Coleman.Liau.grade), color = "blue") +
theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(readability$Flesch.Kincaid, readability$FOG, use = "complete.obs")
## [1] 0.8007756
list_of_wars_1000 <- "https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499"
wars_1000s_df <- read_html(list_of_wars_1000)
wars_1000s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(2) %>% html_table()
wars_1000s_subset <- wars_1000s[,1:5]
wars_1000s_subset
## # A tibble: 58 × 5
## Start Finish `Name of conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of conflict Victorious … "Defeated p…
## 2 1000 1139 Norman conquest of southern Italy County of A… "Principali…
## 3 1001 1001 Battle of Peshawar (1001) Ghaznavids "Kabul Shah…
## 4 1002 1018 German–Polish War Kingdom of … "Holy Roman…
## 5 1008 1008 Hungarian–Ahtum War Kingdom of … "Voivodeshi…
## 6 1008 1008 Battle of Chach Ghaznavids "Kabul Shah…
## 7 1007–8 1007–8 Battle at Herdaler Kingdom of … "Finnish tr…
## 8 1009 1031 Fitna of al-Andalus Hammudid dy… "Caliphate …
## 9 1010 1011 Second conflict in the Goryeo–Khitan… Liao dynasty "Goryeo"
## 10 1014 1014 Battle of Clontarf High King o… "LeinsterDu…
## # … with 48 more rows
wars_1100s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(3) %>% html_table()
wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
## # A tibble: 39 × 5
## Start Finish `Name of Conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of Conflict "Victorious… "Defeated p…
## 2 Summer of 1101 Summer of 1101 Crusade of 1101Part … "Sultanate … "Crusaders\…
## 3 1101 1101 Battle of Ramla (110… "Kingdom of… "Fatimid Ca…
## 4 1102 1102 Battle of Ramla (110… "Fatimids o… "Kingdom of…
## 5 1107 1110 Norwegian CrusadePar… "Kingdom of… "Muslim Kin…
## 6 1110 1110 Chola invasion of Ka… "Chola Empi… "Kalinga"
## 7 1113 1115 1113–15 Balearic Isl… "Republic o… "Taifa of M…
## 8 1122 1124 Venetian CrusadePart… "Republic o… "Fatimid Ca…
## 9 1107 1119 Muhammad Tapar's ant… "Seljuq Emp… "Nizari Ism…
## 10 1123 1123 Kalmare ledungPart o… "Kingdom of… "Swedish Pa…
## # … with 29 more rows
wars_1100s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(3) %>% html_table()
wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
## # A tibble: 39 × 5
## Start Finish `Name of Conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of Conflict "Victorious… "Defeated p…
## 2 Summer of 1101 Summer of 1101 Crusade of 1101Part … "Sultanate … "Crusaders\…
## 3 1101 1101 Battle of Ramla (110… "Kingdom of… "Fatimid Ca…
## 4 1102 1102 Battle of Ramla (110… "Fatimids o… "Kingdom of…
## 5 1107 1110 Norwegian CrusadePar… "Kingdom of… "Muslim Kin…
## 6 1110 1110 Chola invasion of Ka… "Chola Empi… "Kalinga"
## 7 1113 1115 1113–15 Balearic Isl… "Republic o… "Taifa of M…
## 8 1122 1124 Venetian CrusadePart… "Republic o… "Fatimid Ca…
## 9 1107 1119 Muhammad Tapar's ant… "Seljuq Emp… "Nizari Ism…
## 10 1123 1123 Kalmare ledungPart o… "Kingdom of… "Swedish Pa…
## # … with 29 more rows
wars_1200s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(4) %>% html_table()
wars_1200s_subset <- wars_1200s[,1:5]
wars_1200s_subset
## # A tibble: 83 × 5
## Start Finish `Name of Conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of Conflict "Victorious par… "Defeated p…
## 2 1201 1219 War of the Antiochene Succession "Forces of Bohe… "Forces of …
## 3 1202 1204 Fourth CrusadePart of the Crusades "Holy Roman Emp… "Byzantine …
## 4 1204 1206 Intervention in Chaldia "Kingdom of Geo… "Byzantine …
## 5 1202 1204 Anglo-Norman War (1202–04) "Kingdom of Fra… "Kingdom of…
## 6 1202 1214 Anglo-French War of 1202–1214 "Kingdom of Fra… "Kingdom of…
## 7 1203 1206 Loon War "Holland Kingdo… "Loon Franc…
## 8 1204 1261 Bulgarian–Latin wars "Bulgarian Empi… "Latin Empi…
## 9 1206 1337 Mongol invasions and conquests "Mongol Empire" "西夏 Weste…
## 10 1208 1209 Lombard Rebellion "Latin Empire\n… "Rebel baro…
## # … with 73 more rows
wars_1300s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(5) %>% html_table()
wars_1300s_subset <- wars_1300s[,1:5]
wars_1300s_subset
## # A tibble: 79 × 5
## Start Finish `Name of Conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of Conflict Victorious … Defeated pa…
## 2 1300 1301 Second Mongol inva… Myinsaing K… Yuan dynasty
## 3 1300 1300 Lembu Sora rebelli… Majapahit E… Lembu Sora …
## 4 c. 14th century c. 14th century K'aissape–Hvalsey … Inuit under… Norsemen un…
## 5 1303 1303 Conquest of Sylhet Independent… Gaur Kingdo…
## 6 1308 1308 Teutonic takeover … Teutonic Kn… Margraviate…
## 7 1309 1309 Crusade of the Poor Duchy of Br… Crusaders
## 8 1311 1312 Rebellion of mayor… Władysław I… Kraków
## 9 1311 1318 Delhi–Seuna War Delhi Sulta… Seuna Empire
## 10 1314 1318 Esen Buqa–Ayurbarw… Yuan dynast… Chagatai Kh…
## # … with 69 more rows
wars_1400s<- wars_1000s_df %>%
html_nodes("table") %>% `[[`(6) %>% html_table()
wars_1400s_subset <- wars_1400s[,1:5]
wars_1400s_subset
## # A tibble: 123 × 5
## Start Finish `Name of Conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish Name of Conflict Victorious party (i… "Defeated p…
## 2 1400 1400 English invasion of Scotland Kingdom of Scotland "Kingdom of…
## 3 1400 1420 Glyndŵr Rising Kingdom of England "Welsh rebe…
## 4 1401 1404 First Samogitian Uprising Teutonic State "Grand Duch…
## 5 1402 1402 Battle of Ankara Timurid Empire "Ottoman Em…
## 6 1402 1413 Ottoman Interregnum Faction of Mehmed I "Faction of…
## 7 1402 1496 Conquest of the Canary Islands Union of Castile an… "Guanches"
## 8 1403 1403 Percy Rebellion Kingdom of England "English re…
## 9 1404 1406 Paregreg war Western court "Eastern co…
## 10 1405 1405 Scrope Rebellion Kingdom of England "English re…
## # … with 113 more rows
list_of_wars_1500 <- "https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799"
wars_1500s_df <- read_html(list_of_wars_1500)
wars_1500s <- wars_1500s_df %>%
html_nodes("table") %>% `[[`(1) %>% html_table()
wars_1500s_subset <- wars_1500s[,1:5]
wars_1500s_subset
## # A tibble: 160 × 5
## Start Finish `Name of conflict` Belligerents Belligerents
## <chr> <chr> <chr> <chr> <chr>
## 1 Start Finish "Name of conflict" Victorious … "Defeated p…
## 2 1500 1503 "Second Muscovite–Lithuanian WarMusco… Grand Duchy… "Grand Duch…
## 3 1500 1500 "Battle of Hemmingstedt" Peasantry o… "Kalmar Uni…
## 4 1501 1512 "Dano-Swedish War (1501–1512)\nPart o… Sweden Free… "Kalmar Uni…
## 5 1502 1510 "Persian–Uzbek wars" Persian Emp… "Shaybanid …
## 6 1502 1543 "Guelders Wars" Holy Roman … "Duchy of G…
## 7 1503 1505 "War of the Succession of Landshut" Duchy of Ba… "Duchy of B…
## 8 1505 1517 "Portuguese–Mamluk naval war" Portugal "Mamluk Sul…
## 9 1507 1508 "Third Muscovite–Lithuanian WarMuscov… Grand Duchy… "Grand Duch…
## 10 1508 1516 "War of the League of CambraiPart of … 1508–10: Pa… "1508–10: V…
## # … with 150 more rows