library(rmarkdown)

library(RedditExtractoR)
library(jsonlite)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag()     masks stats::lag()
library(dplyr)
library(httr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate
#install.packages("textclean")
#install.packages("tmap")
#library(tmap)
library(corpus)
library(quanteda)
## Package version: 3.2.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
## 
##     stopwords
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-
library(textclean)
library(knitr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(cleanNLP)
library(quanteda.textstats)
# colors from RColorBrewer::brewer.pal(6, "Set1")
palette(c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33"))
# Load
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library("stopwords")
## 
## Attaching package: 'stopwords'
## The following object is masked from 'package:tm':
## 
##     stopwords
library(tidyverse)
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tidytext)

Blog Post 2:

Question: For your second blog post, consider the following points. You can write about all of them, or select one question, depending on where you’re now.

Answer and Explanation: My second blog post concerns the first of the two research ideas I had decided The characteristics include (1) its ‘content’ and/or (2) how it can be scraped. I Summon up your knowledge of some useful packages we’ve reviewed and/or NLP tools in relation to your research project. F Sorting out adjectives? F Extracting major verbs or named entities? … I Specify a research paper or two regarding your research domain/topic. Focus on their ‘data,’ analytic strategy, and findings. If they use text-as-data methods, use them as a friendly example. If they have nothing to do with text-as-data methods, imagine what you can do differently. I Grab partial data of your project, if you’re ready. to do so. Report your success and failure!

#top_guns_urls <- find_thread_urls(subreddit="guns", sort_by="top")

load("/Users/noahmilstein/Desktop/Spring 2022/Textasdata/text_as_data_work/df_guns.RData")

str(top_guns_urls)

top_guns_urls_df=top_guns_urls[,c("title", "date_utc", "comments")]

#guns_contents <- get_thread_content(top_guns_urls_df$url[1:1000])
#str(guns_contents$threads)
top_guns_urls_df=top_guns_urls[,c("title", "date_utc", "comments")]

top_guns_corpus<-corpus(top_guns_urls_df$title )

cnlp_init_udpipe()

text_for_top_guns <- as.character(top_guns_corpus)

top_guns_corpus_2 <- docvars(top_guns_corpus)

top_guns_corpus_2$text <- text_for_top_guns

annotated.guns_corpus <- cnlp_annotate(top_guns_corpus_2)
head(annotated.guns_corpus$token)
## # A tibble: 6 × 11
##   doc_id   sid tid   token    token_with_ws lemma   upos  xpos  feats tid_source
##    <int> <int> <chr> <chr>    <chr>         <chr>   <chr> <chr> <chr> <chr>     
## 1      1     1 1     Smith    "Smith "      Smith   PROPN NNP   Numb… 4         
## 2      1     1 2     and      "and "        and     CCONJ CC    <NA>  3         
## 3      1     1 3     Wesson   "Wesson "     Wesson  PROPN NNP   Numb… 1         
## 4      1     1 4     Saturday "Saturday "   Saturd… PROPN NNP   Numb… 0         
## 5      1     1 5     anyone   "anyone"      anyone  PRON  NN    Numb… 4         
## 6      1     1 6     ?        "?"           ?       PUNCT .     <NA>  4         
## # … with 1 more variable: relation <chr>
head(annotated.guns_corpus$document)
##   doc_id
## 1      1
## 2      2
## 3      3
## 4      4
## 5      5
## 6      6
doc_id_guns<-annotated.guns_corpus$document
doc_id_guns$date<-top_guns_urls_df$date_utc

annoData <- left_join(doc_id_guns, annotated.guns_corpus$token, by = "doc_id")

annoData$date<-as.Date(annoData$date)
annoData %>% 
  group_by(date) %>% 
  summarize(Sentences = max(sid)) %>%
  ggplot(aes(date, Sentences)) +
    geom_line() +
    geom_smooth() +
    theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# calculate readability
readability <- textstat_readability(top_guns_corpus, 
                                    measure = c("Flesch.Kincaid")) 

# add in a chapter number
readability$chapter <- c(1:nrow(readability))

# plot results
ggplot(readability, aes(x = chapter, y = Flesch.Kincaid)) +
  geom_line() + 
  geom_smooth() + 
  theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

readability <- textstat_readability(top_guns_corpus, 
                                    measure = c("Flesch.Kincaid", "FOG", "Coleman.Liau.grade")) 

# add in a chapter number

readability$post <- c(1:nrow(readability))



# plot results
ggplot(readability, aes(x = post)) +
  geom_line(aes(y = Flesch.Kincaid), color = "black") + 
  geom_line(aes(y = FOG), color = "red") + 
  geom_line(aes(y = Coleman.Liau.grade), color = "blue") + 
  theme_bw()

annoData$date<-as.Date(annoData$date)

readability$added_dates<-as.Date(top_guns_urls_df$date_utc)

ggplot(readability, aes(x = added_dates)) +
  geom_smooth(aes(y = Flesch.Kincaid), color = "black") + 
  geom_smooth(aes(y = FOG), color = "red") + 
  geom_smooth(aes(y = Coleman.Liau.grade), color = "blue") + 
  theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

cor(readability$Flesch.Kincaid, readability$FOG, use = "complete.obs")
## [1] 0.8007756
list_of_wars_1000 <- "https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499"

wars_1000s_df <- read_html(list_of_wars_1000) 

wars_1000s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(2) %>% html_table()
wars_1000s_subset <- wars_1000s[,1:5]
wars_1000s_subset
## # A tibble: 58 × 5
##    Start  Finish `Name of conflict`                    Belligerents Belligerents
##    <chr>  <chr>  <chr>                                 <chr>        <chr>       
##  1 Start  Finish Name of conflict                      Victorious … "Defeated p…
##  2 1000   1139   Norman conquest of southern Italy     County of A… "Principali…
##  3 1001   1001   Battle of Peshawar (1001)             Ghaznavids   "Kabul Shah…
##  4 1002   1018   German–Polish War                     Kingdom of … "Holy Roman…
##  5 1008   1008   Hungarian–Ahtum War                   Kingdom of … "Voivodeshi…
##  6 1008   1008   Battle of Chach                       Ghaznavids   "Kabul Shah…
##  7 1007–8 1007–8 Battle at Herdaler                    Kingdom of … "Finnish tr…
##  8 1009   1031   Fitna of al-Andalus                   Hammudid dy… "Caliphate …
##  9 1010   1011   Second conflict in the Goryeo–Khitan… Liao dynasty "Goryeo"    
## 10 1014   1014   Battle of Clontarf                    High King o… "LeinsterDu…
## # … with 48 more rows
wars_1100s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(3) %>% html_table()


wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
## # A tibble: 39 × 5
##    Start          Finish         `Name of Conflict`    Belligerents Belligerents
##    <chr>          <chr>          <chr>                 <chr>        <chr>       
##  1 Start          Finish         Name of Conflict      "Victorious… "Defeated p…
##  2 Summer of 1101 Summer of 1101 Crusade of 1101Part … "Sultanate … "Crusaders\…
##  3 1101           1101           Battle of Ramla (110… "Kingdom of… "Fatimid Ca…
##  4 1102           1102           Battle of Ramla (110… "Fatimids o… "Kingdom of…
##  5 1107           1110           Norwegian CrusadePar… "Kingdom of… "Muslim Kin…
##  6 1110           1110           Chola invasion of Ka… "Chola Empi… "Kalinga"   
##  7 1113           1115           1113–15 Balearic Isl… "Republic o… "Taifa of M…
##  8 1122           1124           Venetian CrusadePart… "Republic o… "Fatimid Ca…
##  9 1107           1119           Muhammad Tapar's ant… "Seljuq Emp… "Nizari Ism…
## 10 1123           1123           Kalmare ledungPart o… "Kingdom of… "Swedish Pa…
## # … with 29 more rows
wars_1100s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(3) %>% html_table()


wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
## # A tibble: 39 × 5
##    Start          Finish         `Name of Conflict`    Belligerents Belligerents
##    <chr>          <chr>          <chr>                 <chr>        <chr>       
##  1 Start          Finish         Name of Conflict      "Victorious… "Defeated p…
##  2 Summer of 1101 Summer of 1101 Crusade of 1101Part … "Sultanate … "Crusaders\…
##  3 1101           1101           Battle of Ramla (110… "Kingdom of… "Fatimid Ca…
##  4 1102           1102           Battle of Ramla (110… "Fatimids o… "Kingdom of…
##  5 1107           1110           Norwegian CrusadePar… "Kingdom of… "Muslim Kin…
##  6 1110           1110           Chola invasion of Ka… "Chola Empi… "Kalinga"   
##  7 1113           1115           1113–15 Balearic Isl… "Republic o… "Taifa of M…
##  8 1122           1124           Venetian CrusadePart… "Republic o… "Fatimid Ca…
##  9 1107           1119           Muhammad Tapar's ant… "Seljuq Emp… "Nizari Ism…
## 10 1123           1123           Kalmare ledungPart o… "Kingdom of… "Swedish Pa…
## # … with 29 more rows
wars_1200s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(4) %>% html_table()


wars_1200s_subset <- wars_1200s[,1:5]
wars_1200s_subset
## # A tibble: 83 × 5
##    Start Finish `Name of Conflict`                 Belligerents     Belligerents
##    <chr> <chr>  <chr>                              <chr>            <chr>       
##  1 Start Finish Name of Conflict                   "Victorious par… "Defeated p…
##  2 1201  1219   War of the Antiochene Succession   "Forces of Bohe… "Forces of …
##  3 1202  1204   Fourth CrusadePart of the Crusades "Holy Roman Emp… "Byzantine …
##  4 1204  1206   Intervention in Chaldia            "Kingdom of Geo… "Byzantine …
##  5 1202  1204   Anglo-Norman War (1202–04)         "Kingdom of Fra… "Kingdom of…
##  6 1202  1214   Anglo-French War of 1202–1214      "Kingdom of Fra… "Kingdom of…
##  7 1203  1206   Loon War                           "Holland Kingdo… "Loon Franc…
##  8 1204  1261   Bulgarian–Latin wars               "Bulgarian Empi… "Latin Empi…
##  9 1206  1337   Mongol invasions and conquests     "Mongol Empire"  "西夏 Weste…
## 10 1208  1209   Lombard Rebellion                  "Latin Empire\n… "Rebel baro…
## # … with 73 more rows
wars_1300s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(5) %>% html_table()


wars_1300s_subset <- wars_1300s[,1:5]
wars_1300s_subset
## # A tibble: 79 × 5
##    Start           Finish          `Name of Conflict`  Belligerents Belligerents
##    <chr>           <chr>           <chr>               <chr>        <chr>       
##  1 Start           Finish          Name of Conflict    Victorious … Defeated pa…
##  2 1300            1301            Second Mongol inva… Myinsaing K… Yuan dynasty
##  3 1300            1300            Lembu Sora rebelli… Majapahit E… Lembu Sora …
##  4 c. 14th century c. 14th century K'aissape–Hvalsey … Inuit under… Norsemen un…
##  5 1303            1303            Conquest of Sylhet  Independent… Gaur Kingdo…
##  6 1308            1308            Teutonic takeover … Teutonic Kn… Margraviate…
##  7 1309            1309            Crusade of the Poor Duchy of Br… Crusaders   
##  8 1311            1312            Rebellion of mayor… Władysław I… Kraków      
##  9 1311            1318            Delhi–Seuna War     Delhi Sulta… Seuna Empire
## 10 1314            1318            Esen Buqa–Ayurbarw… Yuan dynast… Chagatai Kh…
## # … with 69 more rows
wars_1400s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(6) %>% html_table()


wars_1400s_subset <- wars_1400s[,1:5]
wars_1400s_subset
## # A tibble: 123 × 5
##    Start Finish `Name of Conflict`             Belligerents         Belligerents
##    <chr> <chr>  <chr>                          <chr>                <chr>       
##  1 Start Finish Name of Conflict               Victorious party (i… "Defeated p…
##  2 1400  1400   English invasion of Scotland   Kingdom of Scotland  "Kingdom of…
##  3 1400  1420   Glyndŵr Rising                 Kingdom of England   "Welsh rebe…
##  4 1401  1404   First Samogitian Uprising      Teutonic State       "Grand Duch…
##  5 1402  1402   Battle of Ankara               Timurid Empire       "Ottoman Em…
##  6 1402  1413   Ottoman Interregnum            Faction of Mehmed I  "Faction of…
##  7 1402  1496   Conquest of the Canary Islands Union of Castile an… "Guanches"  
##  8 1403  1403   Percy Rebellion                Kingdom of England   "English re…
##  9 1404  1406   Paregreg war                   Western court        "Eastern co…
## 10 1405  1405   Scrope Rebellion               Kingdom of England   "English re…
## # … with 113 more rows
list_of_wars_1500 <- "https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799"

wars_1500s_df <- read_html(list_of_wars_1500) 

wars_1500s <- wars_1500s_df %>% 
  html_nodes("table") %>% `[[`(1) %>% html_table()

wars_1500s_subset <- wars_1500s[,1:5]

wars_1500s_subset
## # A tibble: 160 × 5
##    Start Finish `Name of conflict`                     Belligerents Belligerents
##    <chr> <chr>  <chr>                                  <chr>        <chr>       
##  1 Start Finish "Name of conflict"                     Victorious … "Defeated p…
##  2 1500  1503   "Second Muscovite–Lithuanian WarMusco… Grand Duchy… "Grand Duch…
##  3 1500  1500   "Battle of Hemmingstedt"               Peasantry o… "Kalmar Uni…
##  4 1501  1512   "Dano-Swedish War (1501–1512)\nPart o… Sweden Free… "Kalmar Uni…
##  5 1502  1510   "Persian–Uzbek wars"                   Persian Emp… "Shaybanid …
##  6 1502  1543   "Guelders Wars"                        Holy Roman … "Duchy of G…
##  7 1503  1505   "War of the Succession of Landshut"    Duchy of Ba… "Duchy of B…
##  8 1505  1517   "Portuguese–Mamluk naval war"          Portugal     "Mamluk Sul…
##  9 1507  1508   "Third Muscovite–Lithuanian WarMuscov… Grand Duchy… "Grand Duch…
## 10 1508  1516   "War of the League of CambraiPart of … 1508–10: Pa… "1508–10: V…
## # … with 150 more rows