An update om my project. What I did since my last blog post was explore other dictionaries in the ‘quanteda.dictionaries’ package, while also expanding my use of the NPC dictionary. The NPC dictionary is my favorite dictionary because I think its the easiest to understand and apply. The words they use such as fear, disgust, or joy make sense in the context of my personal research. I sued the AFINN dictionary as well but thought it was less useful to me than the NPC one. I honestly do not understand completely what AFINN is doing and what the variables of that sentiment dictionary are such as WPS and afinn. The next dictionary i experimented with is the RID. That dictionary was cool but seemed complex and less clear than the NRC dictionary. This exploration was good to expose me to various dictionaries and showed me there are a lot of different ways to view sentiment.

My next step was to create training data, test data, and held out data. I understand the concepts behind machine learning but when reviewing the tutorial I got lost as to what an index and dlm was and why it is necessary to create in order for machine learning methods.

Moving forward I am going to further explore the sentiment analysis with upgraded graphs along with an addition of machine learning methods.

#loading in libraries
library(httr)
library(jsonlite)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag()     masks stats::lag()
library(tidytext)
library(quanteda)
## Package version: 3.2.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
library(readtext)
library(quanteda.textmodels)
prison90.t <- c()
#i will be the last number in the sequence of 1970-1975
for (i in 0:4){
  #j will be for the months
  for(j in 1:12){
    link90 <- paste0('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=prison&policy&facet_field=day_of_week&facet=true&begin_date=197',i, '0',j,'01&end_date=199',i,'0',j,'31&api-key=ywg9nsFaF2uXhMA43C409CUYglWMxpMY')
    if (j > 9 & j< 13){
      link90 <- paste0('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=prison&facet_field=day_of_week&facet=true&begin_date=199',i,j,'01&end_date=197',i,j,'31&api-key=ywg9nsFaF2uXhMA43C409CUYglWMxpMY')
      
    prison.policy90 <- GET(link90)
    prison.r90 <- fromJSON(rawToChar(prison.policy90$content))
    table90 <- as.tibble(cbind(
  date=prison.r90$response$docs$pub_date,
  abstract=prison.r90$response$docs$abstract,
  lead=prison.r90$response$docs$lead_paragraph,
  source=prison.r90$response$docs$source,
  byline=prison.r90$response$docs$byline$original,
  type.m=prison.r90$response$docs$type_of_material,
  type.d=prison.r90$response$docs$document_type))
    
    prison90.t <- rbind(prison90.t, table90)
    
    Sys.sleep(7)
    }
    
    
  prison.policy90 <- GET(link90)
    prison.r90 <- fromJSON(rawToChar(prison.policy90$content))
    table90 <- as.tibble(cbind(
  date=prison.r90$response$docs$pub_date,
  abstract=prison.r90$response$docs$abstract,
  lead=prison.r90$response$docs$lead_paragraph,
  source=prison.r90$response$docs$source,
  byline=prison.r90$response$docs$byline$original,
  type.m=prison.r90$response$docs$type_of_material,
  type.d=prison.r90$response$docs$document_type))
    
    prison90.t <- rbind(prison90.t, table90)
  }

}
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
#removing odd numbers at the end of dates
prison90.t$date <- gsub('.{14}$','',prison90.t$date)
#creating a corous of the lead paragraph
corpus <- corpus(prison90.t$lead)
#making ids for each text
docid <- paste(prison90.t$date, prison90.t$byline)
docnames(corpus) <- docid
#removing punctuation, numbers, and symbols.
corpus.tokens <-tokens(corpus, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)



library(devtools)
## Loading required package: usethis
devtools::install_github("kbenoit/quanteda.dictionaries") 
## Skipping install of 'quanteda.dictionaries' from a github remote, the SHA1 (9b97367f) has not changed since last install.
##   Use `force = TRUE` to force installation
library(quanteda.dictionaries)
devtools::install_github("quanteda/quanteda.sentiment")
## Skipping install of 'quanteda.sentiment' from a github remote, the SHA1 (a2aca88b) has not changed since last install.
##   Use `force = TRUE` to force installation
library(quanteda.sentiment)
## 
## Attaching package: 'quanteda.sentiment'
## The following object is masked from 'package:quanteda':
## 
##     data_dictionary_LSD2015
library(ggplot2)
review.sentiment <- liwcalike(as.character(corpus), data_dictionary_NRC)

ggplot(review.sentiment)+
  geom_histogram(aes(x=positive))+
  xlim(0,30) +
  ylim(0,30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(review.sentiment)+
  geom_histogram(aes(x=negative))+
                   xlim(0,30) +
  ylim(0,30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(review.sentiment)+
  geom_histogram(aes(x=fear))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(review.sentiment)+
  geom_histogram(aes(x=disgust))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(review.sentiment)+
  geom_histogram(aes(x=anger))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(review.sentiment)+
  geom_histogram(aes(x=anticipation))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

review.sentiment2 <- liwcalike(as.character(corpus), data_dictionary_AFINN)

ggplot(review.sentiment2)+
  geom_histogram(aes(x=afinn))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(review.sentiment2)+
  geom_histogram(aes(x=WPS))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

review.sentiment3 <- liwcalike(as.character(corpus), data_dictionary_RID)

ggplot(review.sentiment3)+
  geom_histogram(aes(x=emotions.anxiety))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:httr':
## 
##     progress
set.seed(2)
#creating id variable
docvars(corpus,'id') <- 1:ndoc(corpus)

#training set and test set

n <- ndoc(corpus)
train <- sample(1:n,.7*n)
test <- c(1:n)[-train]
#creating held out data
L <- length(test)
heldout <- sample(1:L, L* .5)
test <- test[-heldout]

#confused on indices and dfms