Visualizations in this file were created to support the Final Project where we analyze news articles classified as “Real” news vs. “Fake” news.
Create dataframe of true and fake words to use with r2d3
#load true and fake data into dataframes
true_df <-read.csv(here('data','csv','true_df.csv'), encoding="ascii", stringsAsFactors = FALSE)
fake_df <- read.csv(here('data','csv', 'fake_df.csv'), encoding="ascii", stringsAsFactors = FALSE)
#total count of words in true df
true_df_words <- true_df %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE)
#top 20 true words df
top_true_df_words <- true_df_words %>%
#select(-c(type)) %>%
filter(!word %in% c("s","â")) %>%
top_n(20, n)
#create .tsv file to use in r2d3 testing
con<-file(here('model_scripts','data.tsv'),encoding="UTF-8")
write.table(top_true_df_words,file=con , row.names = FALSE, sep = "\t")
#total count of words in fake df
fake_df_words <- fake_df %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords()) %>%
count(type, word, sort = TRUE)
#top 20 fake words df
top_fake_df_words <- fake_df_words %>%
#select(-c(type)) %>%
filter(!word %in% c("s","â","t")) %>%
top_n(20, n)
#call r2d3 javascript code to display top 20 words
#fake words
r2d3::r2d3(data=top_fake_df_words, script = "r2d3/d3_scripts.js", d3_version = "3", container = "div")
#true words
r2d3::r2d3(data=top_true_df_words, script = "r2d3/d3_scripts.js", d3_version = "3", container = "div")
Create dataframe to use with shiny app
top_words_all <- fake_df_words %>%
filter(!word %in% c("s","â","t")) %>%
top_n(20, n)
x <- true_df_words %>%
filter(!word %in% c("s","â","t")) %>%
top_n(20, n)
#top 20 real and fake news words df to use in shiny app
top_words_all <- rbind(top_words_all,x)
Tables for a sentiment analysis. True vs. Fake
library(reshape2)
library(dplyr)
library(htmlwidgets)
fake_df <- mutate(fake_df, text = as.character(fake_df$text))
true_df_words <- true_df %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords())
fake_df_words <- fake_df %>%
unnest_tokens(word,text) %>%
anti_join(get_stopwords())
fake_df_words_top <- fake_df_words %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment) %>%
top_n(10,n) %>%
arrange(desc(n))
true_df_words_top <- true_df_words %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment) %>%
top_n(10, n) %>%
arrange(desc(n))
# table of top 20 words
knitr::kable(true_df_words_top)
| word | sentiment | n |
|---|---|---|
| trump | positive | 42795 |
| support | positive | 7124 |
| like | positive | 4664 |
| work | positive | 4653 |
| top | positive | 4410 |
| opposition | negative | 4008 |
| right | positive | 3917 |
| well | positive | 3848 |
| intelligence | positive | 3829 |
| led | positive | 3641 |
# table of top 20 words
knitr::kable(fake_df_words_top)
| word | sentiment | n |
|---|---|---|
| trump | positive | 75423 |
| like | positive | 17892 |
| right | positive | 10636 |
| well | positive | 7773 |
| support | positive | 5859 |
| good | positive | 5370 |
| work | positive | 5118 |
| great | positive | 4235 |
| attack | negative | 4124 |
| won | positive | 3726 |
Wordclouds
#fake and true wordclouds comparing positive and negative words
fake_df_words %>%
#select(word, type) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray10", "gray50"),
max.words = 175)
true_df_words %>%
#select(word, type) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray10", "gray50"),
max.words = 175)