#install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#install.packages("tidytext")
library(tidytext)
#install.packages("ggwordcloud")
library(ggwordcloud)
#install.packages('textdata')
library(textdata)
#install.packages("striptf")
#Reads in the articles from the Economic Times
econ_times <- read_lines("Newspaperarticles.txt")
#Turns the list of articles into a tibble
econ_times <- tibble(econ_times)
#View(econ_times)
#Turns the text into a character
econ_times$econ_times <- as.character(econ_times$econ_times)
#View(econ_times)
#Removes extra text that's not needed
econ_times <- econ_times[1108:4914, ]
#View(econ_times)
#Removes the stop words and then groups and counts on word count
econ_times <- econ_times %>%
unnest_tokens(word, econ_times)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Joining, by = "word"
#View(econ_times)
Ok, now that we have our word frequencies let’s do some analysis. We will compare the three speeches using sentiment analysis to see if the generally align or not.
#helps with the sentiment analysis, using package "textdata"
get_sentiments('afinn')# we see a list of words and there classification, 2,467 - not really that many overall.
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments('nrc')# looks like a good amount more 13,891, but as we can see words are classified in several different categories.
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments('bing')# looks like a good amount more 6,776, but as we can see just negative and positive.
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
econ_sentiment_affin <- econ_times %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
## Joining, by = "word"
econ_sentiment_nrc <- econ_times %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
econ_sentiment_bing <- econ_times %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
#View(econ_sentiment_bing)
#read in txt file
USA_Today <- read_lines("USATodayDS.txt")
#converting data to a tibble
USA_Today <- tibble(USA_Today)
#view(USA_Today)
#remove extra rows at the top
USA_Today <- USA_Today[60:4274, ]
#view(USA_Today)
#converting data to character
USA_Today$USA_Today <- as.character(USA_Today$USA_Today)
str(USA_Today)
## tibble [4,215 × 1] (S3: tbl_df/tbl/data.frame)
## $ USA_Today: chr [1:4215] "" "\\f3\\fs20 \\cf0 It may be a new year, but we're all still dealing with fallout from 2020. One area where this "| __truncated__ "\\f0\\fs24 \\" "" ...
#Removes the stop words and then groups and counts on word count
USA_Today <- USA_Today %>%
unnest_tokens(word, USA_Today)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Joining, by = "word"
#removing words that are not words
USA_Today <- USA_Today[13:1757, ]
USA_Today <- USA_Today[-c(2,3,4,5,8,11,12,13,14,15,16,17,21,22,32,46,50,51,52,55), ]
view(USA_Today)
USA_Today <- USA_Today[-c(1,4,5,8,19), ]
#helps with the sentiment analysis, using package "textdata"
get_sentiments('afinn')# we see a list of words and there classification, 2,467 - not really that many overall.
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments('nrc')# looks like a good amount more 13,891, but as we can see words are classified in several different categories.
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments('bing')# looks like a good amount more 6,776, but as we can see just negative and positive.
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
#create dataframes for the different sentiment dictionaries and the words and word counts in the articles
USA_Today_sentiment_affin <- USA_Today %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
## Joining, by = "word"
USA_Today_sentiment_nrc <- USA_Today %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
USA_Today_sentiment_bing <- USA_Today %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
#View(USA_Today_sentiment_bing)
#View(USA_Today_sentiment_nrc)
#View(USA_Today_sentiment_affin)
#do tabling to see the differences in bing and nrc, seems like there are more positives than negatives assiocated with these articles
table(USA_Today_sentiment_bing$sentiment)
##
## negative positive
## 86 73
table(USA_Today_sentiment_nrc$sentiment)
##
## anger anticipation disgust fear joy negative
## 38 86 19 53 53 84
## positive sadness surprise trust
## 184 50 31 126
#View(USA_Today_sentiment_affin)
#plot affin senetiment range to get a better understanding of the articles' sentiments
ggplot(data = USA_Today_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("USA Today Sentiment Range")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#word cloud of the 100 top words
set.seed(42)
ggplot(USA_Today[1:100,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()
#Reads in the articles from the Economic Times
la <- read_lines("latimes.txt")
#Turns the list of articles into a tibble
la <- tibble(la)
#View(la)
#Turns the text into a character
la$la<- as.character(la$la)
#View(la)
#Removes extra text that's not needed
la <- la[566:1506, ]
#View(la)
#Removes the stop words and then groups and counts on word count
la <- la %>%
unnest_tokens(word, la)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Joining, by = "word"
#View(la)
#remove rows that don't have words
la <- la[14:1880, ]
la <- la[-c(5, 6, 8, 14, 15, 23, 26, 28, 31, 34, 35, 42, 53), ]
Ok, now that we have our word frequencies let’s do some analysis. We will compare the three speeches using sentiment analysis to see if the generally align or not.
#helps with the sentiment analysis, using package "textdata"
get_sentiments('afinn')# we see a list of words and there classification, 2,467 - not really that many overall.
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments('nrc')# looks like a good amount more 13,891, but as we can see words are classified in several different categories.
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments('bing')# looks like a good amount more 6,776, but as we can see just negative and positive.
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
la_sentiment_affin <- la %>%
inner_join(get_sentiments("afinn"))#using a inner join to match words and add the sentiment variable
## Joining, by = "word"
la_sentiment_nrc <- la %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
la_sentiment_bing <- la %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
#View(la_sentiment_bing)
#plot affin senetiment range to get a better understanding of the articles' sentiments
ggplot(data = la_sentiment_affin,
aes(x=value)
)+
geom_histogram()+
ggtitle("LA Times Sentiment Range")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#wordcloud of the 100 top words
set.seed(42)
ggplot(la[1:100,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()
#reads in data
twv <- read_lines("Files (6).txt")
#creates a tibble
twv <- tibble(twv)
#casts to character
twv$twv <- as.character(twv$twv)
#Removes the stop words and then groups and counts on word count
twv <- twv %>%
unnest_tokens(word, twv)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Joining, by = "word"
#helps with the sentiment analysis, using package "textdata"
get_sentiments('afinn')# we see a list of words and there classification, 2,467 - not really that many overall.
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments('nrc')# looks like a good amount more 13,891, but as we can see words are classified in several different categories.
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments('bing')# looks like a good amount more 6,776, but as we can see just negative and positive.
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
twv_a <- twv %>%
inner_join(get_sentiments("afinn"))
## Joining, by = "word"
twv_n <- twv %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
twv_b <- twv %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
# View(twv_b)
#plot affin senetiment range to get a better understanding of the articles' sentiments
ggplot(data = twv_a,
aes(x=value)
)+
geom_histogram()+
ggtitle("The Times WV Sentiment Range")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#word cloud of the 100 top words
set.seed(42)
ggplot(twv[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()
#read in data
act <- read_lines("Files (8).txt")
#create a tibble
act <- tibble(act)
#cast to character
act$act <- as.character(act$act)
#Removes the stop words and then groups and counts on word count
act <- act %>%
unnest_tokens(word, act)%>%
anti_join(stop_words)%>%
count(word, sort=TRUE)
## Joining, by = "word"
#helps with the sentiment analysis, using package "textdata"
get_sentiments('afinn')# we see a list of words and there classification, 2,467 - not really that many overall.
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments('nrc')# looks like a good amount more 13,891, but as we can see words are classified in several different categories.
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments('bing')# looks like a good amount more 6,776, but as we can see just negative and positive.
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
act_a <- act %>%
inner_join(get_sentiments("afinn"))
## Joining, by = "word"
act_n <- act %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
act_b <- act %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
# View(act_b)
#plot affin senetiment range to get a better understanding of the articles' sentiments
ggplot(data = act_a,
aes(x=value)
)+
geom_histogram()+
ggtitle("Arizona Capitol Times Sentiment Range")+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#wordcloud of the 100 top words
set.seed(42)
ggplot(act[1:50,], aes(label = word, size = n)
) +
geom_text_wordcloud() +
theme_minimal()
#getting raw data
USA_Today_raw <- as.tibble(read_lines("USATodayDS.txt"))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
twv_raw <- as.tibble(read_lines("Files (6).txt"))
act_raw <- as.tibble(read_lines("Files (8).txt"))
la_raw <- as.tibble(read_lines("latimes.txt"))
econ_raw <- as.tibble(read_lines("Newspaperarticles.txt"))
#creating a data prep function
data_prep <- function(x,y,z){
i <- as_tibble(t(x))
ii <- unite(i,"text",y:z,remove = TRUE,sep = "")
}
#running data prep function
USA_Today_bag <- data_prep(USA_Today_raw[60:4274,1],'V1','V4214')
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(y)` instead of `y` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(z)` instead of `z` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
twv_bag <- data_prep(twv_raw, "V1", "V335")
act_bag <- data_prep(act_raw, "V1", "V391")
econ_bag <- data_prep(econ_raw[1108:4914,1 ], "V1", "V3806")
la_bag <- data_prep(la_raw[566:1506,1 ], "V1", "V940")
#creating a newspaper column
newspapers <- c("USA Today","The Times West Virginian ","Arizona Capitol Times", "The Economic Times", "Los Angeles Times")
#adding the texts and news papers to a tibble
tf_idf_text <- tibble(newspapers,text=t(tibble(USA_Today_bag$text,twv_bag$text, act_bag$text, econ_bag$text, la_bag$text,.name_repair = "universal"))) #selecting only the text columns
## New names:
## * `USA_Today_bag$text` -> USA_Today_bag.text
## * `twv_bag$text` -> twv_bag.text
## * `act_bag$text` -> act_bag.text
## * `econ_bag$text` -> econ_bag.text
## * `la_bag$text` -> la_bag.text
class(tf_idf_text)
## [1] "tbl_df" "tbl" "data.frame"
#view(tf_idf_text)
#counting words
word_count <- tf_idf_text %>%
unnest_tokens(word, text) %>%
count(newspapers, word, sort = TRUE)
#view(word_count)
#getting total number of words
total_words <- word_count %>%
group_by(newspapers) %>%
summarize(total = sum(n))
#view(total_words)
#joining dataframes into one
newspapers_words <- left_join(word_count, total_words)
## Joining, by = "newspapers"
#View(newspapers_words)
#getting tf and idk and tf_idf
newspapers_words <- newspapers_words %>% #binding on the tf_idf column
bind_tf_idf(word, newspapers, n)
#View(newspapers_words)
###COMPARING TF-IDF VALUES [involved sorting tf_idf column, then looking at top ten words for each newspaper corpus (excluding non-relevant filler words, etc.)]
#First sort the tf_idf column from greatest to least
newspapers_words$tf_idf<-sort(newspapers_words$tf_idf,decreasing=TRUE)
#View(newspapers_words)
#Selecting the top 10 relevant terms (non-filler words) from each Corpus Newspaper and combining into a data frame for visualization
USATODAY<-c('their','data','who','company','tech','year','people','new','usa','companies')
ECONOMICTIMES<-c('data','science','india','rights','technology','more','learning','digital','analytics','bennett')
WESTVA<-c('west','fairmont','students','virginia','virginian','data','standards','state','content','board')
ARIZONA<-c('arizona','home','bill','state','health','water','data','capitol','adoption','forest')
LOSANGELES<-c('people','asian','corgi','uc','americans','electoral','sandy','smoke','data','covid')
final_df<-data.frame(cbind(USATODAY,ECONOMICTIMES,WESTVA,ARIZONA,LOSANGELES))
#View(final_df)
We would want to do some surveys or something interactive to try and find out why we were seeing negative sentiments in some states. We would then like to run some state level campaigns in order to try and increase the level of enthusiasm for data science at the local level. We would also want to increase the data we have access to as it some of the newspaper we tried to acquire articles from were not in the database.
-USA Today: Overall had more positive sentiments and word counts. We also saw more national and specifically American terms in our word cloud for this one. -The Economic Times: Similar to USA Today was a national level newspaper and so seemed to have a more positive sentiment. A lot of business related words in this one and we talked about how data science seemed to be intertwined per say with economics here. -Los Angeles Times: A closer gap between positive and negative words with this one but still more positive overall than negative. Also had more everyday life type of terms which we thought was because of this is more of a regional newspaper even though it is still very big. -The Times West Virginian: Saw a pretty positive count of the words with this one too. An interesting word in the word cloud we saw was colleges and it turns out some of the articles discussed data science in education. -Arizona Capitol Times: This one was pretty split between positive and negative sentiment of the words. it also discussed a lot of everyday words like LA Times did. We speculated that some of the articles must have had to do with legislation and policy given the words in the word cloud.