Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?
tokenizers package, if you do not have it.master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)
colnames(master)[1] = "time"
prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")
ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
lowercase = T,
n = 2)
pdata = as.data.frame(table(unlist(ptemp)))
nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
lowercase = T,
n = 2)
npdata = as.data.frame(table(unlist(nptemp)))
final_data = merge(pdata, npdata, by = "Var1", all = TRUE)
colnames(final_data) = c("Bigram", "Prez", "NotPrez")
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
install.packages("C:/Users/arkumar/Downloads/Rling_1.0.tar.gz", repos = NULL, type = "source")
library(Rling)
Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.
final_data[is.na(final_data)] = 0
grep function.
^ indicates that it should be the start of the bigram.$ indicates that it should be the last word of the bigram.make - so the firstword would be “^make” or the lastword would be " make$".firstword = "^great"
#lastword = " fake$"
#change to either first or last word here
reduced_data = final_data[grep(firstword, final_data$Bigram), ]
Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.
a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
## a b c d
## [1,] "great 8" "1" "0" "1482" "1469"
## [2,] "great accomplishment" "1" "0" "1482" "1469"
## [3,] "great addition" "1" "0" "1482" "1469"
## [4,] "great again" "142" "399" "1341" "1070"
## [5,] "great again.â" "1" "1" "1482" "1468"
## [6,] "great again:prior" "1" "0" "1482" "1469"
Calculate the expected value of a for each bigram.
aExp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d))
## a aExp b c d
## [1,] "great 8" "1" "0.502371273712737" "0" "1482" "1469"
## [2,] "great accomplishment" "1" "0.502371273712737" "0" "1482" "1469"
## [3,] "great addition" "1" "0.502371273712737" "0" "1482" "1469"
## [4,] "great again" "142" "271.782859078591" "399" "1341" "1070"
## [5,] "great again.â" "1" "1.00474254742547" "1" "1482" "1468"
## [6,] "great again:prior" "1" "0.502371273712737" "0" "1482" "1469"
Calculate the log p-values from the Fisher test.
pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < aExp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d, logpvF))
## a aExp b c d
## [1,] "great 8" "1" "0.502371273712737" "0" "1482" "1469"
## [2,] "great accomplishment" "1" "0.502371273712737" "0" "1482" "1469"
## [3,] "great addition" "1" "0.502371273712737" "0" "1482" "1469"
## [4,] "great again" "142" "271.782859078591" "399" "1341" "1070"
## [5,] "great again.â" "1" "1.00474254742547" "1" "1482" "1468"
## [6,] "great again:prior" "1" "0.502371273712737" "0" "1482" "1469"
## logpvF
## [1,] "4.82163733276644e-17"
## [2,] "4.82163733276644e-17"
## [3,] "4.82163733276644e-17"
## [4,] "-35.2768536128774"
## [5,] "0"
## [6,] "4.82163733276644e-17"
Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).
reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 28179 great state 83 26 7.627114
## 28034 great honor 99 36 7.544729
## 28086 great military 25 0 7.266254
## 28019 great governor 26 2 5.556696
## 28028 great healthcare 17 0 4.835259
## 28166 great senator 14 0 3.925943
## 27918 great american 19 2 3.674133
## 27989 great economic 10 0 2.715470
## 28003 great first 10 0 2.715470
## 28138 great progress 10 0 2.715470
What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated?
Post becoming President, Great as part of a phrase has been used much more than earlier. This clearly indicates an effort to showcase progress and bestow more credit to various organizations, the economy and other instruments of democracy. The style definitely became more about positive reinforcement if we look at the phrases we are analyzing