Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?
tokenizers package, if you do not have it.master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)
colnames(master)[1] = "time"
prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")
ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
lowercase = T,
n = 2)
pdata = as.data.frame(table(unlist(ptemp)))
nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
lowercase = T,
n = 2)
npdata = as.data.frame(table(unlist(nptemp)))
final_data = merge(pdata, npdata, by = "Var1", all = TRUE)
colnames(final_data) = c("Bigram", "Prez", "NotPrez")
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(Rling)
Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.
final_data[is.na(final_data)] = 0
grep function.^ indicates that it should be the start of the bigram.$ indicates that it should be the last word of the bigram.make - so the firstword would be “^make” or the lastword would be " make$“.firstword = "^change "
lastword = " change$"
#change to either first or last word here
reduced_data = final_data[grep(firstword, final_data$Bigram), ]
Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.
a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
## a b c d
## [1,] "change always" "1" "0" "41" "62"
## [2,] "change and" "1" "2" "41" "60"
## [3,] "change big" "1" "0" "41" "62"
## [4,] "change election" "1" "0" "41" "62"
## [5,] "change fast" "1" "0" "41" "62"
## [6,] "change history" "1" "0" "41" "62"
Calculate the expected value of a for each bigram.
aExp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d))
## a aExp b c d
## [1,] "change always" "1" "0.403846153846154" "0" "41" "62"
## [2,] "change and" "1" "1.21153846153846" "2" "41" "60"
## [3,] "change big" "1" "0.403846153846154" "0" "41" "62"
## [4,] "change election" "1" "0.403846153846154" "0" "41" "62"
## [5,] "change fast" "1" "0.403846153846154" "0" "41" "62"
## [6,] "change history" "1" "0.403846153846154" "0" "41" "62"
Calculate the log p-values from the Fisher test.
pvF = pv.Fisher.collostr(a, b, c, d)
logpvF = ifelse(a < aExp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d, logpvF))
## a aExp b c d
## [1,] "change always" "1" "0.403846153846154" "0" "41" "62"
## [2,] "change and" "1" "1.21153846153846" "2" "41" "60"
## [3,] "change big" "1" "0.403846153846154" "0" "41" "62"
## [4,] "change election" "1" "0.403846153846154" "0" "41" "62"
## [5,] "change fast" "1" "0.403846153846154" "0" "41" "62"
## [6,] "change history" "1" "0.403846153846154" "0" "41" "62"
## logpvF
## [1,] "0.39378404890088"
## [2,] "-4.82163733276644e-17"
## [3,] "0.39378404890088"
## [4,] "0.39378404890088"
## [5,] "0.39378404890088"
## [6,] "0.39378404890088"
Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).
reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 13872 change the 9 3 1.8936410
## 13868 change our 3 0 1.2003776
## 13877 change tones 3 0 1.2003776
## 13863 change libel 2 0 0.7938374
## 13866 change of 2 0 0.7938374
## 13854 change always 1 0 0.3937840
## 13856 change big 1 0 0.3937840
## 13857 change election 1 0 0.3937840
## 13858 change fast 1 0 0.3937840
## 13859 change history 1 0 0.3937840
as.character(top_prez)
## [1] "change the" "change our" "change tones"
## [4] "change libel" "change of" "change always"
## [7] "change big" "change election" "change fast"
## [10] "change history"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 88700 change your 0 4 -8.366519e-01
## 88677 change i 0 2 -2.891921e-01
## 88680 change it's 0 2 -2.891921e-01
## 88684 change nothing 0 2 -2.891921e-01
## 88689 change that 0 2 -2.891921e-01
## 88693 change trump2016 0 2 -2.891921e-01
## 13855 change and 1 2 -4.821637e-17
## 13860 change in 1 2 -4.821637e-17
## 13865 change my 1 2 -4.821637e-17
## 13876 change to 1 2 -4.821637e-17
as.character(top_notprez)
## [1] "change your" "change i" "change it's"
## [4] "change nothing" "change that" "change trump2016"
## [7] "change and" "change in" "change my"
## [10] "change to"
What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated? From the result above, I could find that Donald Trump loves to use “change” to express his ambitious goal for doing something for the country. But after he was elected to President, he used much less“change” in his Twitter.