Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?
tokenizers package, if you do not have it.trump_tweets = read_excel("/Users/pallavisaitu/Downloads/trump_tweets.xls", col_types = c("text", "text", "text", "date","numeric"))
trump_tweets = na.omit(trump_tweets)
colnames(trump_tweets)[1] = "time"
prez = subset(trump_tweets, time == "prez")
notprez = subset(trump_tweets, time == "not prez")
ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
lowercase = T,
n = 2)
pdata = as.data.frame(table(unlist(ptemp)))
nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
lowercase = T,
n = 2)
npdata = as.data.frame(table(unlist(nptemp)))
final_data = merge(pdata, npdata, by = "Var1", all = TRUE)
colnames(final_data) = c("Bigram", "Prez", "NotPrez")
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(Rling)
Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.
final_data[is.na(final_data)] = 0
grep function.^ indicates that it should be the start of the bigram.$ indicates that it should be the last word of the bigram.make - so the firstword would be “^make” or the lastword would be " make$“.firstword = "^make "
lastword = " america$"
#change to either first or last word here
reduced_data = final_data[grep(lastword, final_data$Bigram), ]
Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.
a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
## a b c d
## [1,] "about america" "2" "4" "357" "708"
## [2,] "across america" "3" "2" "356" "710"
## [3,] "again america" "1" "0" "358" "712"
## [4,] "and america" "9" "1" "350" "711"
## [5,] "around america" "1" "0" "358" "712"
## [6,] "back america" "1" "0" "358" "712"
Calculate the expected value of a for each bigram.
Exp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d))
## a Exp b c d
## [1,] "about america" "2" "2.01120448179272" "4" "357" "708"
## [2,] "across america" "3" "1.67600373482726" "2" "356" "710"
## [3,] "again america" "1" "0.335200746965453" "0" "358" "712"
## [4,] "and america" "9" "3.35200746965453" "1" "350" "711"
## [5,] "around america" "1" "0.335200746965453" "0" "358" "712"
## [6,] "back america" "1" "0.335200746965453" "0" "358" "712"
Calculate the log p-values from the Fisher test.
pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < Exp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d, logpvF))
## a Exp b c d
## [1,] "about america" "2" "2.01120448179272" "4" "357" "708"
## [2,] "across america" "3" "1.67600373482726" "2" "356" "710"
## [3,] "again america" "1" "0.335200746965453" "0" "358" "712"
## [4,] "and america" "9" "3.35200746965453" "1" "350" "711"
## [5,] "around america" "1" "0.335200746965453" "0" "358" "712"
## [6,] "back america" "1" "0.335200746965453" "0" "358" "712"
## logpvF
## [1,] "0"
## [2,] "0.466695797455702"
## [3,] "0.474695022253536"
## [4,] "3.45432045600062"
## [5,] "0.474695022253536"
## [6,] "0.474695022253536"
Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).
reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 39038 making america 37 12 9.0503579
## 32287 in america 31 10 7.6420733
## 67154 to america 16 6 3.5494599
## 5750 and america 9 1 3.4543205
## 36007 keep america 7 0 3.3398914
## 51644 putting america 6 0 2.8603165
## 44917 of america 22 19 2.1637064
## 11448 bless america 7 2 2.0794249
## 37064 leadership america 3 0 1.4265053
## 11655 booming america 2 0 0.9501958
as.character(top_prez)
## [1] "making america" "in america" "to america"
## [4] "and america" "keep america" "putting america"
## [7] "of america" "bless america" "leadership america"
## [10] "booming america"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 38950 make america 103 380 -13.9510960
## 92442 crippled america 0 30 -5.2967691
## 77978 you america 2 29 -3.1504879
## 125723 save america 0 9 -1.4780333
## 98283 fix america 0 5 -0.7568401
## 138097 trump america 0 5 -0.7568401
## 52250 realdonaldtrump america 1 7 -0.5526976
## 87977 bring america 0 4 -0.5126699
## 110868 loves america 0 4 -0.5126699
## 113629 morning america 0 4 -0.5126699
as.character(top_notprez)
## [1] "make america" "crippled america"
## [3] "you america" "save america"
## [5] "fix america" "trump america"
## [7] "realdonaldtrump america" "bring america"
## [9] "loves america" "morning america"
What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated?
Before he was selected as president, he used a lot of ‘make america’,‘save america’. After becoming the President he still used the word a lot, but the frequency decreased and it makes sense because this is the time to act and not just keep repeating the same promises.