Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?
tokenizers package, if you do not have it.master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)
colnames(master)[1] = "time"
prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")
ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
lowercase = T,
n = 2)
pdata = as.data.frame(table(unlist(ptemp)))
nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
lowercase = T,
n = 2)
npdata = as.data.frame(table(unlist(nptemp)))
final_data = merge(pdata, npdata, by = "Var1", all = TRUE)
colnames(final_data) = c("Bigram", "Prez", "NotPrez")
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(Rling)
Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.
final_data[is.na(final_data)] = 0
grep function.
^ indicates that it should be the start of the bigram.$ indicates that it should be the last word of the bigram.make - so the firstword would be “^make” or the lastword would be " make$".firstword = "^make "
#lastword = " fake$"
#change to either first or last word here
reduced_data = final_data[grep(firstword, final_data$Bigram), ]
Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.
a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
## a b c d
## [1,] "make 3" "1" "0" "259" "604"
## [2,] "make a" "44" "32" "216" "572"
## [3,] "make almost" "1" "0" "259" "604"
## [4,] "make america" "103" "378" "157" "226"
## [5,] "make an" "2" "2" "258" "602"
## [6,] "make and" "1" "1" "259" "603"
Calculate the expected value of a for each bigram.
Exp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d))
## a Exp b c d
## [1,] "make 3" "1" "0.300925925925926" "0" "259" "604"
## [2,] "make a" "44" "22.8703703703704" "32" "216" "572"
## [3,] "make almost" "1" "0.300925925925926" "0" "259" "604"
## [4,] "make america" "103" "144.74537037037" "378" "157" "226"
## [5,] "make an" "2" "1.2037037037037" "2" "258" "602"
## [6,] "make and" "1" "0.601851851851852" "1" "259" "603"
Calculate the log p-values from the Fisher test.
pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < Exp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d, logpvF))
## a Exp b c d
## [1,] "make 3" "1" "0.300925925925926" "0" "259" "604"
## [2,] "make a" "44" "22.8703703703704" "32" "216" "572"
## [3,] "make almost" "1" "0.300925925925926" "0" "259" "604"
## [4,] "make america" "103" "144.74537037037" "378" "157" "226"
## [5,] "make an" "2" "1.2037037037037" "2" "258" "602"
## [6,] "make and" "1" "0.601851851851852" "1" "259" "603"
## logpvF
## [1,] "0.521540394508075"
## [2,] "6.83505620083176"
## [3,] "0.521540394508075"
## [4,] "-9.14445123335334"
## [5,] "0.230659049802421"
## [6,] "0.291121076380002"
Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).
reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 39530 make a 44 32 6.8350562
## 39585 make up 8 4 2.0345332
## 39538 make california 3 0 1.5681390
## 39539 make case 2 0 1.0442514
## 39550 make freedom 2 0 1.0442514
## 39563 make much 2 0 1.0442514
## 39568 make or 2 0 1.0442514
## 39575 make safety 2 0 1.0442514
## 39586 make us 5 4 0.8602290
## 39555 make his 2 1 0.6637883
as.character(top_prez)
## [1] "make a" "make up" "make california"
## [4] "make case" "make freedom" "make much"
## [7] "make or" "make safety" "make us"
## [10] "make his"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
## Bigram Prez NotPrez logp
## 39532 make america 103 378 -9.14445123
## 39584 make this 3 16 -0.67454341
## 111885 make donald 0 4 -0.49209659
## 111915 make them 0 5 -0.48172161
## 39558 make it 7 24 -0.36812899
## 111874 make americaâ 0 3 -0.25336167
## 111875 make anything 0 3 -0.25336167
## 39593 make your 2 8 -0.13571217
## 39569 make our 5 15 -0.09360166
## 39535 make any 1 2 0.00000000
as.character(top_notprez)
## [1] "make america" "make this" "make donald" "make them"
## [5] "make it" "make americaâ" "make anything" "make your"
## [9] "make our" "make any"
What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated? Before he was selected as president, he used a lot of ‘make’, such as ‘make america’,‘make it’. When he was in campign, he was trying to win the selection. So he said he would make american great again. That was his slogan. However, after he was selected as president, he still used make american a lot, since he is trying to keep his promise. But the number of ‘make’ becomes less, because what he is doing should be realize his promise instead of making new promises. That’s why the number of make decreased.