Create the Data

Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?

master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)

colnames(master)[1] = "time"

prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")

ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
                            lowercase = T,
                            n = 2)
pdata = as.data.frame(table(unlist(ptemp)))

nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
                            lowercase = T,
                            n = 2)
npdata = as.data.frame(table(unlist(nptemp)))

final_data = merge(pdata, npdata, by = "Var1", all = TRUE)

colnames(final_data) = c("Bigram", "Prez", "NotPrez")

Load the Libraries + Functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

library(Rling)

Deal with NA values

Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.

final_data[is.na(final_data)] = 0

Pick your construction

firstword = "^change "
lastword = " change$"

#change to either first or last word here 
reduced_data = final_data[grep(firstword, final_data$Bigram), ]

Summarize the data

Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.

a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
##                        a   b   c    d   
## [1,] "change always"   "1" "0" "41" "62"
## [2,] "change and"      "1" "2" "41" "60"
## [3,] "change big"      "1" "0" "41" "62"
## [4,] "change election" "1" "0" "41" "62"
## [5,] "change fast"     "1" "0" "41" "62"
## [6,] "change history"  "1" "0" "41" "62"

Calculate aExp

Calculate the expected value of a for each bigram.

aExp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d))
##                        a   aExp                b   c    d   
## [1,] "change always"   "1" "0.403846153846154" "0" "41" "62"
## [2,] "change and"      "1" "1.21153846153846"  "2" "41" "60"
## [3,] "change big"      "1" "0.403846153846154" "0" "41" "62"
## [4,] "change election" "1" "0.403846153846154" "0" "41" "62"
## [5,] "change fast"     "1" "0.403846153846154" "0" "41" "62"
## [6,] "change history"  "1" "0.403846153846154" "0" "41" "62"

logPF

Calculate the log p-values from the Fisher test.

pvF = pv.Fisher.collostr(a, b, c, d)
logpvF = ifelse(a < aExp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d, logpvF))
##                        a   aExp                b   c    d   
## [1,] "change always"   "1" "0.403846153846154" "0" "41" "62"
## [2,] "change and"      "1" "1.21153846153846"  "2" "41" "60"
## [3,] "change big"      "1" "0.403846153846154" "0" "41" "62"
## [4,] "change election" "1" "0.403846153846154" "0" "41" "62"
## [5,] "change fast"     "1" "0.403846153846154" "0" "41" "62"
## [6,] "change history"  "1" "0.403846153846154" "0" "41" "62"
##      logpvF                 
## [1,] "0.39378404890088"     
## [2,] "-4.82163733276644e-17"
## [3,] "0.39378404890088"     
## [4,] "0.39378404890088"     
## [5,] "0.39378404890088"     
## [6,] "0.39378404890088"

Calculate the top scores

Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).

reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                Bigram Prez NotPrez      logp
## 13872      change the    9       3 1.8936410
## 13868      change our    3       0 1.2003776
## 13877    change tones    3       0 1.2003776
## 13863    change libel    2       0 0.7938374
## 13866       change of    2       0 0.7938374
## 13854   change always    1       0 0.3937840
## 13856      change big    1       0 0.3937840
## 13857 change election    1       0 0.3937840
## 13858     change fast    1       0 0.3937840
## 13859  change history    1       0 0.3937840
as.character(top_prez)
##  [1] "change the"      "change our"      "change tones"   
##  [4] "change libel"    "change of"       "change always"  
##  [7] "change big"      "change election" "change fast"    
## [10] "change history"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                 Bigram Prez NotPrez          logp
## 88700      change your    0       4 -8.366519e-01
## 88677         change i    0       2 -2.891921e-01
## 88680      change it's    0       2 -2.891921e-01
## 88684   change nothing    0       2 -2.891921e-01
## 88689      change that    0       2 -2.891921e-01
## 88693 change trump2016    0       2 -2.891921e-01
## 13855       change and    1       2 -4.821637e-17
## 13860        change in    1       2 -4.821637e-17
## 13865        change my    1       2 -4.821637e-17
## 13876        change to    1       2 -4.821637e-17
as.character(top_notprez)
##  [1] "change your"      "change i"         "change it's"     
##  [4] "change nothing"   "change that"      "change trump2016"
##  [7] "change and"       "change in"        "change my"       
## [10] "change to"

Interpreting the numbers

What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated? From the result above, I could find that Donald Trump loves to use “change” to express his ambitious goal for doing something for the country. But after he was elected to President, he used much less“change” in his Twitter.