Create the Data

Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?

master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)

colnames(master)[1] = "time"

prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")

ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
                            lowercase = T,
                            n = 2)
pdata = as.data.frame(table(unlist(ptemp)))

nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
                            lowercase = T,
                            n = 2)
npdata = as.data.frame(table(unlist(nptemp)))

final_data = merge(pdata, npdata, by = "Var1", all = TRUE)

colnames(final_data) = c("Bigram", "Prez", "NotPrez")

Load the Libraries + Functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

library(Rling)

Deal with NA values

Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.

final_data[is.na(final_data)] = 0

Pick your construction

firstword = "^make "
#lastword = " fake$"

#change to either first or last word here 
reduced_data = final_data[grep(firstword, final_data$Bigram), ]

Summarize the data

Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.

a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
##                     a     b     c     d    
## [1,] "make 3"       "1"   "0"   "259" "604"
## [2,] "make a"       "44"  "32"  "216" "572"
## [3,] "make almost"  "1"   "0"   "259" "604"
## [4,] "make america" "103" "378" "157" "226"
## [5,] "make an"      "2"   "2"   "258" "602"
## [6,] "make and"     "1"   "1"   "259" "603"

Calculate aExp

Calculate the expected value of a for each bigram.

Exp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d))
##                     a     Exp                 b     c     d    
## [1,] "make 3"       "1"   "0.300925925925926" "0"   "259" "604"
## [2,] "make a"       "44"  "22.8703703703704"  "32"  "216" "572"
## [3,] "make almost"  "1"   "0.300925925925926" "0"   "259" "604"
## [4,] "make america" "103" "144.74537037037"   "378" "157" "226"
## [5,] "make an"      "2"   "1.2037037037037"   "2"   "258" "602"
## [6,] "make and"     "1"   "0.601851851851852" "1"   "259" "603"

logPF

Calculate the log p-values from the Fisher test.

pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < Exp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d, logpvF))
##                     a     Exp                 b     c     d    
## [1,] "make 3"       "1"   "0.300925925925926" "0"   "259" "604"
## [2,] "make a"       "44"  "22.8703703703704"  "32"  "216" "572"
## [3,] "make almost"  "1"   "0.300925925925926" "0"   "259" "604"
## [4,] "make america" "103" "144.74537037037"   "378" "157" "226"
## [5,] "make an"      "2"   "1.2037037037037"   "2"   "258" "602"
## [6,] "make and"     "1"   "0.601851851851852" "1"   "259" "603"
##      logpvF             
## [1,] "0.521540394508075"
## [2,] "6.83505620083176" 
## [3,] "0.521540394508075"
## [4,] "-9.14445123335334"
## [5,] "0.230659049802421"
## [6,] "0.291121076380002"

Calculate the top scores

Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).

reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                Bigram Prez NotPrez      logp
## 39530          make a   44      32 6.8350562
## 39585         make up    8       4 2.0345332
## 39538 make california    3       0 1.5681390
## 39539       make case    2       0 1.0442514
## 39550    make freedom    2       0 1.0442514
## 39563       make much    2       0 1.0442514
## 39568         make or    2       0 1.0442514
## 39575     make safety    2       0 1.0442514
## 39586         make us    5       4 0.8602290
## 39555        make his    2       1 0.6637883
as.character(top_prez)
##  [1] "make a"          "make up"         "make california"
##  [4] "make case"       "make freedom"    "make much"      
##  [7] "make or"         "make safety"     "make us"        
## [10] "make his"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##               Bigram Prez NotPrez        logp
## 39532   make america  103     378 -9.14445123
## 39584      make this    3      16 -0.67454341
## 111885   make donald    0       4 -0.49209659
## 111915     make them    0       5 -0.48172161
## 39558        make it    7      24 -0.36812899
## 111874 make americaâ    0       3 -0.25336167
## 111875 make anything    0       3 -0.25336167
## 39593      make your    2       8 -0.13571217
## 39569       make our    5      15 -0.09360166
## 39535       make any    1       2  0.00000000
as.character(top_notprez)
##  [1] "make america"  "make this"     "make donald"   "make them"    
##  [5] "make it"       "make americaâ" "make anything" "make your"    
##  [9] "make our"      "make any"

Interpreting the numbers

What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated? Before he was selected as president, he used a lot of ‘make’, such as ‘make america’,‘make it’. When he was in campign, he was trying to win the selection. So he said he would make american great again. That was his slogan. However, after he was selected as president, he still used make american a lot, since he is trying to keep his promise. But the number of ‘make’ becomes less, because what he is doing should be realize his promise instead of making new promises. That’s why the number of make decreased.