Create the Data

Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?

master = read.csv("trump_tweets.csv", stringsAsFactors = F)
master = na.omit(master)

colnames(master)[1] = "time"

prez = subset(master, time == "prez")
notprez = subset(master, time == "not prez")

ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
                            lowercase = T,
                            n = 2)
pdata = as.data.frame(table(unlist(ptemp)))

nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
                            lowercase = T,
                            n = 2)
npdata = as.data.frame(table(unlist(nptemp)))

final_data = merge(pdata, npdata, by = "Var1", all = TRUE)

colnames(final_data) = c("Bigram", "Prez", "NotPrez")

Load the Libraries + Functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

install.packages("C:/Users/arkumar/Downloads/Rling_1.0.tar.gz", repos = NULL, type = "source")
library(Rling)

Deal with NA values

Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.

final_data[is.na(final_data)] = 0

Pick your construction

firstword = "^great"
#lastword = " fake$"

#change to either first or last word here 
reduced_data = final_data[grep(firstword, final_data$Bigram), ]

Summarize the data

Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.

a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
##                             a     b     c      d     
## [1,] "great 8"              "1"   "0"   "1482" "1469"
## [2,] "great accomplishment" "1"   "0"   "1482" "1469"
## [3,] "great addition"       "1"   "0"   "1482" "1469"
## [4,] "great again"          "142" "399" "1341" "1070"
## [5,] "great again.â"        "1"   "1"   "1482" "1468"
## [6,] "great again:prior"    "1"   "0"   "1482" "1469"

Calculate aExp

Calculate the expected value of a for each bigram.

aExp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d))
##                             a     aExp                b     c      d     
## [1,] "great 8"              "1"   "0.502371273712737" "0"   "1482" "1469"
## [2,] "great accomplishment" "1"   "0.502371273712737" "0"   "1482" "1469"
## [3,] "great addition"       "1"   "0.502371273712737" "0"   "1482" "1469"
## [4,] "great again"          "142" "271.782859078591"  "399" "1341" "1070"
## [5,] "great again.â"        "1"   "1.00474254742547"  "1"   "1482" "1468"
## [6,] "great again:prior"    "1"   "0.502371273712737" "0"   "1482" "1469"

logPF

Calculate the log p-values from the Fisher test.

pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < aExp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, aExp, b, c, d, logpvF))
##                             a     aExp                b     c      d     
## [1,] "great 8"              "1"   "0.502371273712737" "0"   "1482" "1469"
## [2,] "great accomplishment" "1"   "0.502371273712737" "0"   "1482" "1469"
## [3,] "great addition"       "1"   "0.502371273712737" "0"   "1482" "1469"
## [4,] "great again"          "142" "271.782859078591"  "399" "1341" "1070"
## [5,] "great again.â"        "1"   "1.00474254742547"  "1"   "1482" "1468"
## [6,] "great again:prior"    "1"   "0.502371273712737" "0"   "1482" "1469"
##      logpvF                
## [1,] "4.82163733276644e-17"
## [2,] "4.82163733276644e-17"
## [3,] "4.82163733276644e-17"
## [4,] "-35.2768536128774"   
## [5,] "0"                   
## [6,] "4.82163733276644e-17"

Calculate the top scores

Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).

reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                 Bigram Prez NotPrez     logp
## 28179      great state   83      26 7.627114
## 28034      great honor   99      36 7.544729
## 28086   great military   25       0 7.266254
## 28019   great governor   26       2 5.556696
## 28028 great healthcare   17       0 4.835259
## 28166    great senator   14       0 3.925943
## 27918   great american   19       2 3.674133
## 27989   great economic   10       0 2.715470
## 28003      great first   10       0 2.715470
## 28138   great progress   10       0 2.715470

Interpreting the numbers

What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated?

Post becoming President, Great as part of a phrase has been used much more than earlier. This clearly indicates an effort to showcase progress and bestow more credit to various organizations, the economy and other instruments of democracy. The style definitely became more about positive reinforcement if we look at the phrases we are analyzing