Create the Data

Is there a difference in twitter word usage of Donald Trump during his run for president versus while he is president?

trump_tweets = read_excel("/Users/pallavisaitu/Downloads/trump_tweets.xls", col_types = c("text", "text", "text", "date","numeric"))
trump_tweets = na.omit(trump_tweets)

colnames(trump_tweets)[1] = "time"

prez = subset(trump_tweets, time == "prez")
notprez = subset(trump_tweets, time == "not prez")

ptemp = tokenizers::tokenize_ngrams(as.character(prez$text),
                            lowercase = T,
                            n = 2)
pdata = as.data.frame(table(unlist(ptemp)))

nptemp = tokenizers::tokenize_ngrams(as.character(notprez$text),
                            lowercase = T,
                            n = 2)
npdata = as.data.frame(table(unlist(nptemp)))

final_data = merge(pdata, npdata, by = "Var1", all = TRUE)

colnames(final_data) = c("Bigram", "Prez", "NotPrez")

Load the Libraries + Functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

library(Rling)

Deal with NA values

Remember that the NA values just represent options that didn’t occur in one time point versus the other. Fill those NA values in with zero.

final_data[is.na(final_data)] = 0

Pick your construction

firstword = "^make "
lastword = " america$"

#change to either first or last word here 
reduced_data = final_data[grep(lastword, final_data$Bigram), ]

Summarize the data

Using Not Prez as a and Prez as b, create the vectors for a, b, c, and d.

a = reduced_data$Prez
b = reduced_data$NotPrez
c = sum(reduced_data$Prez) - reduced_data$Prez
d = sum(reduced_data$NotPrez) - reduced_data$NotPrez
head(cbind(as.character(reduced_data$Bigram), a, b, c, d))
##                       a   b   c     d    
## [1,] "about america"  "2" "4" "357" "708"
## [2,] "across america" "3" "2" "356" "710"
## [3,] "again america"  "1" "0" "358" "712"
## [4,] "and america"    "9" "1" "350" "711"
## [5,] "around america" "1" "0" "358" "712"
## [6,] "back america"   "1" "0" "358" "712"

Calculate aExp

Calculate the expected value of a for each bigram.

Exp = (a + b) * (a + c) / (a + b + c + d)
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d))
##                       a   Exp                 b   c     d    
## [1,] "about america"  "2" "2.01120448179272"  "4" "357" "708"
## [2,] "across america" "3" "1.67600373482726"  "2" "356" "710"
## [3,] "again america"  "1" "0.335200746965453" "0" "358" "712"
## [4,] "and america"    "9" "3.35200746965453"  "1" "350" "711"
## [5,] "around america" "1" "0.335200746965453" "0" "358" "712"
## [6,] "back america"   "1" "0.335200746965453" "0" "358" "712"

logPF

Calculate the log p-values from the Fisher test.

pvF = pv.Fisher.collostr(a, b, c, d)
# Convert to effect size measure
logpvF = ifelse(a < Exp, log10(pvF), -log10(pvF))
head(cbind(as.character(reduced_data$Bigram), a, Exp, b, c, d, logpvF))
##                       a   Exp                 b   c     d    
## [1,] "about america"  "2" "2.01120448179272"  "4" "357" "708"
## [2,] "across america" "3" "1.67600373482726"  "2" "356" "710"
## [3,] "again america"  "1" "0.335200746965453" "0" "358" "712"
## [4,] "and america"    "9" "3.35200746965453"  "1" "350" "711"
## [5,] "around america" "1" "0.335200746965453" "0" "358" "712"
## [6,] "back america"   "1" "0.335200746965453" "0" "358" "712"
##      logpvF             
## [1,] "0"                
## [2,] "0.466695797455702"
## [3,] "0.474695022253536"
## [4,] "3.45432045600062" 
## [5,] "0.474695022253536"
## [6,] "0.474695022253536"

Calculate the top scores

Create the top 10 bigrams for Not Prez (positive scores) and for Prez (negative scores).

reduced_data$logp = logpvF
reduced_data = reduced_data[order(-reduced_data$logp),]
top_prez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                   Bigram Prez NotPrez      logp
## 39038     making america   37      12 9.0503579
## 32287         in america   31      10 7.6420733
## 67154         to america   16       6 3.5494599
## 5750         and america    9       1 3.4543205
## 36007       keep america    7       0 3.3398914
## 51644    putting america    6       0 2.8603165
## 44917         of america   22      19 2.1637064
## 11448      bless america    7       2 2.0794249
## 37064 leadership america    3       0 1.4265053
## 11655    booming america    2       0 0.9501958
as.character(top_prez)
##  [1] "making america"     "in america"         "to america"        
##  [4] "and america"        "keep america"       "putting america"   
##  [7] "of america"         "bless america"      "leadership america"
## [10] "booming america"
reduced_data = reduced_data[order(reduced_data$logp),]
top_notprez = reduced_data$Bigram[1:10]
head(reduced_data,10)
##                         Bigram Prez NotPrez        logp
## 38950             make america  103     380 -13.9510960
## 92442         crippled america    0      30  -5.2967691
## 77978              you america    2      29  -3.1504879
## 125723            save america    0       9  -1.4780333
## 98283              fix america    0       5  -0.7568401
## 138097           trump america    0       5  -0.7568401
## 52250  realdonaldtrump america    1       7  -0.5526976
## 87977            bring america    0       4  -0.5126699
## 110868           loves america    0       4  -0.5126699
## 113629         morning america    0       4  -0.5126699
as.character(top_notprez)
##  [1] "make america"            "crippled america"       
##  [3] "you america"             "save america"           
##  [5] "fix america"             "trump america"          
##  [7] "realdonaldtrump america" "bring america"          
##  [9] "loves america"           "morning america"

Interpreting the numbers

What can you gather from the different top scores for Not Prez and Prez? Did he appear to change his style once inaugurated?

Before he was selected as president, he used a lot of ‘make america’,‘save america’. After becoming the President he still used the word a lot, but the frequency decreased and it makes sense because this is the time to act and not just keep repeating the same promises.