Text analysis of Hinze and Mankowski’s ss

library(quanteda)
library(stringi)
library(dplyr)

# features are available here: http://liwc.wpengine.com/compare-dictionaries/
# language manual: https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwi9rOHKg4zNAhUVLVIKHY4SDmwQFggcMAA&url=https%3A%2F%2Frepositories.lib.utexas.edu%2Fbitstream%2Fhandle%2F2152%2F31333%2FLIWC2015_LanguageManual.pdf&usg=AFQjCNEiJRU3zQWiCHXtG15ZN4nKfMDxuw&sig2=35zZvuRVq8uyYs6fTdYFqw&bvm=bv.123664746,d.aXo

setwd("~/Dropbox/research/LIWC")
mydict <- dictionary(file = "LIWC2015_English.dic", format = "LIWC")

data <- read.csv("~/Dropbox/research/IQWST_Motivation/Data/UnitReviews/processed_2.csv", stringsAsFactors=FALSE)

query_list <- c("cogproc", "insight", "cause", "discrep", "tentat", "certain", "differ", "focuspast", "focuspresent", "focusfuture", "social", "affect", "posemo", "negmo", "drives", "relativity")

# Main function

proc_texts <- function(query, text){
    doc_index <- list()
    to_output <- list()
    out <- dfm(text, dictionary = mydict)
    my_n <- ntoken(text)
    for (i in 1:length(query)){
        subset_by_feat <- out[, features(out) %in% query[i]]
        tmp <- data.frame(var = data$treatment, freq = round(rowSums(subset_by_feat) / my_n, 3))
        tmp$freq[is.infinite(tmp$freq)] <- 0
        tmp_out <- to_output[[i]] <- tmp %>%
            group_by(var) %>%
            summarize(prop = mean(freq, na.rm = T))
        # print(tmp_out) # prints means
        print(paste0("THIS IS THE OUTPUT FOR ", query[i]))
        print(t.test(tmp$freq ~ tmp$var)) # prints t-test result
    }
    names(to_output) <- query
}
out <- proc_texts(query_list, data$text2)

## 
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 91 documents
##    ... indexing features: 721 feature types
##    ... applying a dictionary consisting of 73 keys
##    ... created a 91 x 73 sparse dfm
##    ... complete. 
## Elapsed time: 1.492 seconds.
## [1] "THIS IS THE OUTPUT FOR cogproc"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -4.4847, df = 69.319, p-value = 2.821e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10249145 -0.03938474
## sample estimates:
## mean in group 0 mean in group 1 
##       0.0662619       0.1372000 
## 
## [1] "THIS IS THE OUTPUT FOR insight"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -3.4325, df = 54.642, p-value = 0.001147
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03997014 -0.01049970
## sample estimates:
## mean in group 0 mean in group 1 
##      0.01080952      0.03604444 
## 
## [1] "THIS IS THE OUTPUT FOR cause"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -2.2385, df = 69.527, p-value = 0.02839
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.029374674 -0.001691993
## sample estimates:
## mean in group 0 mean in group 1 
##      0.01866667      0.03420000 
## 
## [1] "THIS IS THE OUTPUT FOR discrep"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -2.9836, df = 67.206, p-value = 0.003968
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.041744845 -0.008280552
## sample estimates:
## mean in group 0 mean in group 1 
##     0.009809524     0.034822222 
## 
## [1] "THIS IS THE OUTPUT FOR tentat"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -1.9807, df = 74.345, p-value = 0.05133
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.478730e-02  7.301017e-05
## sample estimates:
## mean in group 0 mean in group 1 
##      0.01430952      0.02666667 
## 
## [1] "THIS IS THE OUTPUT FOR certain"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = 0.29404, df = 84.967, p-value = 0.7694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.004353437  0.005864548
## sample estimates:
## mean in group 0 mean in group 1 
##     0.007000000     0.006244444 
## 
## [1] "THIS IS THE OUTPUT FOR differ"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -1.6815, df = 83.433, p-value = 0.09641
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02662308  0.00222943
## sample estimates:
## mean in group 0 mean in group 1 
##      0.02004762      0.03224444 
## 
## [1] "THIS IS THE OUTPUT FOR focuspast"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -0.79782, df = 81.682, p-value = 0.4273
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02385063  0.01019666
## sample estimates:
## mean in group 0 mean in group 1 
##      0.02459524      0.03142222 
## 
## [1] "THIS IS THE OUTPUT FOR focuspresent"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -0.51707, df = 77.28, p-value = 0.6066
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03253148  0.01911878
## sample estimates:
## mean in group 0 mean in group 1 
##       0.1074048       0.1141111 
## 
## [1] "THIS IS THE OUTPUT FOR focusfuture"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -0.66759, df = 81.435, p-value = 0.5063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.009495509  0.004724080
## sample estimates:
## mean in group 0 mean in group 1 
##     0.008880952     0.011266667 
## 
## [1] "THIS IS THE OUTPUT FOR social"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -0.042423, df = 84.306, p-value = 0.9663
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.01778152  0.01703866
## sample estimates:
## mean in group 0 mean in group 1 
##      0.03709524      0.03746667 
## 
## [1] "THIS IS THE OUTPUT FOR affect"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -2.1372, df = 69.976, p-value = 0.03608
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0171964347 -0.0005940415
## sample estimates:
## mean in group 0 mean in group 1 
##     0.009571429     0.018466667 
## 
## [1] "THIS IS THE OUTPUT FOR posemo"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -1.2958, df = 72.54, p-value = 0.1992
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.012401110  0.002629681
## sample estimates:
## mean in group 0 mean in group 1 
##     0.007714286     0.012600000 
## 
## [1] "THIS IS THE OUTPUT FOR negmo"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = NaN, df = NaN, p-value = NA
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  NaN NaN
## sample estimates:
## mean in group 0 mean in group 1 
##               0               0 
## 
## [1] "THIS IS THE OUTPUT FOR drives"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = -0.78397, df = 78.56, p-value = 0.4354
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02498761  0.01086697
## sample estimates:
## mean in group 0 mean in group 1 
##      0.03809524      0.04515556 
## 
## [1] "THIS IS THE OUTPUT FOR relativity"
## 
##  Welch Two Sample t-test
## 
## data:  tmp$freq by tmp$var
## t = NaN, df = NaN, p-value = NA
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  NaN NaN
## sample estimates:
## mean in group 0 mean in group 1 
##               0               0

Text analysis of Hinze and Mankowski’s ss

Joshua Rosenberg

June 3, 2016