library(quanteda)
library(stringi)
library(dplyr)
# features are available here: http://liwc.wpengine.com/compare-dictionaries/
# language manual: https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwi9rOHKg4zNAhUVLVIKHY4SDmwQFggcMAA&url=https%3A%2F%2Frepositories.lib.utexas.edu%2Fbitstream%2Fhandle%2F2152%2F31333%2FLIWC2015_LanguageManual.pdf&usg=AFQjCNEiJRU3zQWiCHXtG15ZN4nKfMDxuw&sig2=35zZvuRVq8uyYs6fTdYFqw&bvm=bv.123664746,d.aXo
setwd("~/Dropbox/research/LIWC")
mydict <- dictionary(file = "LIWC2015_English.dic", format = "LIWC")
data <- read.csv("~/Dropbox/research/IQWST_Motivation/Data/UnitReviews/processed_2.csv", stringsAsFactors=FALSE)
query_list <- c("cogproc", "insight", "cause", "discrep", "tentat", "certain", "differ", "focuspast", "focuspresent", "focusfuture", "social", "affect", "posemo", "negmo", "drives", "relativity")
# Main function
proc_texts <- function(query, text){
doc_index <- list()
to_output <- list()
out <- dfm(text, dictionary = mydict)
my_n <- ntoken(text)
for (i in 1:length(query)){
subset_by_feat <- out[, features(out) %in% query[i]]
tmp <- data.frame(var = data$treatment, freq = round(rowSums(subset_by_feat) / my_n, 3))
tmp$freq[is.infinite(tmp$freq)] <- 0
tmp_out <- to_output[[i]] <- tmp %>%
group_by(var) %>%
summarize(prop = mean(freq, na.rm = T))
# print(tmp_out) # prints means
print(paste0("THIS IS THE OUTPUT FOR ", query[i]))
print(t.test(tmp$freq ~ tmp$var)) # prints t-test result
}
names(to_output) <- query
}
out <- proc_texts(query_list, data$text2)
##
## ... lowercasing
## ... tokenizing
## ... indexing documents: 91 documents
## ... indexing features: 721 feature types
## ... applying a dictionary consisting of 73 keys
## ... created a 91 x 73 sparse dfm
## ... complete.
## Elapsed time: 1.492 seconds.
## [1] "THIS IS THE OUTPUT FOR cogproc"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -4.4847, df = 69.319, p-value = 2.821e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10249145 -0.03938474
## sample estimates:
## mean in group 0 mean in group 1
## 0.0662619 0.1372000
##
## [1] "THIS IS THE OUTPUT FOR insight"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -3.4325, df = 54.642, p-value = 0.001147
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03997014 -0.01049970
## sample estimates:
## mean in group 0 mean in group 1
## 0.01080952 0.03604444
##
## [1] "THIS IS THE OUTPUT FOR cause"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -2.2385, df = 69.527, p-value = 0.02839
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.029374674 -0.001691993
## sample estimates:
## mean in group 0 mean in group 1
## 0.01866667 0.03420000
##
## [1] "THIS IS THE OUTPUT FOR discrep"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -2.9836, df = 67.206, p-value = 0.003968
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.041744845 -0.008280552
## sample estimates:
## mean in group 0 mean in group 1
## 0.009809524 0.034822222
##
## [1] "THIS IS THE OUTPUT FOR tentat"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -1.9807, df = 74.345, p-value = 0.05133
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.478730e-02 7.301017e-05
## sample estimates:
## mean in group 0 mean in group 1
## 0.01430952 0.02666667
##
## [1] "THIS IS THE OUTPUT FOR certain"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = 0.29404, df = 84.967, p-value = 0.7694
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.004353437 0.005864548
## sample estimates:
## mean in group 0 mean in group 1
## 0.007000000 0.006244444
##
## [1] "THIS IS THE OUTPUT FOR differ"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -1.6815, df = 83.433, p-value = 0.09641
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02662308 0.00222943
## sample estimates:
## mean in group 0 mean in group 1
## 0.02004762 0.03224444
##
## [1] "THIS IS THE OUTPUT FOR focuspast"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -0.79782, df = 81.682, p-value = 0.4273
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02385063 0.01019666
## sample estimates:
## mean in group 0 mean in group 1
## 0.02459524 0.03142222
##
## [1] "THIS IS THE OUTPUT FOR focuspresent"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -0.51707, df = 77.28, p-value = 0.6066
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03253148 0.01911878
## sample estimates:
## mean in group 0 mean in group 1
## 0.1074048 0.1141111
##
## [1] "THIS IS THE OUTPUT FOR focusfuture"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -0.66759, df = 81.435, p-value = 0.5063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.009495509 0.004724080
## sample estimates:
## mean in group 0 mean in group 1
## 0.008880952 0.011266667
##
## [1] "THIS IS THE OUTPUT FOR social"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -0.042423, df = 84.306, p-value = 0.9663
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01778152 0.01703866
## sample estimates:
## mean in group 0 mean in group 1
## 0.03709524 0.03746667
##
## [1] "THIS IS THE OUTPUT FOR affect"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -2.1372, df = 69.976, p-value = 0.03608
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0171964347 -0.0005940415
## sample estimates:
## mean in group 0 mean in group 1
## 0.009571429 0.018466667
##
## [1] "THIS IS THE OUTPUT FOR posemo"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -1.2958, df = 72.54, p-value = 0.1992
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.012401110 0.002629681
## sample estimates:
## mean in group 0 mean in group 1
## 0.007714286 0.012600000
##
## [1] "THIS IS THE OUTPUT FOR negmo"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = NaN, df = NaN, p-value = NA
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## NaN NaN
## sample estimates:
## mean in group 0 mean in group 1
## 0 0
##
## [1] "THIS IS THE OUTPUT FOR drives"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = -0.78397, df = 78.56, p-value = 0.4354
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02498761 0.01086697
## sample estimates:
## mean in group 0 mean in group 1
## 0.03809524 0.04515556
##
## [1] "THIS IS THE OUTPUT FOR relativity"
##
## Welch Two Sample t-test
##
## data: tmp$freq by tmp$var
## t = NaN, df = NaN, p-value = NA
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## NaN NaN
## sample estimates:
## mean in group 0 mean in group 1
## 0 0