library(ggplot2)
library("ggmap")
library(plyr)
library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library("tm")
library("googleway")
library(RCurl)
library("quanteda")
library(tidytext)
library(randomForest)
The first data set was pulled from Kaggle and committed to my GitHub workspace for use within this project (https://www.kaggle.com/datasnaek/mbti-type). It is a listing of journal entries by each of the 16 different Meyers-Briggs personality types. Here I will pull the data into the workspace, separate each row into individual entries, clear out blank entries and any entries containing URLs, as we only want to do text analysis. Then, in order to simplify the analysis, I narrow down to introvert vs. extrovert.
link <- getURL('https://raw.githubusercontent.com/SamCD/Data607Final/master/mbti_1.csv')
mbti <- read.csv(text = link)
mbti <- mbti %>%
mutate(posts = strsplit(as.character(posts), "[|||]")) %>%
unnest(posts)
mbti <- mbti[!(is.na(mbti$posts) | mbti$posts==""), ]
mbti$isURL <- grepl('$http',mbti$posts,TRUE)
mbti <- subset(mbti,mbti$isURL == 0)
mbti$type <- strtrim(mbti$type,1)
Next is a manually created data set using the Google Places API. Using this tool, I will create a table consisting of review scores and text. As I only am interested in random text, I seeded the searches using a CSV listing Starbucks locations around the US (found via Google search; source http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=67416), for which I randomized the order, and searched for the word “Restaurant” nearby.
link2 <- getURL("https://raw.githubusercontent.com/SamCD/Data607Final/master/Starbucks.csv")
samples <- read.csv(text = link2,header = FALSE)
Google API Key hidden
resDF <- data.frame(rating = as.numeric(character()),text = character())
samples <- samples[sample(nrow(samples)),]
for (row in 1:20) {
lat <- samples[row,2]
lon <- samples[row,1]
res <- google_places(location = c(lat, lon),
keyword = "Restaurant",
radius = 5000,
key = key)
for (i in res$results$place_id){
revs <- google_place_details(i,key=key)$result$reviews[,c("rating","text")]
resDF <- rbind(resDF,revs)
}
}
Here is a plot of out sample locations:
samples <- head(samples,10)
usa_center = as.numeric(geocode("United States"))
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
USAMap = ggmap(get_googlemap(center=usa_center, scale=2, zoom=4), extent="normal")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.09024,-95.712891&zoom=4&size=640x640&scale=2&maptype=terrain&sensor=false
USAMap +
geom_point(aes(x=samples[,1], y=samples[,2]), data=samples, col="orange", alpha=0.4)
Using the quanteda package in R to create corpi and word usage matrices. I created some statistics for each entry as well, including “uniqueness” (by comparing the distinct tokens to the total number of tokens) and average sentence length.
entQ <- corpus(subset(mbti,mbti$type=="E"),text_field = "posts")
intQ <- corpus(subset(mbti,mbti$type=="I"),text_field = "posts")
entDF <- tidy(entQ)
intDF <- tidy(intQ)
entDF$ntok <- ntoken(entDF$text)
entDF$ntyp <- ntype(entDF$text)
entDF$uniq <- 100.0 / (entDF$ntok/entDF$ntyp)
entDF$avgWPS <- entDF$ntok / nsentence(entDF$text)
entDF <- tibble::rowid_to_column(entDF, "ID")
intDF$ntok <- ntoken(intDF$text)
intDF$ntyp <- ntype(intDF$text)
intDF$uniq <- 100.0 / (intDF$ntok/intDF$ntyp)
intDF$avgWPS <- intDF$ntok / nsentence(intDF$text)
intDF <- tibble::rowid_to_column(intDF, "ID")
resQ <- corpus(resDF,text_field = "text")
resDF <- tidy(resQ)
resDF$ntok <- ntoken(resDF$text)
resDF$ntyp <- ntype(resDF$text)
resDF$uniq <- 100.0 / (resDF$ntok/resDF$ntyp)
resDF$avgWPS <- resDF$ntok / nsentence(resDF$text)
resDF <- tibble::rowid_to_column(resDF, "ID")
Quanteda has built-in dictionaries which can be used to perform sentiment analysis. I used the most simple one, splitting words into either positive or negative.
entS <- dfm(entQ, dictionary = data_dictionary_LSD2015)
entS <- tidy(entS)
entPlot <- cbind(entS) # make a copy to use later
entS <- mutate(entS, ID = as.numeric(rownames(entS)))
entN <- subset(entS,entS$term=="negative")
entN$document <- as.integer(entN$document)
intS <- dfm(intQ, dictionary = data_dictionary_LSD2015)
intS <- tidy(intS)
intPlot <- cbind(intS) # make a copy to use later
intS <- mutate(intS, ID = as.numeric(rownames(intS)))
intN <- subset(intS,intS$term=="negative")
intN$document <- as.integer(intN$document)
resS <- dfm(resQ, dictionary = data_dictionary_LSD2015)
resPlot <- cbind(resS)
resDF <- data.frame(resDF)
Here I try to set up a predictive model, with the goal of applying an introvert/extrovert type to the reviews.
mbtiDF <- data.frame(rbind(entDF,intDF))
model <- randomForest(y=factor(mbtiDF$type),x=cbind(mbtiDF$uniq,mbtiDF$avgWPS),ntree=10)
summary(model)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 100325 factor numeric
## err.rate 30 -none- numeric
## confusion 6 -none- numeric
## votes 200650 matrix numeric
## oob.times 100325 -none- numeric
## classes 2 -none- character
## importance 2 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 100325 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
summary(glm(ifelse(type=="I",0,1) ~ uniq + avgWPS, data = mbtiDF,family=binomial()))
##
## Call:
## glm(formula = ifelse(type == "I", 0, 1) ~ uniq + avgWPS, family = binomial(),
## data = mbtiDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7386 -0.7255 -0.7203 -0.6922 1.7885
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1534596 0.0635610 -18.147 < 2e-16 ***
## uniq -0.0001398 0.0007157 -0.195 0.845
## avgWPS -0.0032629 0.0008306 -3.928 8.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 107886 on 100324 degrees of freedom
## Residual deviance: 107870 on 100322 degrees of freedom
## AIC: 107876
##
## Number of Fisher Scoring iterations: 4
There does not appear to be a strong correlation. My feeling would be that with some more statistics and complex analysis, we might be able to make a stronger connection for prediction.
Given that we were unable to create a statistical connection between the two, we can visualize some of the differences by looking at sentiment breakdown (positive vs. negative) for the different types, as well as word clouds, to see if there are any noticeable differences.
pie(table(entPlot$term))
pie(table(intPlot$term))
entWC <- dfm(entQ, remove = stopwords("english"), remove_punct = TRUE)
intWC <- dfm(intQ, remove = stopwords("english"), remove_punct = TRUE)
set.seed(100)
textplot_wordcloud(entWC, min.freq = 6, random.order = FALSE,
rot.per = .25,
colors = RColorBrewer::brewer.pal(8,"Dark2"),
scale=c(4,.5),
max.words =40)
textplot_wordcloud(intWC, min.freq = 6, random.order = FALSE,
rot.per = .25,
colors = RColorBrewer::brewer.pal(8,"Dark2"),
scale=c(4,.5),
max.words =40)