Loading required packages

library(ggplot2)
library("ggmap")
library(plyr)
library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library("tm")
library("googleway")
library(RCurl)
library("quanteda") 
library(tidytext)
library(randomForest)

Acquiring and Preparing Data

The first data set was pulled from Kaggle and committed to my GitHub workspace for use within this project (https://www.kaggle.com/datasnaek/mbti-type). It is a listing of journal entries by each of the 16 different Meyers-Briggs personality types. Here I will pull the data into the workspace, separate each row into individual entries, clear out blank entries and any entries containing URLs, as we only want to do text analysis. Then, in order to simplify the analysis, I narrow down to introvert vs. extrovert.

link <- getURL('https://raw.githubusercontent.com/SamCD/Data607Final/master/mbti_1.csv')
mbti <- read.csv(text = link)
mbti <- mbti %>% 
  mutate(posts = strsplit(as.character(posts), "[|||]")) %>% 
  unnest(posts)
mbti <- mbti[!(is.na(mbti$posts) | mbti$posts==""), ]
mbti$isURL <- grepl('$http',mbti$posts,TRUE)
mbti <- subset(mbti,mbti$isURL == 0)
mbti$type <- strtrim(mbti$type,1)

Next is a manually created data set using the Google Places API. Using this tool, I will create a table consisting of review scores and text. As I only am interested in random text, I seeded the searches using a CSV listing Starbucks locations around the US (found via Google search; source http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=67416), for which I randomized the order, and searched for the word “Restaurant” nearby.

link2 <- getURL("https://raw.githubusercontent.com/SamCD/Data607Final/master/Starbucks.csv")
samples <- read.csv(text = link2,header = FALSE)

Google API Key hidden

resDF <- data.frame(rating = as.numeric(character()),text = character())
samples <- samples[sample(nrow(samples)),]
for (row in 1:20) {
  lat <- samples[row,2]
  lon <- samples[row,1]
  res <- google_places(location = c(lat, lon),
                       keyword = "Restaurant",
                       radius = 5000,
                       key = key)
  for (i in res$results$place_id){
    revs <- google_place_details(i,key=key)$result$reviews[,c("rating","text")]
    resDF <- rbind(resDF,revs)
  }
}

Here is a plot of out sample locations:

samples <- head(samples,10)
usa_center = as.numeric(geocode("United States"))

## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false

USAMap = ggmap(get_googlemap(center=usa_center, scale=2, zoom=4), extent="normal")

## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.09024,-95.712891&zoom=4&size=640x640&scale=2&maptype=terrain&sensor=false

USAMap +
geom_point(aes(x=samples[,1], y=samples[,2]), data=samples, col="orange", alpha=0.4)

Creating corpi

Using the quanteda package in R to create corpi and word usage matrices. I created some statistics for each entry as well, including “uniqueness” (by comparing the distinct tokens to the total number of tokens) and average sentence length.

entQ <- corpus(subset(mbti,mbti$type=="E"),text_field = "posts")
intQ <- corpus(subset(mbti,mbti$type=="I"),text_field = "posts")
entDF <- tidy(entQ)
intDF <- tidy(intQ)

entDF$ntok <- ntoken(entDF$text)
entDF$ntyp <- ntype(entDF$text)
entDF$uniq <- 100.0 / (entDF$ntok/entDF$ntyp)
entDF$avgWPS <- entDF$ntok / nsentence(entDF$text)
entDF <- tibble::rowid_to_column(entDF, "ID")

intDF$ntok <- ntoken(intDF$text)
intDF$ntyp <- ntype(intDF$text)
intDF$uniq <- 100.0 / (intDF$ntok/intDF$ntyp)
intDF$avgWPS <- intDF$ntok / nsentence(intDF$text)
intDF <- tibble::rowid_to_column(intDF, "ID")

resQ <- corpus(resDF,text_field = "text")
resDF <- tidy(resQ)
resDF$ntok <- ntoken(resDF$text)
resDF$ntyp <- ntype(resDF$text)
resDF$uniq <- 100.0 / (resDF$ntok/resDF$ntyp)
resDF$avgWPS <- resDF$ntok / nsentence(resDF$text)
resDF <- tibble::rowid_to_column(resDF, "ID")

Performing sentiment analysis

Quanteda has built-in dictionaries which can be used to perform sentiment analysis. I used the most simple one, splitting words into either positive or negative.

entS <- dfm(entQ, dictionary = data_dictionary_LSD2015)
entS <- tidy(entS)
entPlot <- cbind(entS) # make a copy to use later
entS <- mutate(entS, ID = as.numeric(rownames(entS)))
entN <- subset(entS,entS$term=="negative")
entN$document <- as.integer(entN$document)

intS <- dfm(intQ, dictionary = data_dictionary_LSD2015)
intS <- tidy(intS)
intPlot <- cbind(intS) # make a copy to use later
intS <- mutate(intS, ID = as.numeric(rownames(intS)))
intN <- subset(intS,intS$term=="negative")
intN$document <- as.integer(intN$document)

resS <- dfm(resQ, dictionary = data_dictionary_LSD2015)
resPlot <- cbind(resS)
resDF <- data.frame(resDF)

Predictive modeling

Here I try to set up a predictive model, with the goal of applying an introvert/extrovert type to the reviews.

mbtiDF <- data.frame(rbind(entDF,intDF))

model <- randomForest(y=factor(mbtiDF$type),x=cbind(mbtiDF$uniq,mbtiDF$avgWPS),ntree=10)
summary(model)

##                 Length Class  Mode     
## call                 4 -none- call     
## type                 1 -none- character
## predicted       100325 factor numeric  
## err.rate            30 -none- numeric  
## confusion            6 -none- numeric  
## votes           200650 matrix numeric  
## oob.times       100325 -none- numeric  
## classes              2 -none- character
## importance           2 -none- numeric  
## importanceSD         0 -none- NULL     
## localImportance      0 -none- NULL     
## proximity            0 -none- NULL     
## ntree                1 -none- numeric  
## mtry                 1 -none- numeric  
## forest              14 -none- list     
## y               100325 factor numeric  
## test                 0 -none- NULL     
## inbag                0 -none- NULL

summary(glm(ifelse(type=="I",0,1) ~ uniq + avgWPS, data = mbtiDF,family=binomial()))

## 
## Call:
## glm(formula = ifelse(type == "I", 0, 1) ~ uniq + avgWPS, family = binomial(), 
##     data = mbtiDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7386  -0.7255  -0.7203  -0.6922   1.7885  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.1534596  0.0635610 -18.147  < 2e-16 ***
## uniq        -0.0001398  0.0007157  -0.195    0.845    
## avgWPS      -0.0032629  0.0008306  -3.928 8.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 107886  on 100324  degrees of freedom
## Residual deviance: 107870  on 100322  degrees of freedom
## AIC: 107876
## 
## Number of Fisher Scoring iterations: 4

There does not appear to be a strong correlation. My feeling would be that with some more statistics and complex analysis, we might be able to make a stronger connection for prediction.

Visualization

Given that we were unable to create a statistical connection between the two, we can visualize some of the differences by looking at sentiment breakdown (positive vs. negative) for the different types, as well as word clouds, to see if there are any noticeable differences.

pie(table(entPlot$term))

pie(table(intPlot$term))

entWC <- dfm(entQ, remove = stopwords("english"), remove_punct = TRUE)
intWC <- dfm(intQ, remove = stopwords("english"), remove_punct = TRUE)

set.seed(100)
textplot_wordcloud(entWC, min.freq = 6, random.order = FALSE,
                   rot.per = .25, 
                   colors = RColorBrewer::brewer.pal(8,"Dark2"),
                   scale=c(4,.5),
                   max.words =40)

textplot_wordcloud(intWC, min.freq = 6, random.order = FALSE,
                   rot.per = .25, 
                   colors = RColorBrewer::brewer.pal(8,"Dark2"),
                   scale=c(4,.5),
                   max.words =40)

Data 607 Final Project

Sam CD

December 10, 2017