library(gdata)

## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.

##

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.

##

## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

library(wordcloud)

## Loading required package: RColorBrewer

library(tm)

## Loading required package: NLP

library(quanteda)

## Package version: 1.2.0

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(syuzhet)
library(SnowballC)
library(tidytext)
library(devtools)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(text2vec)
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-16

library(e1071)
library(klaR)

## Loading required package: MASS

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:glmnet':
## 
##     auc

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:gdata':
## 
##     first, last

library(magrittr)
library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## 
## Attaching package: 'qdapTools'

## The following object is masked from 'package:data.table':
## 
##     shift

## 
## Attaching package: 'qdap'

## The following object is masked from 'package:magrittr':
## 
##     %>%

## The following object is masked from 'package:Matrix':
## 
##     %&%

## The following objects are masked from 'package:quanteda':
## 
##     %>%, as.DocumentTermMatrix, as.wfm

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:base':
## 
##     Filter

library(readr)
library(qdapTools)
library(qdapRegex)
library(randomcoloR)
library(class)

## 
## Attaching package: 'class'

## The following object is masked from 'package:qdap':
## 
##     condense

docs <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE)
docs <- docs[1:4000,]

docs$type <- as.factor(docs$type)

str(docs)

## 'data.frame':    4000 obs. of  2 variables:
##  $ type : Factor w/ 16 levels "ENFJ","ENFP",..: 9 4 12 11 3 11 9 11 9 12 ...
##  $ posts: chr  "'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||e"| __truncated__ "'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position ofte"| __truncated__ "'Good one  _____   https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; that's my bl"| __truncated__ "'Dear INTP,   I enjoyed our conversation the other day.  Esoteric gabbing about the nature of the universe and "| __truncated__ ...

When we look at the column for Personality “type” we see that there is a factor level names “type” we want to remove that from the factor levels so that there is only the 16 MBTI factors in the data.frame

Get ride of the “type” factor level that offers no information

is.na(docs$type) <- docs$type == "type"
docs$type <- factor(docs$type)
levels(docs$type)

##  [1] "ENFJ" "ENFP" "ENTJ" "ENTP" "ESFJ" "ESFP" "ESTJ" "ESTP" "INFJ" "INFP"
## [11] "INTJ" "INTP" "ISFJ" "ISFP" "ISTJ" "ISTP"

find the frequency table for each personality type.

table(docs$type)

## 
## ENFJ ENFP ENTJ ENTP ESFJ ESFP ESTJ ESTP INFJ INFP INTJ INTP ISFJ ISFP ISTJ 
##   95  291   95  331   15   27   18   49  693  841  508  583   81  115   88 
## ISTP 
##  170

plot the different factors for all 16 levels and color code them for the different groups they fall into.

I know there is probably a better way to color them than doing it copy/paste…..

plot(docs$type, col = c("blue", "blue", "blue", "blue", "red", "red", "red", "red", "green", "green", "green", "green", "yellow", "yellow", "yellow", "yellow"))

using qdap package to get word count frequency for top 20 words in the posts

frequent.terms.unclean <- freq_terms(docs$posts, 20)
frequent.terms.unclean

##    WORD   FREQ
## 1  i    194030
## 2  to   136439
## 3  the  135594
## 4  a    109326
## 5  and  108247
## 6  of    82851
## 7  you   71699
## 8  that  64937
## 9  is    62410
## 10 it    60300
## 11 in    57804
## 12 my    53627
## 13 but   41929
## 14 for   41131
## 15 have  37873
## 16 with  36933
## 17 me    36047
## 18 be    33131
## 19 are   31979
## 20 this  31306

plot(frequent.terms.unclean)

#Set up the doc_id to be able to create our corpus

posts.df <- data.frame(doc_id = seq(1:nrow(docs)), text = docs$posts)

Make the volatile corpus

corpus <- VCorpus(DataframeSource(posts.df))

Set seed for the data set

set.seed(333)

I am only going to use 1000 rows of the amost 7000 row data set because of computational power

df <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE, header = TRUE)
df <- df[1:1000,]

Create my corpus

docs1 <- Corpus(VectorSource(df$posts))

I created a custom stopword collection that I am going to use in addition to the regular english stopwords

custom.stopwords = c(stopwords("english"), "i","like","just","can","im","can","the","one","get","ive","just","know")

Clean my corpus

docs1 <- tm_map(docs1, content_transformer(tolower))
docs1 <- tm_map(docs1, removeNumbers)
docs1 <- tm_map(docs1, removeWords, stopwords("english"))
docs1 <- tm_map(docs1, removePunctuation)
docs1 <- tm_map(docs1, stripWhitespace)
docs1 <- tm_map(docs1, stemDocument, language = "english")

Create the dtm of the corpus

dtm <- DocumentTermMatrix(docs1)

Transform dtm to matrix and then back into a data frame for modeling

mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE)

Column bind category (known classification)

mat.df <- cbind(mat.df, df$type, row.names = NULL)

Change name of new column to “type”

colnames(mat.df)[ncol(mat.df)] <- "type"

Creat our testing and training set of data with a 70/30 split

train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .70))
test <- (1:nrow(mat.df))[- train]

Classifier variable based on the personality type

cl <- mat.df[,"type"]

Create model data and remove “type”

modeldata <- mat.df[,!colnames(mat.df) %in% "type"]

Create classifyer model for the Knn algorithm

knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train])

Creat the confusion matrix for our predictions

conf.mat <- table("Predictions" = knn.pred, Actual = cl[test])
conf.mat

##            Actual
## Predictions  0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 18 24
##          0  18 24 19 11 11  4  7  2  2  0  0  2  0  1  0  0  1
##          1  14 28 11 15  8  6  8  4  1  1  1  3  1  0  0  0  0
##          2   8 12  7 12  0  4  1  1  1  1  0  0  0  0  0  0  0
##          3   1  1  2  1  0  1  1  1  0  0  0  0  0  0  0  0  0
##          4   0  3  3  3  1  3  0  0  1  0  0  0  0  0  0  0  0
##          5   0  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0
##          6   0  1  0  0  4  1  1  0  0  0  0  1  1  0  0  0  0
##          7   0  0  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##          8   0  1  0  0  0  0  1  1  0  1  1  0  0  0  0  0  0
##          9   1  1  1  0  0  1  0  0  0  0  0  0  1  0  1  0  0
##          10  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  1  0
##          11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0

The accuracy of out Knn model in percetage output

(accuracy <- sum(diag(conf.mat))/length(test) * 100)

## [1] 19.33333

Running into size issues again here

frequent.terms.clean <- freq_terms(mat.df, 20)
frequent.terms.clean

##   WORD  FREQ
## 1 c    39262

plot(frequent.terms.clean)

For further exploration

Subset each of the 16 MBTI personality typed into their own catagory

#ENFJ <- subset(df, type=="ENFJ")
#ENFP <- subset(df, type=="ENFP")
#ENTJ <- subset(df, type=="ENTJ")
#ENTP <- subset(df, type=="ENTP")
#ESFJ <- subset(df, type=="ESFJ")
#ESFP <- subset(df, type=="ESFP")
#ESTJ <- subset(df, type=="ESTJ")
#ESTP <- subset(df, type=="ESTP")
#INFJ <- subset(df, type=="INFJ")
#INFP <- subset(df, type=="INFP")
#INTJ <- subset(df, type=="INTJ")
#INTP <- subset(df, type=="INTP")
#ISFJ <- subset(df, type=="ISFJ")
#ISFP <- subset(df, type=="ISFP")
#ISTJ <- subset(df, type=="ISTJ")
#ISTP <- subset(df, type=="ISTP")

once the documents are clean we can see the top 10 words for each personality type

#ENFJ.freq <- freq_terms(ENFJ$posts, 10)
#ENFP.freq <- freq_terms(ENFP$posts, 10)
#ENTJ.freq <- freq_terms(ENTJ$posts, 10)
#ENTP.freq <- freq_terms(ENTP$posts, 10)
#ESFJ.freq <- freq_terms(ESFJ$posts, 10)
#ESFP.freq <- freq_terms(ESFP$posts, 10)
#ESTJ.freq <- freq_terms(ESTJ$posts, 10)
#ESTP.freq <- freq_terms(ESTP$posts, 10)
#INFJ.freq <- freq_terms(INFJ$posts, 10)
#INFP.freq <- freq_terms(INFP$posts, 10)
#INTJ.freq <- freq_terms(INTJ$posts, 10)
#INTP.freq <- freq_terms(INTP$posts, 10)
#ISFJ.freq <- freq_terms(ISFJ$posts, 10)
#ISFP.freq <- freq_terms(ISFP$posts, 10)
#ISTJ.freq <- freq_terms(ISTJ$posts, 10)
#ISTP.freq <- freq_terms(ISTP$posts, 10)

It will help to plot them and see them side by side

#plot(ENFJ.freq)
#plot(ENFP.freq)
##plot(ENTJ.freq)
#plot(ENTP.freq)
#plot(ESFJ.freq)
##plot(ESFP.freq)
#plot(ESTJ.freq)
#plot(ESTP.freq)
#plot(INFJ.freq)
#plot(INFP.freq)
#plot(INTJ.freq)
#plot(INTP.freq)
#plot(ISFJ.freq)
#plot(ISFP.freq)
#plot(ISTJ.freq)
#plot(ISTP.freq)