library(odbc)
library(DBI)
library(RODBC)
library(dbConnect)
## Loading required package: RMySQL
## Loading required package: gWidgets
library(RMySQL)
library(tm)
## Loading required package: NLP
library(quanteda)
## Package version: 1.2.0
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
library(syuzhet)
library(SnowballC)
library(tidytext)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:gWidgets':
## 
##     add
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## 
## Attaching package: 'qdapTools'
## The following object is masked from 'package:gWidgets':
## 
##     id
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## The following object is masked from 'package:magrittr':
## 
##     %>%
## The following object is masked from 'package:Matrix':
## 
##     %&%
## The following objects are masked from 'package:quanteda':
## 
##     %>%, as.DocumentTermMatrix, as.wfm
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:base':
## 
##     Filter
library(class)
## 
## Attaching package: 'class'
## The following object is masked from 'package:qdap':
## 
##     condense
#con <- dbConnect(odbc(),
#                Driver = "SQL Server Native Client 11.0",
#                Server = "DESKTOP-K5T17G7\JONATHANSQL.master(sa(54))",
#                Database = "Text Mining Project",
#                UID = "sa",
#                PWD = rstudioapi::askForPassword("Database password"),
#                Port = 1433)

Simple function to open connection and fetches data and then closes the connection again

#getQuery <- function(sql)
#{
#  con <- odbc::dbConnect(odbc::odbc(),
#                Driver = "SQL Server Native Client 11.0",
#                Server = "DESKTOP-K5T17G7\JONATHANSQL.master(sa(54))",
#                Database = "Text Mining Project")
#  result <- odbc::dbGetQuery(con, sql)
#  odbc::dbDisconnect(con)
#  result
#}
#data <- getQuery("select 1")
docs <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE)
docs <- docs[1:4000,]
docs$type <- as.factor(docs$type)
str(docs)
## 'data.frame':    4000 obs. of  2 variables:
##  $ type : Factor w/ 16 levels "ENFJ","ENFP",..: 9 4 12 11 3 11 9 11 9 12 ...
##  $ posts: chr  "'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||e"| __truncated__ "'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position ofte"| __truncated__ "'Good one  _____   https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; that's my bl"| __truncated__ "'Dear INTP,   I enjoyed our conversation the other day.  Esoteric gabbing about the nature of the universe and "| __truncated__ ...

When we look at the column for Personality “type” we see that there is a factor level names “type” we want to remove that from the factor levels so that there is only the 16 MBTI factors in the data.frame

Get ride of the “type” factor level that offers no information

is.na(docs$type) <- docs$type == "type"
docs$type <- factor(docs$type)
levels(docs$type)
##  [1] "ENFJ" "ENFP" "ENTJ" "ENTP" "ESFJ" "ESFP" "ESTJ" "ESTP" "INFJ" "INFP"
## [11] "INTJ" "INTP" "ISFJ" "ISFP" "ISTJ" "ISTP"

find the frequency table for each personality type.

table(docs$type)
## 
## ENFJ ENFP ENTJ ENTP ESFJ ESFP ESTJ ESTP INFJ INFP INTJ INTP ISFJ ISFP ISTJ 
##   95  291   95  331   15   27   18   49  693  841  508  583   81  115   88 
## ISTP 
##  170

plot the different factors for all 16 levels and color code them for the different groups they fall into.

I know there is probably a better way to color them than doing it copy/paste…..

plot(docs$type, col = c("blue", "blue", "blue", "blue", "red", "red", "red", "red", "green", "green", "green", "green", "yellow", "yellow", "yellow", "yellow"))

#using qdap package to get word count frequency for top 20 words in the posts

frequent.terms.unclean <- freq_terms(docs$posts, 20)
frequent.terms.unclean
##    WORD   FREQ
## 1  i    194030
## 2  to   136439
## 3  the  135594
## 4  a    109326
## 5  and  108247
## 6  of    82851
## 7  you   71699
## 8  that  64937
## 9  is    62410
## 10 it    60300
## 11 in    57804
## 12 my    53627
## 13 but   41929
## 14 for   41131
## 15 have  37873
## 16 with  36933
## 17 me    36047
## 18 be    33131
## 19 are   31979
## 20 this  31306
plot(frequent.terms.unclean)

#Set up the doc_id to be able to create our corpus

posts.df <- data.frame(doc_id = seq(1:nrow(docs)), text = docs$posts)

Make the volatile corpus

corpus <- VCorpus(DataframeSource(posts.df))

Set seed for the data set

set.seed(333)

I am only going to use 1000 rows of the amost 7000 row data set because of computational power

df <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE, header = TRUE)
df <- df[1:1000,]

Create my corpus

docs1 <- Corpus(VectorSource(df$posts))

I created a custom stopword collection that I am going to use in addition to the regular english stopwords

#custom.stopwords = c(stopwords("english"),"i","like","just","can","im","can","the","one","get","ive","just","know")

Clean my corpus

docs1 <- tm_map(docs1, content_transformer(tolower))
docs1 <- tm_map(docs1, removeNumbers)
docs1 <- tm_map(docs1, removeWords, stopwords("english"))
docs1 <- tm_map(docs1, removePunctuation)
docs1 <- tm_map(docs1, stripWhitespace)
docs1 <- tm_map(docs1, stemDocument, language = "english")

Create the dtm of the corpus

dtm <- DocumentTermMatrix(docs1)

Transform dtm to matrix and then back into a data frame for modeling

mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE)

Column bind category (known classification)

mat.df <- cbind(mat.df, df$type, row.names = NULL)

Change name of new column to “type”

colnames(mat.df)[ncol(mat.df)] <- "type"

Creat our testing and training set of data with a 70/30 split

train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .70))
test <- (1:nrow(mat.df))[- train]

Classifier variable based on the personality type

cl <- mat.df[,"type"]

Create model data and remove “type”

modeldata <- mat.df[,!colnames(mat.df) %in% "type"]

Create classifyer model for the Knn algorithm

knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train])

Creat the confusion matrix for our predictions

conf.mat <- table("Predictions" = knn.pred, Actual = cl[test])
conf.mat
##            Actual
## Predictions  0  1  2  3  4  5  6  7  8  9 10 11 12 14 16 18 24
##          0  18 24 19 11 11  4  7  2  2  0  0  2  0  1  0  0  1
##          1  14 28 11 15  8  6  8  4  1  1  1  3  1  0  0  0  0
##          2   8 12  7 12  0  4  1  1  1  1  0  0  0  0  0  0  0
##          3   1  1  2  1  0  1  1  1  0  0  0  0  0  0  0  0  0
##          4   0  3  3  3  1  3  0  0  1  0  0  0  0  0  0  0  0
##          5   0  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0  0
##          6   0  1  0  0  4  1  1  0  0  0  0  1  1  0  0  0  0
##          7   0  0  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##          8   0  1  0  0  0  0  1  1  0  1  1  0  0  0  0  0  0
##          9   1  1  1  0  0  1  0  0  0  0  0  0  1  0  1  0  0
##          10  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  1  0
##          11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          12  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##          19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0

The accuracy of out Knn model in percetage output. I have run the model several times and gotten a range of values from 14.667 up to 26.667 for the model accuracy this model has a lot of noise in it and would need to be laborously cleaned to start getting better clasiffications. I could possibly use another model. Its not bad considering it is predicting against 16 factors

(accuracy <- sum(diag(conf.mat))/length(test) * 100)
## [1] 19.33333

For further exploration

Subset each of the 16 MBTI personality typed into their own catagory

#ENFJ <- subset(df, type=="ENFJ")
#ENFP <- subset(df, type=="ENFP")
#ENTJ <- subset(df, type=="ENTJ")
#ENTP <- subset(df, type=="ENTP")
#ESFJ <- subset(df, type=="ESFJ")
#ESFP <- subset(df, type=="ESFP")
#ESTJ <- subset(df, type=="ESTJ")
#ESTP <- subset(df, type=="ESTP")
#INFJ <- subset(df, type=="INFJ")
#INFP <- subset(df, type=="INFP")
#INTJ <- subset(df, type=="INTJ")
#INTP <- subset(df, type=="INTP")
#ISFJ <- subset(df, type=="ISFJ")
#ISFP <- subset(df, type=="ISFP")
#ISTJ <- subset(df, type=="ISTJ")
#ISTP <- subset(df, type=="ISTP")

once the documents are clean we can see the top 10 words for each personality type

#ENFJ.freq <- freq_terms(ENFJ$posts, 10)
#ENFP.freq <- freq_terms(ENFP$posts, 10)
#ENTJ.freq <- freq_terms(ENTJ$posts, 10)
#ENTP.freq <- freq_terms(ENTP$posts, 10)
#ESFJ.freq <- freq_terms(ESFJ$posts, 10)
#ESFP.freq <- freq_terms(ESFP$posts, 10)
#ESTJ.freq <- freq_terms(ESTJ$posts, 10)
#ESTP.freq <- freq_terms(ESTP$posts, 10)
#INFJ.freq <- freq_terms(INFJ$posts, 10)
#INFP.freq <- freq_terms(INFP$posts, 10)
#INTJ.freq <- freq_terms(INTJ$posts, 10)
#INTP.freq <- freq_terms(INTP$posts, 10)
#ISFJ.freq <- freq_terms(ISFJ$posts, 10)
#ISFP.freq <- freq_terms(ISFP$posts, 10)
#ISTJ.freq <- freq_terms(ISTJ$posts, 10)
#ISTP.freq <- freq_terms(ISTP$posts, 10)

It will help to plot them and see them side by side

#plot(ENFJ.freq)
#plot(ENFP.freq)
#plot(ENTJ.freq)
#plot(ENTP.freq)
#plot(ESFJ.freq)
#plot(ESFP.freq)
#plot(ESTJ.freq)
#plot(ESTP.freq)
#plot(INFJ.freq)
#plot(INFP.freq)
#plot(INTJ.freq)
#plot(INTP.freq)
#plot(ISFJ.freq)
#plot(ISFP.freq)
#plot(ISTJ.freq)
#plot(ISTP.freq)

```