library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(wordcloud)
## Loading required package: RColorBrewer
library(tm)
## Loading required package: NLP
library(quanteda)
## Package version: 1.2.0
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(syuzhet)
library(SnowballC)
library(tidytext)
library(devtools)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(text2vec)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(e1071)
library(klaR)
## Loading required package: MASS
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:gdata':
##
## first, last
library(magrittr)
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:data.table':
##
## shift
##
## Attaching package: 'qdap'
## The following object is masked from 'package:magrittr':
##
## %>%
## The following object is masked from 'package:Matrix':
##
## %&%
## The following objects are masked from 'package:quanteda':
##
## %>%, as.DocumentTermMatrix, as.wfm
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
library(readr)
library(qdapTools)
library(qdapRegex)
library(randomcoloR)
library(class)
##
## Attaching package: 'class'
## The following object is masked from 'package:qdap':
##
## condense
docs <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE)
docs <- docs[1:4000,]
docs$type <- as.factor(docs$type)
str(docs)
## 'data.frame': 4000 obs. of 2 variables:
## $ type : Factor w/ 16 levels "ENFJ","ENFP",..: 9 4 12 11 3 11 9 11 9 12 ...
## $ posts: chr "'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||e"| __truncated__ "'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position ofte"| __truncated__ "'Good one _____ https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; that's my bl"| __truncated__ "'Dear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of the universe and "| __truncated__ ...
is.na(docs$type) <- docs$type == "type"
docs$type <- factor(docs$type)
levels(docs$type)
## [1] "ENFJ" "ENFP" "ENTJ" "ENTP" "ESFJ" "ESFP" "ESTJ" "ESTP" "INFJ" "INFP"
## [11] "INTJ" "INTP" "ISFJ" "ISFP" "ISTJ" "ISTP"
table(docs$type)
##
## ENFJ ENFP ENTJ ENTP ESFJ ESFP ESTJ ESTP INFJ INFP INTJ INTP ISFJ ISFP ISTJ
## 95 291 95 331 15 27 18 49 693 841 508 583 81 115 88
## ISTP
## 170
plot(docs$type, col = c("blue", "blue", "blue", "blue", "red", "red", "red", "red", "green", "green", "green", "green", "yellow", "yellow", "yellow", "yellow"))
frequent.terms.unclean <- freq_terms(docs$posts, 20)
frequent.terms.unclean
## WORD FREQ
## 1 i 194030
## 2 to 136439
## 3 the 135594
## 4 a 109326
## 5 and 108247
## 6 of 82851
## 7 you 71699
## 8 that 64937
## 9 is 62410
## 10 it 60300
## 11 in 57804
## 12 my 53627
## 13 but 41929
## 14 for 41131
## 15 have 37873
## 16 with 36933
## 17 me 36047
## 18 be 33131
## 19 are 31979
## 20 this 31306
plot(frequent.terms.unclean)
#Set up the doc_id to be able to create our corpus
posts.df <- data.frame(doc_id = seq(1:nrow(docs)), text = docs$posts)
corpus <- VCorpus(DataframeSource(posts.df))
set.seed(333)
df <- read.csv("C:/Users/Jonathan/Desktop/Text Mining Project Data/mbti_1.csv", stringsAsFactors = FALSE, header = TRUE)
df <- df[1:1000,]
docs1 <- Corpus(VectorSource(df$posts))
custom.stopwords = c(stopwords("english"), "i","like","just","can","im","can","the","one","get","ive","just","know")
docs1 <- tm_map(docs1, content_transformer(tolower))
docs1 <- tm_map(docs1, removeNumbers)
docs1 <- tm_map(docs1, removeWords, stopwords("english"))
docs1 <- tm_map(docs1, removePunctuation)
docs1 <- tm_map(docs1, stripWhitespace)
docs1 <- tm_map(docs1, stemDocument, language = "english")
dtm <- DocumentTermMatrix(docs1)
mat.df <- as.data.frame(data.matrix(dtm), stringsAsfactors = FALSE)
mat.df <- cbind(mat.df, df$type, row.names = NULL)
colnames(mat.df)[ncol(mat.df)] <- "type"
train <- sample(nrow(mat.df), ceiling(nrow(mat.df) * .70))
test <- (1:nrow(mat.df))[- train]
cl <- mat.df[,"type"]
modeldata <- mat.df[,!colnames(mat.df) %in% "type"]
knn.pred <- knn(modeldata[train, ], modeldata[test, ], cl[train])
conf.mat <- table("Predictions" = knn.pred, Actual = cl[test])
conf.mat
## Actual
## Predictions 0 1 2 3 4 5 6 7 8 9 10 11 12 14 16 18 24
## 0 18 24 19 11 11 4 7 2 2 0 0 2 0 1 0 0 1
## 1 14 28 11 15 8 6 8 4 1 1 1 3 1 0 0 0 0
## 2 8 12 7 12 0 4 1 1 1 1 0 0 0 0 0 0 0
## 3 1 1 2 1 0 1 1 1 0 0 0 0 0 0 0 0 0
## 4 0 3 3 3 1 3 0 0 1 0 0 0 0 0 0 0 0
## 5 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 6 0 1 0 0 4 1 1 0 0 0 0 1 1 0 0 0 0
## 7 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0
## 9 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0
## 10 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0
## 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
(accuracy <- sum(diag(conf.mat))/length(test) * 100)
## [1] 19.33333
frequent.terms.clean <- freq_terms(mat.df, 20)
frequent.terms.clean
## WORD FREQ
## 1 c 39262
plot(frequent.terms.clean)
#ENFJ <- subset(df, type=="ENFJ")
#ENFP <- subset(df, type=="ENFP")
#ENTJ <- subset(df, type=="ENTJ")
#ENTP <- subset(df, type=="ENTP")
#ESFJ <- subset(df, type=="ESFJ")
#ESFP <- subset(df, type=="ESFP")
#ESTJ <- subset(df, type=="ESTJ")
#ESTP <- subset(df, type=="ESTP")
#INFJ <- subset(df, type=="INFJ")
#INFP <- subset(df, type=="INFP")
#INTJ <- subset(df, type=="INTJ")
#INTP <- subset(df, type=="INTP")
#ISFJ <- subset(df, type=="ISFJ")
#ISFP <- subset(df, type=="ISFP")
#ISTJ <- subset(df, type=="ISTJ")
#ISTP <- subset(df, type=="ISTP")
#ENFJ.freq <- freq_terms(ENFJ$posts, 10)
#ENFP.freq <- freq_terms(ENFP$posts, 10)
#ENTJ.freq <- freq_terms(ENTJ$posts, 10)
#ENTP.freq <- freq_terms(ENTP$posts, 10)
#ESFJ.freq <- freq_terms(ESFJ$posts, 10)
#ESFP.freq <- freq_terms(ESFP$posts, 10)
#ESTJ.freq <- freq_terms(ESTJ$posts, 10)
#ESTP.freq <- freq_terms(ESTP$posts, 10)
#INFJ.freq <- freq_terms(INFJ$posts, 10)
#INFP.freq <- freq_terms(INFP$posts, 10)
#INTJ.freq <- freq_terms(INTJ$posts, 10)
#INTP.freq <- freq_terms(INTP$posts, 10)
#ISFJ.freq <- freq_terms(ISFJ$posts, 10)
#ISFP.freq <- freq_terms(ISFP$posts, 10)
#ISTJ.freq <- freq_terms(ISTJ$posts, 10)
#ISTP.freq <- freq_terms(ISTP$posts, 10)
#plot(ENFJ.freq)
#plot(ENFP.freq)
##plot(ENTJ.freq)
#plot(ENTP.freq)
#plot(ESFJ.freq)
##plot(ESFP.freq)
#plot(ESTJ.freq)
#plot(ESTP.freq)
#plot(INFJ.freq)
#plot(INFP.freq)
#plot(INTJ.freq)
#plot(INTP.freq)
#plot(ISFJ.freq)
#plot(ISFP.freq)
#plot(ISTJ.freq)
#plot(ISTP.freq)