Text Analytics

Load data

wiki <- read.csv("wiki.csv", stringsAsFactors=FALSE)
wiki$Vandal <- as.factor(wiki$Vandal)
str(wiki)

## 'data.frame':    3876 obs. of  7 variables:
##  $ X.1     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Vandal  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Minor   : int  1 1 0 1 1 0 0 0 1 0 ...
##  $ Loggedin: int  1 1 1 0 1 1 1 1 1 0 ...
##  $ Added   : chr  "  represent psycholinguisticspsycholinguistics orthographyorthography help text all actions through human ethnologue relationsh"| __truncated__ " website external links" " " " afghanistan used iran mostly that farsiis is countries some xmlspacepreservepersian parts tajikestan region" ...
##  $ Removed : chr  " " " talklanguagetalk" " regarded as technologytechnologies human first" "  represent psycholinguisticspsycholinguistics orthographyorthography help all actions through ethnologue relationships linguis"| __truncated__ ...

How many cases of vandalism?

table(wiki$Vandal)

## 
##    0    1 
## 2061 1815

Bag of Words Analysis

Prepare document for the “Added” words column and build document term matrix

library(tm)

## Warning: package 'tm' was built under R version 3.1.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.1.3

library(SnowballC)

## Warning: package 'SnowballC' was built under R version 3.1.3

corpusAdded <- Corpus(VectorSource(wiki$Added))
corpusAdded <- tm_map(corpusAdded, removeWords, stopwords("english"))
corpusAdded <- tm_map(corpusAdded, stemDocument)
(dtmAdded <- DocumentTermMatrix(corpusAdded))

## <<DocumentTermMatrix (documents: 3876, terms: 6675)>>
## Non-/sparse entries: 15368/25856932
## Sparsity           : 100%
## Maximal term length: 784
## Weighting          : term frequency (tf)

Filter out sparse terms by keeping only terms that appear in 0.3% or more.

(sparseAdded <- removeSparseTerms(dtmAdded, 1 - 0.003))

## <<DocumentTermMatrix (documents: 3876, terms: 166)>>
## Non-/sparse entries: 2681/640735
## Sparsity           : 100%
## Maximal term length: 28
## Weighting          : term frequency (tf)

Convert sparseAdded to a data frame called wordsAdded and prepend with letter A

wordsAdded <- as.data.frame(as.matrix(sparseAdded))
colnames(wordsAdded) <- paste("A", colnames(wordsAdded))

Now repeat same steps for removed:

corpusRemoved <- Corpus(VectorSource(wiki$Removed))
corpusRemoved <- tm_map(corpusRemoved, removeWords, stopwords("english"))
corpusRemoved <- tm_map(corpusRemoved, stemDocument)
dtmRemoved <- DocumentTermMatrix(corpusRemoved)
sparseRemoved <- removeSparseTerms(dtmRemoved, 1 - 0.003)
wordsRemoved <- as.data.frame(as.matrix(sparseRemoved))
colnames(wordsRemoved) <- paste("R", colnames(wordsRemoved))

Combine 2 frames, add back vandal column:

wikiWords <- cbind(wordsAdded, wordsRemoved)
wikiWords$Vandal <- wiki$Vandal

Split into test and training sets

library(caTools)

## Warning: package 'caTools' was built under R version 3.1.3

set.seed(123)
split <- sample.split(wikiWords$Vandal, SplitRatio=0.7)
train <- subset(wikiWords, split==TRUE)
test <- subset(wikiWords, split==FALSE)

Baseline accuracy on test set

(table(train$Vandal))

## 
##    0    1 
## 1443 1270

(table(test$Vandal))

## 
##   0   1 
## 618 545

618 / nrow(test)

## [1] 0.5313844

Build CART model. What’s the accuracy using threshold of 0.5?

library(rpart)

## Warning: package 'rpart' was built under R version 3.1.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.1.3

wikiCART <- rpart(Vandal ~ ., train, method="class")
prp(wikiCART)

predictCART <- predict(wikiCART, newdata=test, type="class")
(confmat <- table(test$Vandal, predictCART))

##    predictCART
##       0   1
##   0 618   0
##   1 533  12

sum(diag(confmat)) / nrow(test)

## [1] 0.5417025

Problem-specific Knowledge

We weren’t able to improve on the baseline using the raw textual information. More specifically, the words themselves were not useful. There are other options though, and in this section we will try two techniques - identifying a key class of words, and counting words.

The key class of words we will use are website addresses (aka urls). We hypothesize that given that a lot of vandalism seems to be adding links to promotional or irrelevant websites, the presence of a web address is a sign of vandalism.

Create a new dataframe with a new column HTTP that is 1 if url is added, or 0 otherwise:

wikiWords2 <- wikiWords
wikiWords2$HTTP <- ifelse(grepl("http", wiki$Added, fixed=TRUE), 1, 0)
table(wikiWords2$HTTP)

## 
##    0    1 
## 3659  217

Split into test and training sets

train2 <- subset(wikiWords2, split == TRUE)
test2 <- subset(wikiWords2, split == FALSE)

Build CART model and get accuracy

wikiCART2 <- rpart(Vandal ~ ., train2, method="class")
prp(wikiCART2)

predictCART2 <- predict(wikiCART2, newdata=test2, type="class")
(confmat <- table(test2$Vandal, predictCART2))

##    predictCART2
##       0   1
##   0 609   9
##   1 488  57

sum(diag(confmat)) / nrow(test2)

## [1] 0.5726569

Another possibility is that the number of words added and removed is predictive, perhaps more so than the actual words themselves. We already have a word count available in the form of the document-term matrices (DTMs).

Get number of words added and removed

wikiWords2$NumWordsAdded <- rowSums(as.matrix(dtmAdded))
wikiWords2$NumWordsRemoved <- rowSums(as.matrix(dtmRemoved))
mean(wikiWords2$NumWordsAdded)

## [1] 4.050052

Create test and training sets

train2 <- subset(wikiWords2, split == TRUE)
test2 <- subset(wikiWords2, split == FALSE)

Build CART model and get accuracy:

wikiCART2 <- rpart(Vandal ~ ., train2, method="class")
prp(wikiCART2)

predictCART2 <- predict(wikiCART2, newdata=test2, type="class")
(confmat <- table(test2$Vandal, predictCART2))

##    predictCART2
##       0   1
##   0 514 104
##   1 297 248

sum(diag(confmat)) / nrow(test2)

## [1] 0.6552021

Using non-textual data

Using meta-data like minor, loggedin, etc.

wikiWords3 <- wikiWords2
wikiWords3$Minor <- wiki$Minor
wikiWords3$Loggedin <- wiki$Loggedin

train3 <- subset(wikiWords3, split == TRUE)
test3 <- subset(wikiWords3, split == FALSE)

wikiCART3 <- rpart(Vandal ~ ., train3, method="class")
prp(wikiCART3)

predictCART3 <- predict(wikiCART3, newdata=test3, type="class")
(confmat <- table(test3$Vandal, predictCART3))

##    predictCART3
##       0   1
##   0 595  23
##   1 304 241

sum(diag(confmat)) / nrow(test3)

## [1] 0.7188306

Text Analytics - wikis

Andy

Friday, July 03, 2015

Bag of Words Analysis

Problem-specific Knowledge

Using non-textual data