Load data
wiki <- read.csv("wiki.csv", stringsAsFactors=FALSE)
wiki$Vandal <- as.factor(wiki$Vandal)
str(wiki)
## 'data.frame': 3876 obs. of 7 variables:
## $ X.1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Vandal : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Minor : int 1 1 0 1 1 0 0 0 1 0 ...
## $ Loggedin: int 1 1 1 0 1 1 1 1 1 0 ...
## $ Added : chr " represent psycholinguisticspsycholinguistics orthographyorthography help text all actions through human ethnologue relationsh"| __truncated__ " website external links" " " " afghanistan used iran mostly that farsiis is countries some xmlspacepreservepersian parts tajikestan region" ...
## $ Removed : chr " " " talklanguagetalk" " regarded as technologytechnologies human first" " represent psycholinguisticspsycholinguistics orthographyorthography help all actions through ethnologue relationships linguis"| __truncated__ ...
How many cases of vandalism?
table(wiki$Vandal)
##
## 0 1
## 2061 1815
Prepare document for the “Added” words column and build document term matrix
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.1.3
corpusAdded <- Corpus(VectorSource(wiki$Added))
corpusAdded <- tm_map(corpusAdded, removeWords, stopwords("english"))
corpusAdded <- tm_map(corpusAdded, stemDocument)
(dtmAdded <- DocumentTermMatrix(corpusAdded))
## <<DocumentTermMatrix (documents: 3876, terms: 6675)>>
## Non-/sparse entries: 15368/25856932
## Sparsity : 100%
## Maximal term length: 784
## Weighting : term frequency (tf)
Filter out sparse terms by keeping only terms that appear in 0.3% or more.
(sparseAdded <- removeSparseTerms(dtmAdded, 1 - 0.003))
## <<DocumentTermMatrix (documents: 3876, terms: 166)>>
## Non-/sparse entries: 2681/640735
## Sparsity : 100%
## Maximal term length: 28
## Weighting : term frequency (tf)
Convert sparseAdded to a data frame called wordsAdded and prepend with letter A
wordsAdded <- as.data.frame(as.matrix(sparseAdded))
colnames(wordsAdded) <- paste("A", colnames(wordsAdded))
Now repeat same steps for removed:
corpusRemoved <- Corpus(VectorSource(wiki$Removed))
corpusRemoved <- tm_map(corpusRemoved, removeWords, stopwords("english"))
corpusRemoved <- tm_map(corpusRemoved, stemDocument)
dtmRemoved <- DocumentTermMatrix(corpusRemoved)
sparseRemoved <- removeSparseTerms(dtmRemoved, 1 - 0.003)
wordsRemoved <- as.data.frame(as.matrix(sparseRemoved))
colnames(wordsRemoved) <- paste("R", colnames(wordsRemoved))
Combine 2 frames, add back vandal column:
wikiWords <- cbind(wordsAdded, wordsRemoved)
wikiWords$Vandal <- wiki$Vandal
Split into test and training sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.1.3
set.seed(123)
split <- sample.split(wikiWords$Vandal, SplitRatio=0.7)
train <- subset(wikiWords, split==TRUE)
test <- subset(wikiWords, split==FALSE)
Baseline accuracy on test set
(table(train$Vandal))
##
## 0 1
## 1443 1270
(table(test$Vandal))
##
## 0 1
## 618 545
618 / nrow(test)
## [1] 0.5313844
Build CART model. What’s the accuracy using threshold of 0.5?
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
wikiCART <- rpart(Vandal ~ ., train, method="class")
prp(wikiCART)
predictCART <- predict(wikiCART, newdata=test, type="class")
(confmat <- table(test$Vandal, predictCART))
## predictCART
## 0 1
## 0 618 0
## 1 533 12
sum(diag(confmat)) / nrow(test)
## [1] 0.5417025
We weren’t able to improve on the baseline using the raw textual information. More specifically, the words themselves were not useful. There are other options though, and in this section we will try two techniques - identifying a key class of words, and counting words.
The key class of words we will use are website addresses (aka urls). We hypothesize that given that a lot of vandalism seems to be adding links to promotional or irrelevant websites, the presence of a web address is a sign of vandalism.
Create a new dataframe with a new column HTTP that is 1 if url is added, or 0 otherwise:
wikiWords2 <- wikiWords
wikiWords2$HTTP <- ifelse(grepl("http", wiki$Added, fixed=TRUE), 1, 0)
table(wikiWords2$HTTP)
##
## 0 1
## 3659 217
Split into test and training sets
train2 <- subset(wikiWords2, split == TRUE)
test2 <- subset(wikiWords2, split == FALSE)
Build CART model and get accuracy
wikiCART2 <- rpart(Vandal ~ ., train2, method="class")
prp(wikiCART2)
predictCART2 <- predict(wikiCART2, newdata=test2, type="class")
(confmat <- table(test2$Vandal, predictCART2))
## predictCART2
## 0 1
## 0 609 9
## 1 488 57
sum(diag(confmat)) / nrow(test2)
## [1] 0.5726569
Another possibility is that the number of words added and removed is predictive, perhaps more so than the actual words themselves. We already have a word count available in the form of the document-term matrices (DTMs).
Get number of words added and removed
wikiWords2$NumWordsAdded <- rowSums(as.matrix(dtmAdded))
wikiWords2$NumWordsRemoved <- rowSums(as.matrix(dtmRemoved))
mean(wikiWords2$NumWordsAdded)
## [1] 4.050052
Create test and training sets
train2 <- subset(wikiWords2, split == TRUE)
test2 <- subset(wikiWords2, split == FALSE)
Build CART model and get accuracy:
wikiCART2 <- rpart(Vandal ~ ., train2, method="class")
prp(wikiCART2)
predictCART2 <- predict(wikiCART2, newdata=test2, type="class")
(confmat <- table(test2$Vandal, predictCART2))
## predictCART2
## 0 1
## 0 514 104
## 1 297 248
sum(diag(confmat)) / nrow(test2)
## [1] 0.6552021
Using meta-data like minor, loggedin, etc.
wikiWords3 <- wikiWords2
wikiWords3$Minor <- wiki$Minor
wikiWords3$Loggedin <- wiki$Loggedin
train3 <- subset(wikiWords3, split == TRUE)
test3 <- subset(wikiWords3, split == FALSE)
wikiCART3 <- rpart(Vandal ~ ., train3, method="class")
prp(wikiCART3)
predictCART3 <- predict(wikiCART3, newdata=test3, type="class")
(confmat <- table(test3$Vandal, predictCART3))
## predictCART3
## 0 1
## 0 595 23
## 1 304 241
sum(diag(confmat)) / nrow(test3)
## [1] 0.7188306