The data for this problem is based on the revision history of the page Language. Wikipedia provides a history for each page that consists of the state of the page at each revision. Rather than manually considering each revision, a script was run that checked whether edits stayed or were reverted. If a change was eventually reverted then that revision is marked as vandalism. This may result in some misclassifications, but the script performs well enough for our needs.
As a result of this preprocessing, some common processing tasks have already been done, including lower-casing and punctuation removal. The columns in the dataset are:
packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C/C/C/C/C/zh_TW.UTF-8"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)wiki = read.csv("data/wiki.csv", stringsAsFactors = F)
wiki$Vandal = factor(wiki$Vandal)
table(wiki$Vandal)##
## 0 1
## 2061 1815
library(tm)
library(SnowballC)
# Create corpus for Added Words
txt = iconv(wiki$Added, to = "utf-8", sub="")
corpus = Corpus(VectorSource(txt))
corpus = tm_map(corpus, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corpus)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 6674)>>
## Non-/sparse entries: 15367/25853057
## Sparsity : 100%
## Maximal term length: 784
## Weighting : term frequency (tf)
dtmAdded?
Filter out sparse terms by keeping only terms that appear in 0.3% or more of the revisions, and call the new matrix sparseAdded.
nwAdded = rowSums(as.matrix(dtm)) # no. word added in each edit
dtm = removeSparseTerms(dtm, 0.997)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 166)>>
## Non-/sparse entries: 2681/640735
## Sparsity : 100%
## Maximal term length: 28
## Weighting : term frequency (tf)
sparseAdded?
wordAdded & wordRemovedConvert sparseAdded to a data frame called wordsAdded, and then prepend all the words with the letter A, by using the command:
wordsAdded = as.data.frame(as.matrix(dtm))
colnames(wordsAdded) = paste("A", colnames(wordsAdded)) # for proper column namesNow repeat all of the steps we’ve done so far to create a Removed bag-of-words dataframe, called wordsRemoved, except this time, prepend all of the words with the letter R:
# Create corpus
txt = iconv(wiki$Removed, to = "utf-8", sub="")
corpus = Corpus(VectorSource(txt))
corpus = tm_map(corpus, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corpus)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 5403)>>
## Non-/sparse entries: 13293/20928735
## Sparsity : 100%
## Maximal term length: 784
## Weighting : term frequency (tf)
nwRemoved = rowSums(as.matrix(dtm))
dtm = removeSparseTerms(dtm, 0.997)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 162)>>
## Non-/sparse entries: 2552/625360
## Sparsity : 100%
## Maximal term length: 28
## Weighting : term frequency (tf)
wordsRemoved = as.data.frame(as.matrix(dtm))
colnames(wordsRemoved) = paste("R", colnames(wordsRemoved))wordsRemoved data frame?
Combine the Data Frames wordsAdded & wordsRemoved with the Target Variable wiki$Vandal
wikiWords = cbind(wordsAdded, wordsRemoved)
wikiWords$Vandal = wiki$VandalSplit the data frame for train and test data
library(caTools)
set.seed(123)
spl = sample.split(wikiWords$Vandal, 0.7)
train = subset(wikiWords, spl == TRUE)
test = subset(wikiWords, spl == FALSE)
table(test$Vandal) %>% prop.table##
## 0 1
## 0.53138 0.46862
library(rpart)
library(rpart.plot)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # 0.54428## [1] 0.54428
prp(cart)
HTTP columnAdd a new column based on whether "http" is added
wiki2 = wikiWords
wiki2$HTTP = ifelse( grepl("http",wiki$Added,fixed=TRUE) , 1, 0)
table(wiki2$HTTP) # 217##
## 0 1
## 3659 217
train2 = subset(wiki2, spl==T)
test2 = subset(wiki2, spl==F)
cart2 = rpart(Vandal~., train2, method="class")
pred2 = predict(cart2,test2,type='class')
table(test2$Vandal, pred2) %>% {sum(diag(.)) / sum(.)} # 0.57524## [1] 0.57524
wiki2$nwAdded = nwAdded
wiki2$nwRemoved = nwRemoved
mean(nwAdded) # 4.0501## [1] 4.0498
train = subset(wiki2, spl)
test = subset(wiki2, !spl)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # 0.6552## [1] 0.6552
wiki3 = wiki2
wiki3$Minor = wiki$Minor
wiki3$Loggedin = wiki$Loggedintrain = subset(wiki3, spl=T)
test = subset(wiki3, spl=F)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # .72472## [1] 0.72472
prp(cart)