The data for this problem is based on the revision history of the page Language. Wikipedia provides a history for each page that consists of the state of the page at each revision. Rather than manually considering each revision, a script was run that checked whether edits stayed or were reverted. If a change was eventually reverted then that revision is marked as vandalism. This may result in some misclassifications, but the script performs well enough for our needs.
As a result of this preprocessing, some common processing tasks have already been done, including lower-casing and punctuation removal. The columns in the dataset are:
packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)wiki = read.csv("data/wiki.csv", stringsAsFactors = F)
wiki$Vandal = factor(wiki$Vandal)
table(wiki$Vandal)##
## 0 1
## 2061 1815
【P1.1】How many cases of vandalism were detected in the history of this page?
library(tm)
library(SnowballC)
# Create corpus for Added Words
txt = iconv(wiki$Added, to = "utf-8", sub="")
corpus = Corpus(VectorSource(txt))
corpus = tm_map(corpus, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corpus)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 6675)>>
## Non-/sparse entries: 15368/25856932
## Sparsity : 100%
## Maximal term length: 784
## Weighting : term frequency (tf)
【P1.2】How many terms appear in dtmAdded?
Filter out sparse terms by keeping only terms that appear in 0.3% or more of the revisions, and call the new matrix sparseAdded.
nwAdded = rowSums(as.matrix(dtm)) # no. word added in each edit
dtm = removeSparseTerms(dtm, 0.997)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 166)>>
## Non-/sparse entries: 2681/640735
## Sparsity : 100%
## Maximal term length: 28
## Weighting : term frequency (tf)
【P1.3】How many terms appear in sparseAdded?
wordAdded & wordRemovedConvert sparseAdded to a data frame called wordsAdded, and then prepend all the words with the letter A, by using the command:
wordsAdded = as.data.frame(as.matrix(dtm))
colnames(wordsAdded) = paste("A", colnames(wordsAdded)) # for proper column namesNow repeat all of the steps we’ve done so far to create a Removed bag-of-words dataframe, called wordsRemoved, except this time, prepend all of the words with the letter R:
# Create corpus
txt = iconv(wiki$Removed, to = "utf-8", sub="")
corpus = Corpus(VectorSource(txt))
corpus = tm_map(corpus, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corpus)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 5404)>>
## Non-/sparse entries: 13294/20932610
## Sparsity : 100%
## Maximal term length: 784
## Weighting : term frequency (tf)
nwRemoved = rowSums(as.matrix(dtm))
dtm = removeSparseTerms(dtm, 0.997)
dtm## <<DocumentTermMatrix (documents: 3876, terms: 162)>>
## Non-/sparse entries: 2552/625360
## Sparsity : 100%
## Maximal term length: 28
## Weighting : term frequency (tf)
wordsRemoved = as.data.frame(as.matrix(dtm))
colnames(wordsRemoved) = paste("R", colnames(wordsRemoved))【P1.4】How many words are in the wordsRemoved data frame?
Combine the Data Frames wordsAdded & wordsRemoved with the Target Variable wiki$Vandal
wikiWords = cbind(wordsAdded, wordsRemoved)
wikiWords$Vandal = wiki$VandalSplit the data frame for train and test data
library(caTools)
set.seed(123)
spl = sample.split(wikiWords$Vandal, 0.7)
train = subset(wikiWords, spl == TRUE)
test = subset(wikiWords, spl == FALSE)
table(test$Vandal) %>% prop.table##
## 0 1
## 0.53138 0.46862
【P1.5】What is the accuracy on the test set of a baseline method that always predicts “not vandalism”?
library(rpart)
library(rpart.plot)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # 0.54428## [1] 0.54428
【P1.6】What is the accuracy of the model on the test set, using a threshold of 0.5?
prp(cart) 【P1.7】How many word stems does the CART model use?
【P1.8】Given the performance of the CART model relative to the baseline, what is the best explanation of these results?
HTTP columnAdd a new column based on whether "http" is added
wiki2 = wikiWords
wiki2$HTTP = ifelse( grepl("http",wiki$Added,fixed=TRUE) , 1, 0)
table(wiki2$HTTP) # 217##
## 0 1
## 3659 217
【P2.1】Based on this new column, how many revisions added a link?
train2 = subset(wiki2, spl==T)
test2 = subset(wiki2, spl==F)
cart2 = rpart(Vandal~., train2, method="class")
pred2 = predict(cart2,test2,type='class')
table(test2$Vandal, pred2) %>% {sum(diag(.)) / sum(.)} # 0.57524## [1] 0.57524
【P2.2】What is the new accuracy of the CART model on the test set, using a threshold of 0.5?
wiki2$nwAdded = nwAdded
wiki2$nwRemoved = nwRemoved
mean(nwAdded) # 4.0501## [1] 4.0501
【P2.3】What is the average number of words added?
train = subset(wiki2, spl)
test = subset(wiki2, !spl)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # 0.6552## [1] 0.6552
【P2.4】What is the new accuracy of the CART model on the test set?
原始資料之中還有一些之前沒有用到的欄位,我們把它們也加進來
wiki3 = wiki2
wiki3$Minor = wiki$Minor
wiki3$Loggedin = wiki$Loggedintrain = subset(wiki3, spl=T)
test = subset(wiki3, spl=F)
cart = rpart(Vandal~., train, method="class")
pred = predict(cart,test,type='class')
table(test$Vandal, pred) %>% {sum(diag(.)) / sum(.)} # .72472## [1] 0.72472
【P3.1】What is the accuracy of the model on the test set?
prp(cart)【P3.2】How many splits are there in the tree?
討論議題:
■ 請舉出一些可以繼續提高模型準確率的方法,方法越多越好:
●獲取更多數據,訓練數據的質量就限制了模型的質量。
●創造更多數據,得到的數據越多,通常就表現得越好。
●重新調整數據的規模,然後在每一個數據集上評估模型的表現。
●轉換數據,必須真正瞭解數據,並將其可視化,然後尋找出那些離群的數據。
●特徵選取,也許可以用更少的特徵做得一樣好,甚至有更好表現。
●重構問題,所收集的觀察是唯一能構建你問題的方式嗎?
●通過算法提升表現表現,例如重採樣方法…