Перед тем, как приступить к анализу данных, следует, используя функции пакета tm выполнить следующие действия:
load("e:/RDir/messages.rda")
str(df1)
## 'data.frame': 1324 obs. of 2 variables:
## $ text : chr "Urgent! call 09061749602 from Landline. Your complimentary 4* Tenerife Holiday or Ј10,000 cash await collection SAE T&Cs BOX 52"| __truncated__ "+449071512431 URGENT! This is the 2nd attempt to contact U!U have WON Ј1250 CALL 09071512433 b4 050703 T&CsBCM4235WC1N3XX. call"| __truncated__ "FREE for 1st week! No1 Nokia tone 4 ur mob every week just txt NOKIA to 8007 Get txting and tell ur mates www.getzed.co.uk POBo"| __truncated__ "Urgent! call 09066612661 from landline. Your complementary 4* Tenerife Holiday or Ј10,000 cash await collection SAE T&Cs PO Box"| __truncated__ ...
## $ label: chr "spam" "spam" "spam" "spam" ...
N = length(df1[,1])
words = scan(text=df1[,1], what=character())
head(words)
## [1] "Urgent!" "call" "09061749602" "from" "Landline."
## [6] "Your"
Необходимо избавиться от лишних символов: цифр, знаков препинания, лишних пробелов - мусора. При помощи функции scan реализовать это не получится.
table(df1[,2])/length(df1[,2])
##
## not spam spam
## 0.7567976 0.2432024
require(tm)
messages <- VCorpus(VectorSource(df1[,1]))
msgs1 <- tm_map(messages, tolower)
msgs2 <- tm_map(msgs1, removeWords, stopwords("english"))
msgs3 <- tm_map(msgs2, removeNumbers)
msgs4 <- tm_map(msgs3, removePunctuation)
msgs5 <- tm_map(msgs4, stripWhitespace)
msgs <- c()
for (i in 1:N)
{
msgs <- c(msgs, msgs5[[i]])
}
head(msgs)
## [1] "urgent call landline complimentary tenerife holiday cash await collection sae tcs box hp yf ppm "
## [2] " urgent nd attempt contact uu won call b tcsbcmwcnxx callcost ppm mobilesvary max "
## [3] "free st week no nokia tone ur mob every week just txt nokia get txting tell ur mates wwwgetzedcouk pobox wwq normptone "
## [4] "urgent call landline complementary tenerife holiday cash await collection sae tcs po box wa px ppm sender hol offer"
## [5] "winner valued network customer selected receivea prize reward claim call claim code kl valid hours "
## [6] "okmail dear dave final notice collect tenerife holiday cash award call landline tcs sae box cwwx ppm"
msgs_labeled = data.frame(message=msgs, label=df1[,2])
head(msgs_labeled)
## message
## 1 urgent call landline complimentary tenerife holiday cash await collection sae tcs box hp yf ppm
## 2 urgent nd attempt contact uu won call b tcsbcmwcnxx callcost ppm mobilesvary max
## 3 free st week no nokia tone ur mob every week just txt nokia get txting tell ur mates wwwgetzedcouk pobox wwq normptone
## 4 urgent call landline complementary tenerife holiday cash await collection sae tcs po box wa px ppm sender hol offer
## 5 winner valued network customer selected receivea prize reward claim call claim code kl valid hours
## 6 okmail dear dave final notice collect tenerife holiday cash award call landline tcs sae box cwwx ppm
## label
## 1 spam
## 2 spam
## 3 spam
## 4 spam
## 5 spam
## 6 spam
s=13266
set.seed(s)
n <- 1000+sample(nrow(msgs_labeled)-1000,1)
set.seed(s)
my_data <- msgs_labeled[sample(nrow(df1), n), ]
require(caret)
indexes<-createDataPartition(my_data[,2],p=0.7,list=F)
tr_msgs=my_data[indexes,]
val_msgs =my_data[-indexes,]
N1=nrow(tr_msgs); N1
## [1] 723
spam_not_spam=table(tr_msgs[,2])
n_spam = spam_not_spam[2]
n_not_spam = spam_not_spam[1]
p_not_spam=n_not_spam/N1
p_spam=n_spam/N1
p_not_spam; p_spam
## not spam
## 0.769018
## spam
## 0.230982
my_corpus <- VCorpus(VectorSource(tr_msgs[,1]))
dtm <- DocumentTermMatrix(my_corpus)
freq_words=findFreqTerms(dtm,50)
word_cnt = length(freq_words)
freq_words; word_cnt
## [1] "call" "can" "free" "lor" "now"
## [1] 5
dtm2 <- DocumentTermMatrix(my_corpus, control = list(dictionary = freq_words))
inspectdtm2=inspect(dtm2)
colSums(inspectdtm2)
## call can free lor now
## 110 54 58 88 97
spam_dtm <- dtm2[which(tr_msgs[,2] == "spam"),]
not_spam_dtm <- dtm2[which(tr_msgs[,2] == "not spam"),]
i_s_dtm = inspect(spam_dtm)
i_n_s_dtm = inspect(not_spam_dtm)
word_freq_spam = colSums(i_s_dtm)
word_freq_not_spam = colSums(i_n_s_dtm)
word_freq_spam; word_freq_not_spam
## call can free lor now
## 87 6 44 0 42
## call can free lor now
## 23 48 14 88 55
word_prob_spam = word_freq_spam/n_spam
word_prob_not_spam = word_freq_not_spam/n_not_spam
word_prob_spam; word_prob_not_spam
## call can free lor now
## 0.52095808 0.03592814 0.26347305 0.00000000 0.25149701
## call can free lor now
## 0.04136691 0.08633094 0.02517986 0.15827338 0.09892086
spam_dtm2 <- dtm[which(tr_msgs[,2] == "spam"),]
not_spam_dtm2 <- dtm[which(tr_msgs[,2] == "not spam"),]
freq_spam_words = findFreqTerms(spam_dtm2, 20)
freq_not_spam_words = findFreqTerms(not_spam_dtm2, 20)
spamdtm2 <- DocumentTermMatrix(my_corpus, control = list(dictionary = freq_spam_words))
notspamdtm2 <- DocumentTermMatrix(my_corpus, control = list(dictionary = freq_not_spam_words))
isd=inspect(spamdtm2)
isnd=inspect(notspamdtm2)
sort(colSums(isd),decreasing = T)
freq_spam_words = names(as.factor(sort(colSums(isd),decreasing = T)))[1:5]
freq_spam_words
sort(colSums(isnd),decreasing = T)
freq_not_spam_words = names(as.factor(sort(colSums(isnd),decreasing = T)))[1:5]
freq_not_spam_words
spamdtm2 <- DocumentTermMatrix(my_corpus, control = list(dictionary = freq_spam_words))
notspamdtm2 <- DocumentTermMatrix(my_corpus, control = list(dictionary = freq_not_spam_words))
isd=inspect(spamdtm2)
isnd=inspect(notspamdtm2)
word_prob_spam=colSums(isd)/n_spam; word_prob_not_spam=colSums(isnd)/n_not_spam
word_prob_spam; word_prob_not_spam
## call free mobile now txt
## 0.6586826 0.3473054 0.2155689 0.5808383 0.2754491
## call can lor now wat
## 0.19784173 0.09712230 0.15827338 0.17446043 0.07553957
idtm = inspect(dtm)
aprobs <- function(dict)
{
len = length(dict);
p = data.frame(words=dict,probs_spam = 1:len,probs_not_spam=1:len);
for (j in 1:len)
{
n = sum(idtm[,dict[j]]);
ni = sum(spam_dtm2[,dict[j]]);
p[j,2]=(ni+1)/(n+len);
}
for (j in 1:len)
{
n = sum(idtm[,dict[j]]);
ni = sum(not_spam_dtm2[,dict[j]]);
p[j,3]=(ni+1)/(n+len);
}
return(p);
}
aprobs(freq_spam_words)
## words probs_spam probs_not_spam
## 1 call 0.7652174 0.20869565
## 2 now 0.4215686 0.54901961
## 3 free 0.7142857 0.23809524
## 4 txt 0.7450980 0.19607843
## 5 mobile 0.9024390 0.02439024
aprobs(freq_not_spam_words)
## words probs_spam probs_not_spam
## 1 call 0.76521739 0.2086957
## 2 now 0.42156863 0.5490196
## 3 lor 0.01075269 0.9569892
## 4 can 0.11864407 0.8305085
## 5 wat 0.02127660 0.9148936
strcount <- function(x, pattern, split = " ")
{
unlist(lapply(strsplit(x, split),function(z) na.omit(length(grep(pattern, z)))))
}
estimate_class_probs <- function(dict, ps_sp, ps_n_sp, p_sp, p_n_sp, query)
{
words_prob_spam = 1; words_prob_not_spam = 1;
len = length(dict);
for (i in 1:len)
{
occur_i <- str_count(query,dict[i]);
word_prob_spam <- ps_sp[i]^occur_i;
word_prob_not_spam <- ps_n_sp[i]^occur_i;
words_prob_spam <- words_prob_spam*word_prob_spam;
words_prob_not_spam <- words_prob_not_spam*word_prob_not_spam;
}
words_prob_spam_final = words_prob_spam * p_sp;
words_prob_not_spam_final = words_prob_not_spam * p_n_sp;
class_probs <- c(words_prob_not_spam_final, words_prob_spam_final);
names(class_probs) <- c("not spam", "spam");
return(class_probs);
}
get_winner <- function(class_probs)
{
ifelse((class_probs[1]>class_probs[2]), "not spam", "spam")
}
ps_sp=aprobs(freq_words)[,"probs_spam"]
ps_n_sp=aprobs(freq_words)[,"probs_not_spam"]
len = length(tr_msgs[,1])
probs_tr = data.frame("not spam"=vector(length=len), "spam"=vector(length=len))
names(probs_tr)=c("not spam","spam")
for (i in 1:len) {
probs_tr[i,]=estimate_class_probs(freq_words,ps_sp,ps_n_sp,p_spam,p_not_spam,tr_msgs[i,1])
}
head(probs_tr)
## not spam spam
## 1 0.638676 0.02740465
## 2 0.769018 0.23098202
## 3 0.638676 0.02740465
## 4 0.769018 0.23098202
## 5 0.769018 0.23098202
## 6 0.769018 0.23098202
classified_tr=vector(length=len)
for (i in 1:len) {
classified_tr[i]=get_winner(probs_tr[i,])
}
head(classified_tr)
## [1] "not spam" "not spam" "not spam" "not spam" "not spam" "not spam"
table(classified_tr==tr_msgs[,2])["FALSE"]/sum(table(classified_tr==tr_msgs[,2]))
## FALSE
## 0.1562932
ps_sp=aprobs(freq_words)[,"probs_spam"]
ps_n_sp=aprobs(freq_words)[,"probs_not_spam"]
len = length(val_msgs[,1])
probs_val = data.frame("not spam"=vector(length=len), "spam"=vector(length=len))
names(probs_val)=c("not spam","spam")
for (i in 1:len) {
probs_val[i,]=estimate_class_probs(freq_words,ps_sp,ps_n_sp,p_spam,p_not_spam,val_msgs[i,1])
}
head(probs_val)
## not spam spam
## 1 0.7690180 0.230982019
## 2 0.7690180 0.230982019
## 3 0.1604907 0.176751458
## 4 0.7690180 0.230982019
## 5 0.7690180 0.230982019
## 6 0.7359419 0.002483678
classified_val=vector(length=len)
for (i in 1:len) {
classified_val[i]=get_winner(probs_val[i,])
}
table(classified_val==val_msgs[,2])["FALSE"]/len
## FALSE
## 0.1428571
freq_w=findFreqTerms(dtm,30)
error_tr = vector(length=length(freq_w))
for (j in 1:length(freq_w))
{
ps_sp=aprobs(freq_w[1:j])[,"probs_spam"];
ps_n_sp=aprobs(freq_w[1:j])[,"probs_not_spam"];
len = length(tr_msgs[,1]);
probs_tr = data.frame("not spam"=vector(length=len), "spam"=vector(length=len));
names(probs_tr)=c("not spam","spam");
for (i in 1:len) {
probs_tr[i,]=estimate_class_probs(freq_w[1:j],ps_sp,ps_n_sp,p_spam,p_not_spam,tr_msgs[i,1])
}
classified_tr=vector(length=len);
for (i in 1:len) {
classified_tr[i]=get_winner(probs_tr[i,])
}
error_tr[j]=table(classified_tr==tr_msgs[,2])["FALSE"]/len;
}
error_val = vector(length=length(freq_w))
for (j in 1:length(freq_w))
{
ps_sp=aprobs(freq_w[1:j])[,"probs_spam"];
ps_n_sp=aprobs(freq_w[1:j])[,"probs_not_spam"];
len = length(val_msgs[,1]);
probs_val = data.frame("not spam"=vector(length=len), "spam"=vector(length=len));
names(probs_val)=c("not spam","spam");
for (i in 1:len) {
probs_val[i,]=estimate_class_probs(freq_w[1:j],ps_sp,ps_n_sp,p_spam,p_not_spam,val_msgs[i,1])
}
classified_val=vector(length=len);
for (i in 1:len) {
classified_val[i]=get_winner(probs_val[i,])
}
error_val[j]=table(classified_val==val_msgs[,2])["FALSE"]/len;
}
plot(error_tr, type="l",xlab = "Количество слов в словаре", ylab="Ошибка", col="red")
points(error_val,type="l",col="blue")
legend('top',c("Обучающая выборка","Контрольная выборка"),lty=c(1,1),col=c('red','blue'),ncol=1,bty ="n")