R Markdown

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'tibble' was built under R version 4.4.2

## Warning: package 'stringr' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(tidytext)

## Warning: package 'tidytext' was built under R version 4.4.2

library(qdap)

## Warning: package 'qdap' was built under R version 4.4.2

## Loading required package: qdapDictionaries
## Loading required package: qdapRegex

## Warning: package 'qdapRegex' was built under R version 4.4.2

## 
## Attaching package: 'qdapRegex'
## 
## The following object is masked from 'package:dplyr':
## 
##     explain
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
## 
## Loading required package: qdapTools

## Warning: package 'qdapTools' was built under R version 4.4.2

## 
## Attaching package: 'qdapTools'
## 
## The following object is masked from 'package:dplyr':
## 
##     id
## 
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## 
## The following objects are masked from 'package:base':
## 
##     Filter, proportions

library("tm")

## Warning: package 'tm' was built under R version 4.4.2

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 4.4.2

## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:qdap':
## 
##     ngrams
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## 
## 
## Attaching package: 'tm'
## 
## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(stringi)
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.2

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/ ***

# Load data from spamassassin.apache.org/old/publiccorpus/20021010_easy_ham and 
# ```{r}
# download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", destfile = "20021010_easy_ham.tar.bz2")
# download.file(url = "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2", destfile = "20050311_spam_2.tar.bz2")

#read data

spam_directory = "C:/Users/asadn/Desktop/All/SPS/Fall-2024/DATA-607/Assignment/week14-project4/20050311_spam_2"
ham_directory = "C:/Users/asadn/Desktop/All/SPS/Fall-2024/DATA-607/Assignment/week14-project4/20021010_easy_ham"

tidying data

dfun <- function(path, tag){
  files <- list.files(path=path, 
                      full.names=TRUE, 
                      recursive=TRUE)
  email <- lapply(files, function(x) {
    body <- read_file(x)
    })
  email <- unlist(email)
  data <- as.data.frame(email)
  data$tag <- tag
  return (data)
}

ham_data <- dfun(ham_directory, tag="ham") 
spam_data <- dfun(spam_directory, tag="spam")
dfun <- rbind(ham_data, spam_data)
table(dfun$tag)

## 
##  ham spam 
## 2551 1397

remove unnessesary characters

df<-dfun %>%
  mutate(email = str_remove_all(email, pattern = "<.*?>")) %>%
  mutate(email = str_remove_all(email, pattern = "[:digit:]")) %>%
  mutate(email = str_remove_all(email, pattern = "[:punct:]")) %>%
  mutate(email = str_remove_all(email, pattern = "[\n]")) %>%
  mutate(email = str_to_lower(email)) %>%
  unnest_tokens(output=text,input=email,
                token="paragraphs",
                format="text") %>%
  anti_join(stop_words, by=c("text"="word"))

corpus <- VCorpus(VectorSource(df$text))
corpus <- tm_map(corpus, content_transformer(stringi::stri_trans_tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)

make Term Matrix

shuffled <- sample(nrow(df))
df<-df[shuffled,]
df$tag <- as.factor(df$tag)

#Matrix from our data frame

dtmatrix <- DocumentTermMatrix(corpus, control =
                                 list(stemming = TRUE))
dtmatrix <- removeSparseTerms(dtmatrix, 0.999)

#contract document Term Matrix using DocumentTermMatrix()

convert_count_fun <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c(0,1))
  y
}

tmp <- apply(dtmatrix, 2, convert_count_fun)

df_matrix = as.data.frame(as.matrix(tmp))

df_matrix$class = df_matrix$class
str(df_matrix$class)

##  chr [1:3948] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...

#Prediction

prediction <- createDataPartition(df_matrix$class, p=.7, list = FALSE, times = 1)
head(prediction)

##      Resample1
## [1,]         1
## [2,]         2
## [3,]         4
## [4,]         5
## [5,]         6
## [6,]         7

training <- df[prediction,]
testing <- df[-prediction,]

The randomForest

classifier <-  randomForest(x = training, y = training$tag, ntree = 300) 
predicted <-  predict(classifier, newdata = testing)

confusionMatrix(table(predicted,testing$tag))

## Confusion Matrix and Statistics
## 
##          
## predicted ham spam
##      ham  760    0
##      spam   0  423
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9969, 1)
##     No Information Rate : 0.6424     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.6424     
##          Detection Rate : 0.6424     
##    Detection Prevalence : 0.6424     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : ham        
##

Conclusion: In the project, I learn how to read the text from the email and transform it to the data frame. Also, I learned how use the data frame to predicting spam.

Week 13 Project 4 - Document Classification

Md Asaduzzaman

2024-12-08

R Markdown

tidying data

remove unnessesary characters

make Term Matrix

The randomForest