In this assignment, spam emails and non-spam (ham) emails are used to create a model that will determine if other emails are spam or not. To do this, hundreds of emails already characterized as spam or ham were labelled. A portion of the data frame was used as a training set, and the other portion was used as a testing set. The training data was used to make the model. The testing data was used to test the accuracy of the model.
library(tidyr)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.3.2
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "pcorMatrix" of class "replValueSp"; definition not updated
## Package version: 3.3.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
## Warning: package 'quanteda.textmodels' was built under R version 4.3.2
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(readr)
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(rpart)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.2
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.2
When replicating this data, download the files found here:
https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2 https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
The files were too large to upload to Git Hub and use directly from this Rmd file.
Each step is explained in the comments of the R chunk. In the data frames, each column has each line of the emails.
# List all the files needed
not_spam_files <- list.files("spamham/easy_ham")
#not_spam_files <- not_spam_files[95]
spam_files <- list.files("spamham/spam")
# This is a for loop to get the words of all the ham files
# First, it gets the words from each file
# Each column is one file with all its words
# Each row shows a different line of each email
num <- 1
nonSpamEmails <- as.data.frame(col1 <- c(1:3100))
for (file in not_spam_files)
{
del1 <- read.delim(paste0("spamham/easy_ham/", file), row.names = NULL)
names(del1)[1] <- "one"
if (ncol(del1) > 1)
{
names(del1)[2] <- "two"
filename <- paste0("file", num)
del1 <- unite(del1, filename, c(one, two))
}
else
{
names(del1)[1] <- "filename"
}
diffNum <- 3100 - nrow(del1)
del2 <- data.frame(filename = c(rep(NA, diffNum)))
del3 <- rbind(del1, del2)
nonSpamEmails <- cbind(nonSpamEmails, del3)
colnames(nonSpamEmails)[ncol(nonSpamEmails)] <- paste0("file", num)
num <- num + 1
}
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
nonSpamEmails <- nonSpamEmails[,-1]
# This is a for loop to get the words of all the spam files
# First, it gets the words from each file
# Each column is one file with all its words
# Each row shows a different line of each email
num <- 1
spamEmails <- as.data.frame(col1 <- c(1:3100))
for (file in spam_files)
{
del1 <- read.delim(paste0("spamham/spam/", file), row.names = NULL)
names(del1)[1] <- "one"
if (ncol(del1) > 1)
{
names(del1)[2] <- "two"
filename <- paste0("file", num)
del1 <- unite(del1, filename, c(one, two))
}
else
{
names(del1)[1] <- "filename"
}
diffNum <- 3100 - nrow(del1)
del2 <- data.frame(filename = c(rep(NA, diffNum)))
del3 <- rbind(del1, del2)
spamEmails <- cbind(spamEmails, del3)
colnames(spamEmails)[ncol(spamEmails)] <- paste0("file", num)
num <- num + 1
}
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
spamEmails <- spamEmails[,-1]
The training data uses 75% of the spam and 75% of the ham emails. The testing data uses 25% of the spam and 25% of the ham emails. The column named spam is changed to 1 for spam and 0 for ham.
train <- c(spamEmails[,1:375], nonSpamEmails[,1:1913]) # This is the first 75% of the data
test <- c(spamEmails[,376:500], nonSpamEmails[,1913:2551]) # This is the last 25% of the data
train <- dfm(as.character(train))
## Warning: 'dfm.character()' is deprecated. Use 'tokens()' first.
test <- dfm(as.character(test))
## Warning: 'dfm.character()' is deprecated. Use 'tokens()' first.
trainClass <- c(rep(1, 375), rep(0, 1913))
testClass <- c(rep(1, 125), rep(0, 639))
This model did not work, so this model is fully commented. I included this model and others to show what failed.
The model did not work because the predictions did not make sense. They were between 1 and 2 instead of 0 and 1, so it is unclear what the predictions were showing.
model3 <- textmodel_wordscores(train, as.numeric(factor(trainClass)))
# Yes = 2
# No = 1
predictions <- predict(model3, newdata = test)
## Warning: 14947 features in newdata not used in prediction.
summary(predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.155 1.156 1.156 1.163 1.157 1.332
This section formats the data differently because the data needed to be in a different format for the models I wanted to try. In this new format, each column is a word that appears in at least one of the emails, and each row is a file. The cells that line up with a word and file show the count of that word in that file.
# Get lists of all files needed
not_spam_files <- list.files("spamham/easy_ham")
spam_files <- list.files("spamham/spam")
notSpamFrequencies <- as.data.frame(c())
spamFrequencies <- as.data.frame(c())
# For loop to count each word in each email (non-spam)
for (file in not_spam_files)
{
lines <- readLines(paste0("spamham/easy_ham/", file))
lines2 <- paste(lines , collapse =" ")
lines2 <- iconv(lines2, from = "ISO-8859-1", to = "UTF-8")
words <- as.data.frame(strsplit(lines2, " "))
words[words == ''] <- NA
words[words == '...'] <- NA
words <-
words |>
na.omit()
colnames(words)[1] = "one"
# Count each word
num <- 1
for (myWord in words$one)
{
total <-
words |>
filter(one == myWord) |>
count() |>
as.integer()
words$count[num] <- total
num <- num + 1
}
words <- distinct(words)
testing <- words |> pivot_wider(names_from = "one", values_from = "count")
notSpamFrequencies <- bind_rows(notSpamFrequencies, testing)
}
# For loop to count each word in each email (spam)
for (file in spam_files)
{
lines <- readLines(paste0("spamham/spam/", file))
lines2 <- paste(lines , collapse =" ")
lines2 <- iconv(lines2, from = "ISO-8859-1", to = "UTF-8")
words <- as.data.frame(strsplit(lines2, " "))
words[words == ''] <- NA
words[words == '...'] <- NA
words <-
words |>
na.omit()
colnames(words)[1] = "one"
# Count each word
num <- 1
for (myWord in words$one)
{
total <-
words |>
filter(one == myWord) |>
count() |>
as.integer()
words$count[num] <- total
num <- num + 1
}
words <- distinct(words)
testing <- words |> pivot_wider(names_from = "one", values_from = "count")
spamFrequencies <- bind_rows(spamFrequencies, testing)
}
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `...12304` -> `...75`
## Warning in readLines(paste0("spamham/spam/", file)): incomplete final line
## found on 'spamham/spam/0143.260a940290dcb61f9327b224a368d4af'
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## New names:
## • `else......11` -> `else...`
In this section, I showed the model that worked the best. None of the others worked. The accuracy was still very low at around 58%. However, the other models either did not work because of space limits or they resulted in horrible accuracy (less than 1% accuracy). Those are in the last section.
Disclaimer: This sections takes a long time to run, but it does finish eventually. Be patient, and it will work at some point.
# In this section, we want to create a model for new emails
# Add spam to the end
notSpamFrequencies$spam <- c(rep(0, nrow(notSpamFrequencies)))
spamFrequencies$spam <- c(rep(1, nrow(spamFrequencies)))
allFrequencies <- bind_rows(notSpamFrequencies, spamFrequencies)
allFrequencies$spam <- as.factor(allFrequencies$spam)
set.seed(1234)
spl = sample.split(allFrequencies$spam, 0.7)
train <- subset(allFrequencies, spl == TRUE)
test <- subset(allFrequencies, spl == FALSE)
eqnames <- names(train)[names(train) %in% names(test)]
train2 <- train[eqnames]
test2 <- test[eqnames]
train <- na_replace(train, 0)
train2 <- subset(train, select = -c(spam))
rf_classifier <- randomForest(x = train2[1500:2000,],
y = train$spam[1500:2000],
ntree = 100)
rf_classifier
##
## Call:
## randomForest(x = train2[1500:2000, ], y = train$spam[1500:2000], ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 374
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## 0 1 class.error
## 0 287 0 0
## 1 0 214 0
test <- na_replace(test, 0)
test2 <- subset(test, select = -c(spam))
rf_pred <- predict(rf_classifier, newdata = test2[380:880,])
confusionMatrix(table(rf_pred,test$spam[380:880]))
## Confusion Matrix and Statistics
##
##
## rf_pred 0 1
## 0 197 0
## 1 189 115
##
## Accuracy : 0.6228
## 95% CI : (0.5787, 0.6654)
## No Information Rate : 0.7705
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3236
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5104
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.3783
## Prevalence : 0.7705
## Detection Rate : 0.3932
## Detection Prevalence : 0.3932
## Balanced Accuracy : 0.7552
##
## 'Positive' Class : 0
##
In this section, I show the code for other methods I tried. These methods did not work for a few reasons. Mainly, the data frames were too large with too many rows and columns for proper analysis. R Studio could not handle the amount of data involved. I commented out all the sections, but you can uncomment them to test if you would like to see.
#spamCART <- rpart(spam~., data=train, method="class")
#train |>
# keep(~ isTRUE(any((. >= 50)((spam == 1) | (spam == 0)))))
#train |> keep(~ (isTRUE(any(. >= 50) & train$spam < 2)))
#train |> keep(c(~ isTRUE(any(c((. <= 2), (. >= 50))))), spam)
#smallTrain <- train |>
# keep(~ isTRUE(any(. >= 50)))
#spamTrain <- train$spam
#over50 <- merge(spamTrain, smallTrain, 'row.names')
#over50 <- na.omit(over50)
#over50spamCART <- rpart(x~., data=over50, method="class")
#prp(over50spamCART)
#predTrainCART = predict(over50spamCART)[,2]
#predictionTrainCART = prediction(as.numeric(predTrainCART), over50$spam)
#as.numeric(performance(predictionTrainCART, "auc")@y.values)
#over50spamRF <- randomForest(x~., data=over50, na.action=na.omit)
#over50 <- over50[,-1]
#over50$x <- as.numeric(over50$x)
#spamLog = glm(x~., data=over50, family="binomial", na.action = na.omit)
#predTrainLog = predict(spamLog, type="response")
I used all these sources. Some of them I just glanced at. Some of them I used to create the models. The first two websites helped a lot.
https://rpubs.com/anilcs13m/126170 https://rpubs.com/Seun/455974 https://kharshit.github.io/blog/2017/08/25/email-spam-filtering-text-analysis-in-r http://www.sthda.com/english/wiki/reading-data-from-txt-csv-files-r-base-functions http://thinkagile.net/easily-import-multiple-files-into-r/ https://stackoverflow.com/questions/8854046/duplicate-row-names-are-not-allowed-error https://www.statology.org/r-combine-two-columns-into-one/ https://www.r-bloggers.com/2023/08/a-handy-guide-to-read-delim-in-r-unraveling-the-magic-of-reading-tabular-data/ https://www.datanovia.com/en/lessons/rename-data-frame-columns-in-r/ https://sparkbyexamples.com/r-programming/add-empty-column-to-dataframe-in-r/ https://cmdlinetips.com/2021/03/tips-to-add-columns-to-a-dataframe-with-add_column/ https://stackoverflow.com/questions/21781596/refer-to-the-last-column-in-r https://github.com/r-lib/rlang/issues/1300 https://stackoverflow.com/questions/40399229/cbind-2-dataframes-with-different-number-of-rows https://www.r-bloggers.com/2022/07/how-to-use-mutate-function-in-r/ https://www.geeksforgeeks.org/insert-multiple-rows-in-r-dataframe/ https://quanteda.io/reference/textmodel_nb.html https://www.rdocumentation.org/packages/quanteda/versions/1.5.0/topics/dfm https://stackoverflow.com/questions/15375483/r-put-all-elements-of-a-vector-into-one-element-without-paste https://stackoverflow.com/questions/6437164/removing-empty-rows-of-a-data-file-in-r https://kharshit.github.io/blog/2017/08/25/email-spam-filtering-text-analysis-in-r https://tidyr.tidyverse.org/reference/pivot_wider.html https://www.storybench.org/pivoting-data-from-columns-to-rows-and-back-in-the-tidyverse/ https://www.statology.org/pivot_wider-r/ https://dplyr.tidyverse.org/reference/distinct.html https://stackoverflow.com/questions/3402371/combine-two-data-frames-by-rows-rbind-when-they-have-different-sets-of-columns https://www.r-bloggers.com/2022/09/error-in-rbinddeparse-level-numbers-of-columns-of-arguments-do-not-match-2/#google_vignette https://stackoverflow.com/questions/76680882/unable-to-translate-to-a-wide-string https://www.rdocumentation.org/packages/randomForest/versions/4.7-1.1/topics/randomForest https://sparkbyexamples.com/r-programming/replace-na-values-with-zero-in-r-dataframe/ https://stackoverflow.com/questions/31385886/how-to-only-keep-the-columns-with-same-names-between-two-data-frames https://www.listendata.com/2015/06/r-keep-drop-columns-from-data-frame.html