library(tm)
## Warning: package 'tm' was built under R version 4.4.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.4.2
library(stringi)
## Warning: package 'stringi' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file(url, destfile = "Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul
## = TRUE): incomplete final line found on 'final/en_US/en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
stats <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter)))
)
print(stats)
## File Lines Words
## 1 Blogs 899288 37546806
## 2 News 77259 2674561
## 3 Twitter 2360148 30096690
set.seed(123)
sample_data <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
# Create a Corpus and clean it
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
# Note: For prediction, we usually keep stop words to maintain sentence flow.
# Create a Document Term Matrix to get word frequencies
dtm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix), decreasing = TRUE)
df_words <- data.frame(word = names(words), freq = words)
# Sampling and Cleaning
set.seed(42)
sample_text <- c(sample(blogs, length(blogs)*0.01),
sample(news, length(news)*0.01),
sample(twitter, length(twitter)*0.01))
# Create Tidy Data Frame for N-grams
library(tidytext)
sample_df <- data.frame(text = sample_text, stringsAsFactors = FALSE)
# Extracting Bigrams
bigrams <- sample_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE) %>%
filter(!is.na(bigram))
# 1. Download a common profanity list (using a public repository)
profanity_url <- "https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
profanity_file <- "profanity_list.txt"
if (!file.exists(profanity_file)) {
download.file(profanity_url, destfile = profanity_file)
}
# 2. Read the list into R
profanity_words <- readLines(profanity_file, warn = FALSE, encoding = "UTF-8")
# 3. Apply the filter to your Corpus
library(tm)
clean_corpus <- tm_map(corpus, removeWords, profanity_words)
print("Profanity filtering complete.")
## [1] "Profanity filtering complete."
# 4. Load libraries for styling and define paths
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(stringi)
file_paths <- c(blogs = "final/en_US/en_US.blogs.txt",
news = "final/en_US/en_US.news.txt",
twitter = "final/en_US/en_US.twitter.txt")
# 5. Create "file_info"
file_info <- data.frame(
Line_Count = c(length(blogs), length(news), length(twitter)),
Word_Count = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter)))
)
# 6. Prepare the data
summary_table <- data.frame(
Source = c("Blogs", "News", "Twitter"),
`File Size (MB)` = round(sapply(file_paths, function(x) file.info(x)$size / 1024^2), 2),
`Line Count` = format(file_info$Line_Count, big.mark = ","),
`Word Count` = format(file_info$Word_Count, big.mark = ","),
`Avg Words/Line` = round(file_info$Word_Count / file_info$Line_Count, 2)
)
| Source | File.Size..MB. | Line.Count | Word.Count | Avg.Words.Line | |
|---|---|---|---|---|---|
| blogs | Blogs | 200.42 | 899,288 | 37,546,806 | 41.75 |
| news | News | 196.28 | 77,259 | 2,674,561 | 34.62 |
| 159.36 | 2,360,148 | 30,096,690 | 12.75 |