This report summarizes the progress in developing a predictive text mining application using English language datasets. The goal is to predict the next word in a sentence based on previous words, using tokenization, profanity filtering, and n-gram modeling techniques.
# Function for Tokenization and Profanity Filtering
tokenize_and_filter <- function(file_path, profanity_list) {
# Read the data
lines <- readLines(file_path, warn = FALSE)
# Tokenization
tokens <- unlist(strsplit(lines, "\\s"))
tokens <- tokens[tokens != ""] # Remove empty tokens
# Profanity Filtering
filtered_tokens <- tokens[!tolower(tokens) %in% profanity_list]
return(filtered_tokens)
}
# Define a list of profane words (example: c("badword1", "badword2"))
profanity_list <- c("shit", "idiot", "fool", "dumb", "moron", "cunt", "crap", "fuck")
# Usage example
file_path <- "E:/Sync/Coursera/Johns Hopkins Data Science Specialization/Data Science Capstone/final/en_US/en_US.twitter.txt" # Replace with actual file path
clean_tokens <- tokenize_and_filter(file_path, profanity_list)
# Function to Perform Exploratory Analysis
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
explore_data <- function(tokens) {
# Frequency of words
word_freq <- table(tokens)
word_freq_df <- as.data.frame(word_freq, stringsAsFactors = FALSE)
colnames(word_freq_df) <- c("Word", "Frequency")
word_freq_df <- word_freq_df %>% arrange(desc(Frequency))
# Plotting Word Frequency
ggplot(head(word_freq_df, 20), aes(x = reorder(Word, Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 20 Word Frequencies", x = "Words", y = "Frequency")
# 2-grams
bigrams <- paste(tokens[-length(tokens)], tokens[-1], sep = " ")
bigram_freq <- sort(table(bigrams), decreasing = TRUE)
# 3-grams
trigram_indices <- 1:(length(tokens)-2)
trigrams <- paste(tokens[trigram_indices], tokens[trigram_indices + 1], tokens[trigram_indices + 2], sep = " ")
trigram_freq <- sort(table(trigrams), decreasing = TRUE)
list(word_freq = word_freq_df, bigram_freq = bigram_freq, trigram_freq = trigram_freq)
}
# Usage
# Ensure clean_tokens is defined and contains the tokenized data
analysis_results <- explore_data(clean_tokens)
###N-gram Modeling
library(tidytext)
library(stringr)
create_ngrams <- function(tokens, n) {
tokens_df <- data.frame(text = paste(tokens, collapse = " "), stringsAsFactors = FALSE)
tokenized <- tokens_df %>%
unnest_tokens(input = text, output = word, token = "words")
ngrams <- tokenized %>%
mutate(linenumber = row_number()) %>%
group_by(linenumber) %>%
summarise(ngram = paste(word, collapse = " ")) %>%
ungroup() %>%
filter(str_count(ngram, " ") == (n-1))
ngram_freq <- ngrams %>%
count(ngram, sort = TRUE)
return(ngram_freq)
}
bigram_model <- create_ngrams(clean_tokens, 2)
The exploratory analysis provided valuable insights into the structure and frequency of English text. Future work includes: