notice

Because my computer cannot open the complete txt file, I took the first 755 lines of text in en_US.twitter, a total of 52159 characters, to build the model.:

#Load necessary libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
# Read the data file
data <- readLines("data.txt")
## Warning in readLines("data.txt"): incomplete final line found on 'data.txt'
# Tokenization function
tokenize <- function(text) {
  tokens <- unlist(strsplit(text, "\\W+"))
  tokens <- tolower(tokens)
  return(tokens)
}

# Function to remove profanity and unwanted words
remove_unwanted <- function(tokens) {
  clean_tokens <- tokens[!tokens %in% c("damn", "sh*t", "f*ck", "b*tch")]
  return(clean_tokens)
}

# Apply tokenization and profanity removal
tokenized_data <- lapply(data, tokenize)
clean_data <- lapply(tokenized_data, remove_unwanted)

# Basic summary
num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Plot histogram of word frequencies
word_freq <- table(unlist(clean_data))
word_freq_df <- data.frame(Word = names(word_freq), Frequency = as.numeric(word_freq))
word_freq_df <- word_freq_df[order(word_freq_df$Frequency, decreasing = TRUE), ]
top_words <- head(word_freq_df, 10)

ggplot(top_words, aes(x = reorder(Word, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Word", y = "Frequency", title = "Top 10 Most Frequent Words")

num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Print basic summary
basic_summary
##   Num_Words Num_Lines
## 1      9787       755
# Flatten the list of tokens into a single vector
all_tokens <- unlist(clean_data)

# Create a corpus and term-document matrix
corpus <- Corpus(VectorSource(all_tokens))
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
word_freqs_df <- data.frame(word = names(word_freqs), freq = word_freqs)

# Plot the word cloud
set.seed(1234)
wordcloud(words = word_freqs_df$word, freq = word_freqs_df$freq, min.freq = 1,
          max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

Because my computer cannot open the complete txt file, I took the first 1073 lines of text in en_US.news, a total of 212692 characters, to build the model.:

# Load necessary libraries
library(tidyverse)
library(tm)
library(wordcloud)
# Read the data file
data <- readLines("data2.txt")
## Warning in readLines("data2.txt"): incomplete final line found on 'data2.txt'
# Tokenization function
tokenize <- function(text) {
  tokens <- unlist(strsplit(text, "\\W+"))
  tokens <- tolower(tokens)
  return(tokens)
}

# Function to remove profanity and unwanted words
remove_unwanted <- function(tokens) {
  clean_tokens <- tokens[!tokens %in% c("damn", "sh*t", "f*ck", "b*tch")]
  return(clean_tokens)
}

# Apply tokenization and profanity removal
tokenized_data <- lapply(data, tokenize)
clean_data <- lapply(tokenized_data, remove_unwanted)

# Basic summary
num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Plot histogram of word frequencies
word_freq <- table(unlist(clean_data))
word_freq_df <- data.frame(Word = names(word_freq), Frequency = as.numeric(word_freq))
word_freq_df <- word_freq_df[order(word_freq_df$Frequency, decreasing = TRUE), ]
top_words <- head(word_freq_df, 10)

ggplot(top_words, aes(x = reorder(Word, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Word", y = "Frequency", title = "Top 10 Most Frequent Words")

num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Print basic summary
basic_summary
##   Num_Words Num_Lines
## 1     37389      1073
# Flatten the list of tokens into a single vector
all_tokens <- unlist(clean_data)

# Create a corpus and term-document matrix
corpus <- Corpus(VectorSource(all_tokens))
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
word_freqs_df <- data.frame(word = names(word_freqs), freq = word_freqs)

# Plot the word cloud
set.seed(1234)
wordcloud(words = word_freqs_df$word, freq = word_freqs_df$freq, min.freq = 1,
          max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))

Because my computer cannot open the complete txt file, I took the first 1216 lines of text in en_US.blog, a total of 282416 characters, to build the model.:

# Load necessary libraries
library(tidyverse)
library(tm)
library(wordcloud)
# Read the data file
data <- readLines("data3.txt")
## Warning in readLines("data3.txt"): incomplete final line found on 'data3.txt'
# Tokenization function
tokenize <- function(text) {
  tokens <- unlist(strsplit(text, "\\W+"))
  tokens <- tolower(tokens)
  return(tokens)
}

# Function to remove profanity and unwanted words
remove_unwanted <- function(tokens) {
  clean_tokens <- tokens[!tokens %in% c("damn", "sh*t", "f*ck", "b*tch")]
  return(clean_tokens)
}

# Apply tokenization and profanity removal
tokenized_data <- lapply(data, tokenize)
clean_data <- lapply(tokenized_data, remove_unwanted)

# Basic summary
num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Plot histogram of word frequencies
word_freq <- table(unlist(clean_data))
word_freq_df <- data.frame(Word = names(word_freq), Frequency = as.numeric(word_freq))
word_freq_df <- word_freq_df[order(word_freq_df$Frequency, decreasing = TRUE), ]
top_words <- head(word_freq_df, 10)

ggplot(top_words, aes(x = reorder(Word, -Frequency), y = Frequency)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(x = "Word", y = "Frequency", title = "Top 10 Most Frequent Words")

num_words <- sum(lengths(clean_data))
num_lines <- length(data)
basic_summary <- data.frame(Num_Words = num_words, Num_Lines = num_lines)

# Print basic summary
basic_summary
##   Num_Words Num_Lines
## 1     52519      1216
# Flatten the list of tokens into a single vector
all_tokens <- unlist(clean_data)