Text Data Cleaning and Analysis

Load Required Libraries

# Function to check and install missing packages
install_if_missing <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, dependencies = TRUE)
    library(pkg, character.only = TRUE)
  }
}

# List of packages to load
packages <- c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RWeka")

# Install and load the packages
lapply(packages, install_if_missing)

## Loading required package: tm

## Loading required package: NLP

## Loading required package: SnowballC

## Loading required package: wordcloud

## Loading required package: RColorBrewer

## Warning: package 'RColorBrewer' was built under R version 4.4.1

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL

# Load necessary packages
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RWeka)

1. Download and Read Data

1.1 Download Data

# Set working directory
capstone_dir <- "~/capstone_1/"

if (!dir.exists(capstone_dir)) {
  dir.create(capstone_dir)
}

setwd(capstone_dir)  # Set working directory to the created folder

# Define file URL
file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

# Check if the file exists, download, and unzip if necessary
if (!file.exists("Coursera.zip")) {
  download.file(file_url, destfile = "Coursera.zip", method = "curl")
  unzip("Coursera.zip")
}

1.2 Read Data

# Read data files
# Twitter and blogs files
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)

# Read news file as binary to handle special characters
news_con <- file("final/en_US/en_US.news.txt", open = "rb")
news <- readLines(news_con, encoding = "UTF-8")
close(news_con)

2. Summary Statistics and Data Sampling

2.1 Sample Data

# Set seed for reproducibility
set.seed(5454568)

# Randomly sample 1,000 lines from each dataset
sampleTwitter <- twitter[sample(1:length(twitter), 1000)]
sampleBlogs <- blogs[sample(1:length(blogs), 1000)]
sampleNews <- news[sample(1:length(news), 1000)]

# Combine the sampled data
sampleData <- c(sampleTwitter, sampleBlogs, sampleNews)

# Save sampled data and free up memory
writeLines(sampleData, "sampleData.txt")
rm(twitter, news, blogs, sampleTwitter, sampleNews, sampleBlogs)

2.2 Read and Clean Sample Data

# Read the sampled data
sampleData <- readLines("sampleData.txt", encoding = "UTF-8")

# Create a text corpus
corpus <- VCorpus(VectorSource(sampleData))

# Define a custom content transformer for replacing patterns with spaces
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

# Apply cleaning transformations to the corpus
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

3. Interesting Findings

# Convert corpus to a data frame
corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = FALSE)

# Generate unigrams, bigrams, and trigrams
uniGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
biGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
triGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))

# Order tokens by frequency
unigram <- uniGramToken[order(uniGramToken$Freq, decreasing = TRUE),]
bigram  <- biGramToken[order(biGramToken$Freq, decreasing = TRUE),]
trigram <- triGramToken[order(triGramToken$Freq, decreasing = TRUE),]

# Plot most commonly used words (unigrams)
par(mfrow = c(1, 1))
par(mar = c(5, 4, 2, 0))
barplot(unigram[1:30, 2], 
        names.arg = unigram[1:30, 1], 
        col = "red", 
        main = "Most commonly used Words (Top 30)", 
        las = 2, 
        ylab = "Frequency")

plot of chunk unnamed-chunk-6

# Plot most commonly used two-word combinations (bigrams)
par(mar = c(8.5, 4, 2, 1))
barplot(bigram[1:30, 2], 
        names.arg = bigram[1:30, 1], 
        col = "blue", 
        main = "Most commonly used two word combinations (Top 30)", 
        las = 2, 
        ylab = "Frequency")

plot of chunk unnamed-chunk-6

# Plot most commonly used three-word combinations (trigrams)
par(mar = c(8.5, 4, 2, 1))
barplot(trigram[1:30, 2], 
        names.arg = trigram[1:30, 1], 
        col = "green", 
        main = "Most commonly used three word combinations (Top 30)", 
        las = 2, 
        ylab = "Frequency")

4. Next Steps

Prediction Algorithm: Plan to develop a predictive text model based on n-gram frequency analysis.
Shiny App: The app will provide an interactive interface for text prediction and visualization.
Feedback: Seek input on optimizing the cleaning process and enhancing predictive accuracy.

Text Data Cleaning and Analysis

Belmouidi Mohamed

2025-01-02

Load Required Libraries

1. Download and Read Data

1.1 Download Data

1.2 Read Data

2. Summary Statistics and Data Sampling

2.1 Sample Data

2.2 Read and Clean Sample Data

3. Interesting Findings

4. Next Steps