R Markdown

Introduction

The purpose of this milestone report is to demonstrate exploratory analysis of the text data sets (blogs, news, and Twitter). The ultimate goal is to build a word prediction algorithm and deploy it in a Shiny app.

Example summary (adjust paths if datasets are uploaded)

con <- file("en_US.blogs.txt", "r")

# Read only first 20 lines to get a quick look
example_lines <- readLines(con, n = 20)
close(con)

# Print
head(example_lines)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [2] "We love you Mr. Brown."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
## [5] "With graduation season right around the corner, Nancy has whipped up a fun set to help you out with not only your graduation cards and gifts, but any occasion that brings on a change in one's life. I stamped the images in Memento Tuxedo Black and cut them out with circle Nestabilities. I embossed the kraft and red cardstock with TE's new Stars Impressions Plate, which is double sided and gives you 2 fantastic patterns. You can see how to use the Impressions Plates in this tutorial Taylor created. Just one pass through your die cut machine using the Embossing Pad Kit is all you need to do - super easy!"                                                                                    
## [6] "If you have an alternative argument, let's hear it! :)"

Basic summaries

con <- file("en_US.blogs.txt", "r")

total_lines <- 0
total_chars <- 0

repeat {
  lines <- readLines(con, n = 10000)  # chunk size: 10k lines
  if (length(lines) == 0) break
  total_lines <- total_lines + length(lines)
  total_chars <- total_chars + sum(nchar(lines))
  
  rm(lines); gc()  # free memory after processing
}

close(con)

cat("Total lines:", total_lines, "\n")
## Total lines: 899288
cat("Average line length:", total_chars / total_lines, "characters\n")
## Average line length: 229.987 characters

Exploratory Analysis Line Length Distribution

con <- file("en_US.blogs.txt", "r")
line_lengths <- c()  # initialize empty vector

repeat {
  lines <- readLines(con, n = 10000)
  if (length(lines) == 0) break
  line_lengths <- c(line_lengths, nchar(lines))  # append lengths
  
  rm(lines); gc()  # free memory
}

close(con)

# Plot histogram
hist(line_lengths, breaks = 50, main = "Line Length Distribution", xlab = "Characters per Line")

install.packages("data.table")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

Word Frequency

library(tm) library(wordcloud)

library(data.table)
library(stringr)

word_freq_fast <- function(file_path, chunk_size = 10000) {
  con <- file(file_path, "r")
  freq_dt <- data.table(word = character(), freq = integer())
  
  repeat {
    lines <- readLines(con, n = chunk_size)
    if (length(lines) == 0) break
    
    # Preprocess lines
    lines <- tolower(lines)
    lines <- str_replace_all(lines, "[^a-z\\s]", "")
    
    # Split into words
    words <- unlist(str_split(lines, "\\s+"))
    words <- words[words != ""]
    
    # Count frequencies for this chunk
    chunk_dt <- data.table(word = words)
    chunk_dt <- chunk_dt[, .(freq = .N), by = word]
    
    # Merge with main frequency table safely
    if (nrow(freq_dt) == 0) {
      freq_dt <- chunk_dt
    } else {
      freq_dt <- merge(freq_dt, chunk_dt, by = "word", all = TRUE)
      freq_dt[is.na(freq.x), freq.x := 0]
      freq_dt[is.na(freq.y), freq.y := 0]
      freq_dt[, freq := freq.x + freq.y]
      freq_dt[, c("freq.x", "freq.y") := NULL]
    }
    
    rm(lines, words, chunk_dt); gc()
  }
  
  close(con)
  
  # Sort by frequency
  setorder(freq_dt, -freq)
  return(freq_dt)
}

# Example run
blogs_freq <- word_freq_fast("en_US.blogs.txt")
head(blogs_freq, 20)
##       word    freq
##     <char>   <int>
##  1:    the 1855771
##  2:    and 1086110
##  3:     to 1065698
##  4:      a  896947
##  5:     of  875028
##  6:      i  769495
##  7:     in  593633
##  8:   that  459500
##  9:     is  431834
## 10:     it  400905
## 11:    for  362867
## 12:    you  296855
## 13:   with  286177
## 14:    was  278002
## 15:     on  274047
## 16:     my  270181
## 17:   this  257977
## 18:     as  223359
## 19:   have  218541
## 20:     be  208303
##       word    freq

========================

N-GRAM ANALYSIS

========================

Load required packages

library(stringr)
library(data.table)