Absrtact

This is the Milestone Report for the Coursera Data Science Capstone project. In this report I would conduct exploratory analysis on a cleaned sample of the data.

Loading and Unzipping Data

First, we will load the data and unzip the folder

# Unzip file
if(!file.exists("~/Coding Projects/John hopkins Class Project/final")){
  unzip(zipfile="~/Coding Projects/John hopkins Class Project/Coursera-SwiftKey.zip",exdir="~/Coding Projects/John hopkins Class Project")
}

Next, this is a large dataset so we will read line by line only the necessary amount of data. We will focus on the english data which is the en_US, which consist of news, blog, and twitter text data.

path <- file.path("~/Coding Projects/John hopkins Class Project/final" , "en_US")
files<-list.files(path, recursive=TRUE)

# file connection of the twitter data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.twitter.txt", "r") 
Twitter<-readLines(con, skipNul = TRUE)
close(con)

# file connection of the blog data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.blogs.txt", "r") 
Blogs<-readLines(con, skipNul = TRUE)
close(con)

# file connection of the news data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.news.txt", "r") 
News<-readLines(con, skipNul = TRUE)
close(con)

Cleaning the Data

Next, we will clean a sample of the text datasetsas as a demonstration since the datasets are very large.

library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.3.3
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.3
library(textclean)
## Warning: package 'textclean' was built under R version 4.3.3
set.seed(123)  # for reproducibility

# Sample 1000 lines from each dataset
twitter_sample <- sample(Twitter, 1000)
blogs_sample   <- sample(Blogs, 1000)
news_sample    <- sample(News, 1000)

# Combine into one text vector
text_sample <- c(twitter_sample, blogs_sample, news_sample)

clean_text <- function(text) {
  text <- tolower(text)                        # Convert to lowercase
  text <- replace_contraction(text)            # Expand contractions (e.g., don't -> do not)
  text <- replace_symbol(text)                 # Replace symbols (e.g., & -> and)
  text <- str_replace_all(text, "http\\S+\\s*", "") # Remove URLs
  text <- str_replace_all(text, "[^a-z\\s]", "")    # Remove punctuation and numbers
  text <- stripWhitespace(text)                # Remove extra whitespace
  text <- removeWords(text, stopwords("en"))   # Remove common stopwords
  text <- text[text != ""]                     # Remove empty lines
  return(text)
}

cleaned_text <- clean_text(text_sample)
head(cleaned_text, 10)
##  [1] "just wanted  thank   ask  got  started   mission"                                                             
##  [2] "right   thought   done  ran  sugar   last dessert"                                                            
##  [3] " tell ion gaf   test  tolerance"                                                                              
##  [4] "mayfly wish    "                                                                                              
##  [5] "follow  tho   can dm"                                                                                         
##  [6] "sorry  earlier   asleep   going  set  screen"                                                                 
##  [7] " hope  let    day   work tomorrow"                                                                            
##  [8] "good night ap test tomorrow "                                                                                 
##  [9] "thanks   follow"                                                                                              
## [10] "md house ways  means committee approves bill   authorize statewide referendum  adding table games   pg casino"

Exploratory Analysis

Next we will conduct some exploratory analysis on the data to get an idea of the data we are working with.

library(quanteda) 
library(quanteda.textstats)
library(tidytext)     
library(dplyr)        
library(ggplot2)    

# Create a corpus and tokenize into words
tokens <- tokens(cleaned_text, remove_punct = TRUE, remove_numbers = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_remove(tokens, stopwords("en"))
# Word Frequency Graph

dfm_uni <- dfm(tokens)
freq_uni <- textstat_frequency(dfm_uni)

# Top 20 most common words
head(freq_uni, 20)
##    feature frequency rank docfreq group
## 1     said       288    1     265   all
## 2      can       265    2     231   all
## 3      one       239    3     204   all
## 4     like       213    4     190   all
## 5     just       191    5     174   all
## 6     time       175    6     160   all
## 7      get       154    7     141   all
## 8   number       152    8     129   all
## 9       us       145    9     122   all
## 10     new       137   10     128   all
## 11     now       137   10     122   all
## 12    also       134   12     117   all
## 13   first       133   13     120   all
## 14    good       131   14     122   all
## 15  people       128   15     107   all
## 16    know       128   15     109   all
## 17   going       124   17     112   all
## 18  dollar       123   18      76   all
## 19     two       117   19     105   all
## 20     see       116   20      98   all
# Plot
ggplot(freq_uni[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words",
       x = "Words", y = "Frequency")

#2 gram frequency graph

tokens_2gram <- tokens_ngrams(tokens, n = 2)
dfm_2gram <- dfm(tokens_2gram)
freq_2gram <- textstat_frequency(dfm_2gram)

# Top 20 bigrams
head(freq_2gram, 20)
##           feature frequency rank docfreq group
## 1  dollar_million        22    1      17   all
## 2       last_year        16    2      16   all
## 3      first_time        14    3      14   all
## 4     high_school        13    4      13   all
## 5       right_now        13    4      13   all
## 6   united_states        13    4      12   all
## 7       feel_like        11    7      11   all
## 8     even_though        11    7      10   all
## 9        new_york        10    9      10   all
## 10         let_us        10    9      10   all
## 11      years_ago        10    9      10   all
## 12        can_see        10    9       9   all
## 13     basal_diet        10    9       1   all
## 14      last_week         9   14       9   all
## 15        g_basal         9   14       1   all
## 16  san_francisco         9   14       9   all
## 17    many_people         8   17       8   all
## 18      one_thing         8   17       8   all
## 19       can_find         8   17       8   all
## 20     last_night         8   17       8   all
# Plot
ggplot(freq_2gram[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Bigrams",
       x = "2-grams", y = "Frequency")

#3 gram frequency graph

tokens_3gram <- tokens_ngrams(tokens, n = 3)
dfm_3gram <- dfm(tokens_3gram)
freq_3gram <- textstat_frequency(dfm_3gram)

# Top 20 trigrams
head(freq_3gram, 20)
##                    feature frequency rank docfreq group
## 1             g_basal_diet         9    1       1   all
## 2             diet_g_basal         6    2       1   all
## 3           weed_weed_weed         4    3       1   all
## 4         metal_gear_solid         4    3       1   all
## 5                let_us_go         3    5       3   all
## 6             can_wait_see         3    5       3   all
## 7              mgkg_diet_g         3    5       1   all
## 8               gkg_diet_g         3    5       1   all
## 9           time_next_year         3    5       1   all
## 10           next_year_ill         3    5       1   all
## 11 virginia_north_carolina         3    5       1   all
## 12           george_w_bush         3    5       3   all
## 13       dylan_carter_kick         3    5       1   all
## 14  president_barack_obama         3    5       3   all
## 15         best_friend_one         2   15       2   all
## 16          new_york_times         2   15       2   all
## 17             u_r_amazing         2   15       2   all
## 18           can_find_path         2   15       2   all
## 19     find_path_obstacles         2   15       2   all
## 20 path_obstacles_probably         2   15       2   all
# Plot
ggplot(freq_3gram[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(title = "Top 20 Most Frequent Trigrams",
       x = "3-grams", y = "Frequency")