This is the Milestone Report for the Coursera Data Science Capstone project. In this report I would conduct exploratory analysis on a cleaned sample of the data.
First, we will load the data and unzip the folder
# Unzip file
if(!file.exists("~/Coding Projects/John hopkins Class Project/final")){
unzip(zipfile="~/Coding Projects/John hopkins Class Project/Coursera-SwiftKey.zip",exdir="~/Coding Projects/John hopkins Class Project")
}
Next, this is a large dataset so we will read line by line only the necessary amount of data. We will focus on the english data which is the en_US, which consist of news, blog, and twitter text data.
path <- file.path("~/Coding Projects/John hopkins Class Project/final" , "en_US")
files<-list.files(path, recursive=TRUE)
# file connection of the twitter data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.twitter.txt", "r")
Twitter<-readLines(con, skipNul = TRUE)
close(con)
# file connection of the blog data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.blogs.txt", "r")
Blogs<-readLines(con, skipNul = TRUE)
close(con)
# file connection of the news data set
con <- file("~/Coding Projects/John hopkins Class Project/final/en_US/en_US.news.txt", "r")
News<-readLines(con, skipNul = TRUE)
close(con)
Next, we will clean a sample of the text datasetsas as a demonstration since the datasets are very large.
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.3.3
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.3
library(textclean)
## Warning: package 'textclean' was built under R version 4.3.3
set.seed(123) # for reproducibility
# Sample 1000 lines from each dataset
twitter_sample <- sample(Twitter, 1000)
blogs_sample <- sample(Blogs, 1000)
news_sample <- sample(News, 1000)
# Combine into one text vector
text_sample <- c(twitter_sample, blogs_sample, news_sample)
clean_text <- function(text) {
text <- tolower(text) # Convert to lowercase
text <- replace_contraction(text) # Expand contractions (e.g., don't -> do not)
text <- replace_symbol(text) # Replace symbols (e.g., & -> and)
text <- str_replace_all(text, "http\\S+\\s*", "") # Remove URLs
text <- str_replace_all(text, "[^a-z\\s]", "") # Remove punctuation and numbers
text <- stripWhitespace(text) # Remove extra whitespace
text <- removeWords(text, stopwords("en")) # Remove common stopwords
text <- text[text != ""] # Remove empty lines
return(text)
}
cleaned_text <- clean_text(text_sample)
head(cleaned_text, 10)
## [1] "just wanted thank ask got started mission"
## [2] "right thought done ran sugar last dessert"
## [3] " tell ion gaf test tolerance"
## [4] "mayfly wish "
## [5] "follow tho can dm"
## [6] "sorry earlier asleep going set screen"
## [7] " hope let day work tomorrow"
## [8] "good night ap test tomorrow "
## [9] "thanks follow"
## [10] "md house ways means committee approves bill authorize statewide referendum adding table games pg casino"
Next we will conduct some exploratory analysis on the data to get an idea of the data we are working with.
library(quanteda)
library(quanteda.textstats)
library(tidytext)
library(dplyr)
library(ggplot2)
# Create a corpus and tokenize into words
tokens <- tokens(cleaned_text, remove_punct = TRUE, remove_numbers = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_remove(tokens, stopwords("en"))
# Word Frequency Graph
dfm_uni <- dfm(tokens)
freq_uni <- textstat_frequency(dfm_uni)
# Top 20 most common words
head(freq_uni, 20)
## feature frequency rank docfreq group
## 1 said 288 1 265 all
## 2 can 265 2 231 all
## 3 one 239 3 204 all
## 4 like 213 4 190 all
## 5 just 191 5 174 all
## 6 time 175 6 160 all
## 7 get 154 7 141 all
## 8 number 152 8 129 all
## 9 us 145 9 122 all
## 10 new 137 10 128 all
## 11 now 137 10 122 all
## 12 also 134 12 117 all
## 13 first 133 13 120 all
## 14 good 131 14 122 all
## 15 people 128 15 107 all
## 16 know 128 15 109 all
## 17 going 124 17 112 all
## 18 dollar 123 18 76 all
## 19 two 117 19 105 all
## 20 see 116 20 98 all
# Plot
ggplot(freq_uni[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Words", y = "Frequency")
#2 gram frequency graph
tokens_2gram <- tokens_ngrams(tokens, n = 2)
dfm_2gram <- dfm(tokens_2gram)
freq_2gram <- textstat_frequency(dfm_2gram)
# Top 20 bigrams
head(freq_2gram, 20)
## feature frequency rank docfreq group
## 1 dollar_million 22 1 17 all
## 2 last_year 16 2 16 all
## 3 first_time 14 3 14 all
## 4 high_school 13 4 13 all
## 5 right_now 13 4 13 all
## 6 united_states 13 4 12 all
## 7 feel_like 11 7 11 all
## 8 even_though 11 7 10 all
## 9 new_york 10 9 10 all
## 10 let_us 10 9 10 all
## 11 years_ago 10 9 10 all
## 12 can_see 10 9 9 all
## 13 basal_diet 10 9 1 all
## 14 last_week 9 14 9 all
## 15 g_basal 9 14 1 all
## 16 san_francisco 9 14 9 all
## 17 many_people 8 17 8 all
## 18 one_thing 8 17 8 all
## 19 can_find 8 17 8 all
## 20 last_night 8 17 8 all
# Plot
ggplot(freq_2gram[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Most Frequent Bigrams",
x = "2-grams", y = "Frequency")
#3 gram frequency graph
tokens_3gram <- tokens_ngrams(tokens, n = 3)
dfm_3gram <- dfm(tokens_3gram)
freq_3gram <- textstat_frequency(dfm_3gram)
# Top 20 trigrams
head(freq_3gram, 20)
## feature frequency rank docfreq group
## 1 g_basal_diet 9 1 1 all
## 2 diet_g_basal 6 2 1 all
## 3 weed_weed_weed 4 3 1 all
## 4 metal_gear_solid 4 3 1 all
## 5 let_us_go 3 5 3 all
## 6 can_wait_see 3 5 3 all
## 7 mgkg_diet_g 3 5 1 all
## 8 gkg_diet_g 3 5 1 all
## 9 time_next_year 3 5 1 all
## 10 next_year_ill 3 5 1 all
## 11 virginia_north_carolina 3 5 1 all
## 12 george_w_bush 3 5 3 all
## 13 dylan_carter_kick 3 5 1 all
## 14 president_barack_obama 3 5 3 all
## 15 best_friend_one 2 15 2 all
## 16 new_york_times 2 15 2 all
## 17 u_r_amazing 2 15 2 all
## 18 can_find_path 2 15 2 all
## 19 find_path_obstacles 2 15 2 all
## 20 path_obstacles_probably 2 15 2 all
# Plot
ggplot(freq_3gram[1:20,], aes(x = reorder(feature, frequency), y = frequency)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 20 Most Frequent Trigrams",
x = "3-grams", y = "Frequency")