The dataset is fairly large for a course project. For computational efficiency, we randomly sample 10% of the data from each of the three files.
# Set a seed for reproducibilityset.seed(666)# Calculate 10% of each dataset's lengthsample_size_tweets <-round(length(twitter) *0.10)sample_size_blogs <-round(length(blogs) *0.10)sample_size_news <-round(length(news) *0.10)# Randomly sample 10% of each datasetsampled_tweets <-sample(twitter, size = sample_size_tweets, replace =FALSE)sampled_blogs <-sample(blogs, size = sample_size_blogs, replace =FALSE)sampled_news <-sample(news, size = sample_size_news, replace =FALSE)# Combine the sampled datasets into one objectsampled_data <-c(sampled_tweets, sampled_blogs, sampled_news)# Check the resultlength(sampled_data)
[1] 426968
# Show a few lineshead(sampled_data)
[1] "\"Happy Mother's Day\""
[2] "Love can't always go the way you want it to."
[3] "merry xmas everyone"
[4] "I just have the best friends EVER"
[5] "When your work speaks for itself, don't interrupt. ~ Henry J. Kaiser"
[6] "Looking for Great Holiday Program Staff in NY, San Fran, Orlando, Miami, Atlanta, Seattle, Boston and Chicago! email ASAP"
We further split the sampled data into 80% training set, 10% validation set, and 10% reserved test set.
# Set a seed for reproducibilityset.seed(666)# Shuffle the datasampled_data <-sample(sampled_data)# Calculate indices for splittingtotal_length <-length(sampled_data)train_end <-round(total_length *0.80)valid_end <- train_end +round(total_length *0.10)# Split into training, validation, and testingtrain_data <- sampled_data[1:train_end]valid_data <- sampled_data[(train_end +1):valid_end]test_data <- sampled_data[(valid_end +1):total_length]# Check the lengths of each splitlength(train_data) # Should be around 80% of the original length
[1] 341574
length(valid_data) # Should be around 10% of the original length
[1] 42697
length(test_data) # Should be around 10% of the original length
[1] 42697
Clean all three sets in the same way
p_load(tm)p_load(stringr)clean_text_data <-function(input_data, banned_words) {# Ensure inputs are character vectorsif (!is.character(input_data)) {stop("input_data must be a character vector containing the text data.") }if (!is.character(banned_words)) {stop("banned_words must be a character vector containing the words to remove.") }# Create a text corpus from input data corpus <-Corpus(VectorSource(input_data))# Custom content transformer that combines multiple steps clean_corpus <-content_transformer(function(text) { text <-iconv(text, to ="UTF-8", sub ="byte") text <-tolower(text) text <-gsub("https?://\\S+", " ", text) text <-removePunctuation(text, preserve_intra_word_dashes =TRUE, preserve_intra_word_contractions=TRUE) text <-removeNumbers(text) text <-stripWhitespace(text) text <-removeWords(text, stopwords("english")) text <-removeWords(text, banned_words) text <-gsub("\\b[a-z]\\b", " ", text) text <-gsub("[[:space:]]+", " ", text) text <-gsub("\\W+", " ", text) # Replace non-words with space text <-gsub("\\b\\w{1,2}\\b", "", text) # Remove short words text <-gsub('"', '', text) text <-gsub("^\\s+|\\s+$", "", text) # Trim whitespace text <-stemDocument(text)return(text) })# Apply the custom transformation to the corpus corpus <-tm_map(corpus, clean_corpus)# Convert corpus to character vector cleaned_text <-sapply(corpus, as.character)return(cleaned_text)}# Clean the datatrain_data_clean <-clean_text_data(train_data, badwords)
Warning in tm_map.SimpleCorpus(corpus, clean_corpus): transformation drops
documents
Warning in tm_map.SimpleCorpus(corpus, clean_corpus): transformation drops
documents
# Chekc Cleaned datahead(train_data_clean)
[1] "suntrup sack four time threw one intercept"
[2] "love short link awesm"
[3] "continu later second quarter fumbl thoma morstead yard punt giant yard line crisi avert amukamara pounc ball giant yard line"
[4] "can follow plz"
[5] "nope easi prove know fact small talk prove critic thinker"
[6] "record session may stop lovin music lilli tonight come get tast"
head(valid_data_clean)
[1] "even know ugli life easier"
[2] "back englemann talk masteri spiral frame classroom environ say danger teacher focus student master part know subject lipe book spoke statist say heard mom long ago usa parent concern teacher care student china concern teacher know master teach return englemann paper read critic school teacher give student test result poor alway put blame student tri test learn disord claim student learn taught sharpli point fact find provoc true student fail teacher fail talk teacher present lesson sweden state easi teach masteri teacher just wast opportun deliv great lesson ask question"
[3] "hanley"
[4] "good morn shopper merchant breakfast"
[5] "klinenberg touch deep intang emot issu share regardless live find partner write enough solv social pain loneli fundament part human experi also note live alon choic circumst poor ill mani case elder solitud can crush pain burden bring none posit aspect singl life"
[6] "find beepur check walmart local honey"
head(test_data_clean)
[1] "moon"
[2] "serious high yearbook"
[3] "interest thing time complet contradict fact live pari made individu differ anyon els continu argentina buy shirt shoe trouser everybodi els pari flea market buy dress one els"
[4] "man never get old"
[5] "will now ship finland will open"
[6] "hold back think"
Tokenize all sets in preparation for ngrams
# Load required packagesp_load(quanteda)p_load(quanteda.textstats)generate_ngrams <-function(Corpus, N) { tokens <-tokens(Corpus) ngrams <-tokens_ngrams(tokens, n = N, concatenator =" ", skip =0)# Filter out n-grams with repeated words ngrams <-tokens_select(ngrams, pattern ="^(?!.*(\\b\\w+\\b).*\\1).*$", valuetype ="regex", case_insensitive =TRUE) dfm <-dfm(ngrams)return(dfm)}# Generate unigrams, bigrams, and trigrams for each datasettrain_unigrams <-generate_ngrams(train_data_clean, 1) train_bigrams <-generate_ngrams(train_data_clean, 2)train_trigrams <-generate_ngrams(train_data_clean, 3)train_fourgrams <-generate_ngrams(train_data_clean, 4)valid_unigrams <-generate_ngrams(valid_data_clean, 1)valid_bigrams <-generate_ngrams(valid_data_clean, 2)valid_trigrams <-generate_ngrams(valid_data_clean, 3)valid_fourgrams <-generate_ngrams(valid_data_clean, 4)test_unigrams <-generate_ngrams(test_data_clean, 1)test_bigrams <-generate_ngrams(test_data_clean, 2)test_trigrams <-generate_ngrams(test_data_clean, 3)test_fourgrams <-generate_ngrams(test_data_clean, 4)
Exploratory Data Analysis
Let’s take a look at the top 10 features for each gram in the training set
# Function to count the top n most common n-gramsp_load(quanteda.textstats)count_top_ngrams <-function(tokens_df, top_n =10) { tokens_df %>%textstat_frequency() %>%head(top_n)}# Display the top 10 most common unigrams, bigrams, trigrams, and 4-grams for training datatop_train_unigrams <-count_top_ngrams(train_unigrams, top_n =10)top_train_bigrams <-count_top_ngrams(train_bigrams, top_n =10)top_train_trigrams <-count_top_ngrams(train_trigrams, top_n =10)top_train_fourgrams <-count_top_ngrams(train_fourgrams, top_n =10)# Print the top resultslist(Top_Train_Unigrams = top_train_unigrams,Top_Train_Bigrams = top_train_bigrams,Top_Train_Trigrams = top_train_trigrams,Top_Train_Fourgrams = top_train_fourgrams)
$Top_Train_Unigrams
feature frequency rank docfreq group
1 will 25766 1 21281 all
2 one 25370 2 22169 all
3 get 24359 3 22020 all
4 like 24234 4 21590 all
5 said 24183 5 22007 all
6 just 24123 6 22337 all
7 time 21453 7 19069 all
8 can 20826 8 18325 all
9 year 19354 9 16691 all
10 day 18357 10 16326 all
$Top_Train_Bigrams
feature frequency rank docfreq group
1 right now 1976 1 1935 all
2 year old 1975 2 1851 all
3 last year 1874 3 1814 all
4 look like 1673 4 1637 all
5 new york 1662 5 1547 all
6 feel like 1392 6 1341 all
7 year ago 1370 7 1345 all
8 last night 1336 8 1311 all
9 look forward 1235 9 1224 all
10 high school 1230 10 1118 all
$Top_Train_Trigrams
feature frequency rank docfreq group
1 happi mother day 271 1 268 all
2 new york citi 202 2 193 all
3 happi new year 176 3 176 all
4 presid barack obama 165 4 165 all
5 look forward see 133 5 132 all
6 two year ago 117 6 117 all
7 new york time 114 7 114 all
8 new year eve 75 8 70 all
9 dream come true 72 9 72 all
10 five year ago 69 10 68 all
$Top_Train_Fourgrams
feature frequency rank docfreq group
1 happi mother day mom 32 1 32 all
2 thank follow look forward 31 2 31 all
3 dow jone industri averag 27 3 27 all
4 new york stock exchang 27 3 26 all
5 rock roll hall fame 24 5 23 all
6 happi new year everyon 24 5 24 all
7 calori protein carbohydr satur 22 7 22 all
8 protein carbohydr satur cholesterol 22 7 22 all
9 carbohydr satur cholesterol sodium 22 7 22 all
10 satur cholesterol sodium fiber 22 7 22 all
# Visalize# Function to plot n-gramsplot_ngrams <-function(data, title) { data <- data %>%arrange(desc(frequency)) # Order data from most frequent to least frequentggplot(data, aes(x =reorder(feature, frequency), y = frequency)) +geom_bar(stat ="identity", fill ="steelblue") +theme_minimal() +labs(title = title, x ="N-Gram", y ="Frequency") +coord_flip() # This makes it easier to read the n-grams}# Plotting each set of n-gramsp1 <-plot_ngrams(top_train_unigrams, "Top 10 Training Unigrams")p2 <-plot_ngrams(top_train_bigrams, "Top 10 Training Bigrams")p3 <-plot_ngrams(top_train_trigrams, "Top 10 Training Trigrams")p4 <-plot_ngrams(top_train_fourgrams, "Top 10 Training Fourgrams")# Print the plotsgridExtra::grid.arrange(p1, p2, p3, p4, ncol =2)