library(ngram)
library(readr)
library(quanteda)
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
twitter <- "~/Downloads/final/en_US/en_US.twitter.txt"
blogs <- "~/Downloads/final/en_US/en_US.blogs.txt"
news <- "~/Downloads/final/en_US/en_US.news.txt"
twitter <- readLines(twitter, 1000, warn = F, encoding = "UTF-8")
blogs <- readLines(blogs, 1000, warn = F, encoding = "UTF-8")
news <- readLines(news, 1000, warn = F, encoding = "UTF-8")
blogs_corp <- corpus(blogs)
news_corp <- corpus(news)
twitter_corp <- corpus(twitter)
summary <- data.frame('File' = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
"Documents" = sapply(list(blogs_corp, news_corp, twitter_corp), function(x){ndoc(x)}),
"Words" = sapply(list(blogs, news, twitter), function(x){wordcount(x)})
)
summary
## File File.Size Documents Words
## 1 Blogs 0.3 Mb 1000 41890
## 2 News 0.3 Mb 1000 33489
## 3 Twitter 0.1 Mb 1000 12782
rm(blogs, news, twitter)
token_news <- tokens(news_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_news <- tokens_remove(token_news, pattern = stopwords('en'))
one_gram_news <- tokens_ngrams(token_news, n = 1)
top_one_gram_news <- topfeatures(dfm(one_gram_news), 50)
rm(one_gram_news)
top_one_gram_news
## said new one year two just first like time also
## 259 80 65 63 62 59 58 54 49 48
## people last police get now can state home years game
## 46 46 45 44 43 43 43 41 38 37
## city three make says percent even many going four got
## 35 34 33 33 33 32 31 31 31 30
## much made good right team think day million school county
## 30 30 29 29 29 28 28 28 28 28
## another next play may left come know see well since
## 27 27 27 27 26 26 26 25 25 24
t_gram_news <- tokens_ngrams(token_news, n = 3)
top_t_gram_news <- topfeatures(dfm(t_gram_news), 50)
rm(t_gram_news,token_news)
top_t_gram_news
## osama_bin_laden make_situation_worse
## 3 2
## cinco_de_mayo faces_years_prison
## 2 2
## chief_executive_officer buy_health_insurance
## 2 2
## world_war_ii people_new_jersey
## 2 2
## see_light_day new_york_jets
## 2 2
## first_four_innings allowed_one_run
## 2 2
## offensive_line_coach new_york_post
## 2 2
## lakers_first_points oh_god_amazing
## 2 2
## oh_god_oh god_oh_god
## 2 2
## county_prosecutor's_office little_bit_now
## 2 2
## unrestricted_free_agents averaging_points_rebounds
## 2 2
## cleveland_clinic_innovations new_york_times
## 2 2
## sen_john_mccain chandler_centennial_chihuahua
## 2 2
## just_feel_right rose_almost_percent
## 2 2
## home_alone_apparently great_lakes_fisheries
## 1 1
## st_louis_plant island_president_viewers
## 1 1
## soul_train_soulful louis_plant_close
## 1 1
## plant_close_die close_die_old
## 1 1
## die_old_age baker's_38-page_private
## 1 1
## old_age_workers 38-page_private_placement
## 1 1
## clothes_puddled_floor age_workers_making
## 1 1
## president_viewers_likely private_placement_memorandum
## 1 1
## workers_making_cars puddled_floor_stepped
## 1 1
## viewers_likely_find placement_memorandum_says
## 1 1
## floor_stepped_night making_cars_since
## 1 1
barplot(height = top_one_gram_news, names.arg = names(top_one_gram_news),
las = 3, col = "blue", main = "Most common single words of news data")
par(mar=c(8,4,4,4))
barplot(height = top_t_gram_news, names.arg = names(top_t_gram_news),
las = 3, col = "red", main = "Most common 3-gram word series of news data")
token_twitter <- tokens(twitter_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_twitter <- tokens_remove(token_twitter, pattern = stopwords('en'))
one_gram_twitter <- tokens_ngrams(token_twitter, n = 1)
top_one_gram_twitter <- topfeatures(dfm(one_gram_twitter), 50)
rm(one_gram_twitter)
top_one_gram_twitter
## just like one can love know u day get rt
## 62 58 52 50 48 46 43 42 39 37
## new good go time thanks great lol now much see
## 36 36 32 32 30 29 28 27 26 25
## today going night need make follow want tonight got last
## 25 24 24 23 23 22 22 21 21 20
## back really hey right us life next people well still
## 20 20 20 19 19 18 17 17 17 17
## way first haha happy bad work think every never im
## 16 16 15 15 15 14 14 14 14 14
t_gram_twitter <- tokens_ngrams(token_twitter, n = 3)
top_t_gram_twitter <- topfeatures(dfm(t_gram_twitter), 50)
rm(t_gram_twitter,token_twitter)
top_t_gram_twitter
## ur_eyes_ur eyes_ur_eyes
## 2 2
## cinco_de_mayo make_money_money
## 2 2
## little_stage_puppet stage_puppet_theater
## 2 2
## gets_standing_ovation graduate_yeah_time
## 2 1
## yeah_time_well friends_need_new
## 1 1
## time_well_needed well_needed_r
## 1 1
## need_new_music btw_thanks_rt
## 1 1
## smoke_one_put thanks_rt_gonna
## 1 1
## rt_gonna_dc needed_r_r
## 1 1
## gonna_dc_anytime dc_anytime_soon
## 1 1
## anytime_soon_love mc_rib_time
## 1 1
## one_put_smoke soon_love_see
## 1 1
## rib_time_favorite put_smoke_two
## 1 1
## love_see_way time_favorite_whataburger
## 1 1
## see_way_way smoke_two_put
## 1 1
## favorite_whataburger_sandwiches new_music_tell
## 1 1
## way_way_long two_put_smoke
## 1 1
## whataburger_sandwiches_real music_tell_check
## 1 1
## put_smoke_three tell_check_us
## 1 1
## sandwiches_real_life meet_someone_special
## 1 1
## smoke_three_finally check_us_spotify
## 1 1
## someone_special_know three_finally_ask
## 1 1
## us_spotify_free special_know_heart
## 1 1
## spotify_free_awesome yes_girl_-_____-
## 1 1
## finally_ask_u know_heart_beat
## 1 1
barplot(height = top_one_gram_twitter, names.arg = names(top_one_gram_twitter),
las = 3, col = "blue", main = "Most common single words of twitter data")
par(mar=c(8,4,4,4))
barplot(height = top_t_gram_twitter, names.arg = names(top_t_gram_twitter),
las = 3, col = "red", main = "Most common 3-gram word series of twitter data")
token_blogs <- tokens(blogs_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_blogs <- tokens_remove(token_blogs, pattern = stopwords('en'))
one_gram_blogs <- tokens_ngrams(token_blogs, n = 1)
top_one_gram_blogs <- topfeatures(dfm(one_gram_blogs), 50)
rm(one_gram_blogs)
top_one_gram_blogs
## one like just time can get know day new us good
## 142 137 131 112 99 88 74 73 71 71 68
## now much people really also make think first little back see
## 68 66 62 60 57 56 56 54 53 52 51
## way love right go well even made want work got say
## 51 50 49 49 49 48 46 44 43 42 42
## many world life around need two god year going things today
## 41 41 40 40 40 39 39 39 38 38 37
## still days away lot always said
## 36 36 35 35 34 34
t_gram_blogs <- tokens_ngrams(token_blogs, n = 3)
top_t_gram_blogs <- topfeatures(dfm(t_gram_blogs), 50)
rm(t_gram_blogs,token_blogs)
top_t_gram_blogs
## cricket_world_cup hyun_suk_asks
## 4 3
## team_leader_han rock_paper_scissors
## 3 3
## world_cup_dvd vacant_housing_real
## 3 2
## housing_real_estate exist_days_ago
## 2 2
## chang_min_scoffs weight_loss_challenge
## 2 2
## believers_insist_can love_hate_relationship
## 2 2
## can_borrow_library roman_catholic_code
## 2 2
## catholic_code_canon code_canon_law
## 2 2
## every_cab_run find_iron_tooth
## 2 2
## want_make_sure done_unto_us
## 2 2
## unto_us_believe guns_germs_steel
## 2 2
## star_trek_mug new_york_city
## 2 2
## hound_makes_think late_last_night
## 2 2
## right_around_corner chocolate_ice_cream
## 2 2
## high_school_championships tenant_personal_financial
## 2 2
## personal_financial_information god_love_god
## 2 2
## makes_one_wonder hide_books_get
## 2 2
## books_get_chores get_chores_done
## 2 2
## every_single_day love_spending_time
## 2 2
## pieces_may_end hibiscus_aguas_fresca
## 1 1
## tend_either_incredibly may_end_stitching
## 1 1
## end_stitching_onto either_incredibly_cautious
## 1 1
## stitching_onto_canvas incredibly_cautious_stupidly
## 1 1
## cautious_stupidly_rash even_though_new
## 1 1
## stupidly_rash_something though_new_computer
## 1 1
barplot(height = top_one_gram_blogs, names.arg = names(top_one_gram_blogs),
las = 3, col = "blue", main = "Most common single words of blogs data")
par(mar=c(8,4,4,4))
barplot(height = top_t_gram_blogs, names.arg = names(top_t_gram_blogs),
las = 3, col = "red", main = "Most common 3-gram word series of blogs data")
Summary When creating the shiny App I will try to compare the frequescy of words and phrases with more of the data. Initially I only selected the first 1000 lines from each of the datasets however I am thinking I could run the prediction with 2000 or 3000 words and compare the frequency of those words to see the accuracy of the prediction.