milestone report

library(ngram)
library(readr)
library(quanteda)

## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1

## Parallel computing: 4 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

twitter <- "~/Downloads/final/en_US/en_US.twitter.txt"
blogs <- "~/Downloads/final/en_US/en_US.blogs.txt"
news <- "~/Downloads/final/en_US/en_US.news.txt"

twitter <- readLines(twitter, 1000, warn = F, encoding = "UTF-8")
blogs <- readLines(blogs, 1000, warn = F, encoding = "UTF-8")
news <-  readLines(news, 1000, warn = F, encoding = "UTF-8")

blogs_corp <- corpus(blogs)
news_corp <- corpus(news)
twitter_corp <- corpus(twitter)
summary <- data.frame('File' = c("Blogs","News","Twitter"),
                      "File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
                      "Documents" = sapply(list(blogs_corp, news_corp, twitter_corp), function(x){ndoc(x)}),
                      "Words" = sapply(list(blogs, news, twitter), function(x){wordcount(x)})
)
summary

##      File File.Size Documents Words
## 1   Blogs    0.3 Mb      1000 41890
## 2    News    0.3 Mb      1000 33489
## 3 Twitter    0.1 Mb      1000 12782

rm(blogs, news, twitter)

token_news <- tokens(news_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_news <- tokens_remove(token_news, pattern = stopwords('en'))

one_gram_news <- tokens_ngrams(token_news, n = 1)
top_one_gram_news <- topfeatures(dfm(one_gram_news), 50)
rm(one_gram_news)
top_one_gram_news

##    said     new     one    year     two    just   first    like    time    also 
##     259      80      65      63      62      59      58      54      49      48 
##  people    last  police     get     now     can   state    home   years    game 
##      46      46      45      44      43      43      43      41      38      37 
##    city   three    make    says percent    even    many   going    four     got 
##      35      34      33      33      33      32      31      31      31      30 
##    much    made    good   right    team   think     day million  school  county 
##      30      30      29      29      29      28      28      28      28      28 
## another    next    play     may    left    come    know     see    well   since 
##      27      27      27      27      26      26      26      25      25      24

t_gram_news <- tokens_ngrams(token_news, n = 3)
top_t_gram_news <- topfeatures(dfm(t_gram_news), 50)
rm(t_gram_news,token_news)
top_t_gram_news

##               osama_bin_laden          make_situation_worse 
##                             3                             2 
##                 cinco_de_mayo            faces_years_prison 
##                             2                             2 
##       chief_executive_officer          buy_health_insurance 
##                             2                             2 
##                  world_war_ii             people_new_jersey 
##                             2                             2 
##                 see_light_day                 new_york_jets 
##                             2                             2 
##            first_four_innings               allowed_one_run 
##                             2                             2 
##          offensive_line_coach                 new_york_post 
##                             2                             2 
##           lakers_first_points                oh_god_amazing 
##                             2                             2 
##                     oh_god_oh                    god_oh_god 
##                             2                             2 
##    county_prosecutor's_office                little_bit_now 
##                             2                             2 
##      unrestricted_free_agents     averaging_points_rebounds 
##                             2                             2 
##  cleveland_clinic_innovations                new_york_times 
##                             2                             2 
##               sen_john_mccain chandler_centennial_chihuahua 
##                             2                             2 
##               just_feel_right           rose_almost_percent 
##                             2                             2 
##         home_alone_apparently         great_lakes_fisheries 
##                             1                             1 
##                st_louis_plant      island_president_viewers 
##                             1                             1 
##            soul_train_soulful             louis_plant_close 
##                             1                             1 
##               plant_close_die                 close_die_old 
##                             1                             1 
##                   die_old_age       baker's_38-page_private 
##                             1                             1 
##               old_age_workers     38-page_private_placement 
##                             1                             1 
##         clothes_puddled_floor            age_workers_making 
##                             1                             1 
##      president_viewers_likely  private_placement_memorandum 
##                             1                             1 
##           workers_making_cars         puddled_floor_stepped 
##                             1                             1 
##           viewers_likely_find     placement_memorandum_says 
##                             1                             1 
##           floor_stepped_night             making_cars_since 
##                             1                             1

barplot(height = top_one_gram_news, names.arg = names(top_one_gram_news),
 las = 3, col = "blue", main = "Most common single words of news data")

par(mar=c(8,4,4,4))
barplot(height = top_t_gram_news, names.arg = names(top_t_gram_news),
        las = 3, col = "red", main = "Most common 3-gram word series of news data")

token_twitter <- tokens(twitter_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_twitter <- tokens_remove(token_twitter, pattern = stopwords('en'))

one_gram_twitter <- tokens_ngrams(token_twitter, n = 1)
top_one_gram_twitter <- topfeatures(dfm(one_gram_twitter), 50)
rm(one_gram_twitter)
top_one_gram_twitter

##    just    like     one     can    love    know       u     day     get      rt 
##      62      58      52      50      48      46      43      42      39      37 
##     new    good      go    time  thanks   great     lol     now    much     see 
##      36      36      32      32      30      29      28      27      26      25 
##   today   going   night    need    make  follow    want tonight     got    last 
##      25      24      24      23      23      22      22      21      21      20 
##    back  really     hey   right      us    life    next  people    well   still 
##      20      20      20      19      19      18      17      17      17      17 
##     way   first    haha   happy     bad    work   think   every   never      im 
##      16      16      15      15      15      14      14      14      14      14

t_gram_twitter <- tokens_ngrams(token_twitter, n = 3)
top_t_gram_twitter <- topfeatures(dfm(t_gram_twitter), 50)
rm(t_gram_twitter,token_twitter)
top_t_gram_twitter

##                      ur_eyes_ur                    eyes_ur_eyes 
##                               2                               2 
##                   cinco_de_mayo                make_money_money 
##                               2                               2 
##             little_stage_puppet            stage_puppet_theater 
##                               2                               2 
##           gets_standing_ovation              graduate_yeah_time 
##                               2                               1 
##                  yeah_time_well                friends_need_new 
##                               1                               1 
##                time_well_needed                   well_needed_r 
##                               1                               1 
##                  need_new_music                   btw_thanks_rt 
##                               1                               1 
##                   smoke_one_put                 thanks_rt_gonna 
##                               1                               1 
##                     rt_gonna_dc                      needed_r_r 
##                               1                               1 
##                gonna_dc_anytime                 dc_anytime_soon 
##                               1                               1 
##               anytime_soon_love                     mc_rib_time 
##                               1                               1 
##                   one_put_smoke                   soon_love_see 
##                               1                               1 
##               rib_time_favorite                   put_smoke_two 
##                               1                               1 
##                    love_see_way       time_favorite_whataburger 
##                               1                               1 
##                     see_way_way                   smoke_two_put 
##                               1                               1 
## favorite_whataburger_sandwiches                  new_music_tell 
##                               1                               1 
##                    way_way_long                   two_put_smoke 
##                               1                               1 
##     whataburger_sandwiches_real                music_tell_check 
##                               1                               1 
##                 put_smoke_three                   tell_check_us 
##                               1                               1 
##            sandwiches_real_life            meet_someone_special 
##                               1                               1 
##             smoke_three_finally                check_us_spotify 
##                               1                               1 
##            someone_special_know               three_finally_ask 
##                               1                               1 
##                 us_spotify_free              special_know_heart 
##                               1                               1 
##            spotify_free_awesome                yes_girl_-_____- 
##                               1                               1 
##                   finally_ask_u                 know_heart_beat 
##                               1                               1

barplot(height = top_one_gram_twitter, names.arg = names(top_one_gram_twitter),
 las = 3, col = "blue", main = "Most common single words of twitter data")

par(mar=c(8,4,4,4))
barplot(height = top_t_gram_twitter, names.arg = names(top_t_gram_twitter),
        las = 3, col = "red", main = "Most common 3-gram word series of twitter data")

token_blogs <- tokens(blogs_corp, remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
token_blogs <- tokens_remove(token_blogs, pattern = stopwords('en'))

one_gram_blogs <- tokens_ngrams(token_blogs, n = 1)
top_one_gram_blogs <- topfeatures(dfm(one_gram_blogs), 50)
rm(one_gram_blogs)
top_one_gram_blogs

##    one   like   just   time    can    get   know    day    new     us   good 
##    142    137    131    112     99     88     74     73     71     71     68 
##    now   much people really   also   make  think  first little   back    see 
##     68     66     62     60     57     56     56     54     53     52     51 
##    way   love  right     go   well   even   made   want   work    got    say 
##     51     50     49     49     49     48     46     44     43     42     42 
##   many  world   life around   need    two    god   year  going things  today 
##     41     41     40     40     40     39     39     39     38     38     37 
##  still   days   away    lot always   said 
##     36     36     35     35     34     34

t_gram_blogs <- tokens_ngrams(token_blogs, n = 3)
top_t_gram_blogs <- topfeatures(dfm(t_gram_blogs), 50)
rm(t_gram_blogs,token_blogs)
top_t_gram_blogs

##              cricket_world_cup                  hyun_suk_asks 
##                              4                              3 
##                team_leader_han            rock_paper_scissors 
##                              3                              3 
##                  world_cup_dvd            vacant_housing_real 
##                              3                              2 
##            housing_real_estate                 exist_days_ago 
##                              2                              2 
##               chang_min_scoffs          weight_loss_challenge 
##                              2                              2 
##           believers_insist_can         love_hate_relationship 
##                              2                              2 
##             can_borrow_library            roman_catholic_code 
##                              2                              2 
##            catholic_code_canon                 code_canon_law 
##                              2                              2 
##                  every_cab_run                find_iron_tooth 
##                              2                              2 
##                 want_make_sure                   done_unto_us 
##                              2                              2 
##                unto_us_believe               guns_germs_steel 
##                              2                              2 
##                  star_trek_mug                  new_york_city 
##                              2                              2 
##              hound_makes_think                late_last_night 
##                              2                              2 
##            right_around_corner            chocolate_ice_cream 
##                              2                              2 
##      high_school_championships      tenant_personal_financial 
##                              2                              2 
## personal_financial_information                   god_love_god 
##                              2                              2 
##               makes_one_wonder                 hide_books_get 
##                              2                              2 
##               books_get_chores                get_chores_done 
##                              2                              2 
##               every_single_day             love_spending_time 
##                              2                              2 
##                 pieces_may_end          hibiscus_aguas_fresca 
##                              1                              1 
##         tend_either_incredibly              may_end_stitching 
##                              1                              1 
##             end_stitching_onto     either_incredibly_cautious 
##                              1                              1 
##          stitching_onto_canvas   incredibly_cautious_stupidly 
##                              1                              1 
##         cautious_stupidly_rash                even_though_new 
##                              1                              1 
##        stupidly_rash_something            though_new_computer 
##                              1                              1

barplot(height = top_one_gram_blogs, names.arg = names(top_one_gram_blogs),
 las = 3, col = "blue", main = "Most common single words of blogs data")

par(mar=c(8,4,4,4))
barplot(height = top_t_gram_blogs, names.arg = names(top_t_gram_blogs),
        las = 3, col = "red", main = "Most common 3-gram word series of blogs data")

Summary When creating the shiny App I will try to compare the frequescy of words and phrases with more of the data. Initially I only selected the first 1000 lines from each of the datasets however I am thinking I could run the prediction with 2000 or 3000 words and compare the frequency of those words to see the accuracy of the prediction.

milestone report

me

31/10/2021