vector <- c("emoticon", ":)", "symbol", "$^$")
writeLines((vector))
## emoticon
## :)
## symbol
## $^$
str_view(vector, ".o.")
## [1] │ e<mot>i<con>
## [3] │ sym<bol>
str_view(vector, "emoticon")
## [1] │ <emoticon>
str_view(vector, "\\:\\)")
## [2] │ <:)>
str_view(vector, "\\$\\^\\$")
## [4] │ <$^$>
corpus <- c(
"OMG I looove this movie!!! :D :) #cinema",
"Visit https://data.org for more info!",
"@user123 LOL that's crazy XD XD XD",
"Email me at test_user@mail.com ASAP!!",
"Working from HOME since 2020...",
"BUY NOW!!! Only $9.99!",
"Great job!!! Keep it up :) ;D",
"So tired of this traffic jam... #monday",
"New blog post https://myblog.net/post/123",
"Check this out @friend — unbelievable!!!",
"SALE starts TODAY!!! LIMITED TIME OFFER!",
"Working hard or hardly working?",
"Can't believe it's already 2025!!",
"Follow us for updates @data_science_team",
"Contact: info@company.com for details.",
"RT @newsbot: BREAKING NEWS: Market hits new record highs!!!",
"LOL this made my day :P ;P",
"Nothing better than cooooffee in the mooorning"
)
str_view(corpus, "https?://\\S+|www\\.\\S+", match = TRUE)
## [2] │ Visit <https://data.org> for more info!
## [9] │ New blog post <https://myblog.net/post/123>
str_view(corpus, "@\\w+", match = TRUE)
## [3] │ <@user123> LOL that's crazy XD XD XD
## [4] │ Email me at test_user<@mail>.com ASAP!!
## [10] │ Check this out <@friend> — unbelievable!!!
## [14] │ Follow us for updates <@data_science_team>
## [15] │ Contact: info<@company>.com for details.
## [16] │ RT <@newsbot>: BREAKING NEWS: Market hits new record highs!!!
str_view(corpus,"\\b[A-Z]{2,}\\b(?:\\s+\\b[A-Z]{2,}\\b){2,}", match = TRUE)
## [3] │ @user123 LOL that's crazy <XD XD XD>
## [11] │ SALE starts TODAY!!! <LIMITED TIME OFFER>!
str_view(corpus, "([a-zA-Z])\\1{2,}", match = TRUE)
## [1] │ OMG I l<ooo>ve this movie!!! :D :) #cinema
## [18] │ Nothing better than c<oooo>ffee in the m<ooo>rning
corpus <- c(
"OMG I looove this movie!!! :D :) #cinema",
"Visit https://data.org for more info!",
"@user123 LOL that's crazy XD XD XD",
"Email me at test_user@mail.com ASAP!!",
"Working from HOME since 2020...",
"BUY NOW!!! Only $9.99!",
"Great job!!! Keep it up :) ;D",
"So tired of this traffic jam... #monday",
"New blog post https://myblog.net/post/123",
"Check this out @friend — unbelievable!!!",
"SALE starts TODAY!!! LIMITED TIME OFFER!",
"Working hard or hardly working?",
"Can't believe it's already 2025!!",
"Follow us for updates @data_science_team",
"Contact: info@company.com for details.",
"RT @newsbot: BREAKING NEWS: Market hits new record highs!!!",
"LOL this made my day :P ;P",
"Nothing better than cooooffee in the mooorning"
)
corpus_a <- str_remove_all(corpus, "\\b\\w*\\d\\w*\\b")
print(corpus_a)
## [1] "OMG I looove this movie!!! :D :) #cinema"
## [2] "Visit https://data.org for more info!"
## [3] "@ LOL that's crazy XD XD XD"
## [4] "Email me at test_user@mail.com ASAP!!"
## [5] "Working from HOME since ..."
## [6] "BUY NOW!!! Only $.!"
## [7] "Great job!!! Keep it up :) ;D"
## [8] "So tired of this traffic jam... #monday"
## [9] "New blog post https://myblog.net/post/"
## [10] "Check this out @friend — unbelievable!!!"
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"
## [12] "Working hard or hardly working?"
## [13] "Can't believe it's already !!"
## [14] "Follow us for updates @data_science_team"
## [15] "Contact: info@company.com for details."
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day :P ;P"
## [18] "Nothing better than cooooffee in the mooorning"
corpus_b <- str_remove_all(corpus, "\\b[A-Z]{2,}\\b")
print(corpus_b)
## [1] " I looove this movie!!! :D :) #cinema"
## [2] "Visit https://data.org for more info!"
## [3] "@user123 that's crazy "
## [4] "Email me at test_user@mail.com !!"
## [5] "Working from since 2020..."
## [6] " !!! Only $9.99!"
## [7] "Great job!!! Keep it up :) ;D"
## [8] "So tired of this traffic jam... #monday"
## [9] "New blog post https://myblog.net/post/123"
## [10] "Check this out @friend — unbelievable!!!"
## [11] " starts !!! !"
## [12] "Working hard or hardly working?"
## [13] "Can't believe it's already 2025!!"
## [14] "Follow us for updates @data_science_team"
## [15] "Contact: info@company.com for details."
## [16] " @newsbot: : Market hits new record highs!!!"
## [17] " this made my day :P ;P"
## [18] "Nothing better than cooooffee in the mooorning"
corpus_c <- str_remove_all(corpus, "#\\w+")
print(corpus_c)
## [1] "OMG I looove this movie!!! :D :) "
## [2] "Visit https://data.org for more info!"
## [3] "@user123 LOL that's crazy XD XD XD"
## [4] "Email me at test_user@mail.com ASAP!!"
## [5] "Working from HOME since 2020..."
## [6] "BUY NOW!!! Only $9.99!"
## [7] "Great job!!! Keep it up :) ;D"
## [8] "So tired of this traffic jam... "
## [9] "New blog post https://myblog.net/post/123"
## [10] "Check this out @friend — unbelievable!!!"
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"
## [12] "Working hard or hardly working?"
## [13] "Can't believe it's already 2025!!"
## [14] "Follow us for updates @data_science_team"
## [15] "Contact: info@company.com for details."
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day :P ;P"
## [18] "Nothing better than cooooffee in the mooorning"
corpus_d <- str_remove_all(corpus,"\\:\\)|\\:D|\\:P|\\:-\\)")
print(corpus_d)
## [1] "OMG I looove this movie!!! #cinema"
## [2] "Visit https://data.org for more info!"
## [3] "@user123 LOL that's crazy XD XD XD"
## [4] "Email me at test_user@mail.com ASAP!!"
## [5] "Working from HOME since 2020..."
## [6] "BUY NOW!!! Only $9.99!"
## [7] "Great job!!! Keep it up ;D"
## [8] "So tired of this traffic jam... #monday"
## [9] "New blog post https://myblog.net/post/123"
## [10] "Check this out @friend — unbelievable!!!"
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"
## [12] "Working hard or hardly working?"
## [13] "Can't believe it's already 2025!!"
## [14] "Follow us for updates @data_science_team"
## [15] "Contact: info@company.com for details."
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day ;P"
## [18] "Nothing better than cooooffee in the mooorning"
Apply all a), b), c), d) tasks to obtain corpus_CLEAN
corpus_CLEAN <- corpus %>%
str_remove_all("\\b\\w*\\d\\w*\\b") %>% # a) Remove words with numbers
str_remove_all("\\b[A-Z]{2,}\\b") %>% # b) Remove uppercase words (2+ letters)
str_remove_all("#\\w+") %>% # c) Remove hashtags
str_remove_all("\\:\\)|\\:D|\\:P|\\:-\\)") # d) Remove specific smiley emojis
# Print the cleaned corpus
print(corpus_CLEAN)
## [1] " I looove this movie!!! "
## [2] "Visit https://data.org for more info!"
## [3] "@ that's crazy "
## [4] "Email me at test_user@mail.com !!"
## [5] "Working from since ..."
## [6] " !!! Only $.!"
## [7] "Great job!!! Keep it up ;D"
## [8] "So tired of this traffic jam... "
## [9] "New blog post https://myblog.net/post/"
## [10] "Check this out @friend — unbelievable!!!"
## [11] " starts !!! !"
## [12] "Working hard or hardly working?"
## [13] "Can't believe it's already !!"
## [14] "Follow us for updates @data_science_team"
## [15] "Contact: info@company.com for details."
## [16] " @newsbot: : Market hits new record highs!!!"
## [17] " this made my day ;P"
## [18] "Nothing better than cooooffee in the mooorning"