GUOJIE TAO

476665

1. EXERCISE

vector <- c("emoticon", ":)", "symbol", "$^$")
writeLines((vector))
## emoticon
## :)
## symbol
## $^$
  1. string of 3 characters with the letter o in the middle
str_view(vector, ".o.")
## [1] │ e<mot>i<con>
## [3] │ sym<bol>
  1. expression “emoticon”
str_view(vector, "emoticon")
## [1] │ <emoticon>
  1. expression “:)”
str_view(vector, "\\:\\)")
## [2] │ <:)>
  1. expression “^”
str_view(vector, "\\$\\^\\$")
## [4] │ <$^$>

2. EXERCISE

corpus <- c(
  "OMG I looove this movie!!! :D :) #cinema",
  "Visit https://data.org for more info!",
  "@user123 LOL that's crazy XD XD XD",
  "Email me at test_user@mail.com ASAP!!",
  "Working from HOME since 2020...",
  "BUY NOW!!! Only $9.99!",
  "Great job!!! Keep it up :) ;D",
  "So tired of this traffic jam... #monday",
  "New blog post https://myblog.net/post/123",
  "Check this out @friend — unbelievable!!!",
  "SALE starts TODAY!!! LIMITED TIME OFFER!",
  "Working hard or hardly working?",
  "Can't believe it's already 2025!!",
  "Follow us for updates @data_science_team",
  "Contact: info@company.com for details.",
  "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!",
  "LOL this made my day :P ;P",
  "Nothing better than cooooffee in the mooorning"
)
  1. all posts containing a URL
str_view(corpus, "https?://\\S+|www\\.\\S+", match = TRUE)
## [2] │ Visit <https://data.org> for more info!
## [9] │ New blog post <https://myblog.net/post/123>
  1. all posts containing user mentions (starting with @)
str_view(corpus, "@\\w+", match = TRUE)
##  [3] │ <@user123> LOL that's crazy XD XD XD
##  [4] │ Email me at test_user<@mail>.com ASAP!!
## [10] │ Check this out <@friend> — unbelievable!!!
## [14] │ Follow us for updates <@data_science_team>
## [15] │ Contact: info<@company>.com for details.
## [16] │ RT <@newsbot>: BREAKING NEWS: Market hits new record highs!!!
  1. all posts that have a sequence of >=3 uppercase words
str_view(corpus,"\\b[A-Z]{2,}\\b(?:\\s+\\b[A-Z]{2,}\\b){2,}", match = TRUE)
##  [3] │ @user123 LOL that's crazy <XD XD XD>
## [11] │ SALE starts TODAY!!! <LIMITED TIME OFFER>!
  1. all posts where any letter repeats 3 or more times
str_view(corpus, "([a-zA-Z])\\1{2,}", match = TRUE)
##  [1] │ OMG I l<ooo>ve this movie!!! :D :) #cinema
## [18] │ Nothing better than c<oooo>ffee in the m<ooo>rning

3. EXERCISE

corpus <- c(
  "OMG I looove this movie!!! :D :) #cinema",
  "Visit https://data.org for more info!",
  "@user123 LOL that's crazy XD XD XD",
  "Email me at test_user@mail.com ASAP!!",
  "Working from HOME since 2020...",
  "BUY NOW!!! Only $9.99!",
  "Great job!!! Keep it up :) ;D",
  "So tired of this traffic jam... #monday",
  "New blog post https://myblog.net/post/123",
  "Check this out @friend — unbelievable!!!",
  "SALE starts TODAY!!! LIMITED TIME OFFER!",
  "Working hard or hardly working?",
  "Can't believe it's already 2025!!",
  "Follow us for updates @data_science_team",
  "Contact: info@company.com for details.",
  "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!",
  "LOL this made my day :P ;P",
  "Nothing better than cooooffee in the mooorning"
)
  1. Remove all words that contain a number
corpus_a <- str_remove_all(corpus, "\\b\\w*\\d\\w*\\b")
print(corpus_a)
##  [1] "OMG I looove this movie!!! :D :) #cinema"                   
##  [2] "Visit https://data.org for more info!"                      
##  [3] "@ LOL that's crazy XD XD XD"                                
##  [4] "Email me at test_user@mail.com ASAP!!"                      
##  [5] "Working from HOME since ..."                                
##  [6] "BUY NOW!!! Only $.!"                                        
##  [7] "Great job!!! Keep it up :) ;D"                              
##  [8] "So tired of this traffic jam... #monday"                    
##  [9] "New blog post https://myblog.net/post/"                     
## [10] "Check this out @friend — unbelievable!!!"                   
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"                   
## [12] "Working hard or hardly working?"                            
## [13] "Can't believe it's already !!"                              
## [14] "Follow us for updates @data_science_team"                   
## [15] "Contact: info@company.com for details."                     
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day :P ;P"                                 
## [18] "Nothing better than cooooffee in the mooorning"
  1. Remove all words that are written entirely in uppercase letters
corpus_b <- str_remove_all(corpus, "\\b[A-Z]{2,}\\b")
print(corpus_b)
##  [1] " I looove this movie!!! :D :) #cinema"         
##  [2] "Visit https://data.org for more info!"         
##  [3] "@user123  that's crazy   "                     
##  [4] "Email me at test_user@mail.com !!"             
##  [5] "Working from  since 2020..."                   
##  [6] " !!! Only $9.99!"                              
##  [7] "Great job!!! Keep it up :) ;D"                 
##  [8] "So tired of this traffic jam... #monday"       
##  [9] "New blog post https://myblog.net/post/123"     
## [10] "Check this out @friend — unbelievable!!!"      
## [11] " starts !!!   !"                               
## [12] "Working hard or hardly working?"               
## [13] "Can't believe it's already 2025!!"             
## [14] "Follow us for updates @data_science_team"      
## [15] "Contact: info@company.com for details."        
## [16] " @newsbot:  : Market hits new record highs!!!" 
## [17] " this made my day :P ;P"                       
## [18] "Nothing better than cooooffee in the mooorning"
  1. Remove all hashtags
corpus_c <- str_remove_all(corpus, "#\\w+")
print(corpus_c)
##  [1] "OMG I looove this movie!!! :D :) "                          
##  [2] "Visit https://data.org for more info!"                      
##  [3] "@user123 LOL that's crazy XD XD XD"                         
##  [4] "Email me at test_user@mail.com ASAP!!"                      
##  [5] "Working from HOME since 2020..."                            
##  [6] "BUY NOW!!! Only $9.99!"                                     
##  [7] "Great job!!! Keep it up :) ;D"                              
##  [8] "So tired of this traffic jam... "                           
##  [9] "New blog post https://myblog.net/post/123"                  
## [10] "Check this out @friend — unbelievable!!!"                   
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"                   
## [12] "Working hard or hardly working?"                            
## [13] "Can't believe it's already 2025!!"                          
## [14] "Follow us for updates @data_science_team"                   
## [15] "Contact: info@company.com for details."                     
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day :P ;P"                                 
## [18] "Nothing better than cooooffee in the mooorning"
  1. Remove all smiley-style emojis like :), :D, :P, :-)
corpus_d <- str_remove_all(corpus,"\\:\\)|\\:D|\\:P|\\:-\\)")
print(corpus_d)
##  [1] "OMG I looove this movie!!!   #cinema"                       
##  [2] "Visit https://data.org for more info!"                      
##  [3] "@user123 LOL that's crazy XD XD XD"                         
##  [4] "Email me at test_user@mail.com ASAP!!"                      
##  [5] "Working from HOME since 2020..."                            
##  [6] "BUY NOW!!! Only $9.99!"                                     
##  [7] "Great job!!! Keep it up  ;D"                                
##  [8] "So tired of this traffic jam... #monday"                    
##  [9] "New blog post https://myblog.net/post/123"                  
## [10] "Check this out @friend — unbelievable!!!"                   
## [11] "SALE starts TODAY!!! LIMITED TIME OFFER!"                   
## [12] "Working hard or hardly working?"                            
## [13] "Can't believe it's already 2025!!"                          
## [14] "Follow us for updates @data_science_team"                   
## [15] "Contact: info@company.com for details."                     
## [16] "RT @newsbot: BREAKING NEWS: Market hits new record highs!!!"
## [17] "LOL this made my day  ;P"                                   
## [18] "Nothing better than cooooffee in the mooorning"

Apply all a), b), c), d) tasks to obtain corpus_CLEAN

corpus_CLEAN <- corpus %>%
  str_remove_all("\\b\\w*\\d\\w*\\b") %>% # a) Remove words with numbers
  str_remove_all("\\b[A-Z]{2,}\\b") %>%   # b) Remove uppercase words (2+ letters)
  str_remove_all("#\\w+") %>%             # c) Remove hashtags
  str_remove_all("\\:\\)|\\:D|\\:P|\\:-\\)") # d) Remove specific smiley emojis

# Print the cleaned corpus
print(corpus_CLEAN)
##  [1] " I looove this movie!!!   "                    
##  [2] "Visit https://data.org for more info!"         
##  [3] "@  that's crazy   "                            
##  [4] "Email me at test_user@mail.com !!"             
##  [5] "Working from  since ..."                       
##  [6] " !!! Only $.!"                                 
##  [7] "Great job!!! Keep it up  ;D"                   
##  [8] "So tired of this traffic jam... "              
##  [9] "New blog post https://myblog.net/post/"        
## [10] "Check this out @friend — unbelievable!!!"      
## [11] " starts !!!   !"                               
## [12] "Working hard or hardly working?"               
## [13] "Can't believe it's already !!"                 
## [14] "Follow us for updates @data_science_team"      
## [15] "Contact: info@company.com for details."        
## [16] " @newsbot:  : Market hits new record highs!!!" 
## [17] " this made my day  ;P"                         
## [18] "Nothing better than cooooffee in the mooorning"