Twitter Data Cleaning

setwd("/Users/subasishdas1/Desktop/tweets/tweets2/02")
library(readxl)
dat= read_excel("autonomouscar.xlsx")

library(tidyverse)    

clean_tweets <- function(x) {
  x %>%
    str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
    str_replace_all("&amp;", "and") %>%
    str_remove_all("[[:punct:]]") %>%
    str_remove_all("^RT:? ") %>%
    str_remove_all("@[[:alnum:]]+") %>%
    str_remove_all("#[[:alnum:]]+") %>%
    str_remove_all("pictwittercom") %>%
    str_replace_all("\\\n", " ") %>%
    str_to_lower() %>%
    str_trim("both")
}

dat$tweet1= dat$tweet %>% clean_tweets
###write.csv(dat, "clean02.csv")
names(dat)

##  [1] "id"              "conversation_id" "created_at"      "date"           
##  [5] "time"            "timezone"        "user_id"         "username"       
##  [9] "name"            "place"           "tweet"           "language"       
## [13] "mentions"        "urls"            "photos"          "replies_count"  
## [17] "retweets_count"  "likes_count"     "hashtags"        "cashtags"       
## [21] "link"            "retweet"         "quote_url"       "video"          
## [25] "thumbnail"       "near"            "geo"             "source"         
## [29] "user_rt_id"      "user_rt"         "retweet_id"      "reply_to"       
## [33] "retweet_date"    "translate"       "trans_src"       "trans_dest"     
## [37] "tweet1"

library(DT)
datatable(dat[, c(11,37)])

dat$hashtags1= dat$hashtags %>% clean_tweets
datatable(dat[, c(19, 38)])

dat$mentions2= dat$mentions %>% clean_tweets
dat$mentions3=stringr::str_extract_all(dat$mentions2, regex("(?<=screenname\\s)[:alpha:]*\\b"))
datatable(dat[, c(13, 39, 40)])

Twitter Data Cleaning

2020-12-09