setwd("/Users/subasishdas1/Desktop/tweets/tweets2/02")
library(readxl)
dat= read_excel("autonomouscar.xlsx")
library(tidyverse)
clean_tweets <- function(x) {
x %>%
str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
str_replace_all("&", "and") %>%
str_remove_all("[[:punct:]]") %>%
str_remove_all("^RT:? ") %>%
str_remove_all("@[[:alnum:]]+") %>%
str_remove_all("#[[:alnum:]]+") %>%
str_remove_all("pictwittercom") %>%
str_replace_all("\\\n", " ") %>%
str_to_lower() %>%
str_trim("both")
}
dat$tweet1= dat$tweet %>% clean_tweets
###write.csv(dat, "clean02.csv")
names(dat)
## [1] "id" "conversation_id" "created_at" "date"
## [5] "time" "timezone" "user_id" "username"
## [9] "name" "place" "tweet" "language"
## [13] "mentions" "urls" "photos" "replies_count"
## [17] "retweets_count" "likes_count" "hashtags" "cashtags"
## [21] "link" "retweet" "quote_url" "video"
## [25] "thumbnail" "near" "geo" "source"
## [29] "user_rt_id" "user_rt" "retweet_id" "reply_to"
## [33] "retweet_date" "translate" "trans_src" "trans_dest"
## [37] "tweet1"