quanteda01

Hashtag

setwd("C:/Users/subas/Syncplicity/MyProjects_IMP/SafeD_ADV")
library(readxl)
dat= read_excel("autonomouscar.xlsx")

library(tidyverse)    


clean_tweets <- function(x) {
  x %>%
    str_remove_all(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%
    str_replace_all("&amp;", "and") %>%
    str_remove_all("[[:punct:]]") %>%
    str_remove_all("^RT:? ") %>%
    str_remove_all("@[[:alnum:]]+") %>%
    str_remove_all("#[[:alnum:]]+") %>%
    str_remove_all("pictwittercom") %>%
    str_replace_all("\\\n", " ") %>%
    str_to_lower() %>%
    str_trim("both")
}


library(DT)
names(dat)

##  [1] "id"              "conversation_id" "created_at"      "date"           
##  [5] "time"            "timezone"        "user_id"         "username"       
##  [9] "name"            "place"           "tweet"           "language"       
## [13] "mentions"        "urls"            "photos"          "replies_count"  
## [17] "retweets_count"  "likes_count"     "hashtags"        "cashtags"       
## [21] "link"            "retweet"         "quote_url"       "video"          
## [25] "thumbnail"       "near"            "geo"             "source"         
## [29] "user_rt_id"      "user_rt"         "retweet_id"      "reply_to"       
## [33] "retweet_date"    "translate"       "trans_src"       "trans_dest"

dat$hashtags1= dat$hashtags %>% clean_tweets
dat1= dat[, c(8, 37)]
head(dat1)

## # A tibble: 6 x 2
##   username        hashtags1                   
##   <chr>           <chr>                       
## 1 selfdrivingfeed autonomouscar selfdrivingcar
## 2 selfdrivingfeed autonomouscar selfdrivingcar
## 3 selfdrivingfeed autonomouscar selfdrivingcar
## 4 selfdrivingfeed autonomouscar selfdrivingcar
## 5 selfdrivingfeed autonomouscar selfdrivingcar
## 6 selfdrivingfeed autonomouscar selfdrivingcar

str(dat1)

## tibble [107,315 x 2] (S3: tbl_df/tbl/data.frame)
##  $ username : chr [1:107315] "selfdrivingfeed" "selfdrivingfeed" "selfdrivingfeed" "selfdrivingfeed" ...
##  $ hashtags1: chr [1:107315] "autonomouscar selfdrivingcar" "autonomouscar selfdrivingcar" "autonomouscar selfdrivingcar" "autonomouscar selfdrivingcar" ...

library(tm)
names(dat1)

## [1] "username"  "hashtags1"

## [1] "Year"  "Title"
library(tm)
library(quanteda)
corp_tweets <- corpus(dat1$hashtags1)
tweet_dfm <- dfm(corp_tweets, remove_punct = TRUE)
head(tweet_dfm)

## Document-feature matrix of: 6 documents, 9,804 features (100.0% sparse).
##        features
## docs    autonomouscar selfdrivingcar 5g uber av aurora avs autonomousvehicles
##   text1             1              1  0    0  0      0   0                  0
##   text2             1              1  0    0  0      0   0                  0
##   text3             1              1  0    0  0      0   0                  0
##   text4             1              1  0    0  0      0   0                  0
##   text5             1              1  0    0  0      0   0                  0
##   text6             1              1  0    0  0      0   0                  0
##        features
## docs    drone virtualreality
##   text1     0              0
##   text2     0              0
##   text3     0              0
##   text4     0              0
##   text5     0              0
##   text6     0              0
## [ reached max_nfeat ... 9,794 more features ]

library(quanteda)
###library("quanteda.textplots")
tag_dfm <- dfm_select(tweet_dfm)
toptag <- names(topfeatures(tag_dfm, 40))
tag_fcm <- fcm(tag_dfm)
head(tag_fcm)

## Feature co-occurrence matrix of: 6 by 6 features.
##                 features
## features         autonomouscar selfdrivingcar   5g uber  av aurora
##   autonomouscar             22          71368 1062  633 155     11
##   selfdrivingcar             0              7   18   51  12      0
##   5g                         0              0   11    1   4      0
##   uber                       0              0    0    2   7      2
##   av                         0              0    0    0   1      1
##   aurora                     0              0    0    0   0      0

topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8,
                 edge_color = "orange", edge_size = 2)

Mention

corp_tweets <- corpus(dat$tweet)
tweet_dfm <- dfm(corp_tweets, remove_punct = TRUE)
head(tweet_dfm)

## Document-feature matrix of: 6 documents, 185,794 features (>99.99% sparse).
##        features
## docs    micron technology are we there yet today's adas technologies may
##   text1      1          1   1  1     1   1       1    1            1   1
##   text2      0          0   0  0     0   0       0    0            0   0
##   text3      0          0   0  0     0   0       0    0            0   1
##   text4      0          0   0  0     0   0       0    0            0   0
##   text5      0          0   0  0     0   0       0    0            0   0
##   text6      0          0   0  0     0   0       0    0            0   0
## [ reached max_nfeat ... 185,784 more features ]

tag_dfm <- dfm_select(tweet_dfm, pattern = "@*")
topuser <- names(topfeatures(tag_dfm, 50))
user_fcm <- fcm(tag_dfm)
head(user_fcm)

## Feature co-occurrence matrix of: 6 by 6 features.
##                  features
## features          @kmmdisc @counterpointtr @neiltwitz @faddy0015 @ingliguori
##   @kmmdisc               0               0          0          0           0
##   @counterpointtr        0               0          1          1           0
##   @neiltwitz             0               0          0          1           0
##   @faddy0015             0               0          0          0           0
##   @ingliguori            0               0          0          0           0
##   @chairmanmdec          0               0          0          0           0
##                  features
## features          @chairmanmdec
##   @kmmdisc                    0
##   @counterpointtr             0
##   @neiltwitz                  0
##   @faddy0015                  0
##   @ingliguori                 1
##   @chairmanmdec               0

user_fcm <- fcm_select(user_fcm, pattern = topuser)
textplot_network(user_fcm, min_freq = 0.1, 
                 edge_color = "orange", 
                 edge_alpha = 0.8, edge_size = 3)

quanteda01

Subasish Das (@subasish_das)

2020-12-13

Hashtag

Mention