Hashtag Parsing

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)
IRAhandle_tweets_1_csv <- read_csv("C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )

## See spec(...) for full column specifications.

## Warning: 2803 parsing failures.
##  row        col           expected                   actual                                                     file
## 1149 tco3_step1 1/0/T/F/TRUE/FALSE http://ow.ly/KH2T30a9YGX 'C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt'
## 1397 tco3_step1 1/0/T/F/TRUE/FALSE https://goo.gl/hV3VlX    'C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt'
## 2153 tco3_step1 1/0/T/F/TRUE/FALSE http://ow.ly/NRHJ30aPVFM 'C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt'
## 2164 tco3_step1 1/0/T/F/TRUE/FALSE https://goo.gl/CzsrEU    'C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt'
## 4319 tco3_step1 1/0/T/F/TRUE/FALSE http://bit.ly/sonar_2016 'C:/Users/Nicholas/Downloads/IRAhandle_tweets_1.csv.txt'
## .... .......... .................. ........................ ........................................................
## See problems(...) for more details.

head(IRAhandle_tweets_1_csv)

d <- IRAhandle_tweets_1_csv
hashtags_column <- d$content
hashtags_column <- as.data.frame(hashtags_column)
head(hashtags_column)

hashtags <- str_extract_all(hashtags_column$hashtags_column,"#[a-zA-Z0-9]{1,}")
hashtags <- unlist(hashtags)
hashtags.table <- table(hashtags)
head(hashtags.table)

## hashtags
## #016 #021 #038 #039  #04 #082 
##    1    1    2   22    1    2

hashtags_df <- as.data.frame(hashtags.table)

top100 <- hashtags_df[1:100,]
top100 <- arrange(top100, desc(Freq))
head(top100)

date <- Sys.Date()
date <- rep(date, 100)
top100 <- cbind(top100, date)
head(top100)

ggplot(top100, aes(hashtags, Freq)) + geom_col(width = 0.5, position = position_dodge(width = 0.9)) + coord_flip()