# run for RStudio.Cloud
options(repos = "https://cran.rstudio.com/" )
# call tidyverse -- if you get an error, do you have tidyverse installed??
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
# file name
file <- "./data/CharlotteTweets20Sample.csv"
tweets <- read_csv(file)
## Parsed with column specification:
## cols(
## body = col_character(),
## postedTime = col_datetime(format = ""),
## actor.id = col_double(),
## displayName = col_character(),
## actor.postedTime = col_datetime(format = ""),
## summary = col_character(),
## friendsCount = col_integer(),
## followersCount = col_integer(),
## statusesCount = col_integer(),
## actor.location.displayName = col_character(),
## generator.displayName = col_character(),
## geo.type = col_character(),
## point_long = col_double(),
## point_lat = col_double(),
## urls.0.expanded_url = col_character(),
## klout_score = col_integer(),
## hashtags = col_character(),
## user_mention_screen_names = col_character()
## )
## spatial / leaflet, see https://rstudio.github.io/leaflet/
# install.packages("leaflet")
library(leaflet); library(stringr)
query <- "beer"
beerTweets <- tweets %>%
filter(str_detect(tweets$body, query)) %>%
filter(!is.na(point_long))
leaflet(beerTweets) %>%
addTiles() %>%
addCircleMarkers(lng=beerTweets$point_lat,
lat=beerTweets$point_long,
popup = beerTweets$body,
stroke = FALSE,
fillOpacity = 0.5,
radius = 10,
clusterOptions = markerClusterOptions())
## time series (dygraph), see http://rstudio.github.io/dygraphs/index.html
# install.packages("dygraphs"); install.packages("xts")
library(dygraphs); library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
## The following objects are masked from 'package:dplyr':
##
## first, last
counts <- tweets %>%
mutate(Date = as.Date(postedTime)) %>%
group_by(Date, geo.type) %>%
summarise(Count = n()) %>%
ungroup() %>%
spread(geo.type, Count, fill = 0, convert = T)
dailyCounts <- xts(
x = counts[,-1],
order.by = counts$Date
)
# can convert to weekly, monthly, yearly, etc
weeklyCounts <- apply.weekly(dailyCounts, colSums)
dygraph(dailyCounts, main = "Charlotte Geo-located Tweets") %>%
dyRangeSelector() %>%
dyEvent("2016-02-08", "Super Bowl") %>%
dyEvent("2016-01-25", "NFC Championship")
## visNetwork http://datastorm-open.github.io/visNetwork/
# install.packages("visNetwork"); install.packages("igraph")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# How to pull twitter from R (rtweet): see http://rtweet.info/
# rtweets <- rtweet::search_tweets("#rstats", n = 18000, token = twitter_token)
# read in 17,829 (incl. RT's) tweets pulled June 5, 2018 with "#rstats"
rtweets <- read_csv("./data/rtweets.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## user_id = col_double(),
## created_at = col_datetime(format = ""),
## status_id = col_double(),
## retweet_count = col_integer(),
## favorite_count = col_integer(),
## is_quote_status = col_logical(),
## quote_status_id = col_double(),
## is_retweet = col_logical(),
## retweet_status_id = col_double(),
## in_reply_to_status_status_id = col_double(),
## in_reply_to_status_user_id = col_double(),
## media_id = col_double()
## )
## See spec(...) for full column specifications.
# regular expression -- get only tweets that have "RT @", i.e. retweet
term <- "tidyverse"
rtweetsTerm <- rtweets %>%
filter(str_detect(text, "RT @")) %>%
filter(str_detect(tolower(text), term))
names <- unique(rtweetsTerm$screen_name)
edges <- tibble(node1 = rtweetsTerm$screen_name,
node2 = gsub('.*RT @([a-zA-Z0-9_]+):? ?.*', rtweetsTerm$text, repl="\\1")) %>%
filter(node1 %in% names & node2 %in% names) %>%
group_by(node1, node2) %>%
summarise(weights = n())
g <- graph_from_data_frame(d=edges, directed=TRUE)
library(visNetwork)
degTotal <- degree(g)
V(g)$size <- 10*log(degTotal) + 10
visIgraph(g) %>%
visInteraction(navigationButtons = TRUE) %>%
visEdges(arrows = list(to = list(enabled = TRUE, scaleFactor = 0.5))) %>%
visIgraphLayout("physics" = TRUE)
### tidytext: see https://www.tidytextmining.com/
# install.packages("tidytext")
library(tidytext)
# load stop words
data("stop_words")
# remove urls, lt, amp, gt
replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https"
tweet_words <- filter(rtweets, is_retweet == FALSE) %>%
mutate(text = str_replace_all(text, replace_reg, ""),
Date = as.Date(created_at)) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(Date, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, Date, n)
## Joining, by = "word"
tweet_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(Date) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = as.factor(Date))) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~Date, ncol = 4, scales = "free") +
coord_flip()
## Selecting by tf_idf

#### Advanced
## r2d3 (https://rstudio.github.io/r2d3/)
## if running in Rstudio cloud, need to run statement before as doesn't auto install
# pck <- c("devtools", "digest", "htmltools", "htmlwidgets", "jsonlite", "Rcpp", "rstudioapi", "yaml")
# install.packages(pck)
# devtools::install_github("rstudio/r2d3")
library(r2d3)
flare <- read.csv("./d3/flare.csv")
# bubbles
r2d3(data = flare, d3_version = 4, script = "./d3/bubbles.js")
# radial tree (includes radialtree.css)
r2d3(data = flare, d3_version = 4, script = "./d3/radialtree.js")
# circlepacking (includes circlepacking.css)
r2d3(data = jsonlite::read_json("./d3/flare.json"), d3_version = 4, script = "./d3/circlepacking.js")
# streamgraph on random data
r2d3(data=c(), script = "./d3/streamgraph.js")
# many more; see https://rstudio.github.io/r2d3/articles/gallery.html
## collapsible Tree (https://adeelk93.github.io/collapsibleTree/)
# install.packages("collapsibleTree")
library(collapsibleTree)
# prep data
flare2 <- flare %>%
mutate(id = as.character(id)) %>%
separate(id, into = c("First","Second","Third","Fourth"))
## Warning: Expected 4 pieces. Additional pieces discarded in 33 rows [195,
## 196, 197, 198, 213, 214, 215, 217, 218, 219, 220, 221, 223, 224, 225, 228,
## 229, 230, 232, 233, ...].
## Warning: Expected 4 pieces. Missing pieces filled with `NA` in 111 rows
## [1, 2, 3, 8, 14, 16, 17, 18, 19, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
## 39, ...].
collapsibleTree(
flare2,
hierarchy = c("First", "Second", "Third", "Fourth"),
width = 800,
zoomable = TRUE
)
## Warning in CheckNameReservedWord(path, check): Name 'count' is a reserved
## word as defined in NODE_RESERVED_NAMES_CONST. Using 'count2' instead.
## Warning in CheckNameReservedWord(path, check): Name 'Sort' is a reserved
## word as defined in NODE_RESERVED_NAMES_CONST. Using 'Sort2' instead.