—title: “Twitter_Text” author: “Ganis Ardhaning Saputri” date: “November 12, 2018” output: html_document —
Analisis yang akan dilakukan adalah melakukan teks mining terhadap beberapa kata yang digunakan dalam media sosial Twitter. Hal ini dapat diakukan untuk mengetahui seberapa sering kata-kata tersebu digunakan dalam media sosial Twitter dan kata-kata apa saja yang saling berhubungan dengan kata tersebut. ### 1. Retrieve Tweets from Twitter Tahap awal yang dilakukan adalah menginstall beberapa package yang akan digunakan dalam analisis ini.
#load packages
library(rtweet)
library(tidyverse)
Tahap kedua adaah melakukan penginputan token yang digunakan untuk mengakses data dalam Twitter, dimana sebelum mendapatkan token harus meminta perizinan terlebih dahulu untuk mendapatan data dari twitter.
# Twitter authentication
create_token(
app = "my_twitter_research_app",
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
## <Token>
## <oauth_endpoint>
## request: https://api.twitter.com/oauth/request_token
## authorize: https://api.twitter.com/oauth/authenticate
## access: https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
## key: Lz7QKeXfeJWTM4KBDk4Qe7Uoy
## secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---
# Retrieve tweets
tweets <- search_tweets("#Dollar", n = 10000, langs="en", tweet_mode="extended")
## Searching for tweets...
## Finished collecting tweets!
Tahap ketiga adalah melakukan analisis deskriptif terhadap kata yang ingin dianalisis dalam teks mining ini.
## plot time series of tweets
ts_plot(tweets, "3 hours") +
ggplot2::theme_minimal() +
ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
ggplot2::labs(
x = NULL, y = NULL,
title = "Frequency of #Dollar Twitter statuses from past 9 days",
subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
caption = "\nSource: Data collected from Twitter's REST API via rtweet"
)
Berdasarkan hasil grafik tersebut, dapat diketahui bahwa kata “Dollar” dalam kurun waktu 9 hari dan data tersebut diambil setiap 3 jam sekali. Grafik tersebut menunjukkan bahwa grafik tersebut berflutuasi, sehingga dapat dikatakan bahwa dalam kurun waktu 9 hari orang-orang akan menuliskan tweet tentang “Dollar” ditunjukkan dalam grafik tersebut.
head(tweets)
## # A tibble: 6 x 88
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 293344~ 10619664~ 2018-11-12 12:58:22 fintechna Wow ~ fintrt
## 2 293344~ 10600594~ 2018-11-07 06:40:31 fintechna "For~ fintrt
## 3 105985~ 10619664~ 2018-11-12 12:58:15 sajjadHuss~ Aver~ Twitt~
## 4 300100~ 10619658~ 2018-11-12 12:55:55 StateBank_~ Aver~ Twitt~
## 5 300100~ 10608498~ 2018-11-09 11:01:18 StateBank_~ Aver~ Twitt~
## 6 300100~ 10597661~ 2018-11-06 11:15:20 StateBank_~ Aver~ Twitt~
## # ... with 82 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, hashtags <list>,
## # symbols <list>, urls_url <list>, urls_t.co <list>,
## # urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## # media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## # ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>,
## # mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## # quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## # quoted_favorite_count <int>, quoted_retweet_count <int>,
## # quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## # quoted_followers_count <int>, quoted_friends_count <int>,
## # quoted_statuses_count <int>, quoted_location <chr>,
## # quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>,
## # retweet_created_at <dttm>, retweet_source <chr>,
## # retweet_favorite_count <int>, retweet_retweet_count <int>,
## # retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>,
## # country <chr>, country_code <chr>, geo_coords <list>,
## # coords_coords <list>, bbox_coords <list>, status_url <chr>,
## # name <chr>, location <chr>, description <chr>, url <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## # profile_expanded_url <chr>, account_lang <chr>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
Tahap keempat adalah menginstall package “tm”.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(removeURL)):
## transformation drops documents
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(myCorpus,
## content_transformer(removeNumPunct)): transformation drops documents
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "see", "used", "via", "amp")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, myStopwords):
## transformation drops documents
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(myCorpus, stripWhitespace): transformation
## drops documents
# keep a copy for stem completion later
myCorpusCopy <- myCorpus
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 9289, documents: 4359)>>
## Non-/sparse entries: 74419/40416332
## Sparsity : 100%
## Maximal term length: 51
## Weighting : term frequency (tf)
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms[1:50]
## [1] "dollar" "due" "euro"
## [4] "exchange" "fiat" "fintech"
## [7] "huge" "join" "now"
## [10] "short" "soon" "st"
## [13] "time" "trade" "us"
## [16] "usd" "aaexchange" "actual"
## [19] "calling" "confirm" "currency"
## [22] "currencyexchange" "differ" "displayed"
## [25] "fluctuations" "foreign" "forex"
## [28] "free" "market" "may"
## [31] "pak" "please" "rates"
## [34] "rupees" "toll" "average"
## [37] "pound" "yen" "federal"
## [40] "high" "interest" "investors"
## [43] "month" "next" "political"
## [46] "rate" "read" "reserve"
## [49] "rise" "rupee"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 1000)
df <- data.frame(term = names(term.freq), freq = term.freq)
ggplot2::ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
Hasi output tersebut menunjukkan frekuensi terbanyak dari kata-ata yang banyak digunakan dalam menulis tweet yang ditulis.
library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 300,
random.order = F, colors = pal)
Gambar tersebut menunjukkan kata-kata yang berhubungan dengan kata “Dollar” yang sering ditulis tweet oleh orang-orang.