#rm(list = ls())
# load twitter library
library(rtweet)
#Plotting library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
#data processing/cleaning/pipelining library
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# text mining library
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.1
# plotting packages
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.1
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.1
library("NLP")
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("syuzhet")
## Warning: package 'syuzhet' was built under R version 3.5.1
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:rtweet':
##
## get_tokens
library("tm")
## Warning: package 'tm' was built under R version 3.5.1
library("SnowballC")
library("stringi")
library("topicmodels")
## Warning: package 'topicmodels' was built under R version 3.5.1
library("ROAuth")
## Warning: package 'ROAuth' was built under R version 3.5.1
library(widyr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
#TwitterApp
appname <- "ANLY 545 Twitter data analytics"
## api key
key <- 'Yh1QFngD8tRacpgTVEQ8Xsx6P'
## api secret
secret <- '4NvruwpPKoSa80sG0HFhQZcu297Fw2XXlRkJXAgN80NQOvTR99'
# create token
twitter_token <- create_token(
app = "ANLY 545 Twitter data analytics",
consumer_key = key,
consumer_secret = secret,
access_token ='1037519686525702144-PExsbFE6wrHC2lNzd8ffBXpbr5c4vq' ,
access_secret ='yc2XHlc29J7R84MZMwJmoEpvVjaE5oJHsIPfxLoMdsI9y' )
library(dplyr)
library(tidyr)
job_tweets <- search_tweets(q = "#job",
n = 500)
head(job_tweets, n = 3)
## # A tibble: 3 x 88
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 791258~ 10524169~ 2018-10-17 04:31:56 RHFAJobs This~ Caree~
## 2 464681~ 10524168~ 2018-10-17 04:31:47 weareteamt~ We'r~ Caree~
## 3 464681~ 10524023~ 2018-10-17 03:34:03 weareteamt~ Can ~ Caree~
## # ... with 82 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, hashtags <list>,
## # symbols <list>, urls_url <list>, urls_t.co <list>,
## # urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## # media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## # ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>,
## # mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## # quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## # quoted_favorite_count <int>, quoted_retweet_count <int>,
## # quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## # quoted_followers_count <int>, quoted_friends_count <int>,
## # quoted_statuses_count <int>, quoted_location <chr>,
## # quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>,
## # retweet_created_at <dttm>, retweet_source <chr>,
## # retweet_favorite_count <int>, retweet_retweet_count <int>,
## # retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>,
## # country <chr>, country_code <chr>, geo_coords <list>,
## # coords_coords <list>, bbox_coords <list>, status_url <chr>,
## # name <chr>, location <chr>, description <chr>, url <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## # profile_expanded_url <chr>, account_lang <chr>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
# Only tweets and no retweets
job_tweets <- search_tweets("#job", n = 500,
include_rts = FALSE)
# view top 2 rows of data
head(job_tweets, n = 2)
## # A tibble: 2 x 88
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 791258~ 10524169~ 2018-10-17 04:31:56 RHFAJobs This~ Caree~
## 2 464681~ 10524168~ 2018-10-17 04:31:47 weareteamt~ We'r~ Caree~
## # ... with 82 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, hashtags <list>,
## # symbols <list>, urls_url <list>, urls_t.co <list>,
## # urls_expanded_url <list>, media_url <list>, media_t.co <list>,
## # media_expanded_url <list>, media_type <list>, ext_media_url <list>,
## # ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>,
## # mentions_screen_name <list>, lang <chr>, quoted_status_id <chr>,
## # quoted_text <chr>, quoted_created_at <dttm>, quoted_source <chr>,
## # quoted_favorite_count <int>, quoted_retweet_count <int>,
## # quoted_user_id <chr>, quoted_screen_name <chr>, quoted_name <chr>,
## # quoted_followers_count <int>, quoted_friends_count <int>,
## # quoted_statuses_count <int>, quoted_location <chr>,
## # quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>,
## # retweet_created_at <dttm>, retweet_source <chr>,
## # retweet_favorite_count <int>, retweet_retweet_count <int>,
## # retweet_user_id <chr>, retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>,
## # country <chr>, country_code <chr>, geo_coords <list>,
## # coords_coords <list>, bbox_coords <list>, status_url <chr>,
## # name <chr>, location <chr>, description <chr>, url <chr>,
## # protected <lgl>, followers_count <int>, friends_count <int>,
## # listed_count <int>, statuses_count <int>, favourites_count <int>,
## # account_created_at <dttm>, verified <lgl>, profile_url <chr>,
## # profile_expanded_url <chr>, account_lang <chr>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
# view column with screen names
head(job_tweets$screen_name)
## [1] "RHFAJobs" "weareteamtrump" "weareteamtrump" "weareteamtrump"
## [5] "USSANews" "SanofiUS_Jobs"
unique(job_tweets$screen_name)
## [1] "RHFAJobs" "weareteamtrump" "USSANews"
## [4] "SanofiUS_Jobs" "dsgnjbs" "EmploiAngouleme"
## [7] "SpectrumCommLLC" "MultiCare_Jobs" "ThermoFisherJob"
## [10] "MercyJobs" "EmploiLille" "Emploi59"
## [13] "selly_zest" "rgiscareers" "ictjob_be"
## [16] "TNG_jobs" "ishnae" "luckysmktjobs"
## [19] "WalserCareers" "jobely" "JoeDRSHR"
## [22] "HVMGCareers" "Trad_Spirits" "joinphilipslght"
## [25] "E2EJobs" "MemorialCareJob" "PSJHCareers"
## [28] "ePlusJobs" "MobileMiniJobs" "dalgsllc"
## [31] "RRDJobs" "SCLocumsJobs" "GodivaJobs"
## [34] "careercast_jobs" "jcrew_hiring" "CareerCastHlth"
## [37] "ChicosCareers" "WHBMCareers" "PacDenCareers"
## [40] "ChristinaFo" "DTEEnergyJobs" "CaspersCareers"
## [43] "WestmorelandCO" "CROSSMARKJobs" "OfficeTeamJobs"
## [46] "scoularcareers" "3leads" "CompassJobBoard"
## [49] "HobbyLobbyJobs" "julesdebene" "ITJob_Columbus"
## [52] "JobSeineMarne" "MatthewCraven12" "tmj_tha_jobs"
## [55] "PlastipakJobs" "alleyagee" "JoinCellular"
## [58] "tmj_inh_finance" "TheGioCetina" "ngchunchit"
## [61] "KSV1870" "wendymanganaro" "AccountempsJobs"
## [64] "RHTechJobs" "PSJHCareersRN" "tmj_rus_it"
## [67] "tmj_sgp_banking" "tmj_ich_manuf" "tmj_hkg_banking"
## [70] "TrojanRecruit" "tmj_lka_jobs" "tmj_mng_jobs"
## [73] "mcguirl" "tmj_ide_green" "EmploiLimoges"
## [76] "StanChartJobs" "tmj_ich_banking" "tmj_inm_it"
## [79] "tmj_mys_banking" "PTGTCareers" "tmj_pak_banking"
## [82] "tmj_chs_jobs" "tmj_inh_adv" "tmj_inp_jobs"
## [85] "tmj_inm_itpm" "tmj_mmr_jobs" "tmj_ide_mgmt"
## [88] "lets_ace" "weareddstep" "tmj_uzb_jobs1"
## [91] "EmploiCoteArmor" "tmj_inh_eng" "tmj_ide_jobs"
## [94] "franknaval" "HlthcareJobsite" "tmj_rus_jobs"
## [97] "tmj_ich_finance" "tmj_vnm_banking" "PGJobs"
## [100] "tmj_asi_jobs" "tmj_inc_banking" "anselmbradford"
## [103] "PMPConnect" "tmj_inc_itpm" "tmj_inb_jobs"
## [106] "seanchiggins" "tmj_inh_jobs" "tmj_inm_jobs"
## [109] "tmj_ich_eng" "tmj_ich_legal" "tmj_ich_jobs"
## [112] "tmj_chs_pharm" "tmj_hkg_jobs" "tmj_ide_it"
## [115] "tmj_inm_cler" "tmj_inc_jobs" "Rengineeringjob"
## [118] "tmj_ich_itjava" "tmj_inb_finance" "tmj_inh_itdb"
## [121] "WorkWithSHC" "tmj_inm_legal" "tmj_inb_eng"
## [124] "tmj_inn_jobs" "tmj_ing_jobs" "tmj_ide_itqa"
## [127] "tmj_inm_adv" "tmj_phi_jobs" "tmj_inh_mgmt"
## [130] "tmj_inm_manuf" "tmj_inh_banking" "tmj_ich_cler"
## [133] "tmj_mys_jobs" "JoeFranscella" "tmj_inc_it"
## [136] "tmj_tha_banking" "tmj_inh_itpm" "EGonzalezHaas"
## [139] "JJamin" "tmj_ich_it" "BarrazaChico"
## [142] "winklerdaniel" "tmj_vnm_jobs" "tmj_ide_itdb"
## [145] "tmj_inh_cstsrv" "tmj_inm_mgmt" "tmj_uae_green"
## [148] "JCI_Jobs" "JaySangra" "nature_careers"
## [151] "recruiter_sj" "tmj_ide_acct" "tmj_inm_hr"
## [154] "tmj_ich_sales" "tmj_brn_jobs" "tmj_ich_mgmt"
## [157] "tmj_kaz_jobs1" "tmj_twn_jobs" "tmj_rus_itqa"
## [160] "tmj_inm_finance" "tmj_ide_media" "tmj_inb_mgmt"
## [163] "tmj_ich_hr" "tmj_inh_sales" "tmj_ide_hr"
## [166] "tmj_inh_it" "phogg96" "tmj_ide_itpm"
## [169] "tmj_chn_banking" "BakerHughesJobs" "tmj_ast_jobs"
## [172] "tmj_chs_recruit" "tmj_inb_hr" "MPajemolin"
## [175] "cititrendsjobs" "tmj_inm_banking" "tmj_ide_sales"
## [178] "tmj_inm_writing" "MikeP_Reed" "farmwork23"
## [181] "tmj_inm_itqa" "design2perform" "EmploiMontpel"
## [184] "EmploiHerault" "JobsCaithness" "tmj_nzw_jobs"
## [187] "PNPersonnel" "EmploiAisne" "MarketSourceJob"
## [190] "Tyrex18943336" "EmploiEpinal" "tombdugan"
## [193] "EmploiPau" "ajstravlin" "emploibelfort"
## [196] "EmploiVendee" "DelNorthCareers" "DanielWaite_NOW"
## [199] "EmploiAquitaine" "pherlha_aguilar" "AAIHS"
## [202] "EmploiBourgogne" "masterartisantz" "RICHMEGAJOBS"
## [205] "0crat" "LiveRecruitment" "LauraGLiveRec"
## [208] "FYTEasily" "MortimerBell" "NadiaLiveRec"
## [211] "LipsonLloydJ" "ManpowerSG" "BarefootStudent"
## [214] "denisempratt" "FourSeasonsJobs" "tmj_nzc_jobs"
## [217] "karotex1" "kunalism" "immiultimate"
## [220] "ADSWCareers" "tmj_hon_green" "blue_red_orange"
## [223] "RSchrishuhn" "WDCourse" "LocalWorkCa"
## [226] "PSIPax" "WayneArmstrong" "tmj_syd_mgmt"
## [229] "MH_Pigis" "tmj_syd_sales" "RivCoJobs"
## [232] "tmj_mel_sales" "tmj_mel_it" "JobYamamon"
## [235] "tmj_syd_cstsrv" "Mariah_Ismail" "tmj_inm_realest"
## [238] "RMSI_jobs" "tmj_syd_finance" "tmj_mel_itdb"
## [241] "voltsgjobs" "ULG_Trades" "tmj_mel_itpm"
## [244] "tmj_syd_manuf" "ultabeautyjobs" "tmj_syd_it"
## [247] "mashia6" "DriveMelton" "GoldenCorralJob"
## [250] "tmj_syd_legal" "CoxPurtellJobs" "BarSpire"
## [253] "RAHomesJobs" "tmj_mel_itqa" "AssignRecruit"
## [256] "ZurichNACareers" "tmj_mel_jobs" "VanaVana2200000"
## [259] "LLU_Careers" "tmj_mel_cstsrv" "tmj_auc_jobs"
## [262] "JimGiammatteo" "CSGICareers" "MyHubIntranet"
## [265] "tmj_mnp_jobs" "tmj_syd_recruit" "COC_Careers"
## [268] "SimoneMahedy" "gmtpeople" "Job2Grow"
## [271] "artisanupdates" "tmj_syd_hr" "tmj_syd_itpm"
## [274] "Beth_Finger" "tmj_mel_pharm" "IQVIAcareers"
## [277] "smcb03" "tmj_syd_retail" "tmj_syd_itdb"
## [280] "tmj_syd_hrta" "inzejob" "t_rendezvous"
## [283] "tmj_gum_jobs" "PandaCareers" "tmj_kor_jobs"
## [286] "Bluisooner" "WorkHendersonNV" "EdHuntr"
## [289] "tmj_syd_cler" "tmj_inh_itqa" "ThePiagentini"
## [292] "peoplebankjobs" "attCAREERS" "JobWindow_Jobs"
## [295] "snaphuntjobs" "musStiforp" "shah_saharsh"
## [298] "TicknerRobin" "rachelebitte" "danridesharleys"
## [301] "StuartGHazell" "tmj_jpn_jobs" "danispeck"
## [304] "LMartin_TX" "2CHRONICLES_714" "Ben_ServiceNow"
## [307] "Lasvegasacs" "SatansXwife" "tmj_ide_itjava"
## [310] "IamEdzM" "majawashington" "TinaLOwens"
## [313] "NursempJobs" "Zhaopin_com" "langly511"
## [316] "LisaJRamos" "RitaBrue" "Lee_James_FL"
## [319] "ChristaMcCabe" "EbohAjeroh" "flowmotor"
## [322] "mkhan004_" "CCSF_Cyber_Club" "RecruitByMark"
## [325] "fuzirbarry" "c_cs" "mj_kernan"
## [328] "find_me_in_cali" "interviewgig" "ZWDcom"
## [331] "MktgJobForce" "tmj_ndo_jobs1" "IIS_Delivers"
## [334] "ITJobs_IL" "Mike_McCown" "aciperski"
## [337] "JesusValdesMX" "matoysumayao" "Kryokelt"
## [340] "MarraBeppe" "tmj_mdv_jobs" "carpalwatch"
## [343] "parityconsult" "EssityCareers" "JennBennetSE"
## [346] "hopeobaker" "thebeebles" "lauraholliday"
## [349] "h0v1k" "tmj_inb_it" "gym_guide"
## [352] "alg_talent" "vmorgangipson" "p2pBianca"
## [355] "tmj_inm_sales" "tmj_HI_EDU" "tmj_HI_ACCT"
## [358] "tmj_mau_acct" "Labs83" "tmj_mau_retail"
users <- search_users("job_tweets",
n = 500)
## Searching for users...
## Finished collecting users!
users %>%
ggplot(aes(location)) +
geom_bar() + coord_flip() +
labs(x = "Count",
y = "Location",
title = "Twitter users ")
users %>%
dplyr::count(location, sort = TRUE) %>%
mutate(location = reorder(location,n)) %>%
na.omit() %>%
top_n(20) %>%
ggplot(aes(x = location,y = n)) +
geom_col() +
coord_flip() +
labs(x = "Location",
y = "Count",
title = "Twitter users by City/Region ")
## Selecting by n
job_tweets$stripped_text <- gsub("http.*","", job_tweets$text)
job_tweets$stripped_text <- gsub("https.*","", job_tweets$stripped_text)
# remove punctuation, convert to lowercase, add id for each tweet!
job_tweets_clean <- job_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
#getting emotions using in-built function
mysentiment_job<- get_nrc_sentiment((job_tweets_clean$word))
#calculationg total score for each sentiment
Sentimentscores_job<-data.frame(colSums(mysentiment_job[,]))
names(Sentimentscores_job)<-"Score"
Sentimentscores_job<-cbind("sentiment"=rownames(Sentimentscores_job),Sentimentscores_job)
rownames(Sentimentscores_job)<-NULL
#plotting the sentiments with scores
ggplot(data=Sentimentscores_job,aes(x=sentiment,y=Score))+geom_bar(aes(fill=sentiment),stat = "identity")+
theme(legend.position="none")+
xlab("Sentiments")+ylab("scores")+ggtitle("Sentiments of people behind the tweets on job")
bossDay_tweets <- search_tweets(q = "#bossday", n = 500, lang = "en",
include_rts = FALSE)
head(bossDay_tweets$text)
## [1] "Who run the world? MOMS <U+0001F30D> Happy #BossDay Mamas! <U+0001F495><U+0001F931><U+0001F495> @Regran_ed from @itsAmandaAcosta <U+0001F4F8> #momlife #mommyblogger @ San Diego, California https://t.co/AWrGjn6Xla"
## [2] "WHERE MY REAL FRIENDS AT?? I didnt get any love on this #bossDay. Yall know Im a BOSS <U+0001F644>"
## [3] "Today is #NationalBossDay. We'd like to recognize our boss, Jim Marshall, the Director of @_911TI_. His leadership makes it a joy to serve at #911TI. #BossDay #boss #911dispatcher #911dispatchers https://t.co/APCey30vOa"
## [4] "You may find it hard to sort your boss, for everything else there is Kangaro. Bosses can be inspiring, charming, encouraging...Tag your most memorable boss till date.#BossDay https://t.co/6CR3Qqw8SW"
## [5] "Yo happy boss day to @maggiej_5! Thanks for believing in us and empowering us with the tools necessary to empower our people. You the champ that runs the camp! #bossday"
## [6] "#BossDay Thankful to be part of the Macy Family! @MacyEagles #MacyEagles #LJSD #JoyfulLeaders https://t.co/BMIB7Jidae"
bossDay_tweets$stripped_text <- gsub("http.*","", bossDay_tweets$text)
bossDay_tweets$stripped_text <- gsub("https.*","", bossDay_tweets$stripped_text)
# remove punctuation, convert to lowercase, add id for each tweet!
bossDay_tweets_clean <- bossDay_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words
bossDay_tweets_clean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
## Selecting by n
bossDay_tweets_clean <- bossDay_tweets_clean %>%
anti_join(stop_words)
## Joining, by = "word"
# plot the top 15 words
bossDay_tweets_clean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Count of unique words found in tweets",
subtitle = "Stop words removed from the list")
## Selecting by n
# remove punctuation, convert to lowercase, add id for each tweet!
bossDay_tweets_paired_words <- bossDay_tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
bossDay_tweets_paired_words %>%
dplyr::count(paired_words, sort = TRUE)
## # A tibble: 7,838 x 2
## paired_words n
## <chr> <int>
## 1 thank you 78
## 2 day to 77
## 3 happy bossday 69
## 4 boss's day 64
## 5 to our 57
## 6 boss day 53
## 7 bosss day 52
## 8 bossday to 46
## 9 you for 42
## 10 to all 40
## # ... with 7,828 more rows
bossDay_tweets_separated_words <- bossDay_tweets_paired_words %>%
tidyr::separate(paired_words, c("word1", "word2"), sep = " ")
bossDay_tweets_filtered <- bossDay_tweets_separated_words %>%
dplyr::filter(!word1 %in% stop_words$word) %>%
dplyr::filter(!word2 %in% stop_words$word)
# new bigram counts:
bossDay_words_counts <- bossDay_tweets_filtered %>%
dplyr::count(word1, word2, sort = TRUE)
head(bossDay_words_counts)
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 happy bossday 69
## 2 boss's day 64
## 3 boss day 53
## 4 bosss day 52
## 5 bossday happy 36
## 6 happy bosss 36
bossDay_words_counts %>%
filter(n >= 24) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(color = "darkslategray4", size = 3) +
geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
labs(title = "Word Network: #bossday",
subtitle = "Text mining twitter data ",
x = "", y = "")