Vamos começar nossa análise, vou carregar o arquivo de sentimentos e logo depois vamos pesquisar pelo o usuario FieloWW no twitter
setwd("D:/Cursos/PreparacaoCarreiraCientista/R-Bigdata/Projeto1")
polaridades <- read_csv('parte2/polaridades_pt.csv')
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret);
## [1] "Using direct authentication"
tweets<-userTimeline('FieloWW', n=1000)
df <- twListToDF(tweets)
colnames(df)
## [1] "text" "favorited" "favoriteCount" "replyToSN"
## [5] "created" "truncated" "replyToSID" "id"
## [9] "replyToUID" "statusSource" "screenName" "retweetCount"
## [13] "isRetweet" "retweeted" "longitude" "latitude"
O comando ‘colnames’ nos dá uma ideia básica dos dados que o twitter retorna, já sabemos nesse ponto que podemos analisar informações muito uteis como datas, retweets, favorited e por ai vai. vou agora guardar esses dados para que possamos analisar mais tarde
epCorpus <- corpus(df, text_field='text')
Criamos um objeto chamado corpus, que possui os textos dos posts e todas variáveis. Vamos através do corpus criar um Document Feature Matrix o qual extrairemos as tags mais usadas nos tweets
user_dfm <- dfm_select(tweet_dfm, ('@*'))
topuser <- names(topfeatures(user_dfm, 50))
head(topuser,20)
## [1] "@dreamforce" "@fielo" "@nuvelloresearch"
## [4] "@nlbmda" "@capgemini_aust" "@akram_gargouri"
## [7] "@tonyrobbins" "@qz"
A fielo não cita muitos usuários nos seus tweets, mas mesmo assim vamos plotar um grafico para vermos visualmente
user_fcm <- fcm(user_dfm, tri = F)
user_fcm <- fcm_select(user_fcm, topuser)
textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5)
A única relação que notamos foi @tonyrobbins com o próprio @dreamforce.
Vamos agora analisar as palavras mais usadas nos posts da Fielo.
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
dfmwords <- dfm_remove(tweet_dfm, valuetype='regex', pattern='http[^[:space:]]*')
dfmwords <- dfm_remove(dfmwords,valuetype='regex', pattern='@([A-Za-z0-9_]+)|#([A-Za-z0-9_]+)')
dfmwords <- dfmwords %>% dfm(remove = c(stopwords(),'t.co'), remove_punct = TRUE) %>% dfm_trim(verbose = FALSE)
textplot_wordcloud(dfmwords, min.freq = 5, random.order = FALSE,
rot.per = .6,
colors = RColorBrewer::brewer.pal(8,"Dark2"))
allfeats <- textstat_frequency(dfmwords)
allfeats$feature <- with(allfeats, reorder(feature, -frequency))
ggplot(head(allfeats,20), aes(x=feature, y=frequency, fill=frequency)) + geom_bar(stat="identity") +
xlab("Termos") + ylab("Frequência") + coord_flip() +
theme(axis.text=element_text(size=7))
Identificando e fazendo um score em multi-expressões, ou colocações adjacentes de tamanho fixo dos textos
col <- textstat_collocations(epCorpus, size = 3:6, min_count = 3)
head(col)
## collocation count count_nested length lambda z
## 1 the channel https 4 4 3 5.342235 2.797346
## 2 i evaluate my resellers 3 3 4 10.131619 2.083827
## 3 $ 100 amazon card 3 3 4 9.376617 1.763005
## 4 how fielo can help 5 5 4 5.663995 1.458012
## 5 our channel incentive 3 3 3 3.598598 1.392040
## 6 do i evaluate my 3 3 4 6.665004 1.323673
#col <- with(col, reorder(collocation, count))
ggplot(col[order(col$count, decreasing = T),][1:25,],
aes(x=reorder(collocation,count), y=factor(count), fill=factor(count))) + geom_bar(stat="identity") +
xlab("Expressões") + ylab("Frequência") + coord_flip() +
theme(axis.text=element_text(size=7))
tokenInfo <- summary(epCorpus)
tokenInfo <- tokenInfo %>%
mutate(month = format(created, "%m"),year =format(created, "%Y")) %>%
group_by(month,year) %>%
arrange(created) %>%
summarise(total = n())
ggplot(data=tokenInfo[1:12,], aes(x =month, y = total, fill=factor(year))) +
geom_line() + geom_bar(stat="identity") + ylab('Posts') + xlab('Mês')
O que houve nos meses de junho, julho e agosto?
gg <- ggplot(summary(epCorpus)[1:100,], aes(x = as.Date(yday(created), "1970-01-01"), y = Tokens,
color = factor(year(created))))
gg +geom_line() +
scale_x_date(date_breaks="months", date_labels="%b") +
labs(x="Mês",colour="")
library(maps)
user <- getUser("FieloWW")
friends <- user$getFriends()
followers <- user$getFollowers()
followers_df = rbindlist(lapply(followers,as.data.frame))
followers_df<-subset(followers_df, location!="")
#Install key package helpers:
source("https://raw.githubusercontent.com/LucasPuente/geocoding/master/geocode_helpers.R")
#Install modified version of the geocode function
#(that now includes the api_key parameter):
source("https://raw.githubusercontent.com/LucasPuente/geocoding/master/modified_geocode.R")
geocode_apply<-function(x){
geocode(x, source = "google", output = "all", api_key="AIzaSyDLsb3zib53cEwAIOP-0n6RJriZEM4iwL0")
}
followers_df$location[followers_df$location %like% 'NATURAL HEALTH'] <- 'NATURAL HEALTH FB GROUP'
geocode_results<-sapply(followers_df$location, geocode_apply, simplify = F)
condition_a <- sapply(geocode_results, function(x) x["status"]=="OK")
geocode_results<-geocode_results[condition_a]
condition_b <- lapply(geocode_results, lapply, length)
condition_b2<-sapply(condition_b, function(x) x["results"]=="1")
geocode_results<-geocode_results[condition_b2]
source("https://raw.githubusercontent.com/LucasPuente/geocoding/master/cleaning_geocoded_results.R")
results_b<-lapply(geocode_results, as.data.frame)
#Then, to simplify things, you should extract out only the columns you need to generate a map of this data.
results_c<-lapply(results_b,function(x) subset(x, select=c("results.formatted_address",
"results.geometry.location")))
results_d<-lapply(results_c,function(x) data.frame(Location=x[1,"results.formatted_address"],
lat=x[1,"results.geometry.location"],
lng=x[2,"results.geometry.location"]))
results_e<-rbindlist(results_d)
results_e$username <- geocode_results$name
results_e$Location <- sapply(results_e$Location, function(x) readr::parse_character(x, locale = readr::locale('pt')))
results_f<-results_e[,Original_Location:=names(results_d)]
require(maps)
require(leaflet)
leaflet() %>%
addTiles() %>%
addCircles(data = results_f, lat = ~ lat, lng = ~ lng, popup = results_f$Location, label=results_f$Original_Location)
Traçar um objeto kwic produz um gráfico de dispersão lexical que nos permite visualizar as ocorrências de termos particulares ao longo do texto. Nós chamamos essas parcelas de “raios-x” devido à sua semelhança com os dados produzidos pelo recurso de “raio-x” da Amazon para livros de Kindle.
head(kwic(epCorpus, pattern = c('management') ))
##
## [text4, 15] from our research on incentive | management |
## [text39, 9] streamline your lead and opportunity | management |
## [text44, 9] need CRM( Customer Relationship | Management |
## [text58, 5] How to nail inventory | management |
## [text66, 9] to optimize lead and opportunity | management |
## [text77, 8] struggling with retail and inventory | management |
##
## best practices. https:
## https:// t.co
## ) https://
## in the manufacturing industry https
## https:// t.co
## ? Don't worry- there
#epCorpus[epCorpus$documents$texts %like% 'saúde']
head(textstat_readability(epCorpus),1)
## ARI ARI.simple Bormuth Bormuth.GP Coleman Coleman.C2
## text1 9.388571 59.35714 -1.594821 10157024 29.12143 36.90714
## Coleman.Liau Coleman.Liau.grade Coleman.Liau.short Dale.Chall
## text1 35.63231 13.30055 13.30095 -38.245
## Dale.Chall.old Dale.Chall.PSK Danielson.Bryan Danielson.Bryan.2
## text1 19.9473 15.443 6.40738 73.09
## Dickes.Steiwer DRP ELF Farr.Jenkins.Paterson Flesch
## text1 -338.9482 259.4821 5 -41.33693 43.09179
## Flesch.PSK Flesch.Kincaid FOG FOG.PSK FOG.NRI FORCAST
## text1 6.847333 9.857381 9.914286 4.230882 -0.0825 12.14286
## FORCAST.RGL Fucks Linsear.Write LIW nWS nWS.2 nWS.3
## text1 11.78714 57 6 34.30952 6.255362 6.43091 5.118707
## nWS.4 RIX Scrabble SMOG SMOG.C SMOG.simple SMOG.de Spache
## text1 5.0158 2.5 1.726415 10.12576 9.940668 9.708204 4.708204 10.1295
## Spache.old Strain Traenkle.Bailer Traenkle.Bailer.2 Wheeler.Smith
## text1 10.9195 5.7 -355.6764 -316.9866 50
## meanSentenceLength meanWordSyllables
## text1 10.5 1.809524
s2 <- textstat_simil(dfmwords, c("manufacturing", "management"), method = "cosine",
margin = "features")
head(as.matrix(s2), 10)
## manufacturing management
## manufacturing 1.0000000 0.1360828
## management 0.1360828 1.0000000
## visit 0.0000000 0.0000000
## nyc 0.0000000 0.0000000
## today 0.0000000 0.0000000
## learn 0.0000000 0.0000000
## platform 0.0000000 0.0000000
## allows 0.0000000 0.0000000
## organizations 0.0000000 0.0000000
## run 0.0000000 0.0000000
Outro recurso interessante é que podemos agrupar por assuntos, vou demonstar a ideia agrupando por saúde e hospital, que não é funcional mas mostra que recursos podemos estudar.
myDict <- dictionary(list(production = c('partners','management', 'manufacturing','manufacturers','sales' ),
values = c('help','program','guide','loyalty','power','gain', 'learn')))
dfmGroup <- epCorpus %>% dfm(remove = c(stopwords(),'t.co'), dictionary = myDict,
remove_punct = TRUE) %>% dfm_trim(verbose = FALSE)
head(dfmGroup,20)
## Document-feature matrix of: 20 documents, 2 features (67.5% sparse).
## 20 x 2 sparse Matrix of class "dfm"
## features
## docs production values
## text1 0 1
## text2 0 1
## text3 0 0
## text4 1 0
## text5 0 1
## text6 0 2
## text7 0 2
## text8 0 3
## text9 0 0
## text10 1 0
## text11 0 1
## text12 0 0
## text13 0 0
## text14 0 1
## text15 0 1
## text16 0 0
## text17 0 0
## text18 0 0
## text19 1 0
## text20 1 0
feats <- textstat_frequency(dfmGroup)
feats$feature <- with(feats, reorder(feature, -frequency))
ggplot(feats, aes(x=feature, y=frequency, fill=feature)) + geom_bar(stat="identity") +
xlab("Termos") + ylab("Frequência") + coord_flip() +
theme(axis.text=element_text(size=7))
Como a Fielo posta somente em ingles usaremos dois tipos de dicionários para avaliar os posts * AFINN from Finn Årup Nielsen (http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010) * bing from Bing Liu and collaborators (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) > A diferença principal é que o Afinn classifica palavraas como um score de -5 a 5, e o bing classifica como positivas ou negativas
fielodf <- dplyr::tibble(word =df$text, date=df$created, id=rownames(df))
fielotokens <- fielodf %>% unnest_tokens(word,word)
fielotokens
## # A tibble: 2,149 x 3
## word date id
## <chr> <dttm> <chr>
## 1 visit 2017-12-14 13:22:35 1
## 2 fielo 2017-12-14 13:22:35 1
## 3 153 2017-12-14 13:22:35 1
## 4 at 2017-12-14 13:22:35 1
## 5 the 2017-12-14 13:22:35 1
## 6 salesforcetour 2017-12-14 13:22:35 1
## 7 in 2017-12-14 13:22:35 1
## 8 nyc 2017-12-14 13:22:35 1
## 9 today 2017-12-14 13:22:35 1
## 10 learn 2017-12-14 13:22:35 1
## # ... with 2,139 more rows
Vamos retirar palavras desnecessárias dos textos e manter somente as palavras do dicionário de sentimentos
fielotokensclean <- fielotokens %>% anti_join(data.frame(word=quanteda::stopwords()))
fielotokensclean <- fielotokens %>% anti_join(data.frame(word=c('t.co','rt','de')))
sentJoin <- fielotokensclean %>%
inner_join(get_sentiments("bing"), by='word')
afinnJoin <- fielotokensclean %>%
inner_join(get_sentiments("afinn"), by='word')
head(sentJoin)
## # A tibble: 6 x 4
## word date id sentiment
## <chr> <dttm> <chr> <chr>
## 1 best 2017-10-31 18:04:09 4 positive
## 2 loyalty 2017-10-28 20:14:10 5 positive
## 3 gain 2017-10-03 16:18:37 6 positive
## 4 win 2017-10-03 16:18:37 6 positive
## 5 gain 2017-09-26 20:58:27 7 positive
## 6 win 2017-09-26 20:58:27 7 positive
sentJoin %>%
count(sentiment) %>%
ggplot(aes(sentiment,n , fill = sentiment)) +
geom_bar(stat = "identity", show.legend = FALSE)
afinnJoin %>%
count(score) %>%
ggplot(aes(score,n , fill = score)) +
geom_bar(stat = "identity", show.legend = FALSE)
Quais foram as frequências de palavras positivas e negativas?
sents <- sentJoin %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0)
comparison.cloud(sents,colors = c("#F8766D", "#00BFC4"),max.words = 100)
afinns <- afinnJoin %>%
count(word, score, sort = TRUE) %>%
acast(word ~ score, value.var = "n", fill = 0)
comparison.cloud(afinns,colors = c("#F8766D", "#00BFC4"),max.words = 100)
bottom10tw <- head(sentJoin %>%
count(id, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(score = positive - negative) %>%
arrange(score),8)['id']
fielodf %>% filter(id %in% as.vector(bottom10tw$id))
## # A tibble: 8 x 3
## word date id
## <chr> <dttm> <chr>
## 1 How to run a killer email marketing campaign ~ 2016-09-30 15:18:33 46
## 2 Smaller tech vendors: here's how to tackle yo~ 2016-08-03 15:13:11 67
## 3 The trick to increasing cross sell and up sel~ 2016-08-01 18:05:59 68
## 4 Tech vendors: here’s what you’re struggling w~ 2016-07-27 19:59:21 70
## 5 Are you struggling with retail and inventory ~ 2016-07-11 20:21:52 77
## 6 Are you struggling with retail and inventory ~ 2016-07-11 15:19:17 78
## 7 The Fielo team @Dreamforce gearing up for the~ 2014-10-13 16:55:25 132
## 8 Worrying...Beijing just sent a chilling messa~ 2014-10-01 16:20:19 136
bottom10tw <- head(afinnJoin %>%
group_by(id, score) %>%
summarise(total=sum(score)) %>%
arrange(total),8)['id']
fielodf %>% filter(id %in% as.vector(bottom10tw$id))
## # A tibble: 7 x 3
## word date id
## <chr> <dttm> <chr>
## 1 Smaller tech vendors: here's how to tackle yo~ 2016-08-03 15:13:11 67
## 2 Tech vendors: here’s what you’re struggling w~ 2016-07-27 19:59:21 70
## 3 Why you’ve got gamification all wrong https:/~ 2016-07-21 15:14:09 73
## 4 Are you struggling with retail and inventory ~ 2016-07-11 20:21:52 77
## 5 Are you struggling with retail and inventory ~ 2016-07-11 15:19:17 78
## 6 Tech vendors struggle with brand compliance -~ 2016-07-08 15:11:16 79
## 7 Worrying...Beijing just sent a chilling messa~ 2014-10-01 16:20:19 136
toptw <- head(sentJoin %>%
count(id, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(score = positive - negative) %>%
arrange(desc(score)),8)['id']
fielodf %>% filter(id %in% as.vector(toptw$id))
## # A tibble: 8 x 3
## word date id
## <chr> <dttm> <chr>
## 1 Take our Channel Incentive & Loyalty Prog~ 2017-09-22 18:54:29 8
## 2 "REWARDING QUICKLY: THE KEY TO A SUCCESSFUL I~ 2016-03-30 15:49:52 122
## 3 "Ready to start building a World Class Channe~ 2016-03-29 17:45:14 123
## 4 Our 4 most valuable pieces of advice for desi~ 2016-03-16 15:37:32 125
## 5 Wanna learn how to create an amazing Loyalty ~ 2016-03-07 19:23:30 127
## 6 Wise words from @tonyrobbins presenting at @D~ 2014-10-13 23:03:24 131
## 7 We'll be @Dreamforce & would love to meet~ 2014-10-07 14:41:58 135
## 8 Excellent presentation by #Carbonite on 5 Mus~ 2014-09-18 18:59:28 137
E os mais positivos (afinn)?
toptw <- head(afinnJoin %>%
group_by(id, score) %>%
summarise(total=sum(score)) %>%
arrange(desc(total)),8)['id']
fielodf %>% filter(id %in% as.vector(toptw$id))
## # A tibble: 8 x 3
## word date id
## <chr> <dttm> <chr>
## 1 Take our Channel Incentive Program Survey for~ 2017-10-03 16:18:37 6
## 2 Take our Channel Incentive Program Survey for~ 2017-09-26 20:58:27 7
## 3 How Fielo can help you improve reseller brand~ 2017-05-05 15:01:49 15
## 4 "A FUN WAY TO UNITE AND ENGAGE YOUR CHANNEL P~ 2016-04-04 18:34:09 120
## 5 Wanna learn how to create an amazing Loyalty ~ 2016-03-07 19:23:30 127
## 6 Been a fantastic journey working with the tea~ 2015-05-15 09:19:35 128
## 7 We'll be @Dreamforce & would love to meet~ 2014-10-07 14:41:58 135
## 8 Excellent presentation by #Carbonite on 5 Mus~ 2014-09-18 18:59:28 137
Com uma simples análise conseguimos bons resultados de analise com o dataset Afinn, note que se fosse uma pesquisa de opinião ou até mesmo uma análise streaming realtime como por exemplo usando Spark e alertas gerados por modelos de treinamento em Machine Learning poderiamos atingir um resultado de eficácia de detecção > 90%. O Volume de dados está crescendo assustadoramente e as ferramentas que a Ciência de Dados nos proporciona vieram pra ficar, ou melhor, vieram pra dominar.