This document includes examples of the Social Media Analysis capabilities of ContextBase. “Social Media Analysis” is the use of Natural Language Processing techniques to determine the effects of text-based posting to Social Media. This can uncover hidden textual characteristics within posts, that effect public perceptions.
randomSample <- read.csv("randomSample2016.csv")
randomTweets_corpus <- Corpus(VectorSource(randomSample$tweet[1:2000]))
randomTweets_corpus <- tm_map(randomTweets_corpus, tolower)
randomTweets_corpus <- tm_map(randomTweets_corpus,
function(x) iconv(x, to='ASCII', sub=' '))
randomTweets_corpus <- tm_map(randomTweets_corpus, removeNumbers)
randomTweets_corpus <- tm_map(randomTweets_corpus, removePunctuation)
randomTweets_corpus <- tm_map(randomTweets_corpus, stripWhitespace)
randomTweets_corpus <- tm_map(randomTweets_corpus, removeWords,
stopwords("english"))
randomTweets_corpus <- tm_map(randomTweets_corpus, PlainTextDocument)
random_dtm <- DocumentTermMatrix(randomTweets_corpus)
randomTweets_matrix <- as.matrix(random_dtm)
randomF <- colSums(randomTweets_matrix)
randomFtopten <- data.frame(sort(randomF, decreasing = T)[1:10])
randomFtopten <- data.frame(rownames(randomFtopten), randomFtopten[,1])
names(randomFtopten) <- c("Words", "Freq")
ggplot(randomFtopten, aes(x=reorder(Words,-Freq), y=Freq)) +
geom_bar(stat="identity", col="orange", fill="lightblue") +
labs (x="Words", y="Frequency", title="Top Ten Random Sample Words")
An examination of whether Tweets about the Company are of random topics, or if they contain specific information.
consumer_key <- "UowkpfBq1IfPl4sgHPfuxhzwO"
consumer_secret <- "1A1kp5sYhARgk9pQT2YAzj8AUWMNyhVVRRBf4LKVP9lEmJGbge"
access_token <- "758080422345531398-GKSngPSzL5Sm7kJKw1T1KAvs5UWSu2R"
access_secret <- "OTGFuH93JapGEqGBzHDu8Ld8S0Nk0lyHLtFNgXUEADYbE"
setup_twitter_oauth(consumer_key, consumer_secret,access_token,access_secret)
publicTweets <- searchTwitter('CVS_Extra', n = 1000, lang = "en")
publicTweets_df <- twListToDF(publicTweets)
cvsPublicTweets_corpus <- Corpus(VectorSource(publicTweets_df$text))
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, function(x) iconv(x, to='ASCII'))
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removeNumbers)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removePunctuation)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, stripWhitespace)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, tolower)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removeWords, stopwords('english'))
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, stemDocument)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, PlainTextDocument)
cvs_dtm <- DocumentTermMatrix(cvsPublicTweets_corpus)
cvsPublicTweets_matrix <- as.matrix(cvs_dtm)
cvsPublicF <- colSums(cvsPublicTweets_matrix)
cvsPublicF_topten <- data.frame(sort(cvsPublicF, decreasing = T)[1:10])
cvsPublicF_topten <- data.frame(rownames(cvsPublicF_topten),
cvsPublicF_topten[,1])
names(cvsPublicF_topten) <- c("Words", "Freq")
ggplot(cvsPublicF_topten, aes(x=reorder(Words,-Freq), y=Freq)) +
geom_bar(stat="identity", col="orange", fill="lightblue") +
labs (x="Words", y="Frequency", title="Top Ten Public Tweet Words")
wordAssoc1 <- data.frame(findAssocs(cvs_dtm, "today", 0.2))
wordAssoc2 <- data.frame(findAssocs(cvs_dtm, "cvs", 0.2))
wordAssoc3 <- data.frame(findAssocs(cvs_dtm, "target", 0.2))
assocTable <- data.frame(matrix(nrow=5, ncol=6))
names(assocTable) <- c("today", "frequency",
"cvs", "frequency",
"target", "frequency")
assocTable[,1] <- rownames(wordAssoc1)
assocTable[,2] <- wordAssoc1[,1]
assocTable[,3] <- rownames(wordAssoc2)[1:5]
assocTable[,4] <- wordAssoc2[1:5,1]
assocTable[,5] <- rownames(wordAssoc3)[1:5]
assocTable[,6] <- wordAssoc3[1:5,1]
kable(assocTable, caption="Tweet Word Association")
| today | frequency | cvs | frequency | target | frequency |
|---|---|---|---|---|---|
| checked | 0.98 | extra | 0.37 | boycott | 0.35 |
| download | 0.98 | pharmacy | 0.32 | shop | 0.31 |
| mplusplaces | 0.98 | cash | 0.29 | bottles | 0.30 |
| just | 0.95 | earn | 0.29 | eden | 0.27 |
| cvsextra | 0.45 | mysurvey | 0.29 | family | 0.27 |
Alltweets_dtm <-c(random_dtm, cvs_dtm)
Alltweets_matrix <- as.matrix(Alltweets_dtm)
AlltweetsF <- colSums(Alltweets_matrix)
AlltweetsF_topten <- data.frame(sort(AlltweetsF, decreasing = T)[1:10])
AlltweetsF_topten <- data.frame(rownames(AlltweetsF_topten),
AlltweetsF_topten[,1])
names(AlltweetsF_topten) <- c("Words", "Freq")
ggplot(AlltweetsF_topten[AlltweetsF_topten$Freq>1,],
aes(x=reorder(Words,-Freq), y=Freq)) +
geom_bar(stat="identity", col="orange", fill="lightblue") +
labs (x="Words", y="Frequency", title="Top Ten Random+Public Tweet Words")
NCombined <- rbind(cvsPublicF, randomF)
chisq.test(NCombined)
##
## Pearson's Chi-squared test
##
## data: NCombined
## X-squared = 17760, df = 8309, p-value < 2.2e-16
As the p-value, 2.2e-16, is much smaller than the .05 significance level, we reject the null hypothesis that the @CVS_Extra’s tweets are independent of general tweets posted to Twitter.
cvsCompanytweets <- userTimeline('CVS_Extra', n = 1000)
cvsCompanytweets_df <- twListToDF(cvsCompanytweets)
cvsCompanytweets_corpus <- Corpus(VectorSource(cvsCompanytweets_df$text))
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, function(x) iconv(x, to='ASCII'))
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removeNumbers)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removePunctuation)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, stripWhitespace)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, tolower)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removeWords, stopwords('english'))
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, stemDocument)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, PlainTextDocument)
cvsCompany_dtm <- DocumentTermMatrix(cvsCompanytweets_corpus)
cvsCompanyTweets_matrix <- as.matrix(cvsCompany_dtm)
cvsCompanyF <- colSums(cvsCompanyTweets_matrix)
cvsCompanyF_topten <- data.frame(sort(cvsCompanyF, decreasing = T)[1:10])
cvsCompanyF_topten <- data.frame(rownames(cvsCompanyF_topten),
cvsCompanyF_topten[,1])
names(cvsCompanyF_topten) <- c("Words", "Freq")
ggplot(cvsCompanyF_topten, aes(x=reorder(Words,-Freq), y=Freq)) +
geom_bar(stat="identity", col="orange", fill="lightblue") +
labs (x="Words", y="Frequency", title="Top Ten Company Tweet Words")
cvsPublicF_df <- data.frame(cvsPublicF)
cvsPublicF_df <- cbind(rownames(cvsPublicF_df), cvsPublicF_df[,1])
cvsPublicF_df <- data.frame(cvsPublicF_df)
names(cvsPublicF_df) <- c("Words", "Freq")
cvsCompanyF_df <- data.frame(cvsCompanyF)
cvsCompanyF_df <- cbind(rownames(cvsCompanyF_df), cvsCompanyF_df[,1])
cvsCompanyF_df <- data.frame(cvsCompanyF_df)
names(cvsCompanyF_df) <- c("Words", "Freq")
cvsCombinedF_df <- merge(cvsPublicF_df, cvsCompanyF_df, by="Words")
names(cvsCombinedF_df) <- c("Words", "Public", "Company")
rownames(cvsCombinedF_df) <- cvsCombinedF_df$Words
cvsCombinedF_df <- cvsCombinedF_df[,2:3]
head(cvsCombinedF_df, 10)
## Public Company
## accept 3 6
## account 4 7
## address 1 16
## addressed 1 4
## addressing 1 2
## allisonramsing 1 1
## along 5 43
## also 3 6
## always 4 2
## amanda 1 3
cvsCombinedF_df$Public <- as.numeric(as.character(cvsCombinedF_df$Public))
cvsCombinedF_df$Company <- as.numeric(as.character(cvsCombinedF_df$Company))
cvsCombinedF_dfnum <- cvsCombinedF_df[cvsCombinedF_df$Public > 10
& cvsCombinedF_df$Company > 10,]
ggplot(cvsCombinedF_dfnum[1:10,]) +
geom_line(stat="identity", aes(x=rownames(cvsCombinedF_dfnum[1:10,]),
y=as.numeric(cvsCombinedF_dfnum$Public[1:10]),
group = 1, color="Public")) +
geom_line(stat="identity", aes(x=rownames(cvsCombinedF_dfnum[1:10,]),
y=as.numeric(cvsCombinedF_dfnum$Company[1:10]),
group = 1, color="Company")) +
scale_color_manual(name="Source",values=c(Public="green", Company="blue")) +
theme(axis.text.x=element_text(angle=45, vjust=0.5)) +
labs (x="Words", y="Frequency", title="Top Ten Public+Company Tweet Words")
cvsFnum <- cvsCombinedF_df
cvsFnum_k3 <- kmeans(cvsFnum, centers=2)
cat("The amount of terms in Cluster 1 =", cvsFnum_k3$size[1])
## The amount of terms in Cluster 1 = 485
cat("The amount of terms in Cluster 2 =", cvsFnum_k3$size[2])
## The amount of terms in Cluster 2 = 13
cvsFnum_k3$centers
## Public Company
## 1 2.954639 8.742268
## 2 114.000000 187.384615
clusplot(cvsFnum, cvsFnum_k3$cluster, color=T, shade=T, labels=2, lines=0)
CompanyTweetProporation <- 1 / (cvsFnum_k3$size[1] / cvsFnum_k3$size[2])
CompanyTweetProporation
## [1] 0.02680412
cvsCompanyF_decision <- data.frame(cvsCompanytweets_df$text,
cvsCompanytweets_df$retweetCount,
cvsCompanytweets_df$favoriteCount)
names(cvsCompanyF_decision) <- c("text", "retweetCount", "favoriteCount")
cvsCompanyF_decision <- cvsCompanyF_decision[order(cvsCompanyF_decision$retweetCount, decreasing=T),]
cvsCompanyF_decision <-
cvsCompanyF_decision[cvsCompanyF_decision$retweetCount > 1,]
form <- as.formula(text ~ .)
tree.2 <- rpart(form, cvsCompanyF_decision)
# fast plot
prp(tree.2)
By examining the individual terms within random tweets by the public, tweets about CVS by the public, and Tweets from the CVS corporate entity, it is determinable that the Public on Twitter is mostly concerned with obtaining services from CVS, and the value of buying at CVS. The public also seems to have an interest in specific campaigns, and departments, of CVS. CVS’s Company Tweets contain mostly cordial service terms.
When public tweets, and CVS company tweets are examined together, it is found that cordiality terms are emphasized more by CVS corporate, and practicality terms are emphasized by the Public. Even though both sides of the conversation make use of both types of terms, within tweets.
Kmeans Clustering of all tweets shows that CVS company tweet terms are a very small set of terms used by the Public. The examination of CVS Social Media terms ends with a three-factor Decision Tree analysis of Company + Public tweets. This analysis separates the most retweeted Twitter posts by retweet count, and favorite tweet count. The tweets that are retweeted often, and then favorited often, usually refer to CVS’s marketing of flu shot services during flu season.