Synopsis

This document includes examples of the Social Media Analysis capabilities of ContextBase. “Social Media Analysis” is the use of Natural Language Processing techniques to determine the effects of text-based posting to Social Media. This can uncover hidden textual characteristics within posts, that effect public perceptions.

Analysis of the language used in Twitter Micro-Blogs:

A random sample of tweets is read into the programming environment:

randomSample <- read.csv("randomSample2016.csv")

A document-term matrix of term frequencies is constructed:

randomTweets_corpus <- Corpus(VectorSource(randomSample$tweet[1:2000]))

randomTweets_corpus <- tm_map(randomTweets_corpus, tolower)
randomTweets_corpus <- tm_map(randomTweets_corpus,
                              function(x) iconv(x, to='ASCII', sub=' '))
randomTweets_corpus <- tm_map(randomTweets_corpus, removeNumbers)
randomTweets_corpus <- tm_map(randomTweets_corpus, removePunctuation)
randomTweets_corpus <- tm_map(randomTweets_corpus, stripWhitespace)
randomTweets_corpus <- tm_map(randomTweets_corpus, removeWords,
                              stopwords("english"))
randomTweets_corpus <- tm_map(randomTweets_corpus, PlainTextDocument)

random_dtm <- DocumentTermMatrix(randomTweets_corpus)

Using a vector of term frequencies, the proportion of each term in the random Twitter sample is determined, with the top 10 words are displayed:

randomTweets_matrix <- as.matrix(random_dtm)
randomF <- colSums(randomTweets_matrix)
randomFtopten <- data.frame(sort(randomF, decreasing = T)[1:10])
randomFtopten <- data.frame(rownames(randomFtopten), randomFtopten[,1])
names(randomFtopten) <- c("Words", "Freq")
ggplot(randomFtopten, aes(x=reorder(Words,-Freq), y=Freq)) +
  geom_bar(stat="identity", col="orange", fill="lightblue") +
  labs (x="Words", y="Frequency", title="Top Ten Random Sample Words")

Analysis of Company Image on Twitter:

An examination of whether Tweets about the Company are of random topics, or if they contain specific information.

Access Twitter API, and search for 1000 Tweets about the Company:

consumer_key <- "UowkpfBq1IfPl4sgHPfuxhzwO"
consumer_secret <- "1A1kp5sYhARgk9pQT2YAzj8AUWMNyhVVRRBf4LKVP9lEmJGbge"
access_token <- "758080422345531398-GKSngPSzL5Sm7kJKw1T1KAvs5UWSu2R"
access_secret <- "OTGFuH93JapGEqGBzHDu8Ld8S0Nk0lyHLtFNgXUEADYbE"
setup_twitter_oauth(consumer_key, consumer_secret,access_token,access_secret)

publicTweets <- searchTwitter('CVS_Extra', n = 1000, lang = "en")
publicTweets_df <- twListToDF(publicTweets)

A document-term matrix of Public Tweets, (about the Company), is constructed:

cvsPublicTweets_corpus <- Corpus(VectorSource(publicTweets_df$text))

cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, function(x) iconv(x, to='ASCII'))
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removeNumbers)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removePunctuation)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, stripWhitespace)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, tolower)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, removeWords, stopwords('english'))
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, stemDocument)
cvsPublicTweets_corpus <- tm_map(cvsPublicTweets_corpus, PlainTextDocument)

cvs_dtm <- DocumentTermMatrix(cvsPublicTweets_corpus)

Using a vector of term frequencies, (within Public Tweets about the Company), the proportion of each term in the Public Twitter sample is determined, and the top 10 words are displayed:

cvsPublicTweets_matrix <- as.matrix(cvs_dtm)
cvsPublicF <- colSums(cvsPublicTweets_matrix)
cvsPublicF_topten <- data.frame(sort(cvsPublicF, decreasing = T)[1:10])
cvsPublicF_topten <- data.frame(rownames(cvsPublicF_topten),
                                cvsPublicF_topten[,1])
names(cvsPublicF_topten) <- c("Words", "Freq")
ggplot(cvsPublicF_topten, aes(x=reorder(Words,-Freq), y=Freq)) +
  geom_bar(stat="identity", col="orange", fill="lightblue") +
  labs (x="Words", y="Frequency", title="Top Ten Public Tweet Words")

Words associated with the top ten words are explored:

wordAssoc1 <- data.frame(findAssocs(cvs_dtm, "today", 0.2))
wordAssoc2 <- data.frame(findAssocs(cvs_dtm, "cvs", 0.2))
wordAssoc3 <- data.frame(findAssocs(cvs_dtm, "target", 0.2))
assocTable <- data.frame(matrix(nrow=5, ncol=6))
names(assocTable) <- c("today", "frequency",
                       "cvs", "frequency",
                       "target", "frequency")
assocTable[,1] <- rownames(wordAssoc1) 
assocTable[,2] <- wordAssoc1[,1]
assocTable[,3] <- rownames(wordAssoc2)[1:5] 
assocTable[,4] <- wordAssoc2[1:5,1]
assocTable[,5] <- rownames(wordAssoc3)[1:5] 
assocTable[,6] <- wordAssoc3[1:5,1]
kable(assocTable, caption="Tweet Word Association")

Tweet Word Association
today	frequency	cvs	frequency	target	frequency
checked	0.98	extra	0.37	boycott	0.35
download	0.98	pharmacy	0.32	shop	0.31
mplusplaces	0.98	cash	0.29	bottles	0.30
just	0.95	earn	0.29	eden	0.27
cvsextra	0.45	mysurvey	0.29	family	0.27

The Public Tweets are combined with the Random Sample of Tweets, and a Document-Term Matrix of Term Frequencies is constructed over all tweets:

Alltweets_dtm <-c(random_dtm, cvs_dtm)
Alltweets_matrix <- as.matrix(Alltweets_dtm)
AlltweetsF <- colSums(Alltweets_matrix)
AlltweetsF_topten <- data.frame(sort(AlltweetsF, decreasing = T)[1:10])
AlltweetsF_topten <- data.frame(rownames(AlltweetsF_topten),
                                AlltweetsF_topten[,1])
names(AlltweetsF_topten) <- c("Words", "Freq")
ggplot(AlltweetsF_topten[AlltweetsF_topten$Freq>1,],
       aes(x=reorder(Words,-Freq), y=Freq)) +
  geom_bar(stat="identity", col="orange", fill="lightblue") +
  labs (x="Words", y="Frequency", title="Top Ten Random+Public Tweet Words")

The Vectors of Public Tweets, and Random Tweets, are combined to create a 2×M table:

NCombined <- rbind(cvsPublicF, randomF)

A Chi-Squared Test for Independence, on this Table, is performed:

chisq.test(NCombined)

## 
##  Pearson's Chi-squared test
## 
## data:  NCombined
## X-squared = 17760, df = 8309, p-value < 2.2e-16

Conclusion of Correlation Testing:

As the p-value, 2.2e-16, is much smaller than the .05 significance level, we reject the null hypothesis that the @CVS_Extra’s tweets are independent of general tweets posted to Twitter.

Connection between Public Tweets, and Company Tweets:

Downloading the last 1000 tweets from the Company:

cvsCompanytweets <- userTimeline('CVS_Extra', n = 1000)
cvsCompanytweets_df <- twListToDF(cvsCompanytweets)

A document-term matrix of Company Tweets is constructed:

cvsCompanytweets_corpus <- Corpus(VectorSource(cvsCompanytweets_df$text))

cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, function(x) iconv(x, to='ASCII'))
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removeNumbers)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removePunctuation)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, stripWhitespace)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, tolower)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, removeWords, stopwords('english'))
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, stemDocument)
cvsCompanytweets_corpus <- tm_map(cvsCompanytweets_corpus, PlainTextDocument)

cvsCompany_dtm <- DocumentTermMatrix(cvsCompanytweets_corpus)

Using a vector of term frequencies within Company Tweets, the proportion of each term in the Company Twitter sample is determined, and the top 10 words are displayed:

cvsCompanyTweets_matrix <- as.matrix(cvsCompany_dtm)
cvsCompanyF <- colSums(cvsCompanyTweets_matrix)
cvsCompanyF_topten <- data.frame(sort(cvsCompanyF, decreasing = T)[1:10])
cvsCompanyF_topten <- data.frame(rownames(cvsCompanyF_topten),
                                cvsCompanyF_topten[,1])
names(cvsCompanyF_topten) <- c("Words", "Freq")
ggplot(cvsCompanyF_topten, aes(x=reorder(Words,-Freq), y=Freq)) +
  geom_bar(stat="identity", col="orange", fill="lightblue") +
  labs (x="Words", y="Frequency", title="Top Ten Company Tweet Words")

The Tweets about the Company are combined with the Tweets from the Company, and Kmeans Clustering is applied:

cvsPublicF_df <- data.frame(cvsPublicF)
cvsPublicF_df <- cbind(rownames(cvsPublicF_df), cvsPublicF_df[,1])
cvsPublicF_df <- data.frame(cvsPublicF_df)
names(cvsPublicF_df) <- c("Words", "Freq")

cvsCompanyF_df <- data.frame(cvsCompanyF)
cvsCompanyF_df <- cbind(rownames(cvsCompanyF_df), cvsCompanyF_df[,1])
cvsCompanyF_df <- data.frame(cvsCompanyF_df)
names(cvsCompanyF_df) <- c("Words", "Freq")

cvsCombinedF_df <- merge(cvsPublicF_df, cvsCompanyF_df, by="Words")
names(cvsCombinedF_df) <- c("Words", "Public", "Company")
rownames(cvsCombinedF_df) <- cvsCombinedF_df$Words
cvsCombinedF_df <- cvsCombinedF_df[,2:3]
head(cvsCombinedF_df, 10)

##                Public Company
## accept              3       6
## account             4       7
## address             1      16
## addressed           1       4
## addressing          1       2
## allisonramsing      1       1
## along               5      43
## also                3       6
## always              4       2
## amanda              1       3

cvsCombinedF_df$Public <- as.numeric(as.character(cvsCombinedF_df$Public))
cvsCombinedF_df$Company <- as.numeric(as.character(cvsCombinedF_df$Company))

cvsCombinedF_dfnum <- cvsCombinedF_df[cvsCombinedF_df$Public > 10
                                   & cvsCombinedF_df$Company > 10,]

ggplot(cvsCombinedF_dfnum[1:10,]) +
geom_line(stat="identity", aes(x=rownames(cvsCombinedF_dfnum[1:10,]),
          y=as.numeric(cvsCombinedF_dfnum$Public[1:10]),
          group = 1, color="Public")) +
geom_line(stat="identity", aes(x=rownames(cvsCombinedF_dfnum[1:10,]),
          y=as.numeric(cvsCombinedF_dfnum$Company[1:10]),
          group = 1, color="Company")) +
scale_color_manual(name="Source",values=c(Public="green", Company="blue")) +
theme(axis.text.x=element_text(angle=45, vjust=0.5)) +
labs (x="Words", y="Frequency", title="Top Ten Public+Company Tweet Words")

cvsFnum <- cvsCombinedF_df
cvsFnum_k3 <- kmeans(cvsFnum, centers=2)

cat("The amount of terms in Cluster 1 =", cvsFnum_k3$size[1])

## The amount of terms in Cluster 1 = 485

cat("The amount of terms in Cluster 2 =", cvsFnum_k3$size[2])

## The amount of terms in Cluster 2 = 13

cvsFnum_k3$centers

##       Public    Company
## 1   2.954639   8.742268
## 2 114.000000 187.384615

Cluster Plot

clusplot(cvsFnum, cvsFnum_k3$cluster, color=T, shade=T, labels=2, lines=0)

The Proportion of Company Terms vs Public Terms, in each Cluster:

CompanyTweetProporation <- 1 / (cvsFnum_k3$size[1] / cvsFnum_k3$size[2])
CompanyTweetProporation

## [1] 0.02680412

A Classification Tree is constructed from Company tweets:

cvsCompanyF_decision <- data.frame(cvsCompanytweets_df$text,
                                   cvsCompanytweets_df$retweetCount,
                                   cvsCompanytweets_df$favoriteCount)
names(cvsCompanyF_decision) <- c("text", "retweetCount", "favoriteCount")

cvsCompanyF_decision <- cvsCompanyF_decision[order(cvsCompanyF_decision$retweetCount, decreasing=T),]

cvsCompanyF_decision <-
  cvsCompanyF_decision[cvsCompanyF_decision$retweetCount > 1,]

form <- as.formula(text ~ .)
tree.2 <- rpart(form, cvsCompanyF_decision)
# fast plot
prp(tree.2)

ContextBase Social Media Analysis

http://contextbase.github.io

All programming by John Akwei, ECMp ERMp Data Scientist

October 20, 2016