The four datafiles contains the most common words during the last four quarters when people are talking about Donald Trump (base is Twitter). The two column in the data file are: Word and frequencies.
DATA FILES:
First we read in the data, do some cleaning of the data.
#rm(list=ls(all=TRUE))
setwd("/Users/pockeystar/Desktop/test/")
#install packages
#install.packages("plyr", "dplyr", "ggplot2", "wordcloud", "tm", "stringr", "SnowballC", "slam", "xlsx")
#read in data
q2_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q2_2015-WF1k-20160601110814.csv",
head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)
q3_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q3_2015-WF1k-20160601110415.csv",
head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)
q4_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q4_2015-WF1k-20160601105516.csv",
head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)
#with the problem of quotes in the file of 2016 quarter 1,
#removing all thoes using "sed" in command line/terminal
#and save the new file into _new.csv
#LC_CTYPE=C sed 's/\"//g' Trump_word_count-Q1_2016.csv > Trump_word_count-Q1_2016_new.csv
q1_2016 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count-Q1_2016_new.csv",
head=FALSE, sep="\t",col.names=c("word", "freq"), stringsAsFactors=FALSE)
#present part of the data
head(q2_2015, 5)
TRUE word freq
TRUE 1 save 125
TRUE 2 duration 122
TRUE 3 info 121
TRUE 4 06 101
TRUE 5 marriage 81
str(q1_2016)
TRUE 'data.frame': 5000 obs. of 2 variables:
TRUE $ word: chr "em" "not" "washington" "court" ...
TRUE $ freq: int 324 148 142 138 133 127 120 105 104 103 ...
Then I observe the repetiion in one data set using sort, so i decided to merge all the same words into one word. However the optimal way to further explore and extract word information is to create a corpus, personally speaking. Next step is to combine four data files into one data file to examine the time series.
#install.packages("plyr", "dplyr", "ggplot2")
library(plyr)
library(dplyr)
library(ggplot2)
#put all words which appeared more than one time together using aggregate
q2_2015 <- aggregate( freq ~ word, data <- q2_2015, sum)
q3_2015 <- aggregate( freq ~ word, data <- q3_2015, sum)
q4_2015 <- aggregate( freq ~ word, data <- q4_2015, sum)
q1_2016 <- aggregate( freq ~ word, data <- q1_2016, sum)
#to examine the trend through time, put all four data frames into one and analyze them together.
q23_2015 <- merge(q2_2015, q3_2015, by="word", all=TRUE)
#edit(q23_2015)
q41_20156 <- merge(q4_2015, q1_2016, by="word", all=TRUE)
q1_data <- merge(q23_2015, q41_20156, by="word", all=TRUE)
colnames(q1_data) <- c("word", "Q2", "Q3", "Q4", "Q1")
So far we get the data file that we can really put hand on to analyze and draw inference on the time series. Although after checking the data, there are still some minor things that can be improved. For example, some words begin with ’ could be classified as the same words as those without a prime sign. But since most of them appear rarely in the data file, they play no importance into our analysis of trend. The first thing we do in analysis is to find all the words that are most frequently appeared in the whole year using sort. And we plot the frequent words using histgram as well as wordcloud.
#sum the whole year, try to focus on the most frequent ones
q1_data$total <- rowSums(q1_data[, 2:5], na.rm = TRUE)
sort_data <- q1_data %>% arrange(desc(total))
head(sort_data, 20) #around 900
TRUE word Q2 Q3 Q4 Q1 total
TRUE 1 span NA 14453 NA NA 14453
TRUE 2 class NA 4861 78 3 4942
TRUE 3 msofootnotereference NA 4032 NA NA 4032
TRUE 4 href NA 2042 108 133 2283
TRUE 5 div NA 2208 10 NA 2218
TRUE 6 name 19 2098 11 9 2137
TRUE 7 rights 33 1996 38 59 2126
TRUE 8 not 110 1193 200 189 1692
TRUE 9 women 10 1473 82 6 1571
TRUE 10 li NA 1540 NA NA 1540
TRUE 11 their 45 1231 157 87 1520
TRUE 12 was 117 1030 240 95 1482
TRUE 13 sex 86 1290 23 5 1404
TRUE 14 human 5 1373 8 2 1388
TRUE 15 sexual 2 1070 NA NA 1072
TRUE 16 id 2 1054 2 4 1062
TRUE 17 watch 22 935 15 6 978
TRUE 18 hiv NA 960 NA NA 960
TRUE 19 violence 12 889 5 3 909
TRUE 20 aids NA 906 NA NA 906
#plot the freq with words appeared over 1000 times
p1 <- ggplot(subset(sort_data, total>1000), aes(word, total))
p1 <- p1 + geom_bar(stat="identity")
p1 <- p1 + theme(axis.text.x=element_text(angle=45, hjust=1))
p1
#word cloud
#install.packages("wordcloud")
library(wordcloud)
#set colors
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(12345)
wordcloud(sort_data$word, freq=sort_data$total, max.words=100, colors=pal)
Since there are only four time points in the data and second quarter of data dominated the frequency, there is not point in doing prediction and time series analysis based on these data. It is vital to further examine the data validity and reliablity given the sampling strategy. For example, are the data extracted in a very small time frame, which might result in many data without much valuable information. But the data give no significant sign in any relavant trend.
DATA: A sample of last months tweets where Donald Trump or Hillary Clinton were mentioned.
DATA FILES: TrumpClinton.csv.zip
Firstly, read in the unziped data file through R and clean out twitter handles.
#first unzip the data file and look at it briefly
#read in data
q2_data <- read.csv("/Users/pockeystar/Desktop/test/TrumpClinton.csv",
head=TRUE, sep=";", stringsAsFactors=FALSE)
#install.packages("tm", "stringr")
library(tm)
library(stringr)
#sentiment analysis
#First, let???s remove the Twitter handles (i.e., @) from the tweet texts
#so we can just have the real English words.
q2_data$clean <- str_replace_all(q2_data$content, "@\\w+", "")
Next we will implement natural language processing technique to create a Corpus and to use document term matrix to examine the frequency of word and association between words.
#We will turn data into a ???corpus???, a collection of documents
#containing natural language text that the tm text mining package knows how to deal with.
library(SnowballC)
q2_dataCorpus <- Corpus(VectorSource(q2_data$clean))
#transform all the characters into lower
q2_dataCorpus <- tm_map(q2_dataCorpus, content_transformer(tolower))
#turn the document into a plain text format
q2_dataCorpus <- tm_map(q2_dataCorpus, PlainTextDocument)
#remove all punctuation and stopwords. Stopwords are commonly used words in the English language such as I, me, my, etc. You can see the full list of stopwords using stopwords('english').
q2_dataCorpus <- tm_map(q2_dataCorpus, removePunctuation)
q2_dataCorpus <- tm_map(q2_dataCorpus, removeWords, stopwords('english'))
#stemming
q2_dataCorpus <- tm_map(q2_dataCorpus, stemDocument)
#library(wordcloud)
wordcloud(q2_dataCorpus, max.words = 40, random.order = FALSE, colors = rainbow(50))
dtm <- DocumentTermMatrix(q2_dataCorpus)
dtm
TRUE <<DocumentTermMatrix (documents: 125002, terms: 115362)>>
TRUE Non-/sparse entries: 1238532/14419242192
TRUE Sparsity : 100%
TRUE Maximal term length: 95
TRUE Weighting : term frequency (tf)
#dtm is such a sparse matrix because of the high dimenstionality nature of our data therefore we try to focus on most frequent terms and leave out eh sparse part of the matrix. Here 0,9999 makes a matrix that is 99.99% empty space, maximum.
dtm <- removeSparseTerms(dtm, 0.9999)
dtm
TRUE <<DocumentTermMatrix (documents: 125002, terms: 6196)>>
TRUE Non-/sparse entries: 1062576/773449816
TRUE Sparsity : 100%
TRUE Maximal term length: 40
TRUE Weighting : term frequency (tf)
library(slam)
#freq <- row_sums(dtm, na.rm = T)
#one way to look at the frequency
freq <- sort(col_sums(dtm), decreasing=TRUE)
head(freq, 40)
TRUE trump clinton donald hillari support
TRUE 99845 31780 21765 18544 11441
TRUE vote will https<U+00E2>\u0080<U+0161> amp say
TRUE 7066 6871 6575 6536 6509
TRUE judg via like presid obama
TRUE 5870 5360 5222 5126 4869
TRUE just get attack sander ralli
TRUE 4853 4547 4461 3851 3740
TRUE media peopl san call univers
TRUE 3699 3678 3612 3548 3498
TRUE berni dont democrat make american
TRUE 3481 3430 3346 3277 3268
TRUE jose win trump2016 now california
TRUE 3234 3102 2965 2954 2938
TRUE can endors protest america want
TRUE 2929 2901 2899 2834 2825
findFreqTerms(dtm, lowfreq=3000)
TRUE [1] "american" "amp" "attack" "berni"
TRUE [5] "call" "clinton" "democrat" "donald"
TRUE [9] "dont" "get" "hillari" "https<U+00E2>\u0080<U+0161>"
TRUE [13] "jose" "judg" "just" "like"
TRUE [17] "make" "media" "obama" "peopl"
TRUE [21] "presid" "ralli" "san" "sander"
TRUE [25] "say" "support" "trump" "univers"
TRUE [29] "via" "vote" "will" "win"
#plot the freq
library(ggplot2)
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>3000), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
#Term Correlations
findAssocs(dtm, c("trump" , "clinton"), corlimit=0.1) # specifying a correlation limit of 0.98
TRUE $trump
TRUE donald univers judg support
TRUE 0.23 0.14 0.12 0.12
TRUE
TRUE $clinton
TRUE hillari sander bill democrat nomin foundat clinch email
TRUE 0.44 0.23 0.19 0.19 0.19 0.16 0.15 0.15
TRUE deleg puerto rico poll
TRUE 0.14 0.13 0.13 0.10
#Hierarchal Clustering & k means clustering dont really work for sparse matrix
#library(cluster)
#dtmss <- removeSparseTerms(dtm, 0.999) # This makes a matrix that is only 15% empty space, maximum.
#but it does not very promosing in this example
#d <- dist(t(dtm), method="euclidian")
#fit <- hclust(d=d, method="ward.D")
#fit
#d <- dist(t(dtmss), method="euclidian")
#kfit <- kmeans(d, 2)
#clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
#tdmat <- as.matrix(removeSparseTerms(tdm, sparse=0.99))
#distMatrix <- dist(scale(tdmat))
#fit <- hclust(distMatrix, method="ward.D2")
#sparse matrix clustering doesnt work well either
#library(sparcl)
#perm.out <- HierarchicalSparseCluster.permute(tdmat, wbounds=c(1.5,2:6),
# nperms=5)
#sparsehc <- HierarchicalSparseCluster(dists=perm.out$dists,
# wbound=perm.out$bestw, #method="complete")
Unfortunately lots of clustering method fail because of huge dimension.
Looking at the term correlation, we can find some terms such as “univers”, “judg”, “support” are relatively highly correlated with Donald Trump, while “sander”, “bill”, “democrat”, “nomin”, “foundat”, “email”, “deleg”, “puerto”, “rico”, “poll” are highly associated with Hillary Clinton. We will go through each word. On Mr. Trump side, “univers” can be either referred to “Trump univerity” or “universal health care”. The former is a entrepreneur initiative with university clothes and the latter is the hot topic on health care which each and every american people cares about. “judg” can be refered to Trump’s sister Maryanne Trump Barry who is senior united states circuit judge or can be also referred to as lawsuits Trump has faced. “support” is rather straightforward. On Ms. Clinton side, “Sander” is the “old” democrat candidate and her major competitor “Bernie Sanders” and “bill” is her husband and former US president “Bill Clinton”, who also plays an crutial rule in her campaign. “Democrat”, “nomin”, “poll” and “deleg” are just political words refering to “democrat party”, “nomination”, “pool” and “delegates”. “foundat” is referred to “Clinton foundation” which was firstly founded by Bill Clinton and care about health care, climate, development and so on. “puerto” “rico” is referred to Puerto Rico, part of US territory and Clinton has a statement on this and she also acclaimed victory on this island last week. “email” is referred to as the infamous email scandal on Hillary Clinton published by WikiLeaks, part of which becomes a data mining contest on Kaggle today.
The data file contains data of trends quarterly from year 2000 to 2014 The data contains the name, size of the trend. The columns are quarters.
DATA FILES: Trends 2000-2014.xlsx
library(xlsx)
library(tseries)
library(lsr)
#read in data
q3_data <- read.xlsx("/Users/pockeystar/Desktop/test/Trends_2000-2014.xlsx", sheetIndex=1, sheetName=NULL, rowIndex=NULL,
startRow=NULL, endRow=NULL, colIndex=NULL,
as.data.frame=TRUE, header=TRUE, colClasses=NA)
rownames(q3_data) <- make.names(q3_data[, 1], unique=TRUE)
q3_data <- q3_data[, -1]
To further examine the potential popular words, test of stationarity of time series is conducted and words with highest unstationarity are picked and showned in figure.
#2. time series analysis
#transpose
q3_data.T <- tFrame(q3_data)
#test the stationary regarding to each word
ht <- sapply(q3_data.T, adf.test, alternative="stationary", k=1)
#select those series that are not stationary, which means p.value are larger than 0.01
ht <- data.frame(ht)
ht.T <- tFrame(ht)
Intre <- ht.T[ which(ht.T$p.value > 0.01), ]
High_Intre <- ht.T[ which(ht.T$p.value > 0.5), ]
include <- row.names(High_Intre)
#There are only 7 words that we are interested in.
#[1] "meaningful" "peer.to.peer.lending" "crowdfunding"
#[4] "entertainment" "streaming" "yoga"
#[7] "crypto.currency"
#select back the data of high interest and plot them
q3_data_High_Intre <- q3_data[include, ]
q3_data_High_Intre.T <- tFrame(q3_data_High_Intre)
#to avoid troubling about the data format, we simply use 1 to 60 to represent 60 quarters
q3_data_High_Intre.T2 <- data.frame(Quarter = 1:60,
meaningful = q3_data_High_Intre.T$meaningful,
peer.to.peer.lending = q3_data_High_Intre.T$peer.to.peer.lending,
crowdfunding = q3_data_High_Intre.T$crowdfunding,
entertainment = q3_data_High_Intre.T$entertainment,
streaming = q3_data_High_Intre.T$streaming,
yoga = q3_data_High_Intre.T$yoga,
crypto.currency = q3_data_High_Intre.T$crypto.currency)
p1 <- ggplot() +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = meaningful, color = "red", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = entertainment, color = "blue", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1)) +
xlab('Quarter') +
ylab('Word_Fre')
p1
#to focus on the words that newly rised, leave out "meaningful", "entertainment"
p2 <- ggplot() +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1)) +
geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1)) +
xlab('Quarter') +
ylab('Word_Fre')
p2
From these figures we can see there is a tremendous rise in words such as “peer to peer lending”, “crowdfunding”, “streaming”, “joga”, “crypto currency” which all boom from 2014.