Task 1

The four datafiles contains the most common words during the last four quarters when people are talking about Donald Trump (base is Twitter). The two column in the data file are: Word and frequencies.

DATA FILES:

Q2. Are there (if any latent) dimensions in the data?

Q3. Can you create a time series analysis and formulate any predictions based on the data?

Since there are only four time points in the data and second quarter of data dominated the frequency, there is not point in doing prediction and time series analysis based on these data. It is vital to further examine the data validity and reliablity given the sampling strategy. For example, are the data extracted in a very small time frame, which might result in many data without much valuable information. But the data give no significant sign in any relavant trend.

Task 2

DATA: A sample of last months tweets where Donald Trump or Hillary Clinton were mentioned.

DATA FILES: TrumpClinton.csv.zip

Q1. What relevant groups (if any) can the twitters be divided in?

Firstly, read in the unziped data file through R and clean out twitter handles.

#first unzip the data file and look at it briefly 
#read in data
q2_data <- read.csv("/Users/pockeystar/Desktop/test/TrumpClinton.csv", 
                    head=TRUE, sep=";", stringsAsFactors=FALSE)

#install.packages("tm", "stringr")
library(tm)
library(stringr)
#sentiment analysis
#First, let???s remove the Twitter handles (i.e., @) from the tweet texts 
#so we can just have the real English words.
q2_data$clean <- str_replace_all(q2_data$content, "@\\w+", "")

Next we will implement natural language processing technique to create a Corpus and to use document term matrix to examine the frequency of word and association between words.

#We will turn data into a ???corpus???, a collection of documents 
#containing natural language text that the tm text mining package knows how to deal with.

library(SnowballC)
q2_dataCorpus <- Corpus(VectorSource(q2_data$clean))

#transform all the characters into lower
q2_dataCorpus <- tm_map(q2_dataCorpus, content_transformer(tolower))

#turn the document into a plain text format
q2_dataCorpus <- tm_map(q2_dataCorpus, PlainTextDocument)

#remove all punctuation and stopwords. Stopwords are commonly used words in the English language such as I, me, my, etc. You can see the full list of stopwords using stopwords('english').
q2_dataCorpus <- tm_map(q2_dataCorpus, removePunctuation)
q2_dataCorpus <- tm_map(q2_dataCorpus, removeWords, stopwords('english'))

#stemming
q2_dataCorpus <- tm_map(q2_dataCorpus, stemDocument)

#library(wordcloud)
wordcloud(q2_dataCorpus, max.words = 40, random.order = FALSE, colors = rainbow(50))

dtm <- DocumentTermMatrix(q2_dataCorpus)
dtm
TRUE <<DocumentTermMatrix (documents: 125002, terms: 115362)>>
TRUE Non-/sparse entries: 1238532/14419242192
TRUE Sparsity           : 100%
TRUE Maximal term length: 95
TRUE Weighting          : term frequency (tf)
#dtm is such a sparse matrix because of the high dimenstionality nature of our data therefore we try to focus on most frequent terms and leave out eh sparse part of the matrix. Here 0,9999 makes a matrix that is 99.99% empty space, maximum.   
dtm <- removeSparseTerms(dtm, 0.9999)
dtm
TRUE <<DocumentTermMatrix (documents: 125002, terms: 6196)>>
TRUE Non-/sparse entries: 1062576/773449816
TRUE Sparsity           : 100%
TRUE Maximal term length: 40
TRUE Weighting          : term frequency (tf)
library(slam)
#freq <- row_sums(dtm, na.rm = T)
#one way to look at the frequency
freq <- sort(col_sums(dtm), decreasing=TRUE)   
head(freq, 40)   
TRUE         trump       clinton        donald       hillari       support 
TRUE         99845         31780         21765         18544         11441 
TRUE          vote          will https<U+00E2>\u0080<U+0161>           amp           say 
TRUE          7066          6871          6575          6536          6509 
TRUE          judg           via          like        presid         obama 
TRUE          5870          5360          5222          5126          4869 
TRUE          just           get        attack        sander         ralli 
TRUE          4853          4547          4461          3851          3740 
TRUE         media         peopl           san          call       univers 
TRUE          3699          3678          3612          3548          3498 
TRUE         berni          dont      democrat          make      american 
TRUE          3481          3430          3346          3277          3268 
TRUE          jose           win     trump2016           now    california 
TRUE          3234          3102          2965          2954          2938 
TRUE           can        endors       protest       america          want 
TRUE          2929          2901          2899          2834          2825
findFreqTerms(dtm, lowfreq=3000) 
TRUE  [1] "american"      "amp"           "attack"        "berni"        
TRUE  [5] "call"          "clinton"       "democrat"      "donald"       
TRUE  [9] "dont"          "get"           "hillari"       "https<U+00E2>\u0080<U+0161>"
TRUE [13] "jose"          "judg"          "just"          "like"         
TRUE [17] "make"          "media"         "obama"         "peopl"        
TRUE [21] "presid"        "ralli"         "san"           "sander"       
TRUE [25] "say"           "support"       "trump"         "univers"      
TRUE [29] "via"           "vote"          "will"          "win"
#plot the freq
library(ggplot2) 
wf <- data.frame(word=names(freq), freq=freq)   
p <- ggplot(subset(wf, freq>3000), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p   

#Term Correlations
findAssocs(dtm, c("trump" , "clinton"), corlimit=0.1) # specifying a correlation limit of 0.98   
TRUE $trump
TRUE  donald univers    judg support 
TRUE    0.23    0.14    0.12    0.12 
TRUE 
TRUE $clinton
TRUE  hillari   sander     bill democrat    nomin  foundat   clinch    email 
TRUE     0.44     0.23     0.19     0.19     0.19     0.16     0.15     0.15 
TRUE    deleg   puerto     rico     poll 
TRUE     0.14     0.13     0.13     0.10
#Hierarchal Clustering & k means clustering dont really work for sparse matrix
#library(cluster)   
#dtmss <- removeSparseTerms(dtm, 0.999) # This makes a matrix that is only 15% empty space, maximum.   
#but it does not very promosing in this example
#d <- dist(t(dtm), method="euclidian")   
#fit <- hclust(d=d, method="ward.D")   
#fit   

#d <- dist(t(dtmss), method="euclidian")   
#kfit <- kmeans(d, 2)   
#clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)   

#tdmat <- as.matrix(removeSparseTerms(tdm, sparse=0.99))
#distMatrix <- dist(scale(tdmat))
#fit <- hclust(distMatrix, method="ward.D2")

#sparse matrix clustering doesnt work well either
#library(sparcl)
#perm.out <- HierarchicalSparseCluster.permute(tdmat, wbounds=c(1.5,2:6),
#                                              nperms=5)
#sparsehc <- HierarchicalSparseCluster(dists=perm.out$dists,
#                                      wbound=perm.out$bestw, #method="complete")

Unfortunately lots of clustering method fail because of huge dimension.

Q2. Can you create any predictions based on the data?

Looking at the term correlation, we can find some terms such as “univers”, “judg”, “support” are relatively highly correlated with Donald Trump, while “sander”, “bill”, “democrat”, “nomin”, “foundat”, “email”, “deleg”, “puerto”, “rico”, “poll” are highly associated with Hillary Clinton. We will go through each word. On Mr. Trump side, “univers” can be either referred to “Trump univerity” or “universal health care”. The former is a entrepreneur initiative with university clothes and the latter is the hot topic on health care which each and every american people cares about. “judg” can be refered to Trump’s sister Maryanne Trump Barry who is senior united states circuit judge or can be also referred to as lawsuits Trump has faced. “support” is rather straightforward. On Ms. Clinton side, “Sander” is the “old” democrat candidate and her major competitor “Bernie Sanders” and “bill” is her husband and former US president “Bill Clinton”, who also plays an crutial rule in her campaign. “Democrat”, “nomin”, “poll” and “deleg” are just political words refering to “democrat party”, “nomination”, “pool” and “delegates”. “foundat” is referred to “Clinton foundation” which was firstly founded by Bill Clinton and care about health care, climate, development and so on. “puerto” “rico” is referred to Puerto Rico, part of US territory and Clinton has a statement on this and she also acclaimed victory on this island last week. “email” is referred to as the infamous email scandal on Hillary Clinton published by WikiLeaks, part of which becomes a data mining contest on Kaggle today.

Q3. What should Trump and Clinton communicate to maximise their chances in the presidential election based on the data?

Task 3

The data file contains data of trends quarterly from year 2000 to 2014 The data contains the name, size of the trend. The columns are quarters.

DATA FILES: Trends 2000-2014.xlsx

library(xlsx)
library(tseries)
library(lsr)

#read in data
q3_data <- read.xlsx("/Users/pockeystar/Desktop/test/Trends_2000-2014.xlsx", sheetIndex=1, sheetName=NULL, rowIndex=NULL,
          startRow=NULL, endRow=NULL, colIndex=NULL,
          as.data.frame=TRUE, header=TRUE, colClasses=NA)

rownames(q3_data) <- make.names(q3_data[, 1], unique=TRUE)
q3_data <- q3_data[, -1] 

Q1. What main latent dimensions (if any) are hidden in the material?

To explore the potential dimensions underneath the data, we can perform a (hierarchical) clustering to group the words, based on the frequency.

#1 clustering analysis
d <- dist(q3_data, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D2") 
plot(fit) # display dendogram

clusterCut <- cutree(fit, 3)
table(clusterCut)
TRUE clusterCut
TRUE  1  2  3 
TRUE 49 14  3

From the figure we can see the “optimal” number of cut is 3 and most of the words fall in the first category. The third group consists of “experience”, “production”, “growth” and they appear to be very big and does not fluctuate quite much. Group two consists of “meaningful”, “saving”, “sharing”, “sustainability”, “collaboration”, “culture”, “entertainment”, “migration”, “integration”, “economic grouth”, “competition”, “transformation”, “innovation”, “efficiency”. These are less frequent words compared to group three, but still not new words.

Q2. Can you create any predictions based on the data for 2015 and 2016?

To further examine the potential popular words, test of stationarity of time series is conducted and words with highest unstationarity are picked and showned in figure.

#2. time series analysis
#transpose
q3_data.T <- tFrame(q3_data)

#test the stationary regarding to each word
ht <- sapply(q3_data.T, adf.test, alternative="stationary", k=1)
#select those series that are not stationary, which means p.value are larger than 0.01
ht <- data.frame(ht)
ht.T <- tFrame(ht)
Intre <- ht.T[ which(ht.T$p.value > 0.01), ]
High_Intre <- ht.T[ which(ht.T$p.value > 0.5), ]
include <- row.names(High_Intre)
#There are only 7 words that we are interested in.
#[1] "meaningful"           "peer.to.peer.lending" "crowdfunding"        
#[4] "entertainment"        "streaming"            "yoga"                
#[7] "crypto.currency" 

#select back the data of high interest and plot them
q3_data_High_Intre <- q3_data[include, ]
q3_data_High_Intre.T <- tFrame(q3_data_High_Intre)
#to avoid troubling about the data format, we simply use 1 to 60 to represent 60 quarters
q3_data_High_Intre.T2 <- data.frame(Quarter = 1:60, 
                                    meaningful = q3_data_High_Intre.T$meaningful,
                                    peer.to.peer.lending =  q3_data_High_Intre.T$peer.to.peer.lending,
                                    crowdfunding = q3_data_High_Intre.T$crowdfunding,
                                    entertainment = q3_data_High_Intre.T$entertainment,
                                    streaming = q3_data_High_Intre.T$streaming,
                                    yoga = q3_data_High_Intre.T$yoga,
                                    crypto.currency = q3_data_High_Intre.T$crypto.currency)



p1 <- ggplot() + 
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = meaningful, color = "red", group = 1)) +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = entertainment, color = "blue", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1))  +
  xlab('Quarter') +
  ylab('Word_Fre')

p1

#to focus on the words that newly rised, leave out "meaningful", "entertainment"
p2 <- ggplot() + 
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1))  +
  xlab('Quarter') +
  ylab('Word_Fre')

p2

From these figures we can see there is a tremendous rise in words such as “peer to peer lending”, “crowdfunding”, “streaming”, “joga”, “crypto currency” which all boom from 2014.