Task 1

The four datafiles contains the most common words during the last four quarters when people are talking about Donald Trump (base is Twitter). The two column in the data file are: Word and frequencies.

DATA FILES:

Trump_word_count_Q2_2015-WF1k-20160601110814.csv
Trump_word_count_Q3_2015-WF1k-20160601110415.csv
Trump_word_count_Q4_2015-WF1k-20160601105516.csv
Trump_word_count-Q1_2016

Q1. Are there any trends in the data?

First we read in the data, do some cleaning of the data.

#rm(list=ls(all=TRUE))
setwd("/Users/pockeystar/Desktop/test/")
#install packages
#install.packages("plyr", "dplyr", "ggplot2", "wordcloud", "tm", "stringr", "SnowballC", "slam", "xlsx")

#read in data
q2_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q2_2015-WF1k-20160601110814.csv", 
                    head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)
q3_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q3_2015-WF1k-20160601110415.csv", 
                    head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)
q4_2015 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count_Q4_2015-WF1k-20160601105516.csv", 
                    head=FALSE, sep="\t", col.names=c("word", "freq"), stringsAsFactors=FALSE)

#with the problem of quotes in the file of 2016 quarter 1, 
#removing all thoes using "sed" in command line/terminal
#and save the new file into _new.csv
#LC_CTYPE=C sed 's/\"//g' Trump_word_count-Q1_2016.csv > Trump_word_count-Q1_2016_new.csv
q1_2016 <- read.csv("/Users/pockeystar/Desktop/test/Trump_word_count-Q1_2016_new.csv", 
                    head=FALSE, sep="\t",col.names=c("word", "freq"), stringsAsFactors=FALSE)

#present part of the data
head(q2_2015, 5)

TRUE       word freq
TRUE 1     save  125
TRUE 2 duration  122
TRUE 3     info  121
TRUE 4       06  101
TRUE 5 marriage   81

str(q1_2016)

TRUE 'data.frame':  5000 obs. of  2 variables:
TRUE  $ word: chr  "em" "not" "washington" "court" ...
TRUE  $ freq: int  324 148 142 138 133 127 120 105 104 103 ...

Then I observe the repetiion in one data set using sort, so i decided to merge all the same words into one word. However the optimal way to further explore and extract word information is to create a corpus, personally speaking. Next step is to combine four data files into one data file to examine the time series.

#install.packages("plyr", "dplyr", "ggplot2")
library(plyr)
library(dplyr)
library(ggplot2)

#put all words which appeared more than one time together using aggregate
q2_2015 <- aggregate( freq ~ word, data <- q2_2015, sum)
q3_2015 <- aggregate( freq ~ word, data <- q3_2015, sum)
q4_2015 <- aggregate( freq ~ word, data <- q4_2015, sum)
q1_2016 <- aggregate( freq ~ word, data <- q1_2016, sum)

#to examine the trend through time, put all four data frames into one and analyze them together.
q23_2015 <- merge(q2_2015, q3_2015, by="word", all=TRUE)
#edit(q23_2015)
q41_20156 <- merge(q4_2015, q1_2016, by="word", all=TRUE)
q1_data <- merge(q23_2015, q41_20156, by="word", all=TRUE)
colnames(q1_data) <- c("word", "Q2", "Q3", "Q4", "Q1")

So far we get the data file that we can really put hand on to analyze and draw inference on the time series. Although after checking the data, there are still some minor things that can be improved. For example, some words begin with ’ could be classified as the same words as those without a prime sign. But since most of them appear rarely in the data file, they play no importance into our analysis of trend. The first thing we do in analysis is to find all the words that are most frequently appeared in the whole year using sort. And we plot the frequent words using histgram as well as wordcloud.

#sum the whole year, try to focus on the most frequent ones
q1_data$total <- rowSums(q1_data[, 2:5], na.rm = TRUE)
sort_data <- q1_data %>% arrange(desc(total))
head(sort_data, 20) #around 900

TRUE                    word  Q2    Q3  Q4  Q1 total
TRUE 1                  span  NA 14453  NA  NA 14453
TRUE 2                 class  NA  4861  78   3  4942
TRUE 3  msofootnotereference  NA  4032  NA  NA  4032
TRUE 4                  href  NA  2042 108 133  2283
TRUE 5                   div  NA  2208  10  NA  2218
TRUE 6                  name  19  2098  11   9  2137
TRUE 7                rights  33  1996  38  59  2126
TRUE 8                   not 110  1193 200 189  1692
TRUE 9                 women  10  1473  82   6  1571
TRUE 10                   li  NA  1540  NA  NA  1540
TRUE 11                their  45  1231 157  87  1520
TRUE 12                  was 117  1030 240  95  1482
TRUE 13                  sex  86  1290  23   5  1404
TRUE 14                human   5  1373   8   2  1388
TRUE 15               sexual   2  1070  NA  NA  1072
TRUE 16                   id   2  1054   2   4  1062
TRUE 17                watch  22   935  15   6   978
TRUE 18                  hiv  NA   960  NA  NA   960
TRUE 19             violence  12   889   5   3   909
TRUE 20                 aids  NA   906  NA  NA   906

#plot the freq with words appeared over 1000 times
p1 <- ggplot(subset(sort_data, total>1000), aes(word, total))
p1 <- p1 + geom_bar(stat="identity")   
p1 <- p1 + theme(axis.text.x=element_text(angle=45, hjust=1))   
p1

#word cloud
#install.packages("wordcloud")
library(wordcloud)
#set colors
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(12345)   

wordcloud(sort_data$word, freq=sort_data$total, max.words=100, colors=pal)

Q2. Are there (if any latent) dimensions in the data?

Q3. Can you create a time series analysis and formulate any predictions based on the data?

Since there are only four time points in the data and second quarter of data dominated the frequency, there is not point in doing prediction and time series analysis based on these data. It is vital to further examine the data validity and reliablity given the sampling strategy. For example, are the data extracted in a very small time frame, which might result in many data without much valuable information. But the data give no significant sign in any relavant trend.

Task 2

DATA: A sample of last months tweets where Donald Trump or Hillary Clinton were mentioned.

DATA FILES: TrumpClinton.csv.zip

Q1. What relevant groups (if any) can the twitters be divided in?

Firstly, read in the unziped data file through R and clean out twitter handles.

#first unzip the data file and look at it briefly 
#read in data
q2_data <- read.csv("/Users/pockeystar/Desktop/test/TrumpClinton.csv", 
                    head=TRUE, sep=";", stringsAsFactors=FALSE)

#install.packages("tm", "stringr")
library(tm)
library(stringr)
#sentiment analysis
#First, let???s remove the Twitter handles (i.e., @) from the tweet texts 
#so we can just have the real English words.
q2_data$clean <- str_replace_all(q2_data$content, "@\\w+", "")

Next we will implement natural language processing technique to create a Corpus and to use document term matrix to examine the frequency of word and association between words.

#We will turn data into a ???corpus???, a collection of documents 
#containing natural language text that the tm text mining package knows how to deal with.

library(SnowballC)
q2_dataCorpus <- Corpus(VectorSource(q2_data$clean))

#transform all the characters into lower
q2_dataCorpus <- tm_map(q2_dataCorpus, content_transformer(tolower))

#turn the document into a plain text format
q2_dataCorpus <- tm_map(q2_dataCorpus, PlainTextDocument)

#remove all punctuation and stopwords. Stopwords are commonly used words in the English language such as I, me, my, etc. You can see the full list of stopwords using stopwords('english').
q2_dataCorpus <- tm_map(q2_dataCorpus, removePunctuation)
q2_dataCorpus <- tm_map(q2_dataCorpus, removeWords, stopwords('english'))

#stemming
q2_dataCorpus <- tm_map(q2_dataCorpus, stemDocument)

#library(wordcloud)
wordcloud(q2_dataCorpus, max.words = 40, random.order = FALSE, colors = rainbow(50))

dtm <- DocumentTermMatrix(q2_dataCorpus)
dtm

TRUE <<DocumentTermMatrix (documents: 125002, terms: 115362)>>
TRUE Non-/sparse entries: 1238532/14419242192
TRUE Sparsity           : 100%
TRUE Maximal term length: 95
TRUE Weighting          : term frequency (tf)

#dtm is such a sparse matrix because of the high dimenstionality nature of our data therefore we try to focus on most frequent terms and leave out eh sparse part of the matrix. Here 0,9999 makes a matrix that is 99.99% empty space, maximum.   
dtm <- removeSparseTerms(dtm, 0.9999)
dtm

TRUE <<DocumentTermMatrix (documents: 125002, terms: 6196)>>
TRUE Non-/sparse entries: 1062576/773449816
TRUE Sparsity           : 100%
TRUE Maximal term length: 40
TRUE Weighting          : term frequency (tf)

library(slam)
#freq <- row_sums(dtm, na.rm = T)
#one way to look at the frequency
freq <- sort(col_sums(dtm), decreasing=TRUE)   
head(freq, 40)

TRUE         trump       clinton        donald       hillari       support 
TRUE         99845         31780         21765         18544         11441 
TRUE          vote          will https<U+00E2>\u0080<U+0161>           amp           say 
TRUE          7066          6871          6575          6536          6509 
TRUE          judg           via          like        presid         obama 
TRUE          5870          5360          5222          5126          4869 
TRUE          just           get        attack        sander         ralli 
TRUE          4853          4547          4461          3851          3740 
TRUE         media         peopl           san          call       univers 
TRUE          3699          3678          3612          3548          3498 
TRUE         berni          dont      democrat          make      american 
TRUE          3481          3430          3346          3277          3268 
TRUE          jose           win     trump2016           now    california 
TRUE          3234          3102          2965          2954          2938 
TRUE           can        endors       protest       america          want 
TRUE          2929          2901          2899          2834          2825

findFreqTerms(dtm, lowfreq=3000)

TRUE  [1] "american"      "amp"           "attack"        "berni"        
TRUE  [5] "call"          "clinton"       "democrat"      "donald"       
TRUE  [9] "dont"          "get"           "hillari"       "https<U+00E2>\u0080<U+0161>"
TRUE [13] "jose"          "judg"          "just"          "like"         
TRUE [17] "make"          "media"         "obama"         "peopl"        
TRUE [21] "presid"        "ralli"         "san"           "sander"       
TRUE [25] "say"           "support"       "trump"         "univers"      
TRUE [29] "via"           "vote"          "will"          "win"

#plot the freq
library(ggplot2) 
wf <- data.frame(word=names(freq), freq=freq)   
p <- ggplot(subset(wf, freq>3000), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p

#Term Correlations
findAssocs(dtm, c("trump" , "clinton"), corlimit=0.1) # specifying a correlation limit of 0.98

TRUE $trump
TRUE  donald univers    judg support 
TRUE    0.23    0.14    0.12    0.12 
TRUE 
TRUE $clinton
TRUE  hillari   sander     bill democrat    nomin  foundat   clinch    email 
TRUE     0.44     0.23     0.19     0.19     0.19     0.16     0.15     0.15 
TRUE    deleg   puerto     rico     poll 
TRUE     0.14     0.13     0.13     0.10

#Hierarchal Clustering & k means clustering dont really work for sparse matrix
#library(cluster)   
#dtmss <- removeSparseTerms(dtm, 0.999) # This makes a matrix that is only 15% empty space, maximum.   
#but it does not very promosing in this example
#d <- dist(t(dtm), method="euclidian")   
#fit <- hclust(d=d, method="ward.D")   
#fit   

#d <- dist(t(dtmss), method="euclidian")   
#kfit <- kmeans(d, 2)   
#clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)   

#tdmat <- as.matrix(removeSparseTerms(tdm, sparse=0.99))
#distMatrix <- dist(scale(tdmat))
#fit <- hclust(distMatrix, method="ward.D2")

#sparse matrix clustering doesnt work well either
#library(sparcl)
#perm.out <- HierarchicalSparseCluster.permute(tdmat, wbounds=c(1.5,2:6),
#                                              nperms=5)
#sparsehc <- HierarchicalSparseCluster(dists=perm.out$dists,
#                                      wbound=perm.out$bestw, #method="complete")

Unfortunately lots of clustering method fail because of huge dimension.

Q2. Can you create any predictions based on the data?

Looking at the term correlation, we can find some terms such as “univers”, “judg”, “support” are relatively highly correlated with Donald Trump, while “sander”, “bill”, “democrat”, “nomin”, “foundat”, “email”, “deleg”, “puerto”, “rico”, “poll” are highly associated with Hillary Clinton. We will go through each word. On Mr. Trump side, “univers” can be either referred to “Trump univerity” or “universal health care”. The former is a entrepreneur initiative with university clothes and the latter is the hot topic on health care which each and every american people cares about. “judg” can be refered to Trump’s sister Maryanne Trump Barry who is senior united states circuit judge or can be also referred to as lawsuits Trump has faced. “support” is rather straightforward. On Ms. Clinton side, “Sander” is the “old” democrat candidate and her major competitor “Bernie Sanders” and “bill” is her husband and former US president “Bill Clinton”, who also plays an crutial rule in her campaign. “Democrat”, “nomin”, “poll” and “deleg” are just political words refering to “democrat party”, “nomination”, “pool” and “delegates”. “foundat” is referred to “Clinton foundation” which was firstly founded by Bill Clinton and care about health care, climate, development and so on. “puerto” “rico” is referred to Puerto Rico, part of US territory and Clinton has a statement on this and she also acclaimed victory on this island last week. “email” is referred to as the infamous email scandal on Hillary Clinton published by WikiLeaks, part of which becomes a data mining contest on Kaggle today.

Q3. What should Trump and Clinton communicate to maximise their chances in the presidential election based on the data?

Task 3

The data file contains data of trends quarterly from year 2000 to 2014 The data contains the name, size of the trend. The columns are quarters.

DATA FILES: Trends 2000-2014.xlsx

library(xlsx)
library(tseries)
library(lsr)

#read in data
q3_data <- read.xlsx("/Users/pockeystar/Desktop/test/Trends_2000-2014.xlsx", sheetIndex=1, sheetName=NULL, rowIndex=NULL,
          startRow=NULL, endRow=NULL, colIndex=NULL,
          as.data.frame=TRUE, header=TRUE, colClasses=NA)

rownames(q3_data) <- make.names(q3_data[, 1], unique=TRUE)
q3_data <- q3_data[, -1]

Q1. What main latent dimensions (if any) are hidden in the material?

To explore the potential dimensions underneath the data, we can perform a (hierarchical) clustering to group the words, based on the frequency.

#1 clustering analysis
d <- dist(q3_data, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D2") 
plot(fit) # display dendogram

clusterCut <- cutree(fit, 3)
table(clusterCut)

TRUE clusterCut
TRUE  1  2  3 
TRUE 49 14  3

From the figure we can see the “optimal” number of cut is 3 and most of the words fall in the first category. The third group consists of “experience”, “production”, “growth” and they appear to be very big and does not fluctuate quite much. Group two consists of “meaningful”, “saving”, “sharing”, “sustainability”, “collaboration”, “culture”, “entertainment”, “migration”, “integration”, “economic grouth”, “competition”, “transformation”, “innovation”, “efficiency”. These are less frequent words compared to group three, but still not new words.

Q2. Can you create any predictions based on the data for 2015 and 2016?

To further examine the potential popular words, test of stationarity of time series is conducted and words with highest unstationarity are picked and showned in figure.

#2. time series analysis
#transpose
q3_data.T <- tFrame(q3_data)

#test the stationary regarding to each word
ht <- sapply(q3_data.T, adf.test, alternative="stationary", k=1)
#select those series that are not stationary, which means p.value are larger than 0.01
ht <- data.frame(ht)
ht.T <- tFrame(ht)
Intre <- ht.T[ which(ht.T$p.value > 0.01), ]
High_Intre <- ht.T[ which(ht.T$p.value > 0.5), ]
include <- row.names(High_Intre)
#There are only 7 words that we are interested in.
#[1] "meaningful"           "peer.to.peer.lending" "crowdfunding"        
#[4] "entertainment"        "streaming"            "yoga"                
#[7] "crypto.currency" 

#select back the data of high interest and plot them
q3_data_High_Intre <- q3_data[include, ]
q3_data_High_Intre.T <- tFrame(q3_data_High_Intre)
#to avoid troubling about the data format, we simply use 1 to 60 to represent 60 quarters
q3_data_High_Intre.T2 <- data.frame(Quarter = 1:60, 
                                    meaningful = q3_data_High_Intre.T$meaningful,
                                    peer.to.peer.lending =  q3_data_High_Intre.T$peer.to.peer.lending,
                                    crowdfunding = q3_data_High_Intre.T$crowdfunding,
                                    entertainment = q3_data_High_Intre.T$entertainment,
                                    streaming = q3_data_High_Intre.T$streaming,
                                    yoga = q3_data_High_Intre.T$yoga,
                                    crypto.currency = q3_data_High_Intre.T$crypto.currency)



p1 <- ggplot() + 
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = meaningful, color = "red", group = 1)) +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = entertainment, color = "blue", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1))  +
  xlab('Quarter') +
  ylab('Word_Fre')

p1

#to focus on the words that newly rised, leave out "meaningful", "entertainment"
p2 <- ggplot() + 
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = peer.to.peer.lending, color = "black", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crowdfunding, color = "orange", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = streaming, color = "yellow", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = yoga, color = "grey", group = 1))  +
  geom_line(data = q3_data_High_Intre.T2, aes(x = Quarter, y = crypto.currency, color = "green", group = 1))  +
  xlab('Quarter') +
  ylab('Word_Fre')

p2

From these figures we can see there is a tremendous rise in words such as “peer to peer lending”, “crowdfunding”, “streaming”, “joga”, “crypto currency” which all boom from 2014.

Quattroporte Mining Update

Zhizheng Wang

21 June 2016