Student ID: 1A182901-2
- Set up
rm(list=ls(all=TRUE))
setwd("~/Desktop/R/polimetrics")
library(rtweet)
library(ggplot2)
library(dplyr)
library(quanteda)
library(rtweet)
library(ggplot2)
library(lattice)
library(syuzhet)
library(stm)
library(lubridate)
0.1. In the first time (Run in 11/14 evening)
token <- create_token(
app = "yenchenghsuan",
consumer_key = "VUk6UDizyDpi9xoUvcPqt2gFR",
consumer_secret = "e3X5JwD029oD8WymQgkVOwEdVK75EgrultGUOv2psyqBZcIxYL",
access_token = "1062396813330632704-DBt0K0EKo3BbJExDt2cIMgwELL8xso",
access_secret = "lUrbH9KGsl8C2hi6UTPDAgymEt9o5je4MSHaso4EeLS8t")
#Run in 11/14 evening:
#trumptweet <- get_timeline("realDonaldTrump", n = 3200)
#trumptweet1 <- trumptweet[1:3000,]
#write_as_csv(trumptweet1, "trumptweet.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
- Import data
trumptweet_n <- read.csv("trumptweet.csv",encoding = "UTF-8")
trumptweet_n$created_at_posixit <- as.POSIXct(trumptweet_n$created_at)
trumptweet_n$created_at <- as.numeric(as.Date(trumptweet_n$created_at))
since <- trumptweet_n$created_at_posixit[3000]
latest <- trumptweet_n$created_at_posixit[1]
trumptweet_n_f <- select(trumptweet_n,
status_id,
created_at,
created_at_posixit,
text)
trumptweet_n_f$text <- as.character(trumptweet_n_f$text)
mycorpus <- corpus(trumptweet_n_f)
cat("Twitter data","\n",paste("From:",since),"\n",paste(" To:",latest))
Twitter data
From: 2018-01-04 11:37:46
To: 2018-11-13 21:35:42
- Dictionaries
tweetdfm <- dfm(mycorpus ,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
stem = TRUE,
remove_twitter = TRUE,
remove_url = TRUE)
sentiment1 <- dfm(mycorpus ,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
stem = TRUE,
remove_twitter = TRUE,
remove_url = TRUE,
dictionary = data_dictionary_LSD2015[1:2])
dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
sentiment2 <- dfm(mycorpus ,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
remove_twitter = TRUE,
remove_url = TRUE,
dictionary = lgdict)
sentimentdf <- convert(sentiment1 , to="data.frame")
sentimentlg <- convert(sentiment2 , to="data.frame")
sentimentdf$value <- sentimentdf$posit-sentimentdf$negat
sentimentdf$Date <- as.Date(trumptweet_n_f$created_at_posixit)
sentimentdf$time <- trumptweet_n_f$created_at_posixit
sentimentdf$month <- month(sentimentdf$Date)
sentimentlg_s <- select(sentimentlg,VALUES.LIBERAL,VALUES.CONSERVATIVE)
sentimentlg_s$Date <- as.Date(trumptweet_n_f$created_at_posixit)
sentimentlg_s$time <- trumptweet_n_f$created_at_posixit
sentimentlg_s$month <- month(sentimentlg_s$Date)
lib <- aggregate(sentimentlg_s$VALUES.LIBERAL, by=list(Category=sentimentlg_s$month), FUN=sum)
con <- aggregate(sentimentlg_s$VALUES.CONSERVATIVE, by=list(Category=sentimentlg_s$month), FUN=sum)
libcon <- data.frame(month=lib$Category,
lib=lib$x,
con=con$x)
2.1. Plot dictionaries’ result
2.1.B. Sentiment by month
g2 <- ggplot(sentimentdf,aes(x=reorder(month,Date),y=value,group=month))+
geom_boxplot()+
xlab("Month")+
ylab("Value")+
theme_grey()+
ggtitle("Sentiment by month")+
theme(plot.title = element_text(hjust = 0.5,size=20))
g2

- By applying sentiment dictionary in quanteda, this box plot showed the sentiment of tweets by month
- Overall, the median of tweets was positive.
- Even though the imagination of Trump was he often used twitter to threaten and blame his opponent, the general results were more positive tweets than negative. The possible explanation was he also used a lot of positive words to strengthen and defend his actions.
2.1.C. Lib-con by month
g3 <- ggplot(libcon,aes(x=month,y=con,color="Conservative"))+
geom_line()+
geom_point()+
geom_line(aes(y=lib,color="Liberal"))+
geom_point(aes(y=lib,color="Liberal"))+
scale_x_continuous(breaks = 1:11)+
ylab("Frequency")+
theme_grey()+
ggtitle("Lib-con by month")+
theme(plot.title = element_text(hjust = 0.5,size=18))+
labs(color="")
g3

- By applying the Laver and Garry dictionary, the strong tendency that Trump was a solid republican and conservative person was stable through almost the whole 2018.
- (Since he was first US leader who used twitter a lot, I had no other presidents to compare.)
- STM
3.1. SearchK at the fisrt time (NOT run it here)
#K <-c(4,5,6,7,8,9)
#storage <- searchK(Dfm_stm$documents,
# Dfm_stm$vocab,
# K = K,
# max.em.its = 50,
# prevalence = ~ created_at,
# data = Dfm_stm$meta,
# init.type = "Spectral")
#plot(storage$results$semcoh,
# storage$results$exclus,
# xlab= "Semantic coherence",
# ylab= "Exclusivity",
# col= "blue",
# pch = 19,
# cex = 1,
# lty = "solid",
# lwd = 2)
#text(storage$results$semcoh,
# storage$results$exclus,
# labels=storage$results$K,
# cex= 1,
# pos=2)
- The searchK function could not converge in this analysis. Under this restirction, the results suggested that K=7 would be a bwtter choice.
3.2. STM
Dfm_stm <- convert(tweetdfm, to = "stm")
stmFitted1 <- stm(Dfm_stm$documents,
Dfm_stm$vocab,
K = 7,
max.em.its = 100,
prevalence = ~ created_at,
data = Dfm_stm$meta,
init.type = "Spectral")
3.3. Result of Topic Prevalence
3.3.A. Summary of topics
par(mfrow=c(1,1),mar=c(5.1, 4.1, 4.1, 2.1))
plot(stmFitted1, type = "summary", labeltype = c("frex"))

- I would name the following topics:
- Topic 1: Economic issue
- Topic 4: Trade war
- Topic 5: Conflict with democrats
- Topic 6: Midterm election
3.3.B. Wordcloud of Topic 4
par(mfrow=c(1,1))
cloud(stmFitted1, topic = 4)

- Focusing on the topic 4, we could see more terms about protective trade policy.
3.3.C. Topic Prevalence across time
w <- labelTopics(stmFitted1, topics = 1, n = 3, frexweight = 0.5)
w_df <- as.data.frame(w$frex)
w_df$sum <- paste(w_df$V1,w_df$V2,w_df$V3,sep = " / ")
prep <- estimateEffect(1:7 ~ created_at,
stmFitted1,
meta = Dfm_stm$meta,
uncertainty = "Global")
par(mfrow=c(2,2),mar=c(2,2,2,2))
for(i in c(1,4,5,6)){
x <- w_df$sum[i]
plot(prep, "created_at", method = "continuous", topics = i,
model = stmFitted1, printlegend = FALSE, xaxt = "n", xlab = "Date",ylab = "")
seq <- seq(from = min(Dfm_stm$meta$created_at), to = max(Dfm_stm$meta$created_at))
axis(1, at = seq, labels = c(as.Date(seq,origin="1970/1/1")))
title(paste("Topic",i,x))
abline(h=0, col="#99CCFF")
abline(h=0.1, col="#336699")
abline(h=0.2, col="#003366")
}

- All 7 topics had the expected propotion above zero significantly regarless of the date.
- Plotting those four topics I have named, three of them decreased across time, even the trade-war-related topic decreased graduallly. On the other hand, election-related topic increase a lot as the mid-term election coming (11/6).
---
title: "Home Assignment 6"
output: html_notebook
author: Yen Cheng Hsuan
---
####Student ID: 1A182901-2
***
>0. Set up

```{r, message=FALSE, warning=FALSE}
rm(list=ls(all=TRUE))
setwd("~/Desktop/R/polimetrics")
library(rtweet)
library(ggplot2)
library(dplyr)
library(quanteda)
library(rtweet)
library(ggplot2)
library(lattice)
library(syuzhet)
library(stm)
library(lubridate)
```

>0.1. In the first time (Run in 11/14 evening)

```{r,  message=FALSE, warning=FALSE}
token <- create_token(
  app = "yenchenghsuan",
  consumer_key = "VUk6UDizyDpi9xoUvcPqt2gFR", 
  consumer_secret = "e3X5JwD029oD8WymQgkVOwEdVK75EgrultGUOv2psyqBZcIxYL", 
  access_token = "1062396813330632704-DBt0K0EKo3BbJExDt2cIMgwELL8xso",
  access_secret = "lUrbH9KGsl8C2hi6UTPDAgymEt9o5je4MSHaso4EeLS8t")

#Run in 11/14 evening:
  #trumptweet <- get_timeline("realDonaldTrump", n = 3200)
  #trumptweet1 <- trumptweet[1:3000,]
  #write_as_csv(trumptweet1, "trumptweet.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
```

>1. Import data

```{r}
trumptweet_n <- read.csv("trumptweet.csv",encoding = "UTF-8")
trumptweet_n$created_at_posixit <- as.POSIXct(trumptweet_n$created_at)
trumptweet_n$created_at <- as.numeric(as.Date(trumptweet_n$created_at))
since <- trumptweet_n$created_at_posixit[3000]
latest <- trumptweet_n$created_at_posixit[1]
trumptweet_n_f <- select(trumptweet_n,
                         status_id,
                         created_at,
                         created_at_posixit,
                         text)
trumptweet_n_f$text <- as.character(trumptweet_n_f$text)
mycorpus <- corpus(trumptweet_n_f)
```

```{r}
cat("Twitter data","\n",paste("From:",since),"\n",paste("  To:",latest))
```

***

>2. Dictionaries

```{r results="hide", message=FALSE, warning=FALSE}
tweetdfm <- dfm(mycorpus ,
                remove = stopwords("english"),
                remove_punct = TRUE,
                remove_numbers=TRUE, 
                tolower = TRUE,
                stem = TRUE,
                remove_twitter = TRUE, 
                remove_url = TRUE)

sentiment1 <- dfm(mycorpus ,
                 remove = stopwords("english"),
                 remove_punct = TRUE,
                 remove_numbers=TRUE, 
                 tolower = TRUE,
                 stem = TRUE,
                 remove_twitter = TRUE, 
                 remove_url = TRUE, 
                 dictionary = data_dictionary_LSD2015[1:2])

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
sentiment2 <- dfm(mycorpus ,
                 remove = stopwords("english"),
                 remove_punct = TRUE,
                 remove_numbers=TRUE, 
                 tolower = TRUE,
                 remove_twitter = TRUE, 
                 remove_url = TRUE, 
                 dictionary = lgdict)

sentimentdf <- convert(sentiment1 , to="data.frame")
sentimentlg <- convert(sentiment2 , to="data.frame")
sentimentdf$value <- sentimentdf$posit-sentimentdf$negat
sentimentdf$Date <- as.Date(trumptweet_n_f$created_at_posixit)
sentimentdf$time <- trumptweet_n_f$created_at_posixit
sentimentdf$month <- month(sentimentdf$Date)

sentimentlg_s <- select(sentimentlg,VALUES.LIBERAL,VALUES.CONSERVATIVE)
sentimentlg_s$Date <- as.Date(trumptweet_n_f$created_at_posixit)
sentimentlg_s$time <- trumptweet_n_f$created_at_posixit
sentimentlg_s$month <- month(sentimentlg_s$Date)

lib <- aggregate(sentimentlg_s$VALUES.LIBERAL, by=list(Category=sentimentlg_s$month), FUN=sum)
con <- aggregate(sentimentlg_s$VALUES.CONSERVATIVE, by=list(Category=sentimentlg_s$month), FUN=sum)
libcon <- data.frame(month=lib$Category,
                     lib=lib$x,
                     con=con$x)
```

>2.1. Plot dictionaries' result

####2.1.A. Trump's Tweets by month
```{r}
par(mfrow=c(1,1),mar=c(5.1, 4.1, 4.1, 2.1))
g1 <- ggplot(sentimentdf, aes(month))+
  scale_x_continuous(breaks = 1:11)+
  geom_bar(width = 0.8,fill = "#99CC66")+
  theme_grey()+
  ggtitle("Trump's Tweets by month")+
  theme(plot.title = element_text(hjust = 0.5,size=20))
g1
```

* This is the total observations of the following analysis, Donald Trump personal twitter had posted over 100 tweets every month. After April, Trump posted more than 300 tweets per month.

####2.1.B. Sentiment by month
```{r}
g2 <- ggplot(sentimentdf,aes(x=reorder(month,Date),y=value,group=month))+
  geom_boxplot()+
  xlab("Month")+
  ylab("Value")+
  theme_grey()+
  ggtitle("Sentiment by month")+
  theme(plot.title = element_text(hjust = 0.5,size=20))
g2
```

* By applying sentiment dictionary in quanteda, this box plot showed the sentiment of tweets by month
    1. Overall, the median of tweets was positive.
    2. Even though the imagination of Trump was he often used twitter to threaten and blame his opponent, the general results were more positive tweets than negative. The possible explanation was he also used a lot of positive words to strengthen and defend his actions.

####2.1.C. Lib-con by month
```{r}
g3 <- ggplot(libcon,aes(x=month,y=con,color="Conservative"))+
  geom_line()+
  geom_point()+
  geom_line(aes(y=lib,color="Liberal"))+
  geom_point(aes(y=lib,color="Liberal"))+
  scale_x_continuous(breaks = 1:11)+
  ylab("Frequency")+
  theme_grey()+
  ggtitle("Lib-con by month")+
  theme(plot.title = element_text(hjust = 0.5,size=18))+
  labs(color="") 
g3

```

* By applying the Laver and Garry dictionary, the strong tendency that Trump was a solid republican and conservative person was stable through almost the whole 2018.
* (Since he was first US leader who used twitter a lot, I had no other presidents to compare.)

***

>3. STM

>3.1. SearchK at the fisrt time (NOT run it here)
```{r, eval=F, echo=T}
#K <-c(4,5,6,7,8,9)
#storage  <- searchK(Dfm_stm$documents,
#                    Dfm_stm$vocab,
#                    K = K,
#                    max.em.its = 50,
#                    prevalence = ~ created_at,
#                    data = Dfm_stm$meta,
#                    init.type = "Spectral")
#plot(storage$results$semcoh,
#     storage$results$exclus,
#     xlab= "Semantic coherence",
#     ylab= "Exclusivity",
#     col= "blue",
#     pch = 19,
#     cex = 1,
#     lty = "solid",
#     lwd = 2)
#text(storage$results$semcoh,
#     storage$results$exclus,
#     labels=storage$results$K,
#     cex= 1,
#     pos=2)
```

* The searchK function could not converge in this analysis. Under this restirction, the results suggested that K=7 would be a bwtter choice.

>3.2. STM
```{r results="hide", message=FALSE, warning=FALSE}
Dfm_stm <- convert(tweetdfm, to = "stm")
stmFitted1 <- stm(Dfm_stm$documents,
                  Dfm_stm$vocab,
                  K = 7,
                  max.em.its = 100,
                  prevalence = ~ created_at,
                  data = Dfm_stm$meta,
                  init.type = "Spectral")
```

>3.3. Result of Topic Prevalence

####3.3.A. Summary of topics
```{r}
par(mfrow=c(1,1),mar=c(5.1, 4.1, 4.1, 2.1))
plot(stmFitted1, type = "summary", labeltype = c("frex"))
```

* I would name the following topics: 
    1. Topic 1: Economic issue
    2. Topic 4: Trade war
    3. Topic 5: Conflict with democrats
    4. Topic 6: Midterm election

####3.3.B. Wordcloud of Topic 4
```{r, message=FALSE, warning=FALSE}
par(mfrow=c(1,1))
cloud(stmFitted1, topic = 4)
```

* Focusing on the topic 4, we could see more terms about protective trade policy.

####3.3.C. Topic Prevalence across time
```{r, message=FALSE, warning=FALSE}
w <- labelTopics(stmFitted1, topics = 1, n = 3, frexweight = 0.5)
w_df <- as.data.frame(w$frex)
w_df$sum <- paste(w_df$V1,w_df$V2,w_df$V3,sep = " / ")

prep <- estimateEffect(1:7 ~ created_at, 
                       stmFitted1, 
                       meta = Dfm_stm$meta, 
                       uncertainty = "Global")
par(mfrow=c(2,2),mar=c(2,2,2,2))
for(i in c(1,4,5,6)){
  x <- w_df$sum[i]
  plot(prep, "created_at", method = "continuous", topics = i,
       model = stmFitted1, printlegend = FALSE, xaxt = "n", xlab = "Date",ylab = "")
  seq <- seq(from = min(Dfm_stm$meta$created_at), to = max(Dfm_stm$meta$created_at))
  axis(1, at = seq, labels = c(as.Date(seq,origin="1970/1/1")))
  title(paste("Topic",i,x))
  abline(h=0, col="#99CCFF")
  abline(h=0.1, col="#336699")
  abline(h=0.2, col="#003366")
}
```

* All 7 topics had the expected propotion above zero significantly regarless of the date.
* Plotting those four topics I have named, three of them decreased across time, even the trade-war-related topic decreased graduallly. On the other hand, election-related topic increase a lot as the mid-term election coming (11/6).

***