Student ID: 1A182901-2
- Set up
rm(list=ls(all=TRUE))
setwd("~/Desktop/R/polimetrics")
library(quanteda)
library(readtext)
library(gridExtra)
library(ggplot2)
library(reshape2)
library(manifestoR)
library(e1071)
library(caTools)
library(randomForest)
library(caret)
library(stringr)
library(cowplot)
library(knitr)
library(kableExtra)
- Dictionaries
mp_setapikey(key.file = NULL, key = "e51dc314a21ce75bd8221fcb338e2ee5")
cmp <- mp_maindataset()
uk <- cmp[ which(cmp$countryname=="United Kingdom" & cmp$date == 201505 ),]
available_uk <- mp_availability( countryname=="United Kingdom" & date == 201505 )
uk2015 <- mp_corpus(available_uk )
qt_uk2015 <- corpus(uk2015)
1.1. Sentiment
dfm_uk1 <- dfm(qt_uk2015, tolower=TRUE,remove_punct = TRUE,remove_numbers=TRUE, remove = stopwords("english"))
dfm_uk2 <- dfm_group(dfm_uk1, 'party')
tokens <- ntoken(dfm_uk2)
dfm_uk <- dfm(qt_uk2015,
tolower=TRUE,
remove_punct = TRUE,
remove_numbers=TRUE,
dictionary = data_dictionary_LSD2015[1:2],
group=c("party"))
dfm_uk@Dimnames$docs <- uk$partyname
df_uk <- convert(dfm_uk,to="data.frame")
colnames(df_uk) <- c("party","n","p")
df_uk$tokens <- tokens
df_uk$rel <- df_uk$p-df_uk$n
df_uk$relp <- df_uk$rel/df_uk$tokens
p1 <- ggplot(df_uk,aes(x=reorder(party,rel),y=rel,fill=party))+
geom_bar(position="dodge",stat="identity",width = 0.5)+
xlab("")+
ylab("Absolute terms")+
theme(legend.position="none")+
scale_fill_brewer(palette="Set3")+
coord_flip()
p2 <- ggplot(df_uk,aes(x=reorder(party,relp),y=relp,fill=party))+
geom_bar(position="dodge",stat="identity",width = 0.5)+
xlab("")+
ylab("Relative terms")+
theme(legend.position="none")+
scale_fill_brewer(palette="Set3")+
coord_flip()
plot_grid(p1,p2,nrow = 2)

- This graph is the combination of the absolute and relative terms of the sentiment in the UK 2015 manifestoes.
- As the graph above, there’re several observations,
- All the manifestoes had more positive terms than negative terms.
- The ruling party - Conservative Party, used more positive words than its opposition, which was Labor Party.
- According two results, the Liberal Democrats’ manifesto were the most positive one.
- The relative positions did change if we considered the length of manifestoes.
1.2. Liberal versus Conservative
##1-2
dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
lg_dfm <- dfm(qt_uk2015,
tolower=TRUE,
remove_punct = TRUE,
remove_numbers=TRUE,
group=c("party"),
dictionary = lgdict)
Dictionary <-convert(lg_dfm, to="data.frame")
names(Dictionary )[20] <- "Conservative"
names(Dictionary )[21] <- "Liberal"
Dictionary_res <- data.frame(Party=uk$partyname,
Conservative=Dictionary$Conservative,
Liberal=Dictionary$Liberal)
Dictionary_res$tokens <- tokens
Dictionary_res$absolute <- Dictionary_res$Liberal-Dictionary_res$Conservative
Dictionary_res$relative <- Dictionary_res$absolute/Dictionary_res$tokens
p5 <- ggplot(Dictionary_res,aes(x=Liberal,y=Conservative))+
geom_point()+
geom_text(aes(label = Party),hjust=.5, vjust=-1,size = 2)
p5

- This graph showed the liberal and conservative terms used by the manifestoes.
- Suprisingly, it did not have negative relationship. Contrarily, there were even some positive relationships.
- One possible explanation was that the speaker needed to compare in the manifesto, no matter which value he/she held.
- The major parties in UK, Conservative Party and Labor Party used many liberal-conservative terms.
- Unsurprisingly, Liberal Democrats and Green Party used more liberal terms in their manifestoes.
p3 <- ggplot(Dictionary_res,aes(x=reorder(Party,absolute),y=absolute,fill=Party))+
geom_bar(position="dodge",stat="identity",width = 0.5)+
geom_hline(yintercept=0)+
xlab("")+
ylab("Absolute terms")+
theme(legend.position="none")+
scale_fill_brewer(palette="Set3")+
coord_flip()
p4 <- ggplot(Dictionary_res,aes(x=reorder(Party,relative),y=relative,fill=Party))+
geom_bar(position="dodge",stat="identity",width = 0.5)+
geom_hline(yintercept=0)+
xlab("")+
ylab("Relative terms")+
theme(legend.position="none")+
scale_fill_brewer(palette="Set3")+
coord_flip()
plot_grid(p3,p4,nrow = 2)

- This graph is the absolute and relative terms of liberal-conservative scale in UK 2015 election.
- The positive value meant the party used more liberal words, and vice versa.
- There were several conclusions,
- Several positions changed if we considered the length.
- The two major parties in UK both used more conservative terms than liberal terms, especially the Conservative Party.
- UK Independence Party used lots of conservative terms. This result revealed the characteristics of Brexit.
- Classifiers
#2
trump1 <- read.csv("trump-orig3.csv", stringsAsFactors=FALSE)
trump2 <- read.csv("trump_tweets2.csv", stringsAsFactors=FALSE)
trump1_1 <- read.csv("trump-orig3.csv", stringsAsFactors=FALSE)
trump2_1 <- read.csv("trump_tweets2.csv", stringsAsFactors=FALSE)
2.1. 2-fold validation of RandomForest
trump1$text <- str_replace_all(trump1_1$text, "[^[:alnum:]]", " ")
trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)
cv <- corpus(trump1)
dfm_trump_cv <- dfm(cv,
remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
remove_symbols=TRUE,
remove_twitter = TRUE,
remove_separators=TRUE,
remove_url = TRUE)
dfm_trump_n_cv <- dfm_trim(dfm_trump_cv , min_docfreq= 0.05)
data_cv <- as.data.frame(as.matrix(dfm_trump_n_cv))
colnames(data_cv ) <- make.names(colnames(data_cv ))
data_cv$sentiment<- trump1$Sentiment
set.seed(123)
to_train <- sample(1:472, 236, replace = FALSE)
train_cv <- data_cv[to_train, ]
test_cv <- data_cv[-to_train, ]
set.seed(123)
system.time(RF_cv <- randomForest(sentiment~ ., data=train_cv, type="classification"))
user system elapsed
49.104 0.830 51.288
predictRF_cv <- predict(RF_cv, newdata=test_cv)
x2_cv <- as.matrix(table("Predictions"= predictRF_cv, "Actual"=test_cv$sentiment))
acc_cv <- sum(diag(x2_cv)) / sum(x2_cv)
conf.mat_cv <- confusionMatrix( predictRF_cv, test_cv$sentiment)
set.seed(123)
system.time(RF2_cv <- randomForest(sentiment~ ., data=test_cv, type="classification"))
user system elapsed
48.756 0.887 51.046
predictRF2_cv <- predict(RF2_cv, newdata=train_cv)
x2_2_cv <- as.matrix(table("Predictions"= predictRF2_cv, "Actual"=train_cv$sentiment))
acc_alt_cv <- sum(diag(x2_2_cv)) / sum(x2_2_cv)
conf.mat2_cv <- confusionMatrix( predictRF2_cv, train_cv$sentiment)
res1_cv <- as.data.frame(conf.mat_cv$byClass)
res2_cv <- as.data.frame(conf.mat2_cv$byClass)
2.2. Prediction: RandomForest
trump <- select(trump1_1,text)
trump_t <- select(trump2_1,text)
trump <- as.data.frame(rbind(trump,trump_t))
trump$text <- str_replace_all(trump$text, "[^[:alnum:]]", " ")
tt <- corpus(trump)
dfm_trump_n <- dfm(tt,
remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
remove_symbols=TRUE,
remove_twitter = TRUE,
remove_separators=TRUE,
remove_url = TRUE)
dfm_trump_n_t <- dfm_trim(dfm_trump_n , min_docfreq= 0.05)
data_t <- as.data.frame(as.matrix(dfm_trump_n_t))
colnames(data_t ) <- make.names(colnames(data_t ))
train <- data_t[1:472, ]
trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)
train$sentiment <- trump1$Sentiment
test <- data_t[473:1472, ]
set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train, type="classification"))
user system elapsed
385.031 5.918 425.355
predictRF <- predict(RF, newdata=test)
res_rf <- as.data.frame(table(predictRF))
res_rf$Prob <- res_rf$Freq/sum(res_rf$Freq)
2.3. Prediction: Naive Bayes
ttc2 <- corpus(trump2)
dfm_trump2_n <- dfm(ttc2,
remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
remove_symbols=TRUE,
remove_twitter = TRUE,
remove_separators=TRUE,
remove_url = TRUE)
dfm_trump2_n_t <- dfm_trim(dfm_trump2_n , min_docfreq= 0.05)
trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)
trump1$text <- str_replace_all(trump1$text, "[^[:alnum:]]", " ")
tto <- corpus(trump1)
dfm_trump1 <- dfm(tto,
remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
remove_punct = TRUE,
remove_numbers=TRUE,
tolower = TRUE,
remove_symbols=TRUE,
remove_twitter = TRUE,
remove_separators=TRUE,
remove_url = TRUE)
dfm_trump1_t <- dfm_trim(dfm_trump1, min_docfreq= 0.05)
data_t1 <- as.data.frame(as.matrix(dfm_trump1_t))
colnames(data_t1) <- make.names(colnames(data_t1 ))
data_t1$sentiment<- trump1$Sentiment
nb <- textmodel_nb(dfm_trump1_t ,docvars(dfm_trump1_t, "Sentiment"), distribution = c("multinomial"))
dfm_trump2_n_t <- dfm_select(dfm_trump2_n_t, dfm_trump1_t)
predicted_nb <- predict(nb, dfm_trump2_n_t)
res_nb <- as.data.frame(table(predicted_nb))
res_nb$Prob <- res_nb$Freq/sum(res_nb$Freq)
2.4. Results 2.4.1. Accuracy of validation
A. Table 1
table(test_cv$sentiment, predictRF_cv) %>%
kable() %>%
kable_styling()
| negative |
81 |
11 |
0 |
| neutral |
61 |
27 |
1 |
| positive |
38 |
16 |
1 |
- This is the first result of cross-validation.
B. Table 2
table(train_cv$sentiment, predictRF2_cv)%>%
kable() %>%
kable_styling()
| negative |
70 |
28 |
1 |
| neutral |
42 |
47 |
0 |
| positive |
25 |
20 |
3 |
- This is the second result of cross-validation.
C. Accuracy
ac_cv <- (acc_cv+acc_alt_cv)/2
ac_cv
[1] 0.4851695
D. Precision and Recall
res_all_cv <- data.frame(
precision1=res1_cv$Precision,
recall1=res1_cv$Recall,
precision2=res2_cv$Precision,
recall2=res2_cv$Recall
)
rownames(res_all_cv) <- rownames(res1_cv)
res_all_cv %>%
kable() %>%
kable_styling()
| Class: negative |
0.45 |
0.8804348 |
0.5109489 |
0.7070707 |
| Class: neutral |
0.50 |
0.3033708 |
0.4947368 |
0.5280899 |
| Class: positive |
0.50 |
0.0181818 |
0.7500000 |
0.0625000 |
- As the accuracy, precision, recall results of 2-fold cross validation, there’re some conclusions.
- This model can not succesfully predicted the positive tweet.
- The recall of negative was the highest, the model performanced better on identified the negative tweet.
2.4.2. Prediction of RandomForest
res_rf%>%
kable() %>%
kable_styling()
| negative |
944 |
0.944 |
| neutral |
56 |
0.056 |
| positive |
0 |
0.000 |
- As the cross-validation in 2.4.1, the Random Forest model could not identified the positive tweet again in prediction.
2.4.3. Prediction of Naive Bayes
res_nb%>%
kable() %>%
kable_styling()
| negative |
459 |
0.459 |
| neutral |
304 |
0.304 |
| positive |
237 |
0.237 |
- The Naive Bayes model showed largely different result, comparing to the RF model.
- Since I did not test the cross validation of this model. The accuracy and model choice needs further discussion.
---
title: "Home Assignment 5"
output: html_notebook
author: Yen Cheng Hsuan
---
####Student ID: 1A182901-2
***
>0. Set up

```{r, message=FALSE, warning=FALSE}
rm(list=ls(all=TRUE))
setwd("~/Desktop/R/polimetrics")
library(quanteda)
library(readtext)
library(gridExtra)
library(ggplot2)
library(reshape2)
library(manifestoR)
library(e1071)
library(caTools)
library(randomForest)
library(caret)
library(stringr)
library(cowplot)
library(knitr)
library(kableExtra)
```
>1. Dictionaries

```{r, message=FALSE, warning=FALSE}
mp_setapikey(key.file = NULL, key = "e51dc314a21ce75bd8221fcb338e2ee5")

cmp <- mp_maindataset()
uk <- cmp[ which(cmp$countryname=="United Kingdom" & cmp$date == 201505 ),]

available_uk <- mp_availability( countryname=="United Kingdom" & date == 201505 )
uk2015 <- mp_corpus(available_uk )
qt_uk2015 <- corpus(uk2015)
```
>1.1. Sentiment

```{r, message=FALSE, warning=FALSE}
dfm_uk1 <- dfm(qt_uk2015,  tolower=TRUE,remove_punct = TRUE,remove_numbers=TRUE, remove = stopwords("english"))
dfm_uk2 <- dfm_group(dfm_uk1, 'party') 
tokens <- ntoken(dfm_uk2)

dfm_uk <- dfm(qt_uk2015, 
              tolower=TRUE, 
              remove_punct = TRUE,
              remove_numbers=TRUE,
              dictionary = data_dictionary_LSD2015[1:2],
              group=c("party"))
dfm_uk@Dimnames$docs <- uk$partyname 
df_uk <- convert(dfm_uk,to="data.frame")
colnames(df_uk) <- c("party","n","p")
df_uk$tokens <- tokens
df_uk$rel <- df_uk$p-df_uk$n
df_uk$relp <- df_uk$rel/df_uk$tokens
p1 <- ggplot(df_uk,aes(x=reorder(party,rel),y=rel,fill=party))+
  geom_bar(position="dodge",stat="identity",width = 0.5)+
  xlab("")+
  ylab("Absolute terms")+
  theme(legend.position="none")+
  scale_fill_brewer(palette="Set3")+
  coord_flip()
p2 <- ggplot(df_uk,aes(x=reorder(party,relp),y=relp,fill=party))+
  geom_bar(position="dodge",stat="identity",width = 0.5)+
  xlab("")+
  ylab("Relative terms")+
  theme(legend.position="none")+
  scale_fill_brewer(palette="Set3")+
  coord_flip()
plot_grid(p1,p2,nrow = 2)

```
* This graph is the combination of the absolute and relative terms of the sentiment in the UK 2015 manifestoes.
* As the graph above, there're several observations,
    1. All the manifestoes had more positive terms than negative terms.
    2. The ruling party - Conservative Party, used more positive words than its opposition, which was Labor Party.
    3. According two results, the Liberal Democrats' manifesto were the most positive one.
    4. The relative positions did change if we considered the length of manifestoes.
    

>1.2. Liberal versus Conservative

```{r results="hide"}
dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))

```
```{r, message=FALSE, warning=FALSE}

lg_dfm <- dfm(qt_uk2015,  
              tolower=TRUE, 
              remove_punct = TRUE,
              remove_numbers=TRUE,
              group=c("party"), 
              dictionary = lgdict)
Dictionary <-convert(lg_dfm, to="data.frame")
names(Dictionary )[20] <- "Conservative"
names(Dictionary )[21] <- "Liberal"
Dictionary_res <- data.frame(Party=uk$partyname,
                             Conservative=Dictionary$Conservative,
                             Liberal=Dictionary$Liberal)
Dictionary_res$tokens <- tokens
Dictionary_res$absolute <- Dictionary_res$Liberal-Dictionary_res$Conservative
Dictionary_res$relative <- Dictionary_res$absolute/Dictionary_res$tokens

p5 <- ggplot(Dictionary_res,aes(x=Liberal,y=Conservative))+
  geom_point()+
  geom_text(aes(label = Party),hjust=.5, vjust=-1,size = 2)
p5
```

* This graph showed the liberal and conservative terms used by the manifestoes.
1. Suprisingly, it did not have negative relationship. Contrarily, there were even some positive relationships.
    + One possible explanation was that the speaker needed to compare in the manifesto, no matter which value he/she held.
2. The major parties in UK, Conservative Party and Labor Party used many liberal-conservative terms.
3. Unsurprisingly, Liberal Democrats and Green Party used more liberal terms in their manifestoes.

```{r}
p3 <- ggplot(Dictionary_res,aes(x=reorder(Party,absolute),y=absolute,fill=Party))+
  geom_bar(position="dodge",stat="identity",width = 0.5)+
  geom_hline(yintercept=0)+
  xlab("")+
  ylab("Absolute terms")+
  theme(legend.position="none")+
  scale_fill_brewer(palette="Set3")+
  coord_flip()
p4 <- ggplot(Dictionary_res,aes(x=reorder(Party,relative),y=relative,fill=Party))+
  geom_bar(position="dodge",stat="identity",width = 0.5)+
  geom_hline(yintercept=0)+
  xlab("")+
  ylab("Relative terms")+
  theme(legend.position="none")+
  scale_fill_brewer(palette="Set3")+
  coord_flip()

plot_grid(p3,p4,nrow = 2)
```
* This graph is the absolute and relative terms of liberal-conservative scale in UK 2015 election.
* The positive value meant the party used more liberal words, and vice versa.
* There were several conclusions,
    1. Several positions changed if we considered the length.
    2. The two major parties in UK both used more conservative terms than liberal terms, especially the Conservative Party.
    3. UK Independence Party used lots of conservative terms. This result revealed the characteristics of Brexit.

***
>2. Classifiers

```{r}
#2
trump1 <- read.csv("trump-orig3.csv", stringsAsFactors=FALSE)
trump2 <- read.csv("trump_tweets2.csv", stringsAsFactors=FALSE)
trump1_1 <- read.csv("trump-orig3.csv", stringsAsFactors=FALSE)
trump2_1 <- read.csv("trump_tweets2.csv", stringsAsFactors=FALSE)
```

>2.1. 2-fold validation of RandomForest

```{r}
trump1$text <- str_replace_all(trump1_1$text, "[^[:alnum:]]", " ")

trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)

cv  <- corpus(trump1)
dfm_trump_cv <- dfm(cv, 
                    remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                    remove_punct = TRUE,
                    remove_numbers=TRUE,
                    tolower = TRUE, 
                    remove_symbols=TRUE, 
                    remove_twitter = TRUE, 
                    remove_separators=TRUE,
                    remove_url = TRUE)

dfm_trump_n_cv <- dfm_trim(dfm_trump_cv , min_docfreq= 0.05)
data_cv <- as.data.frame(as.matrix(dfm_trump_n_cv))
colnames(data_cv ) <- make.names(colnames(data_cv ))
data_cv$sentiment<- trump1$Sentiment

set.seed(123)
to_train <- sample(1:472, 236, replace = FALSE)
train_cv <- data_cv[to_train, ]
test_cv  <- data_cv[-to_train, ]

set.seed(123)
system.time(RF_cv <- randomForest(sentiment~ ., data=train_cv, type="classification"))
predictRF_cv <- predict(RF_cv, newdata=test_cv)
x2_cv <- as.matrix(table("Predictions"= predictRF_cv, "Actual"=test_cv$sentiment))
acc_cv <- sum(diag(x2_cv)) / sum(x2_cv)

conf.mat_cv <- confusionMatrix( predictRF_cv, test_cv$sentiment)

set.seed(123)
system.time(RF2_cv <- randomForest(sentiment~ ., data=test_cv, type="classification"))
predictRF2_cv <- predict(RF2_cv, newdata=train_cv)
x2_2_cv <- as.matrix(table("Predictions"= predictRF2_cv, "Actual"=train_cv$sentiment))
acc_alt_cv <- sum(diag(x2_2_cv)) / sum(x2_2_cv)

conf.mat2_cv <- confusionMatrix( predictRF2_cv, train_cv$sentiment)

res1_cv <- as.data.frame(conf.mat_cv$byClass)
res2_cv <- as.data.frame(conf.mat2_cv$byClass)
```
>2.2. Prediction: RandomForest

```{r}
trump <- select(trump1_1,text)
trump_t <- select(trump2_1,text)
trump <- as.data.frame(rbind(trump,trump_t))
trump$text <- str_replace_all(trump$text, "[^[:alnum:]]", " ")
tt  <- corpus(trump)

dfm_trump_n <- dfm(tt, 
                   remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                   remove_punct = TRUE,
                   remove_numbers=TRUE,
                   tolower = TRUE, 
                   remove_symbols=TRUE, 
                   remove_twitter = TRUE, 
                   remove_separators=TRUE,
                   remove_url = TRUE)

dfm_trump_n_t <- dfm_trim(dfm_trump_n , min_docfreq= 0.05)
data_t <- as.data.frame(as.matrix(dfm_trump_n_t))

colnames(data_t ) <- make.names(colnames(data_t ))

train <- data_t[1:472, ]

trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)

train$sentiment <- trump1$Sentiment

test  <- data_t[473:1472, ]

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train, type="classification"))
predictRF <- predict(RF, newdata=test)
res_rf <- as.data.frame(table(predictRF))
res_rf$Prob <- res_rf$Freq/sum(res_rf$Freq)
```
>2.3. Prediction: Naive Bayes
```{r}

ttc2  <- corpus(trump2)
dfm_trump2_n <- dfm(ttc2, 
                    remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                    remove_punct = TRUE,
                    remove_numbers=TRUE,
                    tolower = TRUE, 
                    remove_symbols=TRUE, 
                    remove_twitter = TRUE, 
                    remove_separators=TRUE,
                    remove_url = TRUE)
dfm_trump2_n_t <- dfm_trim(dfm_trump2_n , min_docfreq= 0.05)

trump1$setv <- as.integer(as.factor(trump1_1$Sentiment))
trump1$Sentiment[trump1$setv==1] <- "negative"
trump1$Sentiment[trump1$setv==2] <- "neutral"
trump1$Sentiment[trump1$setv==3] <- "positive"
trump1$Sentiment <- as.factor(trump1$Sentiment)


trump1$text <- str_replace_all(trump1$text, "[^[:alnum:]]", " ")
tto <- corpus(trump1)
dfm_trump1 <- dfm(tto, 
                  remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                  remove_punct = TRUE,
                  remove_numbers=TRUE,
                  tolower = TRUE, 
                  remove_symbols=TRUE, 
                  remove_twitter = TRUE, 
                  remove_separators=TRUE,
                  remove_url = TRUE)
dfm_trump1_t <- dfm_trim(dfm_trump1, min_docfreq= 0.05)
data_t1 <- as.data.frame(as.matrix(dfm_trump1_t))
colnames(data_t1) <- make.names(colnames(data_t1 ))
data_t1$sentiment<- trump1$Sentiment

nb <- textmodel_nb(dfm_trump1_t ,docvars(dfm_trump1_t, "Sentiment"), distribution = c("multinomial"))
dfm_trump2_n_t <- dfm_select(dfm_trump2_n_t, dfm_trump1_t)

predicted_nb <- predict(nb, dfm_trump2_n_t)
res_nb <- as.data.frame(table(predicted_nb))
res_nb$Prob <- res_nb$Freq/sum(res_nb$Freq)
```
>2.4. Results
>2.4.1. Accuracy of validation

>A. Table 1
```{r, message=FALSE, warning=FALSE}

table(test_cv$sentiment, predictRF_cv) %>%
  kable() %>%
  kable_styling()

```
* This is the first result of cross-validation.

>B. Table 2
```{r, message=FALSE, warning=FALSE}
table(train_cv$sentiment, predictRF2_cv)%>%
  kable() %>%
  kable_styling()
```
* This is the second result of cross-validation.

>C. Accuracy
```{r}
ac_cv <- (acc_cv+acc_alt_cv)/2
ac_cv
```
>D. Precision and Recall
```{r, message=FALSE, warning=FALSE}

res_all_cv <- data.frame(
  precision1=res1_cv$Precision,
  recall1=res1_cv$Recall,
  precision2=res2_cv$Precision,
  recall2=res2_cv$Recall
)

rownames(res_all_cv) <- rownames(res1_cv)

res_all_cv %>%
  kable() %>%
  kable_styling()
```

* As the accuracy, precision, recall results of 2-fold cross validation, there're some conclusions.
    1. This model can not succesfully predicted the positive tweet.
    2. The recall of negative was the highest, the model performanced better on identified the negative tweet.

>2.4.2. Prediction of RandomForest
```{r, message=FALSE, warning=FALSE}
res_rf%>%
  kable() %>%
  kable_styling()
```
* As the cross-validation in 2.4.1, the Random Forest model could not identified the positive tweet again in prediction.


>2.4.3. Prediction of Naive Bayes
```{r, message=FALSE, warning=FALSE}
res_nb%>%
  kable() %>%
  kable_styling()
```

* The Naive Bayes model showed largely different result, comparing to the RF model.
* Since I did not test the cross validation of this model. The accuracy and model choice needs further discussion.

***