First, recall the needed pakages.

Second, Import the data. We have two datasets for German speakers, One from Germany and another from Austria.

# twitter Duitse Politici Account (Germany)
twitter_Germany <- read_delim("twitterDuitsePoliticiAccount.csv", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)

## Rows: 284198 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (8): id, type, author, text, sender, url, keywords, mentions
## date (1): datePublished
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View(twitter_Germany)

# twitter Oostenrijkse Politici Account (Austria)
twitter_Austria <- read_delim("twitterOostenrijksePoliticiAccount.csv", 
    delim = "\t", escape_double = FALSE, 
    trim_ws = TRUE)

## Rows: 188936 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (8): id, type, author, text, sender, url, keywords, mentions
## date (1): datePublished
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View(twitter_Austria)

Third, Select the time period of the study and the required variables.

# Select tweets through 2022-01-01 till 2022-04-03
tweets_Germany <- filter(twitter_Germany, 
                  datePublished > "2022-01-01")

tweets_Austria <- filter(twitter_Austria, 
                  datePublished > "2022-01-01")

# select the targeted variables
tweets_Germany <- select(tweets_Germany, author, text, sender, datePublished, keywords, mentions)
tweets_Austria <- select(tweets_Austria, author, text, sender, datePublished, keywords, mentions)

The tweets writer (Sender)

Sender_freq_g <- str_squish(unlist(na.omit(toupper(str_squish(tweets_Germany$sender)))))
Senders <- data.frame(sort(table(Sender_freq_g), decreasing=TRUE))

df <- data.frame(Sender = Senders$Sender_freq_g, Freq=Senders$Freq)
kable(df, caption = "Germany")

Germany
Sender	Freq
SPD PARTEIVORSTAND 🇪🇺	1697
CDU/CSU	1156
FDP	1008
JOANA COTAR	665
CSU	623
CEM ÖZDEMIR	587
CDU DEUTSCHLANDS	540
DIE LINKE	520
ALTERNATIVE FÜR 🇩🇪 DEUTSCHLAND	449
MARKUS SÖDER	407
CHRISTIAN LINDNER	359
KATRIN GÖRING-ECKARDT	299
BÜNDNIS 90/DIE GRÜNEN	276
JANINE WISSLER	236
DIETMAR BARTSCH	188
SUSANNE HENNIG-WELLSOW	114
AUßENMINISTERIN ANNALENA BAERBOCK	98
SAHRA WAGENKNECHT	82
ALICE WEIDEL	79
AMIRA MOHAMED ALI	71
OLAF SCHOLZ	71
ARMIN LASCHET	68
TINO CHRUPALLA	18
MARTIN SCHULZ	12

Sender_freq_a <- str_squish(unlist(na.omit(toupper(str_squish(tweets_Austria$sender)))))
Senders <- data.frame(sort(table(Sender_freq_a), decreasing=TRUE))

df <- data.frame(Sender = Senders$Sender_freq_a, Freq=Senders$Freq)
kable(df, caption = "Austria")

Austria
Sender	Freq
RUDI ANSCHOBER	2277
PETER PILZ	1353
DAS NEUE ÖSTERREICH	560
BEATE MEINL-REISINGER	522
SPÖ	496
FPÖ	282
MATTHIAS STROLZ	276
DIE GRÜNEN	236
WERNER KOGLER	199
HAGEN REINHOLD, MDB	143
NORBERT HOFER	62
PAMELA RENDI-WAGNER	36
SEBASTIAN KURZ	17
MANFRED HAIMBUCHNER	13

Plot how frequent was Ukraine hashtaged on twitter.

hashtags_Germany <- data.frame(str_split_fixed(tweets_Germany$keywords, ",", 10), tweets_Germany$datePublished)
hashtags_Germany <- tibble(hashtags_Germany)

hashtags_Germany$X1 <- str_detect(toupper(str_squish(hashtags_Germany$X1)), "UKRAIN.*")
hashtags_Germany$X2 <- str_detect(toupper(str_squish(hashtags_Germany$X2)), "UKRAIN.*")
hashtags_Germany$X3 <- str_detect(toupper(str_squish(hashtags_Germany$X3)), "UKRAIN.*")
hashtags_Germany$X4 <- str_detect(toupper(str_squish(hashtags_Germany$X4)), "UKRAIN.*")
hashtags_Germany$X5 <- str_detect(toupper(str_squish(hashtags_Germany$X5)), "UKRAIN.*")
hashtags_Germany$X6 <- str_detect(toupper(str_squish(hashtags_Germany$X6)), "UKRAIN.*")
hashtags_Germany$X7 <- str_detect(toupper(str_squish(hashtags_Germany$X7)), "UKRAIN.*")
hashtags_Germany$X8 <- str_detect(toupper(str_squish(hashtags_Germany$X8)), "UKRAIN.*")
hashtags_Germany$X9 <- str_detect(toupper(str_squish(hashtags_Germany$X9)), "UKRAIN.*")
hashtags_Germany$X10 <- str_detect(toupper(str_squish(hashtags_Germany$X10)), "UKRAIN.*")

hashtags_Germany$Count_g <- hashtags_Germany$X1+hashtags_Germany$X2+hashtags_Germany$X3+hashtags_Germany$X4+hashtags_Germany$X5+hashtags_Germany$X6+hashtags_Germany$X7+hashtags_Germany$X8+hashtags_Germany$X9+hashtags_Germany$X10
########################
hashtags_Austria <- data.frame(str_split_fixed(tweets_Austria$keywords, ",", 10), tweets_Austria$datePublished)
hashtags_Austria <- tibble(hashtags_Austria)

hashtags_Austria$X1 <- str_detect(toupper(str_squish(hashtags_Austria$X1)), "UKRAIN.*")
hashtags_Austria$X2 <- str_detect(toupper(str_squish(hashtags_Austria$X2)), "UKRAIN.*")
hashtags_Austria$X3 <- str_detect(toupper(str_squish(hashtags_Austria$X3)), "UKRAIN.*")
hashtags_Austria$X4 <- str_detect(toupper(str_squish(hashtags_Austria$X4)), "UKRAIN.*")
hashtags_Austria$X5 <- str_detect(toupper(str_squish(hashtags_Austria$X5)), "UKRAIN.*")
hashtags_Austria$X6 <- str_detect(toupper(str_squish(hashtags_Austria$X6)), "UKRAIN.*")
hashtags_Austria$X7 <- str_detect(toupper(str_squish(hashtags_Austria$X7)), "UKRAIN.*")
hashtags_Austria$X8 <- str_detect(toupper(str_squish(hashtags_Austria$X8)), "UKRAIN.*")
hashtags_Austria$X9 <- str_detect(toupper(str_squish(hashtags_Austria$X9)), "UKRAIN.*")
hashtags_Austria$X10 <- str_detect(toupper(str_squish(hashtags_Austria$X10)), "UKRAIN.*")

hashtags_Austria$Count_a <- hashtags_Austria$X1+hashtags_Austria$X2+hashtags_Austria$X3+hashtags_Austria$X4+hashtags_Austria$X5+hashtags_Austria$X6+hashtags_Austria$X7+hashtags_Austria$X8+hashtags_Austria$X9+hashtags_Austria$X10

Plotting the number of hashtaging UKRAINE

Agg_hashtags_g <- aggregate(Count_g ~ tweets_Germany.datePublished, data = hashtags_Germany, sum)
Agg_hashtags_a <- aggregate(Count_a ~ tweets_Austria.datePublished, data = hashtags_Austria, sum)
plot(Agg_hashtags_g$tweets_Germany.datePublished, Agg_hashtags_g$Count, type = "l", xlab = "Date", ylab = "Number of hashtaging UKRAINE")
lines(Agg_hashtags_a$tweets_Austria.datePublished, Agg_hashtags_a$Count, col = "red", type = "l")
legend("topleft", legend=c("Germany", "Austria"),
       col=c("Black", "Red"), lty=1, cex=0.8)

# Word cloud

# Germany
hashtags_freq_g <- str_squish(unlist(str_split(na.omit(toupper(str_squish(tweets_Germany$keywords))), ",")))
docs <- Corpus(VectorSource(hashtags_freq_g))
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1,  max.words=200, random.order=FALSE, rot.per=0.35,            colors=brewer.pal(8, "Dark2"))

kable(df[1:30, ], caption = "Germany")

Germany
	word	freq
ukraine	ukraine	624
afd	afd	303
bundestag	bundestag	205
impfpflicht	impfpflicht	194
corona	corona	188
ampel	ampel	166
russland	russland	134
putin	putin	126
bundesversammlung	bundesversammlung	118
bundesregierung	bundesregierung	107
teamcdu	teamcdu	104
3k22	3k22	92
cdupt22	cdupt22	91
steinmeier	steinmeier	85
aufinsneue	aufinsneue	82
cdu	cdu	81
deutschland	deutschland	80
bundespräsident	bundespräsident	78
saarland	saarland	78
europa	europa	65
dbdk22	dbdk22	64
freiheit	freiheit	63
bundeswehr	bundeswehr	60
omikron	omikron	59
spd	spd	57
habeck	habeck	55
inflation	inflation	54
bayern	bayern	51
scholz	scholz	51
standwithukraine	standwithukraine	48

df1 <- df[1:15,]


# Austria
hashtags_freq_a <- str_squish(unlist(str_split(na.omit(toupper(str_squish(tweets_Austria$keywords))), ",")))
docs <- Corpus(VectorSource(hashtags_freq_a))
dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1,  max.words=200, random.order=FALSE, rot.per=0.35,            colors=brewer.pal(8, "Dark2"))

kable(df[1:30, ], caption = "Austria")

Austria
	word	freq
ukraine	ukraine	147
bmichats	bmichats	114
övp	övp	91
sobotka	sobotka	79
oevpua	oevpua	58
zib2	zib2	44
oenr	oenr	44
rotesfoyer	rotesfoyer	40
oevpkorruptionsua	oevpkorruptionsua	37
putin	putin	36
longcovid	longcovid	31
oevp	oevp	28
russland	russland	27
breaking	breaking	23
covid19	covid19	22
omikron	omikron	22
standwithukraine	standwithukraine	20
kloibmüller	kloibmüller	20
imzentrum	imzentrum	19
wksta	wksta	19
einland	einland	17
hessenthaler	hessenthaler	17
yeswecare	yeswecare	16
covid	covid	16
covid19at	covid19at	15
neutralität	neutralität	15
sideletter	sideletter	15
nehammer	nehammer	13
weremember	weremember	13
wolf	wolf	12

df2 <- df[1:15,]
par(mfrow = c(1,2))
barplot(df1$freq, names.arg = df1$word, las=2, col = 2, ylab = "Word frequency", xlab = "Hashtags", main = "Germany")

barplot(df2$freq, names.arg = df2$word, las=2, col = 3, ylab = "Word frequency", xlab = "Hashtags", main = "Austria")

Tweets analysis

#Create a vector containing only the text
Text_g <- tweets_Germany$text
Text_a <- tweets_Austria$text

# Germany
# clean the text
Text_g <- gsub("#\\S*", "", Text_g)
Text_g <- gsub("https\\S*", "", Text_g) 
Text_g <- gsub("@\\S*", "", Text_g)
Text_g <- gsub("amp", "", Text_g) 
Text_g <- gsub("[\r\n]", "", Text_g)
Text_g <- gsub("[[:punct:]]", "", Text_g)
Text_g <- gsub("\\d", "", Text_g)
Text_g <- na.omit(toupper(str_squish(Text_g)))

ger = corpus(Text_g) %>% 
  tokens(remove_punct=T) %>% 
  dfm() %>%
  dfm_remove(stopwords("german")) %>%
  dfm_remove(stopwords("english"))
textplot_wordcloud(ger, max_words=200)

# Austria
# clean the text
Text_a <- gsub("#\\S*", "", Text_a)
Text_a <- gsub("https\\S*", "", Text_a) 
Text_a <- gsub("@\\S*", "", Text_a)
Text_a <- gsub("amp", "", Text_a) 
Text_a <- gsub("[\r\n]", "", Text_a)
Text_a <- gsub("[[:punct:]]", "", Text_a)
Text_a <- gsub("\\d", "", Text_a)
Text_a <- na.omit(toupper(str_squish(Text_a)))

aus = corpus(Text_a) %>% 
  tokens(remove_punct=T) %>% 
  dfm() %>%
  dfm_remove(stopwords("german")) %>%
  dfm_remove(stopwords("english"))
textplot_wordcloud(aus, max_words=200)

# Germany
words <- sort(colSums(ger), decreasing = T)
df <- data.frame(word = names(words), freq=words)
df <- df[df$freq > 200, ]
barplot(df$freq, names.arg = df$word, las=2, col = 2, main = "Germany")

#Austria
words <- sort(colSums(aus), decreasing = T)
df <- data.frame(word = names(words), freq=words)
df <- df[df$freq > 200, ]
barplot(df$freq, names.arg = df$word, las=2, col = 2, main = "Austria")

# Sentement Analysis

# Germany
tg <- iconv(Text_g)
s1 <- get_nrc_sentiment(tg, language = "german")

## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## Please use `spread()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

barplot(colSums(s1),
        las = 2,
        col = rainbow(10),
        ylab = 'Count',
        main = 'Sentiment Scores Tweets')

values_g <- get_sentiment(Text_g, method = "syuzhet", language = "german")
simple_plot(values_g)

#Austria
ta <- iconv(Text_a)
s2 <- get_nrc_sentiment(ta, language = "german")
barplot(colSums(s2),
        las = 2,
        col = rainbow(10),
        ylab = 'Count',
        main = 'Sentiment Scores Tweets')

values_a <- get_sentiment(Text_a, method = "syuzhet", language = "german")
simple_plot(values_a)

Latent Dirichlet Allocation (LDA)

# Germany
lda_g = ger %>% 
  convert(to = "topicmodels") %>%
  LDA(k=10,control=list(seed=123, alpha = 1/1:10))
terms(lda_g, 10)

##       Topic 1       Topic 2       Topic 3         Topic 4     Topic 5   
##  [1,] "bayern"      "herzlichen"  "dass"          "unsere"    "leben"   
##  [2,] "zukunft"     "glückwunsch" "scholz"        "heute"     "frauen"  
##  [3,] "minister"    "land"        "olaf"          "opfer"     "freiheit"
##  [4,] "ja"          "unsere"      "ganze"         "müssen"    "dass"    
##  [5,] "danke"       "frankwalter" "entscheidung"  "menschen"  "immer"   
##  [6,] "gemeinsam"   "liebe"       "bundeskanzler" "jahre"     "unsere"  
##  [7,] "fortschritt" "dass"        "robert"        "tag"       "kinder"  
##  [8,] "ukraine"     "milliarden"  "bedeutet"      "jahren"    "macht"   
##  [9,] "deutschland" "dank"        "deutschen"     "dass"      "debatte" 
## [10,] "berlin"      "zeiten"      "deutschland"   "vergessen" "heute"   
##       Topic 6      Topic 7 Topic 8       Topic 9    Topic 10              
##  [1,] "krieg"      "heute" "müssen"      "uhr"      "mehr"                
##  [2,] "ukraine"    "gute"  "bm"          "ab"       "menschen"            
##  [3,] "presseinfo" "mehr"  "geht"        "live"     "müssen"              
##  [4,] "europa"     "wahl"  "dass"        "brauchen" "fordert"             
##  [5,] "menschen"   "dass"  "brauchen"    "mehr"     "bundesfinanzminister"
##  [6,] "stehen"     "abend" "menschen"    "heute"    "dass"                
##  [7,] "dass"       "danke" "sagt"        "dabei"    "maßnahmen"           
##  [8,] "heute"      "spd"   "darum"       "statt"    "euro"                
##  [9,] "putin"      "neuen" "mehr"        "beim"     "braucht"             
## [10,] "russland"   "neue"  "deutschland" "geht"     "bürger"

# Austria
lda_a = aus %>% 
  convert(to = "topicmodels") %>%
  LDA(k=10,control=list(seed=123, alpha = 1/1:10))
terms(lda_a, 10)

##       Topic 1     Topic 2      Topic 3     Topic 4      Topic 5      
##  [1,] "heute"     "heute"      "danke"     "dass"       "menschen"   
##  [2,] "dass"      "geht"       "regierung" "menschen"   "österreich" 
##  [3,] "immer"     "mehr"       "dass"      "europa"     "mehr"       
##  [4,] "nehammer"  "sobotka"    "via"       "unsere"     "dass"       
##  [5,] "menschen"  "wien"       "ukraine"   "ukraine"    "regierung"  
##  [6,] "regierung" "euro"       "heute"     "österreich" "geht"       
##  [7,] "övp"       "österreich" "wurde"     "demokratie" "tun"        
##  [8,] "wurde"     "teil"       "gerade"    "krieg"      "impfpflicht"
##  [9,] "tag"       "h"          "gute"      "angriff"    "müssen"     
## [10,] "d"         "wolfgang"   "övp"       "gibt"       "ja"         
##       Topic 6        Topic 7      Topic 8      Topic 9     Topic 10    
##  [1,] "interview"    "dass"       "dass"       "russian"   "menschen"  
##  [2,] "abhängigkeit" "ukraine"    "heute"      "ukraine"   "regierung" 
##  [3,] "jahren"       "heute"      "unsere"     "russia"    "viele"     
##  [4,] "dass"         "sanktionen" "mehr"       "people"    "dass"      
##  [5,] "regierung"    "mehr"       "övp"        "kyiv"      "immer"     
##  [6,] "sagt"         "nie"        "russland"   "ukrainian" "mehr"      
##  [7,] "seit"         "sicherheit" "österreich" "us"        "övp"       
##  [8,] "russischem"   "wirklich"   "schon"      "breaking"  "österreich"
##  [9,] "dr"           "pandemie"   "ganz"       "now"       "müssen"    
## [10,] "wäre"         "österreich" "chats"      "new"       "seit"

Big Data assignment Notebook

First, recall the needed pakages.

Second, Import the data. We have two datasets for German speakers, One from Germany and another from Austria.

Third, Select the time period of the study and the required variables.

The tweets writer (Sender)

Plot how frequent was Ukraine hashtaged on twitter.

Plotting the number of hashtaging UKRAINE

Tweets analysis

Latent Dirichlet Allocation (LDA)