Description

Data source

anthems <- read.csv('C:\\Users\\17327\\anthems.csv')
knitr::kable(head(anthems,3), "simple")  
Country alpha.2 alpha.3 Continent Anthem
Albania AL ALB Europe Around our flag we stand united, With one wish and one goal, A sacred oath we bestow upon it Proclaiming loyalty for our salvation. From war abstains only he, Who a traitor is born, He who is a true man is not frightened, But dies a warrior to the cause. With weapons in our hands a-brandished, We will defend our fatherland, Our sacred rights we’ll not relinquish, The foe has no place in our land. For God himself proclaimed The nations of the earth shall wane, And yet will live, will thrive Albania. For you, for you we fight. O Flag, flag, you sacred symbol Upon you we now swear For Albania, our dear fatherland For honour and your glory. Brave man is named and honoured The one who sacrificed himself for the fatherland Forever he will be remembered On earth and under as a saint!
Armenia AM ARM Europe Our Fatherland, free, independent, That has for centuries lived, Is now summoning its sons To the free, independent Armenia. Here is a flag for you, my brother, That I have sewn by hand Over the sleepless nights, And bathed in my tears. Look at it, tricolored, A valuable symbol for us. Let it shine against the enemy. Let you, Armenia, be glorious forever. Death is the same everywhere, A man dies but once, Blessed is the one that dies For the freedom of his nation.
Austria AT AUT Europe Land of mountains, land by the river, Land of fields, land of cathedrals, Land of hammers, with a promising future! Home to great daughters and sons, People highly gifted for beautiful arts, Much-praised Austria! Strongly feuded for, fiercely hard-fought for, Thou liest in the middle of the continent Like a strong heart, Since the early days of the ancestors thou hast Borne the burden of a high mission, Much-tried Austria. Bravely towards the new ages See us striding, free, and faithful, Assiduous and full of hope, Unified, let us in jolly choirs Pledge allegiance to thee, Fatherland Much-beloved Austria.

Extraction of anthems

my_func <- function(){
  x <- list()
  for (k in 1:190){
    x[[k]] <- anthems[k,5]
  }
  x
}

data <- my_func()

head(data,1)
## [[1]]
## [1] "Around our flag we stand united, With one wish and one goal, A sacred oath we bestow upon it Proclaiming loyalty for our salvation. From war abstains only he, Who a traitor is born, He who is a true man is not frightened, But dies a warrior to the cause. With weapons in our hands a-brandished, We will defend our fatherland, Our sacred rights weâââ\200šÂ¬Ã¢â\200žÂ¢ll not relinquish, The foe has no place in our land. For God himself proclaimed The nations of the earth shall wane, And yet will live, will thrive Albania. For you, for you we fight. O Flag, flag, you sacred symbol Upon you we now swear For Albania, our dear fatherland For honour and your glory. Brave man is named and honoured The one who sacrificed himself for the fatherland Forever he will be remembered On earth and under as a saint!"

Top 3 words in each Anthem

library(qdapRegex)
library(textclean)
library(tidyverse)
library(factoextra)
library(xfun)
library(cluster)
library(stringr)
library(tm)
library(dplyr)
library(tidytext)
stopwords <- read.csv("C:\\Users\\17327\\stopwords.csv")
clean_data <- function(data) {
  data <- tolower(trimws(data))
  data <- removeWords(data, c(stopwords("english")))
  stopwords <- head(sort(unlist(stopwords), decreasing=TRUE), 986)
  stopwords <- removePunctuation(stopwords)
  data <- removeWords(data, stopwords)
  data <- removePunctuation(data)
  data <- stemDocument(data, language = "english")

  corpus <- Corpus(VectorSource(data))
  dtm <- as.matrix(DocumentTermMatrix(corpus))
  
  freq <- colSums(dtm)
  freq <- as.data.frame(sort(freq, decreasing = TRUE))
  data2 <- freq
  data2 <- tibble::rownames_to_column(data2,"word")
  colnames(data2) <- c("word","count")
  head(data2$word, 3)
}

clean_data(anthems[1,5])
## [1] "fatherland" "flag"       "sacr"

Clean, process and cluster the data

Tf(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
Idf = (Total number of documents) / (Number of documents with word t in it)
Tf-Idf = Tf x Idf
stopwords <- read.csv("C:\\Users\\17327\\stopwords.csv")
cluster_data <- function() {
  
  anthem <- my_func()
  corpus <- VectorSource(anthem)
  corpus <- Corpus(corpus)
  anthem <- tm::tm_map(corpus, tolower)
  anthem <- tm::tm_map(anthem, removeWords, stopwords("english"))
  stopwords <- head(sort(unlist(stopwords), decreasing=TRUE), 986)
  stopwords <- removePunctuation(stopwords)
  anthem <- tm::tm_map(anthem, removeWords, stopwords)
  anthem <- tm::tm_map(anthem, removePunctuation)
  anthem <- tm::tm_map(anthem, stemDocument)
  anthem <- tm::tm_map(anthem, removeWords, c("listanthem"))
  anthem <- tm::tm_map(anthem, removeWords, "http[A-Za-z]+")
  anthem <- tm::tm_map(anthem, removeWords, "www.[A-Za-z]+")
  anthem <- tm::tm_map(anthem, removeWords, c("afa201rpafad", "aa200o", "a200oeend", "200end", 
                                              "200", "thi", "arab","oer", "wait"))
  anthem <- tm::tm_map(anthem, rm_non_ascii)
  anthem <- tm::tm_map(anthem, rm_nchar_words, 1)
  anthem <- tm::tm_map(anthem, rm_nchar_words, 2)
  anthem <- tm::tm_map(anthem, removeWords, stopwords)
  
  dtm <- DocumentTermMatrix(anthem)
  dtm <- removeSparseTerms(dtm, sparse = 0.98)
  dtm.tfidf <- weightTfIdf(dtm)
  dtm.tfidf
  tfidf.matrix <- t(as.matrix(dtm.tfidf)) 
  print(fviz_nbclust(tfidf.matrix, kmeans, method = "wss"))
  clustering.kmeans <- kmeans(tfidf.matrix, centers = 5)
}

cluster <- cluster_data()

knitr::kable(head(cluster$cluster, 20), "simple") 
x
bestow 1
born 1
brave 2
dear 5
defend 1
die 5
earth 1
fatherland 5
fight 1
flag 1
foe 1
forev 1
glori 5
goal 1
god 5
hand 5
honour 1
land 5
live 5
loyalti 1

Label the data based on the clustered info

final_label <- function(doc, cluster_kmeans) {
  result <- as.data.frame(cluster_kmeans$cluster)
  result <- tibble::rownames_to_column(result, "word")
  colnames(result) <- c("word","cluster")
  I <- result %>%
    filter(cluster == 1)
  I <- I$word
  II <- result %>%
    filter(cluster == 2)
  II <- II$word
  III <- result %>%
    filter(cluster == 3)
  III <- III$word
  IV <- result %>%
    filter(cluster == 4)
  IV <- IV$word
  V <- result %>%
    filter(cluster == 5)
  V <- V$word
  
  count1 <- 0
  count2 <- 0
  count3 <- 0
  count4 <- 0
  count5 <- 0
  clean_doc <- clean_data(doc)
  for (word in clean_doc) {
    ifelse((is.element(word, I)),count1 <- count1+1, 
           ifelse((is.element(word, II)), count2 <- count2+1,
                  ifelse((is.element(word, III)), count3 <- count3+1,
                         ifelse((is.element(word, IV)), count4 <- count4+1,
                                ifelse((is.element(word, V)), count5 <- count5+1,FALSE)
                         )
                  )
           )
    )
    count <- max(count1, count2, count3, count4, count5)
  }
  labels = c()
  ifelse((count1 == count), labels <- return("1"),
         ifelse((count2 == count), labels <- return("2"),
                ifelse((count3 == count), labels <- return("3"),
                       ifelse((count4 == count), labels <- return("4"),
                              ifelse((count5 == count), labels <- return("5"),)
                              )
                       )
                )
         )
}

Assign the labels to the different countries

main <- function(corpora) {
  step3 <- cluster_data()
  labels <- c()
  for (k in 1:190) {
    country <- anthems[k,5]
    labels <- append(labels, final_label(country, step3))
  }
  
  corpus <- as.data.frame(corpora)
  corpus["labels"] <- labels
  corpus[c(1,4,6)]
}

knitr::kable(head(main(anthems),20), "simple") 

Country Continent labels
Albania Europe 1
Armenia Europe 5
Austria Europe 5
Azerbaijan Europe 5
Belarus Europe 5
Belgium Europe 5
Bosnia and Herzegovina Europe 5
Bulgaria Europe 5
Croatia Europe 5
Cyprus Europe 4
Czechia Europe 5
Denmark Europe 1
Estonia Europe 5
Finland Europe 5
France Europe 1
Georgia Europe 1
Germany Europe 5
Greece Europe 4
Hungary Europe 5
Iceland Europe 1

Interactive map on Tableau

Snapshot of map

Tableau map

Source