ik <- read_csv("IK_tweets.csv")
head(ik)
## # A tibble: 6 × 4
##   `_id` Date                User         Tweets                                 
##   <dbl> <dttm>              <chr>        <chr>                                  
## 1     1 2022-10-30 14:07:08 ImranKhanPTI Shocked &amp; deeply saddened by the t…
## 2     2 2022-10-29 17:37:22 ImranKhanPTI For all those spreading rumours about …
## 3     3 2022-10-28 18:43:41 ImranKhanPTI Congratulations to Abid Zuberi on his …
## 4     4 2022-10-24 11:28:55 ImranKhanPTI We saw a foreign abetted regime change…
## 5     5 2022-10-24 11:28:54 ImranKhanPTI &amp; safeguarded against excesses by …
## 6     6 2022-10-24 11:28:53 ImranKhanPTI Arshad Sharif's murder has sent shockw…
colnames(ik)
## [1] "_id"    "Date"   "User"   "Tweets"
ik$Date <- as.Date(ik$Date)
library(mice)
md.pattern(ik, rotate.names = T)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      _id Date User Tweets  
## 1000   1    1    1      1 0
##        0    0    0      0 0
library(Amelia)
psych::describe(ik)
##         vars    n  mean     sd median trimmed    mad min  max range skew
## _id        1 1000 500.5 288.82  500.5   500.5 370.65   1 1000   999    0
## Date       2 1000   NaN     NA     NA     NaN     NA Inf -Inf  -Inf   NA
## User*      3 1000   1.0   0.00    1.0     1.0   0.00   1    1     0  NaN
## Tweets*    4 1000 499.5 288.54  499.5   499.5 370.65   1  995   994    0
##         kurtosis   se
## _id        -1.20 9.13
## Date          NA   NA
## User*        NaN 0.00
## Tweets*    -1.21 9.12
#visualize missing data
missmap(ik, margins = c(2,4))

#libraries for tweet analysis

library(tidyverse)
library(plotly)
library(DT)
library(tidytext)
library(ggrepel)
library(lubridate)
library(scales)
library(janitor)
library(RColorBrewer)
theme_set(theme_light())

glimpse(ik)
## Rows: 1,000
## Columns: 4
## $ `_id`  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ Date   <date> 2022-10-30, 2022-10-29, 2022-10-28, 2022-10-24, 2022-10-24, 20…
## $ User   <chr> "ImranKhanPTI", "ImranKhanPTI", "ImranKhanPTI", "ImranKhanPTI",…
## $ Tweets <chr> "Shocked &amp; deeply saddened by the terrible accident that le…
datatable(ik %>% count(Tweets, sort = TRUE), caption = NULL, options = list(dom = "t"))
#change to lower case
ik <- ik %>% mutate_all(list(str_to_lower))

#check again
datatable(ik %>% count(Tweets, sort = TRUE), caption = NULL, options = list(dom = 't'))
#check missing data
sapply(ik, function(x) sum(is.na(x)))
##    _id   Date   User Tweets 
##      0      0      0      0
#no missing data is found
#create new variables Year, Month, Day and The number of characters and number of words!

#tweets per year and month
#create new variable (year, month, day)

ik_tw <- ik %>% mutate(
  Date     = ymd(Date),
  year     = year(Date),
  month    = month(Date, label = TRUE),
  day      = wday(Date,  label = TRUE),
  NumChar  = nchar(Tweets),
  NumWords = str_count(Tweets, pattern = "\\w+"))

map_dfc(ik_tw, anyNA)
## # A tibble: 1 × 9
##   `_id` Date  User  Tweets year  month day   NumChar NumWords
##   <lgl> <lgl> <lgl> <lgl>  <lgl> <lgl> <lgl> <lgl>   <lgl>   
## 1 FALSE FALSE FALSE FALSE  FALSE FALSE FALSE FALSE   FALSE
glimpse(ik_tw)
## Rows: 1,000
## Columns: 9
## $ `_id`    <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"…
## $ Date     <date> 2022-10-30, 2022-10-29, 2022-10-28, 2022-10-24, 2022-10-24, …
## $ User     <chr> "imrankhanpti", "imrankhanpti", "imrankhanpti", "imrankhanpti…
## $ Tweets   <chr> "shocked &amp; deeply saddened by the terrible accident that …
## $ year     <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2…
## $ month    <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, O…
## $ day      <ord> Sun, Sat, Fri, Mon, Mon, Mon, Mon, Mon, Mon, Thu, Wed, Sun, S…
## $ NumChar  <int> 278, 288, 65, 283, 296, 280, 59, 283, 280, 280, 274, 246, 238…
## $ NumWords <int> 50, 57, 10, 46, 41, 44, 10, 44, 50, 53, 46, 41, 39, 46, 36, 4…
med_yr <- ik_tw %>% select(year) %>% count(year) %>% summarise(median_tweets_perYear = round(median(n),2))

med_month <- ik_tw %>% select(month) %>% count(month) %>% summarise(median_tweets_perMonth = round(median(n),2))

datatable(Median_Yr_Month <- cbind(med_yr, med_month),
          caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(year) %>% count(year),
          caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(month) %>% count(month),
          caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(Date) %>% count(Date, sort = TRUE) %>% top_n(15),
          caption = NULL, options = list(dom = "t"))

#lets dig down tweets of 2021-05-01

datatable(ik_tw %>% select(Date, Tweets) %>% filter(Date == "2021-05-01"), caption = NULL, options = list(dom = 't'))

Data Summary

Number of character vs Number of words by year (scatter plot)

ggplotly(ik_tw %>% select(NumChar, NumWords, year) %>% 
  ggplot(aes(x = NumChar, y = NumWords, col = as.factor(year))) +
  geom_point() +
  labs(title = "Number of character vs Number of words by year",
       x = "Number of Characters",
       col = "year",
       y = "Number of words"),tooltip = c("NumChar", "year"))

The number of characters is highly correlated!

Number of characters used (Histgram)

ggplotly(ik_tw %>% select(NumChar) %>% 
  ggplot(aes(x = NumChar)) +
  geom_histogram(fill = "darkred", bins = 30) +
  labs(title = "Number of characters used",
       x = "Number of characters"),
       tooltip = c("NumChar"))

The number of character increased with time from start to end of governance.

###Number of characters by year (boxplot)

ggplotly(ik_tw %>% select(NumChar, year) %>% 
  ggplot(aes(x = as.factor(year), y = NumChar)) +
  geom_boxplot(aes(fill = as.factor(year))) +
  labs(title = "Number of characters used per year",
       x = "Year", fill = "year",
       y = "Number of Characters"))

its also evident that from 2019 to 2022 the number of characters used increased.

##Countries mentioned in IKs tweets #create a new variable country_mentioned

ik_tw <- ik_tw %>% 
  mutate(country_mentioned = str_extract(Tweets, pattern = "china|chinese|russia|iran|france|germany|ukraine|saudi|australia|canada|malausia| arabia|mexico|india|afghanistan|bangladesh|uae|dubai|saudiarabia|israel|iraq")) %>% 
  mutate(country_mentioned = str_replace(country_mentioned, pattern = "chinese", replacement = "china"))


datatable(ik_tw %>% count(country_mentioned, sort = TRUE),
          caption = NULL, options = list(dom = 't'))

I have mentioned the country names most related to Pakistan, but will decide after the frequency table

###Countries are mentioned in IKs tweeter feed

ik_tw %>% 
  filter(!is.na(country_mentioned)) %>% 
  group_by(country_mentioned) %>% 
  summarise(Count = n()) %>% 
  ggplot(aes(
    x = fct_reorder(country_mentioned, Count),
    Count, label = Count, fill = country_mentioned))+
  geom_col()+
  geom_text(hjust = -0.2)+
  coord_flip()+
  theme(legend.position = "none")+
  labs(title = "Countries are mentioned in IK's tweeter feed",
       subtitle = "Bar plot for countries mentioned in IK's tweets",
       caption = "IK's Twitter feed (2018-2022)",
       x = "Country Name", y = "Number of tweets")

The top 5 are India, followed by Afghanistan, china, Israel and saudi arabia

###Number of Tweets by country per year (Time series)

SixCountries <- c("india","afghanistan", "china", "israel", "saudi arabia", "uae")

ik_tw %>% 
  filter(!is.na(country_mentioned),
         country_mentioned %in% SixCountries) %>% 
  group_by(year, country_mentioned) %>% 
  summarise(YCount = n()) %>% 
  arrange(year) %>% 
  ggplot(aes(x = year, y = YCount, col = country_mentioned))+
  geom_line(size = 1.5)+
  geom_hline(
    aes(yintercept = mean(YCount)), size = 1.2, col= "red", alpha = 0.2)+
  geom_text(aes(label = YCount), vjust = -0.7, col ="black")+
  scale_y_continuous(expand = c(0,100), label = label_number(suffix = "Tweets"))+
  facet_wrap(vars(country_mentioned))+
  theme(legend.position = "none")+
  scale_color_brewer(palette = "Set2")+
  labs(
    title = "Number of tweets by Country per year (Time series)",
    subtitle = "Facet wrap plot with the Mean",
    caption = "IK's twitter feeds (2018-2022)",
    x = "Country Name", y = "Number o Tweets"
  )

###Number of characters used by country per year

ik_tw %>% 
  filter(!is.na(country_mentioned), country_mentioned %in% SixCountries) %>% 
  group_by(country_mentioned, NumChar, year) %>% 
  summarise(TotalChar = sum(NumChar)) %>% 
  arrange(desc(TotalChar)) %>% 
  select(country_mentioned, TotalChar, year) %>% 
  group_by(country_mentioned, year) %>% 
  summarise(GTotalChar = sum(TotalChar)) %>% 
  arrange(year) %>% 
  ggplot(aes(x = year, y = GTotalChar, col = country_mentioned))+
  geom_line(size = 1.5)+
  geom_hline(aes(yintercept = mean(GTotalChar)), size = 1.2, col = "red", alpha = 0.2)+
  geom_text(aes(label = comma(round(GTotalChar), 1)), vjust = -0.7, col="black")+
  scale_y_continuous(limits = c(0,80000), label = label_number(suffix = "Char"))+
  theme(legend.position = "top")+
  scale_color_brewer(palette = "Set2")+
  labs(
    title = "Number of tweets by Country per year (Time series)",
    subtitle = "Facet wrap plot with the Mean",
    caption = "IK's twitter feeds (2018-2022)",
    x = "Country Name", y = "Number o Tweets"
  )

###Mean character and Mean Tweets used

MeanTweetByCountry <- ik_tw %>%
  filter(!is.na(country_mentioned),
         country_mentioned %in% SixCountries) %>%
  group_by(year, country_mentioned) %>%
  summarise(YCount = n(),
            totalNchar = round(sum(NumChar / 1000), 1)) %>%
  arrange(year) %>%
  group_by(country_mentioned) %>%
  mutate(round(across(c(YCount, totalNchar), mean, .names = "mean_{.col}"), 0)) %>%
  mutate(
    labelYCount = ifelse(YCount == max(YCount), YCount, ''),
    labeltotalNchar = ifelse(totalNchar == max(totalNchar), paste(totalNchar, "kch"), '')
  )

ggplot(MeanTweetByCountry)+
  geom_line(aes(x = year, y = YCount, col = country_mentioned), size = 1.5)+
  geom_text(aes(x = year, y = YCount, label = labelYCount), size = 3, vjust = -0.5)+
  geom_hline(aes(yintercept = mean_YCount, col = "orange"), size = 1)+
  geom_text(
    aes(x = year, y = totalNchar, label = labeltotalNchar),
    size = 3, vjust = -0.5 )+
  geom_hline(aes(yintercept = mean_totalNchar), col = "black", size = 1)+
 facet_wrap(vars(country_mentioned), scale = "free_y", ncol = 3)+
  theme(legend.position = "top")+
  labs(
    title = "Mean character and mean tweets used!, by country, per year (Time series)",
    subtitle = "Facet wrap plot (Mean chacter and mean tweets used!)",
    caption = "IK's tweeter feed (2018-2022)",
    x = "Country Name", y = "Number of characters"
  )

## Companies and TV and Social Medias and others

GoodYear, Apple, Google, Facebook, Twitter, CNN, Fox, ABC and NBC

ik_tw %>% 
  mutate(NumChar = nchar(Tweets),
         narratives_mentioned = str_extract(Tweets, pattern = "government|eid|governance|media|social media|opposition|corruption|nro|petrol|diesel|sharif|bhutto|nani|plmn | PPP | ary|tanzeem saazi|pakistan military|establishment|geo|sindh|people|govt|nation|prayer|condolence|sadaf| government|zardari|maryum|kashmir|solidarity|covid|regime")) %>% 
  count(narratives_mentioned, sort =TRUE) %>% 
  filter(!is.na(narratives_mentioned)) %>% 
  ggplot(aes(x = n , y = fct_reorder(narratives_mentioned, n)))+
  geom_col(aes(fill = narratives_mentioned))+
  geom_text(aes(label = n), hjust = -0.1)+
  theme(legend.position = "none")+
  scale_fill_ordinal()+
  labs(
    title = "Popular narratives discussed",
    subtitle = "Plot, what are the most common narratives talked about",
    caption = "IK's tweeter feed (2018-2022)",
    x = "Narratives Name",
    y = "Number of chracters"
  )

## Text Mining: Word, Bigram, Trigram and Quadgram.

By using the tidytext package, I’m going to find the most words used in his tweets, starting by Word, and ending by Quadgram through bigram and trigram.

Word

Although the single word (Monogram) cannot give you a complete picture of the meaning behind it because we need to see in which context is used, I will start with Monogram as starting up, just to have an idea about the most words used by Trump!

So then I will use the unnest_tokens and anti_join function from the tidytext package.

colnames(ik_tw)
##  [1] "_id"               "Date"              "User"             
##  [4] "Tweets"            "year"              "month"            
##  [7] "day"               "NumChar"           "NumWords"         
## [10] "country_mentioned"
ik_tw %>% select(Tweets) %>% unnest_tokens(word, Tweets) %>% 
  anti_join(get_stopwords()) %>% 
  count(word, sort = TRUE) %>% 
  slice_max(n, n=20) %>% 
  ggplot(aes(
    x = fct_reorder(word, n), y=n, fill=word))+
  geom_col()+
  coord_flip()+theme(legend.position = "none")+
  labs(title = "The most words used", subtitle = "Bar plot, The most words used", 
       caption = "Ik's tweeter feed 2018-2022", x = "word", y = "Count")

###Bigram

ik_tw %>% select(Tweets, year, Date) %>% unnest_tokens(Bigram, Tweets, token = "ngrams", n = 2) %>% 
  separate(Bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>% 
  unite(Bigram, word1, word2, sep = " ") %>% 
  filter(Bigram != "https t.co") %>% 
  count(Bigram, year, sort =TRUE) %>% 
  mutate(Bigram = reorder_within(Bigram, n, year)) %>% 
  slice_max(n, n = 30) %>% 
  ggplot(aes(
    x = fct_reorder(Bigram, n), y=n, fill = Bigram
  ))+geom_col()+ coord_flip()+theme(legend.position = "none")+
  facet_wrap(vars(year), scales = "free_y", ncol = 2)+
  scale_x_reordered()+scale_y_reordered()+
  labs(
    title = "The most word used (Bigram)",
    subtitle = "Bar plot and facet wrap, The most words used (Bigram)",
    caption = "IK's tweeter feed 2018-2022",
    x ="Bigram", y = "Count"
  )

2018- terrorist attack comdemn 2019- human rights, indian occupation, hindu supermacy, ethnic cleansing, international community, 2020 - covid 19, human rights, modi govt, hindu supermacy, flag operations, flase flag, 2021-sri lanka, kashmir, unsc resolution, amp prayer, 10years, indian occupation, covid, condolence, climate change 2022- imported govt, regime change, change conspiracy, strongly condemn, haqeeqi azadi, prophet, modi, sovereighty amp, amp prayers

#Trigram

ik_tw %>% select(Tweets, year, Date) %>% 
  unnest_tokens(Trigram, Tweets, token = "ngrams", n = 3) %>% 
  separate(Trigram, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word,!word3 %in% stop_words$word ) %>% 
  unite(Trigram, word1, word2,word3, sep = " ") %>% 
  filter(Trigram != "https t.co") %>% 
  count(Trigram, year, sort =TRUE) %>% 
  mutate(Trigram = reorder_within(Trigram, n, year)) %>% 
  slice_max(n, n = 30) %>% 
  ggplot(aes(
    x = fct_reorder(Trigram, n), y=n, fill = Trigram
  )) + geom_col() + coord_flip() + theme(legend.position = "none")+
  facet_wrap(vars(year), scales = "free_y", ncol = 2)+
  scale_x_reordered()+scale_y_reordered()+
  labs(
    title = "The most word used (Trigram)",
    subtitle = "Bar plot and facet wrap, The most words used (Trigram)",
    caption = "IK's tweeter feed 2018-2022",
    x ="Trigram", y = "Count"
  )

###Quadgram

ik_tw %>% select(Tweets, year, Date) %>% 
  unnest_tokens(Quadgram, Tweets, token = "ngrams", n = 4) %>% 
  separate(Quadgram, c("word1", "word2", "word3", "word4"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word,!word3 %in% stop_words$word, !word4 %in% stop_words$word ) %>% 
  unite(Quadgram, word1, word2,word3,word4, sep = " ") %>% 
  filter(Quadgram != "https t.co") %>% 
  count(Quadgram, year, sort =TRUE) %>% 
  mutate(Quadgram = reorder_within(Quadgram, n, year)) %>% 
  slice_max(n, n = 30) %>% 
  ggplot(aes(
    x = fct_reorder(Quadgram, n), y=n, fill = Quadgram)) + geom_col() + coord_flip() + theme(legend.position = "none")+
  facet_wrap(vars(year), scales = "free_y", ncol = 2)+
  scale_x_reordered()+
  scale_y_reordered()+
  labs(
    title = "The most word used (Quadgram)",
    subtitle = "Bar plot and facet wrap, The most words used (Quadgram)",
    caption = "IK's tweeter feed 2018-2022",
    x ="Quadgram", y = "Count"
  )

The crucks for each year is summarised below: 2018 - Justice Ijazul Ahsan’s house incident 2019- Ethnic cleansing amp genocide, hindu supermacy, rss, hind etc 2020- condolences to victim families, recist hindu, human rights, 2021 - corruption, hindu, rss, etc 2021-corruption, money laundring, ruling elites, kp local govt elections 2022- fair and free elections, regime change conspiracy, UAE prince, kashmiri leader, etc