ik <- read_csv("IK_tweets.csv")
head(ik)
## # A tibble: 6 × 4
## `_id` Date User Tweets
## <dbl> <dttm> <chr> <chr>
## 1 1 2022-10-30 14:07:08 ImranKhanPTI Shocked & deeply saddened by the t…
## 2 2 2022-10-29 17:37:22 ImranKhanPTI For all those spreading rumours about …
## 3 3 2022-10-28 18:43:41 ImranKhanPTI Congratulations to Abid Zuberi on his …
## 4 4 2022-10-24 11:28:55 ImranKhanPTI We saw a foreign abetted regime change…
## 5 5 2022-10-24 11:28:54 ImranKhanPTI & safeguarded against excesses by …
## 6 6 2022-10-24 11:28:53 ImranKhanPTI Arshad Sharif's murder has sent shockw…
colnames(ik)
## [1] "_id" "Date" "User" "Tweets"
ik$Date <- as.Date(ik$Date)
library(mice)
md.pattern(ik, rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## _id Date User Tweets
## 1000 1 1 1 1 0
## 0 0 0 0 0
library(Amelia)
psych::describe(ik)
## vars n mean sd median trimmed mad min max range skew
## _id 1 1000 500.5 288.82 500.5 500.5 370.65 1 1000 999 0
## Date 2 1000 NaN NA NA NaN NA Inf -Inf -Inf NA
## User* 3 1000 1.0 0.00 1.0 1.0 0.00 1 1 0 NaN
## Tweets* 4 1000 499.5 288.54 499.5 499.5 370.65 1 995 994 0
## kurtosis se
## _id -1.20 9.13
## Date NA NA
## User* NaN 0.00
## Tweets* -1.21 9.12
#visualize missing data
missmap(ik, margins = c(2,4))
#libraries for tweet analysis
library(tidyverse)
library(plotly)
library(DT)
library(tidytext)
library(ggrepel)
library(lubridate)
library(scales)
library(janitor)
library(RColorBrewer)
theme_set(theme_light())
glimpse(ik)
## Rows: 1,000
## Columns: 4
## $ `_id` <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, …
## $ Date <date> 2022-10-30, 2022-10-29, 2022-10-28, 2022-10-24, 2022-10-24, 20…
## $ User <chr> "ImranKhanPTI", "ImranKhanPTI", "ImranKhanPTI", "ImranKhanPTI",…
## $ Tweets <chr> "Shocked & deeply saddened by the terrible accident that le…
datatable(ik %>% count(Tweets, sort = TRUE), caption = NULL, options = list(dom = "t"))
#change to lower case
ik <- ik %>% mutate_all(list(str_to_lower))
#check again
datatable(ik %>% count(Tweets, sort = TRUE), caption = NULL, options = list(dom = 't'))
#check missing data
sapply(ik, function(x) sum(is.na(x)))
## _id Date User Tweets
## 0 0 0 0
#no missing data is found
#create new variables Year, Month, Day and The number of characters and number of words!
#tweets per year and month
#create new variable (year, month, day)
ik_tw <- ik %>% mutate(
Date = ymd(Date),
year = year(Date),
month = month(Date, label = TRUE),
day = wday(Date, label = TRUE),
NumChar = nchar(Tweets),
NumWords = str_count(Tweets, pattern = "\\w+"))
map_dfc(ik_tw, anyNA)
## # A tibble: 1 × 9
## `_id` Date User Tweets year month day NumChar NumWords
## <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
glimpse(ik_tw)
## Rows: 1,000
## Columns: 9
## $ `_id` <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"…
## $ Date <date> 2022-10-30, 2022-10-29, 2022-10-28, 2022-10-24, 2022-10-24, …
## $ User <chr> "imrankhanpti", "imrankhanpti", "imrankhanpti", "imrankhanpti…
## $ Tweets <chr> "shocked & deeply saddened by the terrible accident that …
## $ year <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2…
## $ month <ord> Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, Oct, O…
## $ day <ord> Sun, Sat, Fri, Mon, Mon, Mon, Mon, Mon, Mon, Thu, Wed, Sun, S…
## $ NumChar <int> 278, 288, 65, 283, 296, 280, 59, 283, 280, 280, 274, 246, 238…
## $ NumWords <int> 50, 57, 10, 46, 41, 44, 10, 44, 50, 53, 46, 41, 39, 46, 36, 4…
med_yr <- ik_tw %>% select(year) %>% count(year) %>% summarise(median_tweets_perYear = round(median(n),2))
med_month <- ik_tw %>% select(month) %>% count(month) %>% summarise(median_tweets_perMonth = round(median(n),2))
datatable(Median_Yr_Month <- cbind(med_yr, med_month),
caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(year) %>% count(year),
caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(month) %>% count(month),
caption = NULL, options = list(dom = 't'))
datatable(ik_tw %>% select(Date) %>% count(Date, sort = TRUE) %>% top_n(15),
caption = NULL, options = list(dom = "t"))
#lets dig down tweets of 2021-05-01
datatable(ik_tw %>% select(Date, Tweets) %>% filter(Date == "2021-05-01"), caption = NULL, options = list(dom = 't'))
ggplotly(ik_tw %>% select(NumChar, NumWords, year) %>%
ggplot(aes(x = NumChar, y = NumWords, col = as.factor(year))) +
geom_point() +
labs(title = "Number of character vs Number of words by year",
x = "Number of Characters",
col = "year",
y = "Number of words"),tooltip = c("NumChar", "year"))
The number of characters is highly correlated!
ggplotly(ik_tw %>% select(NumChar) %>%
ggplot(aes(x = NumChar)) +
geom_histogram(fill = "darkred", bins = 30) +
labs(title = "Number of characters used",
x = "Number of characters"),
tooltip = c("NumChar"))
The number of character increased with time from start to end of governance.
###Number of characters by year (boxplot)
ggplotly(ik_tw %>% select(NumChar, year) %>%
ggplot(aes(x = as.factor(year), y = NumChar)) +
geom_boxplot(aes(fill = as.factor(year))) +
labs(title = "Number of characters used per year",
x = "Year", fill = "year",
y = "Number of Characters"))
its also evident that from 2019 to 2022 the number of characters used increased.
##Countries mentioned in IKs tweets #create a new variable country_mentioned
ik_tw <- ik_tw %>%
mutate(country_mentioned = str_extract(Tweets, pattern = "china|chinese|russia|iran|france|germany|ukraine|saudi|australia|canada|malausia| arabia|mexico|india|afghanistan|bangladesh|uae|dubai|saudiarabia|israel|iraq")) %>%
mutate(country_mentioned = str_replace(country_mentioned, pattern = "chinese", replacement = "china"))
datatable(ik_tw %>% count(country_mentioned, sort = TRUE),
caption = NULL, options = list(dom = 't'))
I have mentioned the country names most related to Pakistan, but will decide after the frequency table
###Countries are mentioned in IKs tweeter feed
ik_tw %>%
filter(!is.na(country_mentioned)) %>%
group_by(country_mentioned) %>%
summarise(Count = n()) %>%
ggplot(aes(
x = fct_reorder(country_mentioned, Count),
Count, label = Count, fill = country_mentioned))+
geom_col()+
geom_text(hjust = -0.2)+
coord_flip()+
theme(legend.position = "none")+
labs(title = "Countries are mentioned in IK's tweeter feed",
subtitle = "Bar plot for countries mentioned in IK's tweets",
caption = "IK's Twitter feed (2018-2022)",
x = "Country Name", y = "Number of tweets")
The top 5 are India, followed by Afghanistan, china, Israel and saudi
arabia
###Number of Tweets by country per year (Time series)
SixCountries <- c("india","afghanistan", "china", "israel", "saudi arabia", "uae")
ik_tw %>%
filter(!is.na(country_mentioned),
country_mentioned %in% SixCountries) %>%
group_by(year, country_mentioned) %>%
summarise(YCount = n()) %>%
arrange(year) %>%
ggplot(aes(x = year, y = YCount, col = country_mentioned))+
geom_line(size = 1.5)+
geom_hline(
aes(yintercept = mean(YCount)), size = 1.2, col= "red", alpha = 0.2)+
geom_text(aes(label = YCount), vjust = -0.7, col ="black")+
scale_y_continuous(expand = c(0,100), label = label_number(suffix = "Tweets"))+
facet_wrap(vars(country_mentioned))+
theme(legend.position = "none")+
scale_color_brewer(palette = "Set2")+
labs(
title = "Number of tweets by Country per year (Time series)",
subtitle = "Facet wrap plot with the Mean",
caption = "IK's twitter feeds (2018-2022)",
x = "Country Name", y = "Number o Tweets"
)
###Number of characters used by country per year
ik_tw %>%
filter(!is.na(country_mentioned), country_mentioned %in% SixCountries) %>%
group_by(country_mentioned, NumChar, year) %>%
summarise(TotalChar = sum(NumChar)) %>%
arrange(desc(TotalChar)) %>%
select(country_mentioned, TotalChar, year) %>%
group_by(country_mentioned, year) %>%
summarise(GTotalChar = sum(TotalChar)) %>%
arrange(year) %>%
ggplot(aes(x = year, y = GTotalChar, col = country_mentioned))+
geom_line(size = 1.5)+
geom_hline(aes(yintercept = mean(GTotalChar)), size = 1.2, col = "red", alpha = 0.2)+
geom_text(aes(label = comma(round(GTotalChar), 1)), vjust = -0.7, col="black")+
scale_y_continuous(limits = c(0,80000), label = label_number(suffix = "Char"))+
theme(legend.position = "top")+
scale_color_brewer(palette = "Set2")+
labs(
title = "Number of tweets by Country per year (Time series)",
subtitle = "Facet wrap plot with the Mean",
caption = "IK's twitter feeds (2018-2022)",
x = "Country Name", y = "Number o Tweets"
)
###Mean character and Mean Tweets used
MeanTweetByCountry <- ik_tw %>%
filter(!is.na(country_mentioned),
country_mentioned %in% SixCountries) %>%
group_by(year, country_mentioned) %>%
summarise(YCount = n(),
totalNchar = round(sum(NumChar / 1000), 1)) %>%
arrange(year) %>%
group_by(country_mentioned) %>%
mutate(round(across(c(YCount, totalNchar), mean, .names = "mean_{.col}"), 0)) %>%
mutate(
labelYCount = ifelse(YCount == max(YCount), YCount, ''),
labeltotalNchar = ifelse(totalNchar == max(totalNchar), paste(totalNchar, "kch"), '')
)
ggplot(MeanTweetByCountry)+
geom_line(aes(x = year, y = YCount, col = country_mentioned), size = 1.5)+
geom_text(aes(x = year, y = YCount, label = labelYCount), size = 3, vjust = -0.5)+
geom_hline(aes(yintercept = mean_YCount, col = "orange"), size = 1)+
geom_text(
aes(x = year, y = totalNchar, label = labeltotalNchar),
size = 3, vjust = -0.5 )+
geom_hline(aes(yintercept = mean_totalNchar), col = "black", size = 1)+
facet_wrap(vars(country_mentioned), scale = "free_y", ncol = 3)+
theme(legend.position = "top")+
labs(
title = "Mean character and mean tweets used!, by country, per year (Time series)",
subtitle = "Facet wrap plot (Mean chacter and mean tweets used!)",
caption = "IK's tweeter feed (2018-2022)",
x = "Country Name", y = "Number of characters"
)
## Companies and TV and Social Medias and others
ik_tw %>%
mutate(NumChar = nchar(Tweets),
narratives_mentioned = str_extract(Tweets, pattern = "government|eid|governance|media|social media|opposition|corruption|nro|petrol|diesel|sharif|bhutto|nani|plmn | PPP | ary|tanzeem saazi|pakistan military|establishment|geo|sindh|people|govt|nation|prayer|condolence|sadaf| government|zardari|maryum|kashmir|solidarity|covid|regime")) %>%
count(narratives_mentioned, sort =TRUE) %>%
filter(!is.na(narratives_mentioned)) %>%
ggplot(aes(x = n , y = fct_reorder(narratives_mentioned, n)))+
geom_col(aes(fill = narratives_mentioned))+
geom_text(aes(label = n), hjust = -0.1)+
theme(legend.position = "none")+
scale_fill_ordinal()+
labs(
title = "Popular narratives discussed",
subtitle = "Plot, what are the most common narratives talked about",
caption = "IK's tweeter feed (2018-2022)",
x = "Narratives Name",
y = "Number of chracters"
)
## Text Mining: Word, Bigram, Trigram and Quadgram.
By using the tidytext package, I’m going to find the most words used in his tweets, starting by Word, and ending by Quadgram through bigram and trigram.
Although the single word (Monogram) cannot give you a complete picture of the meaning behind it because we need to see in which context is used, I will start with Monogram as starting up, just to have an idea about the most words used by Trump!
So then I will use the unnest_tokens and anti_join function from the tidytext package.
colnames(ik_tw)
## [1] "_id" "Date" "User"
## [4] "Tweets" "year" "month"
## [7] "day" "NumChar" "NumWords"
## [10] "country_mentioned"
ik_tw %>% select(Tweets) %>% unnest_tokens(word, Tweets) %>%
anti_join(get_stopwords()) %>%
count(word, sort = TRUE) %>%
slice_max(n, n=20) %>%
ggplot(aes(
x = fct_reorder(word, n), y=n, fill=word))+
geom_col()+
coord_flip()+theme(legend.position = "none")+
labs(title = "The most words used", subtitle = "Bar plot, The most words used",
caption = "Ik's tweeter feed 2018-2022", x = "word", y = "Count")
###Bigram
ik_tw %>% select(Tweets, year, Date) %>% unnest_tokens(Bigram, Tweets, token = "ngrams", n = 2) %>%
separate(Bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
unite(Bigram, word1, word2, sep = " ") %>%
filter(Bigram != "https t.co") %>%
count(Bigram, year, sort =TRUE) %>%
mutate(Bigram = reorder_within(Bigram, n, year)) %>%
slice_max(n, n = 30) %>%
ggplot(aes(
x = fct_reorder(Bigram, n), y=n, fill = Bigram
))+geom_col()+ coord_flip()+theme(legend.position = "none")+
facet_wrap(vars(year), scales = "free_y", ncol = 2)+
scale_x_reordered()+scale_y_reordered()+
labs(
title = "The most word used (Bigram)",
subtitle = "Bar plot and facet wrap, The most words used (Bigram)",
caption = "IK's tweeter feed 2018-2022",
x ="Bigram", y = "Count"
)
2018- terrorist attack comdemn 2019- human rights, indian occupation, hindu supermacy, ethnic cleansing, international community, 2020 - covid 19, human rights, modi govt, hindu supermacy, flag operations, flase flag, 2021-sri lanka, kashmir, unsc resolution, amp prayer, 10years, indian occupation, covid, condolence, climate change 2022- imported govt, regime change, change conspiracy, strongly condemn, haqeeqi azadi, prophet, modi, sovereighty amp, amp prayers
#Trigram
ik_tw %>% select(Tweets, year, Date) %>%
unnest_tokens(Trigram, Tweets, token = "ngrams", n = 3) %>%
separate(Trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word,!word3 %in% stop_words$word ) %>%
unite(Trigram, word1, word2,word3, sep = " ") %>%
filter(Trigram != "https t.co") %>%
count(Trigram, year, sort =TRUE) %>%
mutate(Trigram = reorder_within(Trigram, n, year)) %>%
slice_max(n, n = 30) %>%
ggplot(aes(
x = fct_reorder(Trigram, n), y=n, fill = Trigram
)) + geom_col() + coord_flip() + theme(legend.position = "none")+
facet_wrap(vars(year), scales = "free_y", ncol = 2)+
scale_x_reordered()+scale_y_reordered()+
labs(
title = "The most word used (Trigram)",
subtitle = "Bar plot and facet wrap, The most words used (Trigram)",
caption = "IK's tweeter feed 2018-2022",
x ="Trigram", y = "Count"
)
###Quadgram
ik_tw %>% select(Tweets, year, Date) %>%
unnest_tokens(Quadgram, Tweets, token = "ngrams", n = 4) %>%
separate(Quadgram, c("word1", "word2", "word3", "word4"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word,!word3 %in% stop_words$word, !word4 %in% stop_words$word ) %>%
unite(Quadgram, word1, word2,word3,word4, sep = " ") %>%
filter(Quadgram != "https t.co") %>%
count(Quadgram, year, sort =TRUE) %>%
mutate(Quadgram = reorder_within(Quadgram, n, year)) %>%
slice_max(n, n = 30) %>%
ggplot(aes(
x = fct_reorder(Quadgram, n), y=n, fill = Quadgram)) + geom_col() + coord_flip() + theme(legend.position = "none")+
facet_wrap(vars(year), scales = "free_y", ncol = 2)+
scale_x_reordered()+
scale_y_reordered()+
labs(
title = "The most word used (Quadgram)",
subtitle = "Bar plot and facet wrap, The most words used (Quadgram)",
caption = "IK's tweeter feed 2018-2022",
x ="Quadgram", y = "Count"
)
The crucks for each year is summarised below: 2018 - Justice Ijazul Ahsan’s house incident 2019- Ethnic cleansing amp genocide, hindu supermacy, rss, hind etc 2020- condolences to victim families, recist hindu, human rights, 2021 - corruption, hindu, rss, etc 2021-corruption, money laundring, ruling elites, kp local govt elections 2022- fair and free elections, regime change conspiracy, UAE prince, kashmiri leader, etc