First, we load all the files from folder dataset and condensed them in one dataset
We calculate the mean of RTs for each year, based on the column RT Count.
To do this we need to separate oroginal tweets from RTs and replies, and calculate the average per year.
#2009 directly, because there are not RTs
RT2009 <- final_2009 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2010 we first separate tweets from RTs and replies
RT2010 <- final_2010 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2011
RT2011 <- final_2011 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2012
RT2012 <- final_2012 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2013
RT2013 <- final_2013 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2014
RT2014 <- final_2014 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2015
RT2015 <- final_2015 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2016
RT2016 <- final_2016 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2017
RT2017 <- final_2017 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2018
RT2018 <- final_2018 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2019
RT2019 <- final_2019 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2020
RT2020 <- final_2020 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2021
RT2021 <- final_2021 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
!grepl('type = \\"retweeted\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#2022
RT2022 <- final_2022 %>% filter(
is.na(referenced_tweets) |
grepl('NULL', referenced_tweets) |
grepl('type = \\"replied_to\\"', referenced_tweets)
) %>% select(id, created_at, text, author_id, retweet_count, referenced_tweets)
#group them
allRTs <- rbind(RT2009, RT2010, RT2011, RT2012, RT2013, RT2014, RT2015, RT2016, RT2017,RT2018, RT2019, RT2020, RT2021, RT2022) %>%
filter(!is.na(retweet_count))
df <- data.frame(value = allRTs)
# Calculate frequencies
freq_table <- df %>%
count(value.retweet_count)
#SAMPLING USING PERCENTILE
# 1. Calculate cumulative frequency
cumulative_frequency <- freq_table %>%
mutate(cumulative_n = cumsum(n))
# 2. Calculate total number of tweets and percentile threshold
total_tweets <- sum(cumulative_frequency$n)
percentile_threshold <- 0.90 * total_tweets
threshold_row <- cumulative_frequency %>%
filter(cumulative_n >= percentile_threshold) %>%
slice(1)
threshold_value <- threshold_row$value.retweet_count
# 3. Filter data for top X% tweets
top_tweets <- cumulative_frequency %>%
filter(value.retweet_count >= threshold_value)
# Print the top tweets data
print(top_tweets)
## value.retweet_count n cumulative_n
## 1 5 42 3439
## 2 6 42 3481
## 3 7 38 3519
## 4 8 20 3539
## 5 9 17 3556
## 6 10 13 3569
## 7 11 20 3589
## 8 12 12 3601
## 9 13 10 3611
## 10 14 8 3619
## 11 15 13 3632
## 12 16 12 3644
## 13 17 6 3650
## 14 18 4 3654
## 15 19 5 3659
## 16 20 6 3665
## 17 21 5 3670
## 18 22 9 3679
## 19 23 3 3682
## 20 24 5 3687
## 21 25 5 3692
## 22 26 4 3696
## 23 27 3 3699
## 24 28 5 3704
## 25 29 1 3705
## 26 30 1 3706
## 27 31 3 3709
## 28 32 3 3712
## 29 33 3 3715
## 30 34 3 3718
## 31 35 2 3720
## 32 36 1 3721
## 33 37 2 3723
## 34 38 2 3725
## 35 39 3 3728
## 36 40 2 3730
## 37 41 3 3733
## 38 42 4 3737
## 39 43 3 3740
## 40 44 3 3743
## 41 46 2 3745
## 42 50 3 3748
## 43 51 1 3749
## 44 52 1 3750
## 45 55 3 3753
## 46 58 2 3755
## 47 60 1 3756
## 48 61 1 3757
## 49 62 1 3758
## 50 63 1 3759
## 51 67 1 3760
## 52 69 1 3761
## 53 73 1 3762
## 54 74 2 3764
## 55 75 2 3766
## 56 76 2 3768
## 57 79 2 3770
## 58 88 1 3771
## 59 93 1 3772
## 60 94 1 3773
## 61 98 1 3774
## 62 99 1 3775
## 63 101 1 3776
## 64 102 1 3777
## 65 104 1 3778
## 66 106 1 3779
## 67 111 1 3780
## 68 117 1 3781
## 69 118 1 3782
## 70 122 1 3783
## 71 127 1 3784
## 72 131 1 3785
## 73 145 1 3786
## 74 153 1 3787
## 75 160 1 3788
## 76 161 1 3789
## 77 166 1 3790
## 78 169 1 3791
## 79 182 1 3792
## 80 187 1 3793
## 81 312 1 3794
## 82 318 1 3795
## 83 347 1 3796
## 84 358 1 3797
## 85 429 1 3798
## 86 615 1 3799
## 87 653 1 3800
## 88 667 1 3801
## 89 961 1 3802
#SAMPLING USING QUARTILE
# Calculate quantiles to find a more meaningful threshold
quantile_threshold <- quantile(freq_table$value.retweet_count, probs = 0.75)
# Filter the data based on the quantile threshold
quantile_data <- freq_table %>%
filter(value.retweet_count >= quantile_threshold)
# Print the sample data
print(quantile_data)
## value.retweet_count n
## 1 106 1
## 2 111 1
## 3 117 1
## 4 118 1
## 5 122 1
## 6 127 1
## 7 131 1
## 8 145 1
## 9 153 1
## 10 160 1
## 11 161 1
## 12 166 1
## 13 169 1
## 14 182 1
## 15 187 1
## 16 312 1
## 17 318 1
## 18 347 1
## 19 358 1
## 20 429 1
## 21 615 1
## 22 653 1
## 23 667 1
## 24 961 1
#SAMPLING USING HIGH IMPACT ABOVE 100RTS
# Define a high-impact threshold
high_impact_threshold <- 100
# Filter tweets that have more than the high-impact threshold
high_data <- freq_table %>%
filter(value.retweet_count > high_impact_threshold)
# Print the sample data
print(high_data)
## value.retweet_count n
## 1 101 1
## 2 102 1
## 3 104 1
## 4 106 1
## 5 111 1
## 6 117 1
## 7 118 1
## 8 122 1
## 9 127 1
## 10 131 1
## 11 145 1
## 12 153 1
## 13 160 1
## 14 161 1
## 15 166 1
## 16 169 1
## 17 182 1
## 18 187 1
## 19 312 1
## 20 318 1
## 21 347 1
## 22 358 1
## 23 429 1
## 24 615 1
## 25 653 1
## 26 667 1
## 27 961 1
We gathered all the data in one single data frame. We proceed to convert some columns and count the tweets per day to identify the days with most activity.
## # A tibble: 22 × 2
## created_at count
## <date> <int>
## 1 2019-01-24 524
## 2 2022-05-22 491
## 3 2021-05-29 478
## 4 2021-02-02 476
## 5 2021-02-03 457
## 6 2019-06-26 447
## 7 2022-11-03 360
## 8 2019-06-27 345
## 9 2022-05-23 313
## 10 2019-11-12 284
## # ℹ 12 more rows