complete.data <- fread("/Volumes/My Book Thunderbolt Duo/CST/Dataset/big.dataset.complete.dataset.raw.csv",
stringsAsFactors = F) #this is the complete dataset without geolocation data
complete.data <- complete.data[28:2677085,] #remove the faulty bit
complete.data$tweet.time.posix <-as.POSIXct(complete.data$tweet.time.str, format= "%a %b %d %H:%M:%S %z %Y", usetz= FALSE)# time value was char, converting it to posixct
It should be noted that this data set is a result of our efforts to identify UK-based tweets. Let’s take a quick peak at the complete data set.
str(complete.data)
## Classes 'data.table' and 'data.frame': 2677058 obs. of 50 variables:
## $ V1 : chr "28" "29" "30" "31" ...
## $ X : int 4 7 24 44 47 54 57 88 94 108 ...
## $ tweet.id.str : num 6.55e+17 6.55e+17 6.55e+17 6.55e+17 6.55e+17 ...
## $ tweet.text.str : chr "@brianaeden_xo @loveaIwayskari @haydnsayble @polygondola Jew and a half but I agree" "RT @AviMayer: I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" "| __truncated__ "RT @AviMayer: I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" "| __truncated__ "RT @MichaelLee2009: Burning #Christian churches in #israel justified, Right wing #Jewish leader #Gopstein says "| __truncated__ ...
## $ tweet.time.str : chr "Fri Oct 16 13:19:03 +0000 2015" "Fri Oct 16 13:19:06 +0000 2015" "Fri Oct 16 13:19:17 +0000 2015" "Fri Oct 16 13:19:29 +0000 2015" ...
## $ timestamp.str : num 1.45e+12 1.45e+12 1.45e+12 1.45e+12 1.45e+12 ...
## $ user.id.str : num 3.86e+08 1.37e+08 1.99e+07 1.05e+08 1.95e+07 ...
## $ user.handle.str : chr "sthrmsy" "Mirelle_Byruck" "blairsupporter" "marmite_news" ...
## $ user.name.str : chr "quesadeity" "Mirelle" "Blair Supporter" "marmite_news" ...
## $ user.verified : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ user.followers : int 108 164 2413 4384 121394 1394 21 1483 454 666 ...
## $ user.following : int 113 63 2091 2007 125992 771 40 1992 1069 579 ...
## $ user.status.count : int 6195 23000 142393 41735 100224 9815 1073 61097 5992 5746 ...
## $ user.description.str : chr "בצלם אלוהים" "Smile at the street sleeper, they are worth as much as, the person who has everything. #Israel #JewishLivesMatter" "Want to know why I reckon Labour will be out of office until at least 2025? History..." "I'm for truth, no matter who tells it. I'm for justice, no matter who it's for or against.... Malcolm X" ...
## $ user.location.str : chr NA "Lovely #London My Home Town.." "London, UK" NA ...
## $ user.timezone : chr "London" "London" "London" "London" ...
## $ retweeted.id.str : num NA 6.55e+17 6.55e+17 6.55e+17 6.55e+17 ...
## $ retweeted.text.str : chr NA "I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" 100-150 Palest"| __truncated__ "I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" 100-150 Palest"| __truncated__ "Burning #Christian churches in #israel justified, Right wing #Jewish leader #Gopstein says https://t.co/STCzl3e"| __truncated__ ...
## $ retweeted.time.str : chr NA "Fri Oct 16 13:18:06 +0000 2015" "Fri Oct 16 13:18:06 +0000 2015" "Fri Oct 16 13:17:49 +0000 2015" ...
## $ retweeted.favorite.count : int NA 1 2 0 14 NA NA NA 0 1 ...
## $ retweeted.retweet.count : int NA 2 3 1 10 NA NA NA 2 1 ...
## $ retweeted.user.id.str : num NA 1.63e+07 1.63e+07 4.80e+07 2.08e+08 ...
## $ retweeted.user.handle.str : chr NA "AviMayer" "AviMayer" "MichaelLee2009" ...
## $ retweeted.user.name.str : chr NA "Avi Mayer" "Avi Mayer" "Michael Lee" ...
## $ retweeted.user.verified : logi NA FALSE FALSE FALSE FALSE NA ...
## $ retweeted.user.followers : int NA 30394 30394 6663 6107 NA NA NA 3980 2566 ...
## $ retweeted.user.following : int NA 6306 6306 6192 642 NA NA NA 3018 816 ...
## $ retweeted.user.status.count : int NA 52988 52988 74694 9561 NA NA NA 49944 7342 ...
## $ retweeted.user.description.str: chr NA "Just some guy living in Israel, trying to help advance the Jewish people and repair the world. No big whoop. (P"| __truncated__ "Just some guy living in Israel, trying to help advance the Jewish people and repair the world. No big whoop. (P"| __truncated__ "By the way I'm not anti-semitic, on the contrary I support the Palestinians." ...
## $ retweeted.user.location.str : chr NA "Jerusalem, Israel" "Jerusalem, Israel" "Oxford UK" ...
## $ retweeted.user.timezone : chr NA "Jerusalem" "Jerusalem" "London" ...
## $ quoted.id.str : num NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.text.str : chr NA NA NA NA ...
## $ quoted.time.str : chr NA NA NA NA ...
## $ quoted.favorite.count : int NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.retweet.count : int NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.user.id.str : num NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.user.handle.str : chr NA NA NA NA ...
## $ quoted.user.name.str : chr NA NA NA NA ...
## $ quoted.user.verified : logi NA NA NA NA NA NA ...
## $ quoted.user.followers : int NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.user.following : int NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.user.status.count : int NA NA NA NA NA NA NA NA NA NA ...
## $ quoted.user.description.str : chr NA NA NA NA ...
## $ quoted.user.location.str : chr NA NA NA NA ...
## $ quoted.user.timezone : chr NA NA NA NA ...
## $ in.reply.to.status.id.str : num 6.55e+17 NA NA NA NA ...
## $ in.reply.to.user.id.str : num 2.53e+09 NA NA NA NA ...
## $ in.reply.to.screen.name : chr "brianaeden_xo" NA NA NA ...
## $ tweet.time.posix : POSIXct, format: "2015-10-16 14:19:03" "2015-10-16 14:19:06" ...
## - attr(*, ".internal.selfref")=<externalptr>
These are Geo-located tweets which have been parsed separately from the complete data set. Extracting Geo-locations has been painful as Twitter API has longitude and latitude as a 2-element list in a column called Geo. It was not possible to unlist and keep them as 2 separate values of the same row. I had to convert them to string and use regex to extract long and lat values.
It is also useful to note that Geo-locations are not restricted to UK tweets and rather global. When I match these tweets with our previous data set (UK-based tweets) below, I will (mostly) exclude non-UK Geo-locations.
geo.data <- fread("/Users/Macbook/Desktop/coordinates.csv", stringsAsFactors = F) #this is the geolocated tweets
str(geo.data)
## Classes 'data.table' and 'data.frame': 25139 obs. of 5 variables:
## $ text : chr "Palestinian rioters torch Jewish holy site Joseph’s Tomb http://t.co/tzksRLEkKN" "Starting the day right (now that my ankle is better!) @ Jewish Community Center of San Francisco https://t.co/8lEjYbZwLA" "Mysteries surrounding Nazi-looted art far from solved...\nhttp://t.co/4MpU73joEJ http://t.co/DniEJPzHek" "@benshapiro MSNBC: Filthy Jewish Feet...what you need to know and what you should fear, next." ...
## $ created_at: chr "Fri Oct 16 13:50:32 +0000 2015" "Fri Oct 16 14:10:10 +0000 2015" "Fri Oct 16 14:30:10 +0000 2015" "Fri Oct 16 14:51:31 +0000 2015" ...
## $ id_str : chr "655017988165345280" "655022926450716674" "655027961259798529" "655033335241166849" ...
## $ lat : chr "4.828516" "37.7873001" "50.4501" "32.8008058" ...
## $ long : chr "7.003183" "-122.4474182" "30.5234" "-79.8910801" ...
## - attr(*, ".internal.selfref")=<externalptr>
geo.data2 <- geo.data %>%
mutate(tweet.id.str=as.numeric(id_str), lat=as.numeric(lat), long=as.numeric(long)) %>% #adjusting data classes
select(tweet.id.str, lat, long) #selecting necessary columns
Here, I am adding Geo-locations to the existing data set. Geo-locations are only added to tweets whose tweet IDs match with those in complete.data so location matching should be solid.
totaldata <- merge(complete.data,geo.data2,by="tweet.id.str", all.x=TRUE)
totaldata <- (totaldata[!duplicated(totaldata), ])
Now, we only keep Geo-locations which match with out UK-based tweets. There are 3100 such tweets, down from global 25139 Geo-located tweets.
world <- ggplot() +
# theme_map() +
borders("world", colour = "gray85", fill = "gray80") +
coord_map( xlim=c(-180,180), ylim=c(-55,85))
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
geomap.world <- world + geom_point(aes(x = long, y = lat),
data = totaldata,
colour = 'red3', alpha = 0.5, size=0.1)+
labs( x= 'Longitude', y= "Latitude",
title = "Geolocated Tweets across the World",
subtitle = "Timespan: Complete Dataset",
caption = "Social Data Lab") +
theme_ipsum_rc()
geomap.world
## Warning: Removed 2673958 rows containing missing values (geom_point).
Based on this map, we have some data points outside the UK but in general most tweets seem to be UK based. This indicates our pattern matching efforts were not pinpoint but successful in general.
Now zooming in and expanding the map of UK. Note that these are tweets from complete time span.
UK <- ggplot() +
# theme_map() +
borders("world", regions= "UK", colour = "gray85", fill = "gray80") +
coord_map( xlim=c(-9,2), ylim=c(49.5,59.5))
geomap.uk <- UK + geom_point(aes(x = long, y = lat),
data = totaldata,
colour = 'red3', alpha = 1, size=0.1)+
labs( x= 'Longitude', y= "Latitude",
title = "Geolocated Tweets across the UK",
subtitle = "Timespan: Complete Dataset",
caption = "Social Data Lab") +
theme_ipsum_rc()
geomap.uk
## Warning: Removed 2673958 rows containing missing values (geom_point).
Looks good!
Now, I will visualise a subset. First step is to create the subset.
subset.july <- (totaldata [totaldata$tweet.time.posix >= "2016-07-24" & totaldata$tweet.time.posix <= "2016-08-08", ]) # Selects 14 days between 25 July and 07 August 2016
str(subset.july)
## Classes 'data.table' and 'data.frame': 97808 obs. of 52 variables:
## $ tweet.id.str : num 7.57e+17 7.57e+17 7.57e+17 7.57e+17 7.57e+17 ...
## $ V1 : chr "1968910" "1968911" "1968912" "1968913" ...
## $ X : int 30 38 50 54 59 73 86 89 90 98 ...
## $ tweet.text.str : chr "I did an #IN phonebank shift at Euston \"\"war room\"\". About as sinister as the QMU, circa '83. https://t.co/"| __truncated__ "RT @oldandrewuk: Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is oka"| __truncated__ "RT @oldandrewuk: Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is oka"| __truncated__ "this is so fucking racist https://t.co/AXdTY3kzcN" ...
## $ tweet.time.str : chr "Sat Jul 23 23:00:30 +0000 2016" "Sat Jul 23 23:00:39 +0000 2016" "Sat Jul 23 23:00:51 +0000 2016" "Sat Jul 23 23:00:54 +0000 2016" ...
## $ timestamp.str : num 1.47e+12 1.47e+12 1.47e+12 1.47e+12 1.47e+12 ...
## $ user.id.str : num 1.76e+08 1.47e+08 1.76e+08 4.09e+07 1.43e+09 ...
## $ user.handle.str : chr "thoughtland" "MJGoldin" "JohnJLindsley" "lmartods" ...
## $ user.name.str : chr "Pat Kane" "Michael Goldin" "J. J. Lindsley" "max-amed" ...
## $ user.verified : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ user.followers : int 19184 756 1283 3323 1389 7747 3330 509 321 460 ...
## $ user.following : int 1138 2410 2046 2713 3804 3185 2687 1364 253 409 ...
## $ user.status.count : int 30634 5117 12124 31088 20789 714457 41122 4614 34135 23277 ...
## $ user.description.str : chr "Always a Yesser + for max attainable progress 4 Scotland. Friendly to all green/left/social-democrat parties 4 "| __truncated__ "Working at @JCWInews and doing law school in my spare time | Politics, immigration/asylum, human rights and Isr"| __truncated__ "Policy/campaigns/economics/philosophy/ex-staffer/ex-teacher/CompanyPartner #MixedRace #pluralism #secular #athe"| __truncated__ "Jon Snow & Kaytrnada Fan Account" ...
## $ user.location.str : chr "Glasgow / London" "London, England" "Mostly London - NUFC / Falcons" "NW LDN" ...
## $ user.timezone : chr "Edinburgh" NA "London" "London" ...
## $ retweeted.id.str : num NA 7.57e+17 7.57e+17 NA NA ...
## $ retweeted.text.str : chr NA "Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is okay? 3) It is a con"| __truncated__ "Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is okay? 3) It is a con"| __truncated__ NA ...
## $ retweeted.time.str : chr NA "Sat Jul 23 09:00:37 +0000 2016" "Sat Jul 23 09:00:37 +0000 2016" NA ...
## $ retweeted.favorite.count : int NA 426 426 NA NA 40 NA NA NA NA ...
## $ retweeted.retweet.count : int NA 553 554 NA NA 17 NA NA NA NA ...
## $ retweeted.user.id.str : num NA 97858519 97858519 NA NA ...
## $ retweeted.user.handle.str : chr NA "oldandrewuk" "oldandrewuk" NA ...
## $ retweeted.user.name.str : chr NA "Andrew Old" "Andrew Old" NA ...
## $ retweeted.user.verified : logi NA FALSE FALSE NA NA FALSE ...
## $ retweeted.user.followers : int NA 13592 13592 NA NA 109607 NA NA NA NA ...
## $ retweeted.user.following : int NA 914 914 NA NA 222 NA NA NA NA ...
## $ retweeted.user.status.count : int NA 98932 98932 NA NA 36404 NA NA NA NA ...
## $ retweeted.user.description.str: chr NA "Teacher. Any attempt to suggest that the word battleground in my blog title refers to my classroom or my studen"| __truncated__ "Teacher. Any attempt to suggest that the word battleground in my blog title refers to my classroom or my studen"| __truncated__ NA ...
## $ retweeted.user.location.str : chr NA NA NA NA ...
## $ retweeted.user.timezone : chr NA NA NA NA ...
## $ quoted.id.str : num NA NA NA NA 7.57e+17 ...
## $ quoted.text.str : chr NA NA NA NA ...
## $ quoted.time.str : chr NA NA NA NA ...
## $ quoted.favorite.count : int NA NA NA NA 8 NA NA NA NA 3 ...
## $ quoted.retweet.count : int NA NA NA NA 15 NA NA NA NA 4 ...
## $ quoted.user.id.str : num NA NA NA NA 3.3e+07 ...
## $ quoted.user.handle.str : chr NA NA NA NA ...
## $ quoted.user.name.str : chr NA NA NA NA ...
## $ quoted.user.verified : logi NA NA NA NA FALSE NA ...
## $ quoted.user.followers : int NA NA NA NA 30190 NA NA NA NA 1784 ...
## $ quoted.user.following : int NA NA NA NA 60 NA NA NA NA 1020 ...
## $ quoted.user.status.count : int NA NA NA NA 24767 NA NA NA NA 41059 ...
## $ quoted.user.description.str : chr NA NA NA NA ...
## $ quoted.user.location.str : chr NA NA NA NA ...
## $ quoted.user.timezone : chr NA NA NA NA ...
## $ in.reply.to.status.id.str : num NA NA NA NA NA ...
## $ in.reply.to.user.id.str : num NA NA NA NA NA ...
## $ in.reply.to.screen.name : chr NA NA NA NA ...
## $ tweet.time.posix : POSIXct, format: "2016-07-24 00:00:30" "2016-07-24 00:00:39" ...
## $ lat : num NA NA NA NA NA ...
## $ long : num NA NA NA NA NA ...
## - attr(*, "sorted")= chr "tweet.id.str"
## - attr(*, ".internal.selfref")=<externalptr>
Next step is to create a time series line graph.
#The timeseries line graph of subset dataset (daily)
ts <- xts(x = rep(1,times=nrow(subset.july)), order.by = subset.july$tweet.time.posix)
ts.sum <- apply.daily(ts,sum)
ts.sum.df <- data.frame(date=index(ts.sum), coredata(ts.sum))
colnames(ts.sum.df)=c('date','sum')
ts.plot.subset <- ggplot(ts.sum.df)+geom_line(aes(x=date,y=sum))+
labs( x= 'Time (Daily)', y= "Tweet Count",
title = "Line Graph of Tweet Counts of the Subset ",
subtitle = "Timespan: 25 July 2016 - 07 August 2016",
caption = "Social Data Lab") +
theme_ipsum_rc()
ts.plot.subset
This is how data is distributed in the 14 day time span.
Now, let’s plot the Geo-located tweets in this timespan.
# Geolocated data from Subset
geomap.uk.subset <- UK + geom_point(aes(x = long, y = lat),
data = subset.july,
colour = 'red3', alpha = .5, size=2)+
labs( x= 'Longitude', y= "Latitude",
title = "Geolocated Tweets across the UK",
subtitle = "Timespan: 25 July 2016 - 07 August 2016",
caption = "Social Data Lab") +
theme_ipsum_rc()
geomap.uk.subset
## Warning: Removed 97711 rows containing missing values (geom_point).
and lastly visualise these 2 plots side-by-side
#2 plots side-by-side
grid.arrange(geomap.uk.subset,ts.plot.subset, ncol=2)
## Warning: Removed 97711 rows containing missing values (geom_point).
Now next step is to run the data set through the classifier and work on comparison.