Adding geo-data to existing dataset

Reading in the complete dataset

complete.data <- fread("/Volumes/My Book Thunderbolt Duo/CST/Dataset/big.dataset.complete.dataset.raw.csv", 
                       stringsAsFactors = F) #this is the complete dataset without geolocation data
complete.data <- complete.data[28:2677085,] #remove the faulty bit
complete.data$tweet.time.posix <-as.POSIXct(complete.data$tweet.time.str, format= "%a %b %d %H:%M:%S %z %Y", usetz= FALSE)# time value was char, converting it to posixct

It should be noted that this data set is a result of our efforts to identify UK-based tweets. Let’s take a quick peak at the complete data set.

str(complete.data)
## Classes 'data.table' and 'data.frame':   2677058 obs. of  50 variables:
##  $ V1                            : chr  "28" "29" "30" "31" ...
##  $ X                             : int  4 7 24 44 47 54 57 88 94 108 ...
##  $ tweet.id.str                  : num  6.55e+17 6.55e+17 6.55e+17 6.55e+17 6.55e+17 ...
##  $ tweet.text.str                : chr  "@brianaeden_xo @loveaIwayskari @haydnsayble @polygondola Jew and a half but I agree" "RT @AviMayer: I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" "| __truncated__ "RT @AviMayer: I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" "| __truncated__ "RT @MichaelLee2009: Burning #Christian churches in #israel justified, Right wing #Jewish leader #Gopstein says "| __truncated__ ...
##  $ tweet.time.str                : chr  "Fri Oct 16 13:19:03 +0000 2015" "Fri Oct 16 13:19:06 +0000 2015" "Fri Oct 16 13:19:17 +0000 2015" "Fri Oct 16 13:19:29 +0000 2015" ...
##  $ timestamp.str                 : num  1.45e+12 1.45e+12 1.45e+12 1.45e+12 1.45e+12 ...
##  $ user.id.str                   : num  3.86e+08 1.37e+08 1.99e+07 1.05e+08 1.95e+07 ...
##  $ user.handle.str               : chr  "sthrmsy" "Mirelle_Byruck" "blairsupporter" "marmite_news" ...
##  $ user.name.str                 : chr  "quesadeity" "Mirelle" "Blair Supporter" "marmite_news" ...
##  $ user.verified                 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ user.followers                : int  108 164 2413 4384 121394 1394 21 1483 454 666 ...
##  $ user.following                : int  113 63 2091 2007 125992 771 40 1992 1069 579 ...
##  $ user.status.count             : int  6195 23000 142393 41735 100224 9815 1073 61097 5992 5746 ...
##  $ user.description.str          : chr  "בצלם אלוהים" "Smile at the street sleeper, they are worth as much as, the person who has everything. #Israel  #JewishLivesMatter" "Want to know why I reckon Labour will be out of office until at least 2025? History..." "I'm for truth, no matter who tells it. I'm for justice, no matter who it's for or against....  Malcolm X" ...
##  $ user.location.str             : chr  NA "Lovely #London  My Home Town.." "London, UK" NA ...
##  $ user.timezone                 : chr  "London" "London" "London" "London" ...
##  $ retweeted.id.str              : num  NA 6.55e+17 6.55e+17 6.55e+17 6.55e+17 ...
##  $ retweeted.text.str            : chr  NA "I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" 100-150 Palest"| __truncated__ "I should note: there is absolutely no dispute over how the Jewish holy site \"\"caught fire.\"\" 100-150 Palest"| __truncated__ "Burning #Christian churches in #israel justified, Right wing #Jewish leader #Gopstein says https://t.co/STCzl3e"| __truncated__ ...
##  $ retweeted.time.str            : chr  NA "Fri Oct 16 13:18:06 +0000 2015" "Fri Oct 16 13:18:06 +0000 2015" "Fri Oct 16 13:17:49 +0000 2015" ...
##  $ retweeted.favorite.count      : int  NA 1 2 0 14 NA NA NA 0 1 ...
##  $ retweeted.retweet.count       : int  NA 2 3 1 10 NA NA NA 2 1 ...
##  $ retweeted.user.id.str         : num  NA 1.63e+07 1.63e+07 4.80e+07 2.08e+08 ...
##  $ retweeted.user.handle.str     : chr  NA "AviMayer" "AviMayer" "MichaelLee2009" ...
##  $ retweeted.user.name.str       : chr  NA "Avi Mayer" "Avi Mayer" "Michael Lee" ...
##  $ retweeted.user.verified       : logi  NA FALSE FALSE FALSE FALSE NA ...
##  $ retweeted.user.followers      : int  NA 30394 30394 6663 6107 NA NA NA 3980 2566 ...
##  $ retweeted.user.following      : int  NA 6306 6306 6192 642 NA NA NA 3018 816 ...
##  $ retweeted.user.status.count   : int  NA 52988 52988 74694 9561 NA NA NA 49944 7342 ...
##  $ retweeted.user.description.str: chr  NA "Just some guy living in Israel, trying to help advance the Jewish people and repair the world. No big whoop. (P"| __truncated__ "Just some guy living in Israel, trying to help advance the Jewish people and repair the world. No big whoop. (P"| __truncated__ "By the way I'm not anti-semitic, on the contrary I support the Palestinians." ...
##  $ retweeted.user.location.str   : chr  NA "Jerusalem, Israel" "Jerusalem, Israel" "Oxford UK" ...
##  $ retweeted.user.timezone       : chr  NA "Jerusalem" "Jerusalem" "London" ...
##  $ quoted.id.str                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.text.str               : chr  NA NA NA NA ...
##  $ quoted.time.str               : chr  NA NA NA NA ...
##  $ quoted.favorite.count         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.retweet.count          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.user.id.str            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.user.handle.str        : chr  NA NA NA NA ...
##  $ quoted.user.name.str          : chr  NA NA NA NA ...
##  $ quoted.user.verified          : logi  NA NA NA NA NA NA ...
##  $ quoted.user.followers         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.user.following         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.user.status.count      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ quoted.user.description.str   : chr  NA NA NA NA ...
##  $ quoted.user.location.str      : chr  NA NA NA NA ...
##  $ quoted.user.timezone          : chr  NA NA NA NA ...
##  $ in.reply.to.status.id.str     : num  6.55e+17 NA NA NA NA ...
##  $ in.reply.to.user.id.str       : num  2.53e+09 NA NA NA NA ...
##  $ in.reply.to.screen.name       : chr  "brianaeden_xo" NA NA NA ...
##  $ tweet.time.posix              : POSIXct, format: "2015-10-16 14:19:03" "2015-10-16 14:19:06" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Reading in the Geolocations

These are Geo-located tweets which have been parsed separately from the complete data set. Extracting Geo-locations has been painful as Twitter API has longitude and latitude as a 2-element list in a column called Geo. It was not possible to unlist and keep them as 2 separate values of the same row. I had to convert them to string and use regex to extract long and lat values.

It is also useful to note that Geo-locations are not restricted to UK tweets and rather global. When I match these tweets with our previous data set (UK-based tweets) below, I will (mostly) exclude non-UK Geo-locations.

geo.data <- fread("/Users/Macbook/Desktop/coordinates.csv", stringsAsFactors = F) #this is the geolocated tweets
str(geo.data)
## Classes 'data.table' and 'data.frame':   25139 obs. of  5 variables:
##  $ text      : chr  "Palestinian rioters torch Jewish holy site Joseph’s Tomb http://t.co/tzksRLEkKN" "Starting the day right (now that my ankle is better!) @ Jewish Community Center of San Francisco https://t.co/8lEjYbZwLA" "Mysteries surrounding Nazi-looted art far from solved...\nhttp://t.co/4MpU73joEJ http://t.co/DniEJPzHek" "@benshapiro MSNBC: Filthy Jewish Feet...what you need to know and what you should fear, next." ...
##  $ created_at: chr  "Fri Oct 16 13:50:32 +0000 2015" "Fri Oct 16 14:10:10 +0000 2015" "Fri Oct 16 14:30:10 +0000 2015" "Fri Oct 16 14:51:31 +0000 2015" ...
##  $ id_str    : chr  "655017988165345280" "655022926450716674" "655027961259798529" "655033335241166849" ...
##  $ lat       : chr  "4.828516" "37.7873001" "50.4501" "32.8008058" ...
##  $ long      : chr  "7.003183" "-122.4474182" "30.5234" "-79.8910801" ...
##  - attr(*, ".internal.selfref")=<externalptr>
geo.data2 <- geo.data %>% 
                    mutate(tweet.id.str=as.numeric(id_str), lat=as.numeric(lat), long=as.numeric(long)) %>% #adjusting data classes 
                    select(tweet.id.str, lat, long) #selecting necessary columns

Adding Geo-locations

Here, I am adding Geo-locations to the existing data set. Geo-locations are only added to tweets whose tweet IDs match with those in complete.data so location matching should be solid.

totaldata <- merge(complete.data,geo.data2,by="tweet.id.str", all.x=TRUE)
totaldata <- (totaldata[!duplicated(totaldata), ])

Now, we only keep Geo-locations which match with out UK-based tweets. There are 3100 such tweets, down from global 25139 Geo-located tweets.

Plotting

Geolocated data worldwide

world <- ggplot() +
     # theme_map() +
     borders("world", colour = "gray85", fill = "gray80") +
     coord_map( xlim=c(-180,180), ylim=c(-55,85))
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
geomap.world <- world + geom_point(aes(x = long, y = lat),
                    data = totaldata, 
                    colour = 'red3', alpha = 0.5, size=0.1)+
                    labs( x= 'Longitude', y= "Latitude",
                         title = "Geolocated Tweets across the World", 
                         subtitle = "Timespan: Complete Dataset", 
                         caption = "Social Data Lab") +
                              theme_ipsum_rc()
geomap.world
## Warning: Removed 2673958 rows containing missing values (geom_point).

Based on this map, we have some data points outside the UK but in general most tweets seem to be UK based. This indicates our pattern matching efforts were not pinpoint but successful in general.

Geolocated data UK

Now zooming in and expanding the map of UK. Note that these are tweets from complete time span.

UK <- ggplot() +
     # theme_map() +
     borders("world", regions= "UK", colour = "gray85", fill = "gray80") +
     coord_map( xlim=c(-9,2), ylim=c(49.5,59.5))

geomap.uk <- UK + geom_point(aes(x = long, y = lat),
               data = totaldata, 
               colour = 'red3', alpha = 1, size=0.1)+
               labs( x= 'Longitude', y= "Latitude",
                    title = "Geolocated Tweets across the UK", 
                    subtitle = "Timespan: Complete Dataset", 
                    caption = "Social Data Lab") +
                         theme_ipsum_rc()
geomap.uk
## Warning: Removed 2673958 rows containing missing values (geom_point).

Looks good!

Visualising a subset

Now, I will visualise a subset. First step is to create the subset.

subset.july <- (totaldata [totaldata$tweet.time.posix >= "2016-07-24" & totaldata$tweet.time.posix <= "2016-08-08", ]) # Selects 14 days between 25 July and 07 August 2016
str(subset.july)
## Classes 'data.table' and 'data.frame':   97808 obs. of  52 variables:
##  $ tweet.id.str                  : num  7.57e+17 7.57e+17 7.57e+17 7.57e+17 7.57e+17 ...
##  $ V1                            : chr  "1968910" "1968911" "1968912" "1968913" ...
##  $ X                             : int  30 38 50 54 59 73 86 89 90 98 ...
##  $ tweet.text.str                : chr  "I did an #IN phonebank shift at Euston \"\"war room\"\". About as sinister as the QMU, circa '83. https://t.co/"| __truncated__ "RT @oldandrewuk: Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is oka"| __truncated__ "RT @oldandrewuk: Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is oka"| __truncated__ "this is so fucking racist https://t.co/AXdTY3kzcN" ...
##  $ tweet.time.str                : chr  "Sat Jul 23 23:00:30 +0000 2016" "Sat Jul 23 23:00:39 +0000 2016" "Sat Jul 23 23:00:51 +0000 2016" "Sat Jul 23 23:00:54 +0000 2016" ...
##  $ timestamp.str                 : num  1.47e+12 1.47e+12 1.47e+12 1.47e+12 1.47e+12 ...
##  $ user.id.str                   : num  1.76e+08 1.47e+08 1.76e+08 4.09e+07 1.43e+09 ...
##  $ user.handle.str               : chr  "thoughtland" "MJGoldin" "JohnJLindsley" "lmartods" ...
##  $ user.name.str                 : chr  "Pat Kane" "Michael Goldin" "J. J. Lindsley" "max-amed" ...
##  $ user.verified                 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ user.followers                : int  19184 756 1283 3323 1389 7747 3330 509 321 460 ...
##  $ user.following                : int  1138 2410 2046 2713 3804 3185 2687 1364 253 409 ...
##  $ user.status.count             : int  30634 5117 12124 31088 20789 714457 41122 4614 34135 23277 ...
##  $ user.description.str          : chr  "Always a Yesser + for max attainable progress 4 Scotland. Friendly to all green/left/social-democrat parties 4 "| __truncated__ "Working at @JCWInews and doing law school in my spare time | Politics, immigration/asylum, human rights and Isr"| __truncated__ "Policy/campaigns/economics/philosophy/ex-staffer/ex-teacher/CompanyPartner #MixedRace #pluralism #secular #athe"| __truncated__ "Jon Snow & Kaytrnada Fan Account" ...
##  $ user.location.str             : chr  "Glasgow / London" "London, England" "Mostly London - NUFC / Falcons" "NW LDN" ...
##  $ user.timezone                 : chr  "Edinburgh" NA "London" "London" ...
##  $ retweeted.id.str              : num  NA 7.57e+17 7.57e+17 NA NA ...
##  $ retweeted.text.str            : chr  NA "Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is okay? 3) It is a con"| __truncated__ "Debates we now have in Labour: 1) Should we try to win votes? 2) How much anti-semitism is okay? 3) It is a con"| __truncated__ NA ...
##  $ retweeted.time.str            : chr  NA "Sat Jul 23 09:00:37 +0000 2016" "Sat Jul 23 09:00:37 +0000 2016" NA ...
##  $ retweeted.favorite.count      : int  NA 426 426 NA NA 40 NA NA NA NA ...
##  $ retweeted.retweet.count       : int  NA 553 554 NA NA 17 NA NA NA NA ...
##  $ retweeted.user.id.str         : num  NA 97858519 97858519 NA NA ...
##  $ retweeted.user.handle.str     : chr  NA "oldandrewuk" "oldandrewuk" NA ...
##  $ retweeted.user.name.str       : chr  NA "Andrew Old" "Andrew Old" NA ...
##  $ retweeted.user.verified       : logi  NA FALSE FALSE NA NA FALSE ...
##  $ retweeted.user.followers      : int  NA 13592 13592 NA NA 109607 NA NA NA NA ...
##  $ retweeted.user.following      : int  NA 914 914 NA NA 222 NA NA NA NA ...
##  $ retweeted.user.status.count   : int  NA 98932 98932 NA NA 36404 NA NA NA NA ...
##  $ retweeted.user.description.str: chr  NA "Teacher. Any attempt to suggest that the word battleground in my blog title refers to my classroom or my studen"| __truncated__ "Teacher. Any attempt to suggest that the word battleground in my blog title refers to my classroom or my studen"| __truncated__ NA ...
##  $ retweeted.user.location.str   : chr  NA NA NA NA ...
##  $ retweeted.user.timezone       : chr  NA NA NA NA ...
##  $ quoted.id.str                 : num  NA NA NA NA 7.57e+17 ...
##  $ quoted.text.str               : chr  NA NA NA NA ...
##  $ quoted.time.str               : chr  NA NA NA NA ...
##  $ quoted.favorite.count         : int  NA NA NA NA 8 NA NA NA NA 3 ...
##  $ quoted.retweet.count          : int  NA NA NA NA 15 NA NA NA NA 4 ...
##  $ quoted.user.id.str            : num  NA NA NA NA 3.3e+07 ...
##  $ quoted.user.handle.str        : chr  NA NA NA NA ...
##  $ quoted.user.name.str          : chr  NA NA NA NA ...
##  $ quoted.user.verified          : logi  NA NA NA NA FALSE NA ...
##  $ quoted.user.followers         : int  NA NA NA NA 30190 NA NA NA NA 1784 ...
##  $ quoted.user.following         : int  NA NA NA NA 60 NA NA NA NA 1020 ...
##  $ quoted.user.status.count      : int  NA NA NA NA 24767 NA NA NA NA 41059 ...
##  $ quoted.user.description.str   : chr  NA NA NA NA ...
##  $ quoted.user.location.str      : chr  NA NA NA NA ...
##  $ quoted.user.timezone          : chr  NA NA NA NA ...
##  $ in.reply.to.status.id.str     : num  NA NA NA NA NA ...
##  $ in.reply.to.user.id.str       : num  NA NA NA NA NA ...
##  $ in.reply.to.screen.name       : chr  NA NA NA NA ...
##  $ tweet.time.posix              : POSIXct, format: "2016-07-24 00:00:30" "2016-07-24 00:00:39" ...
##  $ lat                           : num  NA NA NA NA NA ...
##  $ long                          : num  NA NA NA NA NA ...
##  - attr(*, "sorted")= chr "tweet.id.str"
##  - attr(*, ".internal.selfref")=<externalptr>

Next step is to create a time series line graph.

#The timeseries line graph of subset dataset (daily)
ts <- xts(x = rep(1,times=nrow(subset.july)), order.by = subset.july$tweet.time.posix)
ts.sum <- apply.daily(ts,sum)
ts.sum.df <- data.frame(date=index(ts.sum), coredata(ts.sum))
colnames(ts.sum.df)=c('date','sum')

ts.plot.subset <- ggplot(ts.sum.df)+geom_line(aes(x=date,y=sum))+
                    labs( x= 'Time (Daily)', y= "Tweet Count",
                    title = "Line Graph of Tweet Counts of the Subset ", 
                    subtitle = "Timespan: 25 July 2016 - 07 August 2016", 
                    caption = "Social Data Lab") +
                         theme_ipsum_rc()
ts.plot.subset

This is how data is distributed in the 14 day time span.

Now, let’s plot the Geo-located tweets in this timespan.

# Geolocated data from Subset
geomap.uk.subset <- UK + geom_point(aes(x = long, y = lat),
                         data = subset.july, 
                         colour = 'red3', alpha = .5, size=2)+
                         labs( x= 'Longitude', y= "Latitude",
                         title = "Geolocated Tweets across the UK", 
                         subtitle = "Timespan: 25 July 2016 - 07 August 2016", 
                         caption = "Social Data Lab") +
                              theme_ipsum_rc()
geomap.uk.subset
## Warning: Removed 97711 rows containing missing values (geom_point).

and lastly visualise these 2 plots side-by-side

#2 plots side-by-side
grid.arrange(geomap.uk.subset,ts.plot.subset, ncol=2)
## Warning: Removed 97711 rows containing missing values (geom_point).

Now next step is to run the data set through the classifier and work on comparison.

The End!