library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.2
library(maptools) 
## Loading required package: sp
## Checking rgeos availability: TRUE
library(rgdal) 
## Warning: package 'rgdal' was built under R version 3.4.2
## rgdal: version: 1.2-13, (SVN revision 686)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/rgdal/proj
##  Linking to sp version: 1.2-5
library(RgoogleMaps) # googlemaps API tool
library(classInt) 
library(RColorBrewer) 
library(ggplot2); library(ggmap) 
library(maps);library(mapdata)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(rgeos)
## rgeos version: 0.3-23, (SVN revision 546)
##  GEOS runtime version: 3.6.1-CAPI-1.10.1 r0 
##  Linking to sp version: 1.2-4 
##  Polygon checking: TRUE
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 3.4.2
census_api_key(Sys.getenv("CENSUS_API_KEY"))
## To install your API key for use in future sessions, run this function with `install = TRUE`.
library(tigris) # to pull census shapefiles from the web API
## As of version 0.5.1, tigris does not cache downloaded data by default. To enable caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
## 
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
## 
##     plot
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.1.3, proj.4 4.9.3
library(sp)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse

Bring in the raw file from the BARI shared Google Drive and take a look at it.

a <- fread("~/Dropbox (MIT)/BARI/Beacon-Twitter/somerville.tweets.csv",stringsAsFactors = F,integer64 = "double")
## 
Read 62.6% of 1070519 rows
Read 1048575 rows and 8 (of 8) columns from 0.131 GB file in 00:00:03
head(a)
##    V1       lat      lon  timestamp   user_id    tweet_id
## 1:  1 -71.10880 42.36191 1381391996   3108601 3.88152e+17
## 2:  2 -71.78931 42.58596 1381391997 165814089 3.88152e+17
## 3:  3 -71.11263 42.40137 1381391997 772511833 3.88152e+17
## 4:  4 -71.79779 42.23482 1381391999 255788453 3.88152e+17
## 5:  5 -71.11492 42.22880 1381392000 547953245 3.88152e+17
## 6:  6 -71.09988 42.41979 1381392001 408713240 3.88152e+17
##                                                                                                                                             content
## 1:                                                                                       @aparanjape it is! Loving the view. http://t.co/DVJq6Ug82t
## 2:                                                                                                                           Tonight I love forever
## 3: Was wet by 2 bottles of M. Shadows' water &amp; I probably wasn't breathing 2/3rds of Sinister's shredding.\r\r#boston @TheOfficialA7X #a7x #win
## 4:                                                                                @Reggie_Barnes @DeeYoungstah @KimiiiColex3 http://t.co/BUsUkqlXNQ
## 5:                     @Monnnnnnnster happy birthday best frand love you to the moon an back girl _\xd9\xf7\x8d_\xd9\xf7\xcf_\xd9\xf7\xf7_ٍ\xc8_ٍ\xe1
## 6:                                                                           _\xd9\xf7__\xd9\xf7__\xd9\xf7__\xd9\xf7__\xd9\xf7__\xd9\xf7__\xd9\xf7_
##      user_geoid
## 1:            0
## 2: 250277105002
## 3: 250173396003
## 4: 250277329013
## 5: 250214161021
## 6: 250173399004

Basic summary stats:

mean(a$user_geoid!=0)
## [1] 0.5697227

So, it looks like {r} round(mean(a$user_geoid!=0)*100,2)% of people tweeting in the greater boston area had home locations inside the greater Boston area.

Spatial analyses

In order to do any spatial things with these data, we need to turn into a spatial points dataframe:

tweets.spdf <- SpatialPointsDataFrame(coords = cbind(a$lat,a$lon),
                                               data = a,
                                               proj4string=CRS("+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0") # these data are just Long/Lat coords
                                               )

Bring in geographic reference shapefiles and subset them to Somerville area:

Map the Census geographies:

somer_base <- get_map(location="Somerville, MA",zoom=13, color="color",maptype="terrain",crop=T)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Somerville,+MA&zoom=13&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Somerville,%20MA&sensor=false
b0 <- ggmap(somer_base, extent="device",legend="none",maprange=T)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
(b1 <- b0 + 
      geom_polygon(aes(x=long,y=lat,group=group),data=somer_bg,color = "black",alpha=0.4))
## Regions defined for each Polygons

(b1 <- b0 + 
      geom_polygon(aes(x=long,y=lat,group=group),data=somer_tracts,color = "black",alpha=0.4))
## Regions defined for each Polygons

Look at tweet content within tracts:

tweets_somer <- tweets.spdf[somer,]

# join in CT of user's home:
tweets_somer@data$user_somerville <- ifelse(tweets_somer@data$user_geoid %in% as.numeric(somer_bg@data$GEOID.1),1,0)
mean(tweets_somer@data$user_somerville) # 65.76%
## [1] 0.657585
tweets_somer@data$user_ctid <- as.numeric(gsub("(250173\\d{5}).*","\\1",tweets_somer@data$user_geoid))

tweets_somer_over <- over(tweets_somer,somer_tracts)
tweets_somer$tweet_geoid <- tweets_somer_over$GEOID.1 # add in the CT ID number of the tweet's location

tweets_somer_tb <- tweets_somer@data %>%
        unnest_tokens(word, content)

sent_dict <- get_sentiments("afinn")
sent_dict_nrc <- get_sentiments("nrc")

tweets_somer_sent <- tweets_somer_tb %>%
        left_join(sent_dict,by = "word") %>%
        filter(!is.na(score))

sent_sum <- tweets_somer_sent %>%
      group_by(tweet_id) %>%
      summarize(mean_sent = mean(score,na.rm=T))

tweets_somer$mean_sent <- sent_sum$mean_sent[match(tweets_somer@data$tweet_id,sent_sum$tweet_id)]

# how many are actually talking about the City?
length(grep("Somerville",tweets_somer@data$content,ignore.case = T))/length(tweets_somer@data$content)
## Warning in grep("Somerville", tweets_somer@data$content, ignore.case = T):
## input string 1 is invalid in this locale
## Warning in grep("Somerville", tweets_somer@data$content, ignore.case = T):
## input string 25 is invalid in this locale
## Warning in grep("Somerville", tweets_somer@data$content, ignore.case = T):
## input string 26 is invalid in this locale
## Warning in grep("Somerville", tweets_somer@data$content, ignore.case = T):
## input string 27 is invalid in this locale
## Warning in grep("Somerville", tweets_somer@data$content, ignore.case = T):
## input string 36 is invalid in this locale
## [1] 0.02756731
# 2.76%, so probably can't filter by that.

# what about talking about the MBTA?
length(grep("MBTA",tweets_somer@data$content,ignore.case = T))/length(tweets_somer@data$content)
## Warning in grep("MBTA", tweets_somer@data$content, ignore.case = T): input
## string 1 is invalid in this locale
## Warning in grep("MBTA", tweets_somer@data$content, ignore.case = T): input
## string 25 is invalid in this locale
## Warning in grep("MBTA", tweets_somer@data$content, ignore.case = T): input
## string 26 is invalid in this locale
## Warning in grep("MBTA", tweets_somer@data$content, ignore.case = T): input
## string 27 is invalid in this locale
## Warning in grep("MBTA", tweets_somer@data$content, ignore.case = T): input
## string 36 is invalid in this locale
## [1] 0.001289699
# 0.12%, so even less. Hmm.

# what are some example negative and positive tweets?
head(tweets_somer@data %>%
           select(content,tweet_id,mean_sent) %>%
           arrange(mean_sent)
     )
##                                                                                                                                                    content
## 1 ...within 10 miles of our house. And printer ink.Gift certificates in the neighborhood &amp; printer ink for XMas""\r\r""Okay Dad""\r\r#shitmyfathersays
## 2                                                           If your girl tweets Chief Keef lyrics, You know she fuckin all the niggas she smokes weed with
## 3                                                                               Word niggas working at the other job as well today _\xd9\xf7__\xd9\xd4\xce
## 4                                                                                                                     Adil Rami is moving to Milan. YESSSS
## 5                                                                          \x89\xdb\xcf@Guys_Codes: Bitch sit down http://t.co/QTY2DxdTC5\x89۝_\xd9\xf7\xe2
## 6                                                                                                                                          Bitch kum by ah
##      tweet_id mean_sent
## 1 3.88312e+17        -5
## 2 3.88312e+17        -5
## 3 3.88401e+17        -5
## 4 3.88401e+17        -5
## 5 3.89099e+17        -5
## 6 3.89261e+17        -5
neg_ids <- tweets_somer@data %>%
           select(content,tweet_id,mean_sent) %>%
           arrange(mean_sent) %>%
      select(tweet_id) %>%
      slice(1:10)

Hmm, doesn’t seem super informative.

# why does tweet #2 get such a negative score?
tweets_somer_sent[tweets_somer_sent$tweet_id %in% neg_ids$tweet_id,]
##          V1       lat      lon  timestamp   user_id    tweet_id
## 16     1764 -71.09608 42.38657 1381430102 499005332 3.88312e+17
## 85     9490 -71.11770 42.40074 1381451317 506526818 3.88401e+17
## 846   81900 -71.13209 42.40605 1381617797  19878137 3.89099e+17
## 1214 109570 -71.13184 42.40554 1381656361  19878137 3.89261e+17
## 1653 152486 -71.09959 42.37580 1381812675 442282492 3.89916e+17
## 1793 164271 -71.13206 42.40762 1381829220 506526818 3.89986e+17
##        user_geoid user_somerville   user_ctid tweet_geoid   word score
## 16   250173515002               1 25017351500 25017351300 niggas    -5
## 85   250173507002               1 25017350700 25017350500 niggas    -5
## 846  250173507004               1 25017350700 25017350700  bitch    -5
## 1214 250173507004               1 25017350700 25017350700  bitch    -5
## 1653 250173512033               1 25017351203 25017351203   slut    -5
## 1793 250173507002               1 25017350700 25017350700  bitch    -5

Ah, okay. So, profanity such as racial epithets and curse words get scored negatively.

head(tweets_somer@data %>%
           select(content,tweet_id,mean_sent) %>%
           arrange(desc(mean_sent))
     )
##                                                                                                                           content
## 1                                                                  This season of American Horror Story is going to be so amazing
## 2                                                                                                                 guess who it is
## 3                                                                                                                     Awesome bud
## 4                                                        I made such an amazing call picking up Da'Rell Scott over Brandon Jacobs
## 5                                                                                                      Wow a HR could tie this...
## 6 Honestly the bus ride back was so much fun because the girls really know how to take my mind off of stuff \x89\x9d_\x95\xfc\x8f
##      tweet_id mean_sent
## 1 3.88179e+17         4
## 2 3.88451e+17         4
## 3 3.88451e+17         4
## 4 3.88502e+17         4
## 5 3.88506e+17         4
## 6 3.88835e+17         4

Again, not super informative.

# then summarize data over all words tweeted within each tract:
tracts_sent_sum <- tweets_somer_sent %>%
      group_by(tweet_geoid) %>%
      summarize(mean_sent = mean(score,na.rm=T))

# or averaged at the tweet level:
tracts_sent_sum2 <- tweets_somer@data %>%
      group_by(tweet_geoid) %>%
      summarize(mean_sent = mean(mean_sent,na.rm=T))

somer_tracts@data$avg_sent <- tracts_sent_sum$mean_sent[match(somer_tracts@data$GEOID.1,tracts_sent_sum$tweet_geoid)]
somer_tracts@data$avg_tweet_sent <- tracts_sent_sum2$mean_sent[match(somer_tracts@data$GEOID.1,tracts_sent_sum2$tweet_geoid)]


somer_tracts@data$id <- rownames(somer_tracts@data)
somer_tracts2 <- fortify(somer_tracts,region="id") # separates polygons into individual vertices
somer_tracts2 <- plyr::join(somer_tracts2,somer_tracts@data,by="id")
(b2 <- b0 + 
      geom_polygon(aes(x=long,y=lat,group=group,fill = avg_sent),data=somer_tracts2,alpha=0.8) +
            labs(fill="Avg. Twitter Sentiment"))

ggsave(b2,filename = "mean_sent.pdf")
## Saving 7 x 5 in image

This makes it definitely appear that the Davis area is more happy, while the Prospect Hill/Tufts areas are least happy.

But what about sentiment averaged by tweet (slightly different)?

(b3 <- b0 + 
      geom_polygon(aes(x=long,y=lat,group=group,fill = avg_tweet_sent),data=somer_tracts2,alpha=0.8) +
            labs(fill="Avg. Sentiment of Tweets"))

ggsave(b3,filename = "mean_sent_tweets.pdf")
## Saving 7 x 5 in image

Here it looks like Prospect Hill and Tufts areas are least happy. Also, Davis Square still looks super happy.

This is just tweets in Somerville, though. What about people who live in Somerville?

# then summarize data - all words tweeted within each tract:
tracts_sent_sum_residents <- tweets_somer_sent %>%
      group_by(user_ctid) %>%
      summarize(mean_sent = mean(score,na.rm=T))

# or averaged at the tweet level:
tracts_sent_sum2_residents <- tweets_somer@data %>%
      group_by(user_ctid) %>%
      summarize(mean_sent = mean(mean_sent,na.rm=T))

somer_tracts@data$avg_sent_residents <- tracts_sent_sum_residents$mean_sent[match(somer_tracts@data$GEOID.1,tracts_sent_sum_residents$user_ctid)]
somer_tracts@data$avg_tweet_sent_residents <- tracts_sent_sum2_residents$mean_sent[match(somer_tracts@data$GEOID.1,tracts_sent_sum2_residents$user_ctid)]


somer_tracts@data$id <- rownames(somer_tracts@data)
somer_tracts2 <- fortify(somer_tracts,region="id") # separates polygons into individual vertices
somer_tracts2 <- plyr::join(somer_tracts2,somer_tracts@data,by="id")


(b4 <- b0 +
            geom_polygon(aes(x=long,y=lat,group=group,fill = avg_sent_residents),data=somer_tracts2,alpha=0.8) +
            labs(fill="Avg. Sentiment,\nSomerville Residents\nby Home Location"))

ggsave(b4,filename = "mean_sent_residents.pdf")
## Saving 7 x 5 in image

Here, it looks like Assembly row and East Somerville areas are most negative, and Davis area most positive sentiment.

What’s the correlation between sentiment of tweets in Somerville and tweets sent by Somerville residents?

stats::cor(somer_tracts@data[,c("avg_sent","avg_tweet_sent","avg_sent_residents","avg_tweet_sent_residents")])
##                             avg_sent avg_tweet_sent avg_sent_residents
## avg_sent                  1.00000000     0.86875499         0.19219098
## avg_tweet_sent            0.86875499     1.00000000         0.03527831
## avg_sent_residents        0.19219098     0.03527831         1.00000000
## avg_tweet_sent_residents -0.02394268    -0.04608756         0.93062683
##                          avg_tweet_sent_residents
## avg_sent                              -0.02394268
## avg_tweet_sent                        -0.04608756
## avg_sent_residents                     0.93062683
## avg_tweet_sent_residents               1.00000000

So really quite uncorrelated. Huh. Not sure why that is.

Output the tract-level data for other uses:

somer_tracts@data$fips_code <- paste0(somer_tracts@data$STATEFP.1,somer_tracts@data$COUNTYFP.1,somer_tracts@data$TRACTCE.1)
# write.csv(somer_tracts@data,"somerville_twitter_tracts.csv",row.names = F)