options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(grid)
library(RColorBrewer)
library(ggrepel)
library(ggthemes)
library(viridis)
library(tm)
library(wordcloud)
library(tidytext)

1 Data overview & preparation

preambule :

Data is :

The data preparation mostly consists in making date/time variable.

df<-read.csv('20170816_Documenting_Hate - Data.csv',sep=',',stringsAsFactors=F)
df$Date<-sapply(df$Article.Title, function(x) strsplit(x," ")[[1]][1])
df$Date<-as.Date(df$Date,"%m/%d/%y")
df$Day<-sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][2]))
df$Month<-sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][1]))
df$month_name<-sapply(df$Month, function(x) month.name[x])
df$month_name_ordered<-factor(df$month_name, levels =c(month.name))
df$week<-as.integer(format(df$Date, "%W")) + 1
df$day <- factor(weekdays(df$Date, T), levels = rev(c("Mon", "Tue", "Wed", "Thu","Fri", "Sat", "Sun")))
df$day_ordered <- factor(df$day, levels=c("Mon", "Tue", "Wed", "Thu","Fri","Sat","Sun"))
df$Hour<- sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][1]))
df$Min<- sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][2]))

There are few duplicated rows (for example) :

df %>% filter(City=='INFORUM') %>% select(Date,City,Organization)
##         Date    City                                                              Organization
## 1 2017-07-03 INFORUM Civil rights group calls beating of Somali in Fargo 'possible hate crime'
## 2 2017-07-17 INFORUM                                  Nelson: Hate crime legislation is needed
## 3 2017-08-07 INFORUM    Letter: There are at least two good reasons for hate crime legislation
## 4 2017-08-07 INFORUM    Letter: There are at least two good reasons for hate crime legislation
## 5 2017-08-12 INFORUM                        Shaw: Hate crime legislation in ND is long overdue
## 6 2017-08-12 INFORUM                        Shaw: Hate crime legislation in ND is long overdue

so we need to remove the duplicates

df <- df[!duplicated(df$Organization),]

2 Reports Summary

per_week <- df %>% group_by(week,day) %>% summarize(day_count=n())
pal='B'
ggplot(per_week, aes(x = week, y = day, fill=day_count)) + scale_fill_viridis(
  name="count", 
  option = pal, 
  direction = -1, 
  na.value = "grey",
  limits = c(0, max(per_week$day_count))) +
  geom_tile(color = "white", size = 0.4) +
        scale_x_continuous(
            expand = c(0, 0),
            breaks = seq(1, 52, length = 12),
            labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                       "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
  theme_tufte(base_family = "Helvetica") +
  theme(
    plot.title=element_text(face="bold"),
    axis.title = element_blank(),
    axis.ticks = element_blank(),axis.text.y = element_text(size=14),
    axis.text.x = element_text(size=14,hjust=.75),
    legend.position = "bottom",
    legend.key.width = unit(6, "cm"),
    strip.text = element_text(hjust = 0.01, face = "bold", size = 14)) + 
  ggtitle('Weekly Hate crime report')

The 3 top reports were for :

#merge(df %>% group_by(Date) %>% summarize(count=n()) %>% arrange(-count) %>% top_n(3), df %>% select(Date,Organization, Keywords), by='Date')

3 Which organizations reported the most ?

3.1 List of top 10 organizations reported on hate crime

allOrgs <- df %>% group_by(City) %>% summarise(count=n())
#select top 10 Organizations
top10Org <-(allOrgs %>% top_n(10))$City
top10Org
##  [1] "ABC News"                 "Carbonated.tv"            "HuffPost"                
##  [4] "Mic"                      "New York Daily News"      "Newsweek"                
##  [7] "Patch.com"                "U.S. News & World Report" "Washington Post"         
## [10] "Yahoo News"
  • above is the list of the top 10 Organizations that have reported on Hate Crime within the last 7 months
  • We can quickly look at the distribution by Organization
#define a top10 organization factor variable
df$my_org <-ifelse(df$City %in% top10Org, 'top 10 Organizations','Others')

g2<-ggplotGrob(df %>% group_by(month_name_ordered,my_org) %>% summarise(count=n()) %>% ggplot(aes(x=month_name_ordered,y=count,fill = my_org)) + geom_histogram(stat='identity') + theme_fivethirtyeight() + scale_fill_brewer(name="",palette='Paired') + ggtitle("2017's Monthly Hate crime reports"))

g1<-allOrgs %>% filter(count >  4 * mean(allOrgs$count)) %>% ggplot(aes(x=reorder(City,count),y=count)) + geom_histogram(stat='identity') + coord_flip() + theme_fivethirtyeight()

3.2 Monthly reports by Organization

The plot shows :

  • main panel : distribution of the Organizations that have reported on HAte Crime (lower cut based on 4 * mean for visualization purposes)
  • sub-panel : monthly distribution of the top 10 Organizations and the Others *May and June were the 2 months (so far) with the highest number of reports
g1 + annotation_custom(grob = g2, ymin = 20, ymax = 80, xmin = 1, xmax = 20)

4 Charlottesville events

#days<-c('2017-08-12','2017-08-13')
days<-c(as.Date('2017-08-12'),as.Date('2017-08-13'))
charlotte <- df %>% filter(Date %in% days)

makeCorpus<-function(mydf){
    wordVec<-c()
    temp<-Corpus(VectorSource(mydf$Summary))
    for(i in 1:length(temp)){
        temp2 <- Corpus(VectorSource(temp[i]$content))
        temp_clean<-temp2 %>% 
        tm_map(content_transformer(tolower)) %>% 
        tm_map(content_transformer(removeNumbers)) %>% 
        tm_map(content_transformer(removePunctuation)) %>% 
        tm_map(content_transformer(removeWords),c(tidytext::stop_words$word,"charlottesville")) %>%
        tm_map(content_transformer(trimws)) %>% 
        tm_map(content_transformer(stripWhitespace))

        current<-sort(strsplit(as.character(temp_clean$content),' ')[[1]])
        wordVec<-append(wordVec,current,length(current)) 
        }
        return(wordVec)
}

listAll<-list()
for(i in 1:length(days)){
    temp_df<- charlotte %>% filter(Date==days[i])
    listAll[[i]]<-makeCorpus(temp_df)
}

4.1 Wordcloud

set.seed(1234)
par(mfrow=c(1, 2),bg="black")
for(i in 1:length(days)){
  tdm <- TermDocumentMatrix(Corpus(VectorSource(listAll[[i]])))
  m_tdm <-as.matrix(tdm)
  word.freq<-sort(rowSums(m_tdm), decreasing=T)
  wordcloud(words=names(word.freq),
            freq = word.freq,
            random.order=F,
            colors=viridis::inferno(40),
            max.words=200,scale=c(2,1))
  title(paste0('Most frequent words during ',days[i]),col.main='#46ACC8',cex.main=1)
}

  • Splitting the Corpus of words between the Saturday and Sunday is in fact a better solution because it reveals different sentiments :
  • on Saturday, the press was reporting on the events of the day before, therefore we find a majority of words related to the removal of confederate statue and the neo-nazis walking in Charlottesville with torches
  • on Sunday, the press reports were about the death of Heather Heyer

4.2 Timeline

  • we can look at the number of press reports on a daily basis during these 3 days
charlotte %>% 
  dplyr::group_by(day_ordered,Hour,my_org) %>% 
  summarize(count=n()) %>% 
  ggplot(aes(x=factor(Hour),y=count,fill=my_org)) + 
  geom_bar(stat='identity',bins=24) + 
  scale_fill_brewer(name="",palette='Paired') + 
  scale_x_discrete(limits = seq(0,23)) + facet_wrap(~day_ordered,ncol=1) + 
  theme_fivethirtyeight() + 
  ggtitle('Timeline of the Press Reports during the Charlottesville\'s events') + 
  theme(legend.position='right',legend.direction='vertical')

5 N-Grams

note : I’m not sure the n-grams is working well on the Summary feature because this feature is already a cleaned list (from stopwords, punctuation) of word. Thus 2-grams, or 3-grams do not have a real meaning in a sentence-way.

On the other hand, we have a summary/header of each press-report :

charlotte %>% select(Date,X) %>% head(1)
##         Date
## 1 2017-08-12
##                                                                                                                                                                                                                                                          X
## 1 By Jim Shaw Today at 7:00 am The push is on for hate crime legislation in North Dakota, to which I say... --It's long overdue. North Dakota is shamefully, but not surprisingly, one of only five state states without such laws. Meantime, North Dakota
corpus_all<-VCorpus(VectorSource(df$X))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
corpus_all_clean <- corpus_all %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(content_transformer(removeNumbers)) %>% 
  tm_map(content_transformer(removePunctuation)) %>%
  tm_map(content_transformer(removeWords),c(tidytext::stop_words$word,"charlottesville"))

tdm_2<- TermDocumentMatrix(corpus_all_clean, control = list(tokenize = BigramTokenizer))
tdm_3 <- TermDocumentMatrix(corpus_all_clean, control = list(tokenize = TrigramTokenizer))

m_tdm_2 <-as.matrix(tdm_2)
m_tdm_3 <-as.matrix(tdm_3)

word.freq.2<-sort(rowSums(m_tdm_2), decreasing=T)
word.freq.3<-sort(rowSums(m_tdm_3), decreasing=T)
listMat<-list()
listMat[[1]]<-word.freq.2
listMat[[2]]<-word.freq.3
set.seed(1234)
par(mfrow=c(1, 2),bg="gray50")
for(i in 1:2){
  wordcloud(words=names(listMat[[i]]),
            freq = listMat[[i]],
            random.order=F,
            colors=viridis::inferno(20),min.freq=10)
  title(paste0('Most frequent ',i+1,'-grams'),col.main='#46ACC8',cex.main=1)
}

6 Geolocation

6.1 Analysis

While we don’t have the exact location of the incident, we can infer its location by the URL column, which gives the State where the newspaper is published.

There is 2 cases to distinguish :

  • when there is a physics location
  • when the source is an online media (TV, web)

So the strategy to analyze this feature is :

  • make a new feature (based on URL) to distinguish both cases
  • quick regexp to transform the URL into a state.name and take care of special cases
  • merge count with a map

6.2 Code

df %>% dplyr::select(Date, City, State, URL) %>% head(10)
##          Date                         City           State URL
## 1  2017-02-13                 LGBTQ Nation         Phoenix  AZ
## 2  2017-02-13          New York Daily News                    
## 3  2017-02-13                        FOX19      Cincinnati  OH
## 4  2017-02-13                    89.3 WFPL      Louisville  KY
## 5  2017-02-13                  Daily Comet       Thibodaux  LA
## 6  2017-02-14             Davis Enterprise           Davis  CA
## 7  2017-02-14    Harrisburg Daily Register      Harrisburg  IL
## 8  2017-02-14               GoDanRiver.com        Danville  VA
## 9  2017-02-14    KSBY San Luis Obispo News San Luis Obispo  CA
## 10 2017-02-14 San Diego Gay & Lesbian News       San Diego  CA
df$type_pub<-ifelse(df$URL!="",'paper','online')
df1<-df %>% dplyr::filter(URL!="")
df2<-df %>% dplyr::filter(URL=="")
#define theme and make a pie chart to show the paper/online cases
blank_theme <- theme_fivethirtyeight() + 
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank(),
    panel.grid=element_blank(),
    axis.ticks = element_blank(),
    plot.title=element_text(size=14, face="bold"))

g2<-ggplotGrob(df %>% 
                 group_by(type_pub) %>% 
                 summarize(count=n()) %>% 
                 ggplot(aes(x="",y=count,fill=type_pub)) + geom_bar(width = 1, stat = "identity",color='white',size=.5) +
                 coord_polar("y") + scale_fill_brewer(name="",palette="Set1") + 
                 blank_theme + theme(
                   legend.position='right',
                   legend.direction='vertical',
                   axis.text.x=element_blank(), 
                   legend.text=element_text(size=6),
                   legend.key.size = unit(.2, "cm")))
makeNames<-function(x){
    x<-tolower(x)
    if(nchar(x)==2){
        if(x=='dc'){return('district of columbia')}
            else {return(tolower(state.name[match(x,tolower(state.abb))]))}
    }
    else if(x=='d.c.') {return("district of columbia")}
    else if(x=='venice beach, ca 90291') {return('california')}
    else if(x=='alexandria va') {return('virginia')}
    else if(x=='new york and california') {return('new york')}
    else if(x=='los angeles, california') {return('california')}
    else {return(x)}
}
counter_paper <- df1 %>% select(URL)
counter_paper$region<-sapply(counter_paper$URL, makeNames)
temp<-counter_paper %>% group_by(region) %>% summarise(count=n()) %>% mutate(perc = count/sum(count)) %>% mutate(perc = 100*round(perc,2))
states_map<-map_data("state")
RES<-data.frame(merge(temp, states_map, by='region'))

p<-RES %>% ggplot(aes(map_id = region)) + 
  geom_map(aes(fill = perc), map = states_map,color='black',size=.25) + 
  expand_limits(x = states_map$long, y = states_map$lat) + 
  theme_fivethirtyeight() + 
  scale_fill_gradientn(name='count',colors = brewer.pal(9,'Blues')) +
  theme(panel.grid.major = element_blank(),
        axis.text=element_blank(),axis.ticks=element_blank(),
        legend.position = c(0.1, 0.125))
p <- p + labs(title="Press-reports percentage from Feb,13th 2017 to Aug,13th 2017",
              subtitle="reports from paper publications only\n") + 
  theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20),
        plot.subtitle=element_text(size=10, hjust=0, face="italic", color="black")) 

6.3 Map

p + 
  annotation_custom(grob = g2, xmin = -125, xmax = -115, ymin = 23, ymax = 27) + 
  ggplot2::annotate("text", x = -72, y = 38, label = "NY(22%)", size=3,colour = "#41AB5D") + 
  ggplot2::annotate("segment", x = -73, xend = -74, y = 38.5, yend = 42,colour = "#41AB5D") +
  ggplot2::annotate("text", x = -125, y = 35, label = "CAL(16%)", size=3,colour = "#41AB5D") + 
  ggplot2::annotate("segment", x = -124, xend = -120, y = 35.5, yend = 38,colour = "#41AB5D")

  • the proportion online/paper is ~ 20% - 80%
  • given the size/population of these 2 states, it’s not surprising to find California and New York at the first places where press-reports were written.

7 Next steps

History :