options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(grid)
library(RColorBrewer)
library(ggrepel)
library(ggthemes)
library(viridis)
library(tm)
library(wordcloud)
library(tidytext)
preambule :
Data is :
13 february 2017
to 13 August 2017
.
August 11-12-13
.keyword
column is the source of the document (url mostly), organization
is the title of the article.summary
column seems to be a corpus (already cleaned) of words of the article.state
and city
refers to the press office locationThe data preparation mostly consists in making date/time variable.
df<-read.csv('20170816_Documenting_Hate - Data.csv',sep=',',stringsAsFactors=F)
df$Date<-sapply(df$Article.Title, function(x) strsplit(x," ")[[1]][1])
df$Date<-as.Date(df$Date,"%m/%d/%y")
df$Day<-sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][2]))
df$Month<-sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][1]))
df$month_name<-sapply(df$Month, function(x) month.name[x])
df$month_name_ordered<-factor(df$month_name, levels =c(month.name))
df$week<-as.integer(format(df$Date, "%W")) + 1
df$day <- factor(weekdays(df$Date, T), levels = rev(c("Mon", "Tue", "Wed", "Thu","Fri", "Sat", "Sun")))
df$day_ordered <- factor(df$day, levels=c("Mon", "Tue", "Wed", "Thu","Fri","Sat","Sun"))
df$Hour<- sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][1]))
df$Min<- sapply(df$Article.Title, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][2]))
There are few duplicated rows (for example) :
df %>% filter(City=='INFORUM') %>% select(Date,City,Organization)
## Date City Organization
## 1 2017-07-03 INFORUM Civil rights group calls beating of Somali in Fargo 'possible hate crime'
## 2 2017-07-17 INFORUM Nelson: Hate crime legislation is needed
## 3 2017-08-07 INFORUM Letter: There are at least two good reasons for hate crime legislation
## 4 2017-08-07 INFORUM Letter: There are at least two good reasons for hate crime legislation
## 5 2017-08-12 INFORUM Shaw: Hate crime legislation in ND is long overdue
## 6 2017-08-12 INFORUM Shaw: Hate crime legislation in ND is long overdue
so we need to remove the duplicates
df <- df[!duplicated(df$Organization),]
per_week <- df %>% group_by(week,day) %>% summarize(day_count=n())
pal='B'
ggplot(per_week, aes(x = week, y = day, fill=day_count)) + scale_fill_viridis(
name="count",
option = pal,
direction = -1,
na.value = "grey",
limits = c(0, max(per_week$day_count))) +
geom_tile(color = "white", size = 0.4) +
scale_x_continuous(
expand = c(0, 0),
breaks = seq(1, 52, length = 12),
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
theme_tufte(base_family = "Helvetica") +
theme(
plot.title=element_text(face="bold"),
axis.title = element_blank(),
axis.ticks = element_blank(),axis.text.y = element_text(size=14),
axis.text.x = element_text(size=14,hjust=.75),
legend.position = "bottom",
legend.key.width = unit(6, "cm"),
strip.text = element_text(hjust = 0.01, face = "bold", size = 14)) +
ggtitle('Weekly Hate crime report')
The 3 top reports were for :
Bowie State Student a Hate Crime
(2017-05-22, 105 counts)Virginia Teen Kidnapped, Killed After She Left Mosque
(2017-06-19, 105 counts)Portland white supremacist Jeremy Christian appears in court
(2017-05-31, 97 counts)#merge(df %>% group_by(Date) %>% summarize(count=n()) %>% arrange(-count) %>% top_n(3), df %>% select(Date,Organization, Keywords), by='Date')
top 10
organizations reported on hate crimeallOrgs <- df %>% group_by(City) %>% summarise(count=n())
#select top 10 Organizations
top10Org <-(allOrgs %>% top_n(10))$City
top10Org
## [1] "ABC News" "Carbonated.tv" "HuffPost"
## [4] "Mic" "New York Daily News" "Newsweek"
## [7] "Patch.com" "U.S. News & World Report" "Washington Post"
## [10] "Yahoo News"
Organization
#define a top10 organization factor variable
df$my_org <-ifelse(df$City %in% top10Org, 'top 10 Organizations','Others')
g2<-ggplotGrob(df %>% group_by(month_name_ordered,my_org) %>% summarise(count=n()) %>% ggplot(aes(x=month_name_ordered,y=count,fill = my_org)) + geom_histogram(stat='identity') + theme_fivethirtyeight() + scale_fill_brewer(name="",palette='Paired') + ggtitle("2017's Monthly Hate crime reports"))
g1<-allOrgs %>% filter(count > 4 * mean(allOrgs$count)) %>% ggplot(aes(x=reorder(City,count),y=count)) + geom_histogram(stat='identity') + coord_flip() + theme_fivethirtyeight()
The plot shows :
top 10 Organizations
and the Others
*May
and June
were the 2 months (so far) with the highest number of reportsg1 + annotation_custom(grob = g2, ymin = 20, ymax = 80, xmin = 1, xmax = 20)
Aug-12
and Aug-13
only#days<-c('2017-08-12','2017-08-13')
days<-c(as.Date('2017-08-12'),as.Date('2017-08-13'))
charlotte <- df %>% filter(Date %in% days)
makeCorpus<-function(mydf){
wordVec<-c()
temp<-Corpus(VectorSource(mydf$Summary))
for(i in 1:length(temp)){
temp2 <- Corpus(VectorSource(temp[i]$content))
temp_clean<-temp2 %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(removeNumbers)) %>%
tm_map(content_transformer(removePunctuation)) %>%
tm_map(content_transformer(removeWords),c(tidytext::stop_words$word,"charlottesville")) %>%
tm_map(content_transformer(trimws)) %>%
tm_map(content_transformer(stripWhitespace))
current<-sort(strsplit(as.character(temp_clean$content),' ')[[1]])
wordVec<-append(wordVec,current,length(current))
}
return(wordVec)
}
listAll<-list()
for(i in 1:length(days)){
temp_df<- charlotte %>% filter(Date==days[i])
listAll[[i]]<-makeCorpus(temp_df)
}
set.seed(1234)
par(mfrow=c(1, 2),bg="black")
for(i in 1:length(days)){
tdm <- TermDocumentMatrix(Corpus(VectorSource(listAll[[i]])))
m_tdm <-as.matrix(tdm)
word.freq<-sort(rowSums(m_tdm), decreasing=T)
wordcloud(words=names(word.freq),
freq = word.freq,
random.order=F,
colors=viridis::inferno(40),
max.words=200,scale=c(2,1))
title(paste0('Most frequent words during ',days[i]),col.main='#46ACC8',cex.main=1)
}
Saturday
and Sunday
is in fact a better solution because it reveals different sentiments
:Saturday
, the press was reporting on the events of the day before, therefore we find a majority of words related to the removal of confederate statue
and the neo-nazis walking in Charlottesville with torches
Sunday
, the press reports were about the death of Heather Heyer
charlotte %>%
dplyr::group_by(day_ordered,Hour,my_org) %>%
summarize(count=n()) %>%
ggplot(aes(x=factor(Hour),y=count,fill=my_org)) +
geom_bar(stat='identity',bins=24) +
scale_fill_brewer(name="",palette='Paired') +
scale_x_discrete(limits = seq(0,23)) + facet_wrap(~day_ordered,ncol=1) +
theme_fivethirtyeight() +
ggtitle('Timeline of the Press Reports during the Charlottesville\'s events') +
theme(legend.position='right',legend.direction='vertical')
note : I’m not sure the n-grams is working well on the Summary
feature because this feature is already a cleaned list (from stopwords, punctuation) of word. Thus 2-grams, or 3-grams do not have a real meaning in a sentence-way.
On the other hand, we have a summary/header of each press-report :
charlotte %>% select(Date,X) %>% head(1)
## Date
## 1 2017-08-12
## X
## 1 By Jim Shaw Today at 7:00 am The push is on for hate crime legislation in North Dakota, to which I say... --It's long overdue. North Dakota is shamefully, but not surprisingly, one of only five state states without such laws. Meantime, North Dakota
corpus_all<-VCorpus(VectorSource(df$X))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
corpus_all_clean <- corpus_all %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(removeNumbers)) %>%
tm_map(content_transformer(removePunctuation)) %>%
tm_map(content_transformer(removeWords),c(tidytext::stop_words$word,"charlottesville"))
tdm_2<- TermDocumentMatrix(corpus_all_clean, control = list(tokenize = BigramTokenizer))
tdm_3 <- TermDocumentMatrix(corpus_all_clean, control = list(tokenize = TrigramTokenizer))
m_tdm_2 <-as.matrix(tdm_2)
m_tdm_3 <-as.matrix(tdm_3)
word.freq.2<-sort(rowSums(m_tdm_2), decreasing=T)
word.freq.3<-sort(rowSums(m_tdm_3), decreasing=T)
listMat<-list()
listMat[[1]]<-word.freq.2
listMat[[2]]<-word.freq.3
set.seed(1234)
par(mfrow=c(1, 2),bg="gray50")
for(i in 1:2){
wordcloud(words=names(listMat[[i]]),
freq = listMat[[i]],
random.order=F,
colors=viridis::inferno(20),min.freq=10)
title(paste0('Most frequent ',i+1,'-grams'),col.main='#46ACC8',cex.main=1)
}
While we don’t have the exact location of the incident, we can infer its location by the URL
column, which gives the State
where the newspaper is published.
There is 2 cases to distinguish :
So the strategy to analyze this feature is :
URL
) to distinguish both casesURL
into a state.name
and take care of special casesdf %>% dplyr::select(Date, City, State, URL) %>% head(10)
## Date City State URL
## 1 2017-02-13 LGBTQ Nation Phoenix AZ
## 2 2017-02-13 New York Daily News
## 3 2017-02-13 FOX19 Cincinnati OH
## 4 2017-02-13 89.3 WFPL Louisville KY
## 5 2017-02-13 Daily Comet Thibodaux LA
## 6 2017-02-14 Davis Enterprise Davis CA
## 7 2017-02-14 Harrisburg Daily Register Harrisburg IL
## 8 2017-02-14 GoDanRiver.com Danville VA
## 9 2017-02-14 KSBY San Luis Obispo News San Luis Obispo CA
## 10 2017-02-14 San Diego Gay & Lesbian News San Diego CA
df$type_pub<-ifelse(df$URL!="",'paper','online')
df1<-df %>% dplyr::filter(URL!="")
df2<-df %>% dplyr::filter(URL=="")
#define theme and make a pie chart to show the paper/online cases
blank_theme <- theme_fivethirtyeight() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold"))
g2<-ggplotGrob(df %>%
group_by(type_pub) %>%
summarize(count=n()) %>%
ggplot(aes(x="",y=count,fill=type_pub)) + geom_bar(width = 1, stat = "identity",color='white',size=.5) +
coord_polar("y") + scale_fill_brewer(name="",palette="Set1") +
blank_theme + theme(
legend.position='right',
legend.direction='vertical',
axis.text.x=element_blank(),
legend.text=element_text(size=6),
legend.key.size = unit(.2, "cm")))
makeNames<-function(x){
x<-tolower(x)
if(nchar(x)==2){
if(x=='dc'){return('district of columbia')}
else {return(tolower(state.name[match(x,tolower(state.abb))]))}
}
else if(x=='d.c.') {return("district of columbia")}
else if(x=='venice beach, ca 90291') {return('california')}
else if(x=='alexandria va') {return('virginia')}
else if(x=='new york and california') {return('new york')}
else if(x=='los angeles, california') {return('california')}
else {return(x)}
}
counter_paper <- df1 %>% select(URL)
counter_paper$region<-sapply(counter_paper$URL, makeNames)
temp<-counter_paper %>% group_by(region) %>% summarise(count=n()) %>% mutate(perc = count/sum(count)) %>% mutate(perc = 100*round(perc,2))
states_map<-map_data("state")
RES<-data.frame(merge(temp, states_map, by='region'))
p<-RES %>% ggplot(aes(map_id = region)) +
geom_map(aes(fill = perc), map = states_map,color='black',size=.25) +
expand_limits(x = states_map$long, y = states_map$lat) +
theme_fivethirtyeight() +
scale_fill_gradientn(name='count',colors = brewer.pal(9,'Blues')) +
theme(panel.grid.major = element_blank(),
axis.text=element_blank(),axis.ticks=element_blank(),
legend.position = c(0.1, 0.125))
p <- p + labs(title="Press-reports percentage from Feb,13th 2017 to Aug,13th 2017",
subtitle="reports from paper publications only\n") +
theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20),
plot.subtitle=element_text(size=10, hjust=0, face="italic", color="black"))
p +
annotation_custom(grob = g2, xmin = -125, xmax = -115, ymin = 23, ymax = 27) +
ggplot2::annotate("text", x = -72, y = 38, label = "NY(22%)", size=3,colour = "#41AB5D") +
ggplot2::annotate("segment", x = -73, xend = -74, y = 38.5, yend = 42,colour = "#41AB5D") +
ggplot2::annotate("text", x = -125, y = 35, label = "CAL(16%)", size=3,colour = "#41AB5D") +
ggplot2::annotate("segment", x = -124, xend = -120, y = 35.5, yend = 38,colour = "#41AB5D")
online
/paper
is ~ 20% - 80%California
and New York
at the first places where press-reports were written.History :