City of Boston 311 Requests

In 2015, the City of Boston released a new 311 system for reporting non-emergency issues around the city. It is a neat platform, frequently used around the city, with app support for both iPhone and Android for maximum access. The data collected from this system is used for goal-setting around the city. The 311 dataset for this project is maintained by the Mayor’s Office, and is available here.

First steps first: download the data. This is a relatively large dataset (0.424 GB), and may take some time to download and parse.

library(data.table)
data <- fread("https://data.cityofboston.gov/api/views/awu8-dc52/rows.csv?accessType=DOWNLOAD")
head(data)
str(data)

There is a lot of valuable information here! The Data Dictionary explains what each variable means in further detail.

The next step is data cleaning. Format the date and time to a consistent format, and simplify neighborhood names for further plotting.

# Set consistent date/time format

data$OPEN_DT<-as.POSIXct(data$OPEN_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")
data$TARGET_DT<-as.POSIXct(data$TARGET_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")
data$CLOSED_DT<-as.POSIXct(data$CLOSED_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")

# Set missing neighborhood name to "Other"
data$neighborhood[data$neighborhood==""]<-"Unspecified"

# Simplify Neighborhood names
data$neighborhood[data$neighborhood=="Downtown / Financial District"]<-"Downtown"
data$neighborhood[data$neighborhood=="South Boston / South Boston Waterfront"]<-"South Boston"
data$neighborhood[data$neighborhood=="Fenway / Kenmore / Audubon Circle / Longwood"]<-"Fenway-Kenmore"

311 Requests by Neighborhood

For this analysis, we are only interested in requests made after January 1, 2016. So we subset the data, and generate an initial plot

library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2);

s2<-subset(data, OPEN_DT>as.POSIXct("2016-01-01 00:00:00"))
s3<-data.frame(s2)

cols <- c("SUBJECT", "REASON", "TYPE", "Department", "neighborhood")
s3[cols] <- lapply(s3[cols], factor)

levels(s3$neighborhood)<-rev(levels(s3$neighborhood))
s3_nh1<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s3_nh1$perc<-round((s3_nh1$n/sum(s3_nh1$n) * 100), digits=2)

f <- function(x) factor(x, levels = rev(unique(x)))

ggplot(s3_nh1, aes(x=f(neighborhood), y=perc)) + geom_bar(stat="identity") + theme_economist()+coord_flip()+
  ylab("Percent of Total 311 Requests") + xlab("Neighborhood")+
  ggtitle("311 Requests by Neighborhood")+
  theme(plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"), plot.title = element_text(hjust = 0.5))

Thus, the Top 5 neighborhoods by volume of 311 requests are Jamaica Plain, Boston, Beacon Hill, Hyde Park, and Dorchester.

311 Requests by Subject

Next, let’s look at the subject distribution for all the 311 requests.

s3_sub<-data.frame(s3 %>% count(SUBJECT, sort=TRUE))
s3_sub$perc<-round(s3_sub$n/nrow(s3) * 100, digits=1)
s3_sub$SUBJECT<-as.character(s3_sub$SUBJECT); 
s3_sub$SUBJECT<-factor(s3_sub$SUBJECT, levels=rev(s3_sub$SUBJECT));

ggplot(data=s3_sub, aes(x=SUBJECT, y=perc)) + theme_economist() +
  geom_bar(position="dodge",stat="identity") + 
  coord_flip() + 
  scale_y_continuous(limits = c(0,55), expand = c(0, 0)) +
  scale_x_discrete(expand=c(0,0))+
  ggtitle("311 Requests by Subject") +
  xlab("Subject") + ylab("Percentage") +
  theme(axis.ticks=element_blank(),  panel.grid.minor=element_blank(), 
        plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

We can see that most of the 311 requests come through the Public Works Department, Transportation, and Inspectional Services.

Geographic Variations in 311 Requests

Let’s take a closer look at the 311 request subjects from each neighborhood:

library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(stringr);

colourCount = length(unique(s3$SUBJECT))
getPalette = colorRampPalette(brewer.pal(12, "Set3"))

s3_sub_nh<-data.frame(s3 %>% count(SUBJECT, neighborhood, sort=TRUE))
s3_nh<-data.frame(s3 %>% count(neighborhood, sort=TRUE))

s4_sub_nh<-merge(s3_sub_nh, s3_nh, by="neighborhood")
s4_sub_nh$perc<-round(s4_sub_nh$n.x/s4_sub_nh$n.y * 100, digits=1)
s4_sub_nh$n.x<-s4_sub_nh$n.y<-NULL

s4_sub_nh_top5<-subset(s4_sub_nh, neighborhood=="Jamaica Plain" | neighborhood=="Boston" | neighborhood== "Beacon Hill" | neighborhood== "Hyde Park" | neighborhood== "Dorchester")

s4_sub_nh$neighborhood = str_wrap(s4_sub_nh$neighborhood , width = 10)
s4_sub_nh$neighborhood = factor(s4_sub_nh$neighborhood, levels=rev(unique(s4_sub_nh$neighborhood)))

ggplot(s4_sub_nh, aes(x = neighborhood, y = perc, fill = SUBJECT)) +
  theme_economist()+
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values = getPalette(colourCount))+
  scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
  labs(x = "Neighborhood", y = "Percentage") +
  ggtitle("Subject of 311 Requests for All Neighborhoods")+
  theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"),
  plot.title = element_text(hjust = 0.5))+
  coord_flip()

For the Top 5 neighborhoods identified above, let’s zoom into their 311 request subjects:

colourCount = length(unique(s4_sub_nh$SUBJECT))+7
getPalette = colorRampPalette(brewer.pal(colourCount, "Set2"))

ggplot(s4_sub_nh_top5, aes(x = neighborhood, y = perc, fill = SUBJECT)) +
  theme_economist()+
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values = getPalette(colourCount))+
  scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
  labs(x = "Neighborhood", y = "Percentage") +
  ggtitle("Subject of 311 Requests for Top 5 Neighborhoods")+
  theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"),
  plot.title = element_text(hjust = 0.5))+
  coord_flip()

311 Request Reasons by Neighborhood

There are a total of 54 unique reasons for 311 requests. To make sense of these, we first extract the top 10 reasons for 311 requests from the larger dataset, and plot them by neighborhood.

library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(knitr);

s3_rea<-data.frame(s3 %>% count(REASON, sort=TRUE))
s3_rea<-s3_rea$REASON[1:10]

s4<-s3[s3$REASON %in%  s3_rea,]
s4_rea_nh<-data.frame(s4 %>% count(REASON, neighborhood, sort=TRUE))
s4_nh<-data.frame(s3 %>% count(neighborhood, sort=TRUE))

s4_rea_nh<-merge(s4_rea_nh, s4_nh, by="neighborhood")
s4_rea_nh$perc<-round(s4_rea_nh$n.x/s4_rea_nh$n.y * 100, digits=1)
s4_rea_nh$n.x<-s4_rea_nh$n.y<-NULL

s4_rea_nh_top5<-subset(s4_rea_nh, neighborhood=="Jamaica Plain" | neighborhood=="Boston" | neighborhood== "Beacon Hill" | neighborhood== "Hyde Park" | neighborhood== "Dorchester")

s4_rea_nh$neighborhood = factor(s4_rea_nh$neighborhood, levels=rev(unique(s4_rea_nh$neighborhood)))

colourCount = length(unique(s4$REASON))
getPalette = colorRampPalette(brewer.pal(colourCount, "Set3"))

ggplot(s4_rea_nh, aes(x = neighborhood, y = perc, fill = REASON)) +
  theme_economist()+
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values = getPalette(colourCount))+
  scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
  labs(x = "Neighborhood", y = "Percentage") +
  theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom")+
  coord_flip()

The top 10 reasons capture most neighborhoods, but they are inadequate for a few others. To better understand different reasons behind 311 requests, we look at neighborhoods where less than 75% of 311 requests are captured by the Top 10 reasons. Allston/Brighton, Mattapan, Roslindale, and West Roxbury might face issues completely different from the other neighborhoods. We subset these neighborhoods and look at the distribution of 311 reasons from these areas:

library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(knitr);

s3_reason_subset<-subset(s3, neighborhood=="Allston / Brighton" | neighborhood=="Mattapan" | neighborhood == "Roslindale" | neighborhood=="West Roxbury")

s4_rea_nh<-data.frame(s3_reason_subset %>% count(REASON, neighborhood, sort=TRUE))
s4_nh<-data.frame(s3_reason_subset %>% count(neighborhood, sort=TRUE))

s4_rea_nh<-merge(s4_rea_nh, s4_nh, by="neighborhood")
s4_rea_nh$perc<-round(s4_rea_nh$n.x/s4_rea_nh$n.y * 100, digits=1)
s4_rea_nh$n.x<-s4_rea_nh$n.y<-NULL
s4_rea_nh$neighborhood = factor(s4_rea_nh$neighborhood, levels=rev(unique(s4_rea_nh$neighborhood)))

options(knitr.kable.NA = '')
s4_rea_nh2<-subset(s4_rea_nh, perc>4.00)

w<-reshape(s4_rea_nh2, timevar=c("neighborhood"), idvar=c("REASON"), direction="wide")
names(w)<-gsub("perc.", "", names(w)); names(w)[1]<-"Neighborhood"; row.names(w)<-NULL;
kable(w)

Neighborhood	Allston / Brighton	Mattapan	Roslindale	West Roxbury
Employee & General Comments	36.6
Call Inquiry	20.1
Park Maintenance & Safety	17.9	8.3	11.0	15.3
Disability	5.5
Signs & Signals	5.1		4.2	5.0
Health		8.3
Highway Maintenance		8.3	8.2	10.3
Recycling		25.0
Sanitation		16.7	11.8	5.3
Building		25.0	11.7	7.6
Weights and Measures		8.3
Enforcement & Abandoned Vehicles			10.6	11.5
Code Enforcement			7.8	6.1
Trees			5.0	4.2
Street Cleaning			5.7	9.9
Housing				10.7

We can see here that these four neighborhoods have slightly different priorities for 311 requests compared to the other neighborhoods. Allston/Brighton places 311 requests primarily for Employees & General Comments, as well as Call Inquiry and Park Maintenance and Safety. Mattapan residents, on the other hand, request Recycling and Building issues. Roslindale and West Roxbury have a generally even split regarding Enforcement & Abandoned Vehicles, Building, Sanitation, and Park Maintenance & Safety. However, given that only Allston/Brighton provides a significant number of 311 requests relative to the other neighborhoods, these areas might not be immediate priorities for the City of Boston.

We can compare these communities to the Top 5 neighborhoods by volume of 311 requests identified above:

s4_rea_nh_top5$neighborhood = factor(s4_rea_nh_top5$neighborhood, levels=rev(unique(s4_rea_nh_top5$neighborhood)))
w<-reshape(s4_rea_nh_top5, timevar=c("neighborhood"), idvar=c("REASON"), direction="wide")
names(w)<-gsub("perc.", "", names(w)); names(w)[1]<-"Neighborhood"; row.names(w)<-NULL;
kable(w)

Neighborhood	Beacon Hill	Boston	Dorchester	Hyde Park	Jamaica Plain
Enforcement & Abandoned Vehicles	32.9	10.3	12.8	19.7	12.1
Trees	3.3	3.1	4.2	1.7	3.3
Building	7.3	4.7	5.6	4.9	5.4
Highway Maintenance	5.7	9.9	10.2	12.8	6.3
Street Lights	3.3	4.4	4.6	5.4	4.2
Sanitation	8.2	14.5	15.3	3.1	20.5
Housing	2.2	6.7	3.0	0.7	4.9
Street Cleaning	10.4	11.5	10.4	19.9	13.1
Signs & Signals	5.1	6.8	8.9	9.4	4.8
Code Enforcement	10.0	8.3	8.1	8.8	7.9

Lag Times

The Lag Time is defined as the difference between a project’s target completion time and actual completion time.

library(knitr); library(gridExtra);

otstat<-round((table(s3$OnTime_Status)/nrow(s3) * 100), digits=1); otstat[-1];

## 
##  ONTIME OVERDUE 
##    84.7    15.3

print(otstat)   # On-time completion percentages

## 
##          ONTIME OVERDUE 
##     0.0    84.7    15.3

nhtotals<-data.frame(s3 %>% count(neighborhood, sort=TRUE))

s3$projecttime<-as.numeric(difftime(s3$CLOSED_DT, s3$OPEN_DT, tz="EST5EDT", units="days"))
s3$lagtime<-as.numeric(difftime(s3$CLOSED_DT, s3$TARGET_DT, tz="EST5EDT", units="days"))

mean(s3$projecttime, na.rm=TRUE) # Average number of days taken to complete a request

## [1] 8.823248

## Check whether ontime is accurate or not
# test<-subset(s3, s3$lagtime>0); table(test$OnTime_Status)
# test<-subset(s3, s3$lagtime<0); table(test$OnTime_Status)

od<-subset(s3, OnTime_Status=="OVERDUE")  
lt_h<-data.frame(od %>% count(neighborhood, sort=TRUE))
lt_all<-merge(lt_h, nhtotals, by="neighborhood"); lt_all$perc<-lt_all$n.x/lt_all$n.y * 100;
lt_all <- lt_all[order(lt_all$perc, decreasing=T),]

lt<-aggregate(lagtime ~ neighborhood, data=od, FUN=mean, na.rm=TRUE)
lt$lagtime<-round(lt$lagtime, digits=2); lt <- lt[order(lt$lagtime, decreasing=T),]

f <- function(x) factor(x, levels = rev(unique(x)))
names(lt)<-c("Neighborhood", "Lag Time in Days"); row.names(lt)<-NULL;

p1<-ggplot(lt, aes(x=f(Neighborhood), y=`Lag Time in Days`))+ geom_bar(stat="identity")+
  theme_economist()+coord_flip()+ggtitle("Mean Lag Time in Days")+
  scale_y_continuous(limit=c(0,65), expand=c(0,0))+
  xlab("Neighborhoods")+ylab("Lag Time in Days")+
  theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

p2<-ggplot(lt_all, aes(x=f(neighborhood), y=perc))+ geom_bar(stat="identity")+
  theme_economist()+coord_flip()+ggtitle("% Requests Overdue")+
  scale_y_continuous(limit=c(0,35), expand=c(0,0))+
  xlab("")+ylab("Percentage")+
  theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
  
  grid.arrange(p1, p2, nrow=1, ncol=2)

How about for the Top 5 Neighborhoods? Let’s take a look:

library(knitr); library(gridExtra);

lt<-subset(lt, Neighborhood %in% c("Jamaica Plain", "Boston", "Beacon Hill", "Hyde Park", "Dorchester"))
lt_all<-subset(lt_all, neighborhood %in% c("Jamaica Plain", "Boston", "Beacon Hill", "Hyde Park", "Dorchester"))

p1<-ggplot(lt, aes(x=f(Neighborhood), y=`Lag Time in Days`))+ geom_bar(stat="identity")+
  theme_economist()+coord_flip()+ggtitle("Mean Lag Time in Days")+
  scale_y_continuous(limit=c(0,65), expand=c(0,0))+
  xlab("Neighborhoods")+ylab("Lag Time in Days")+
  theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

p2<-ggplot(lt_all, aes(x=f(neighborhood), y=perc))+ geom_bar(stat="identity")+
  theme_economist()+coord_flip()+ggtitle("% Requests Overdue")+
  scale_y_continuous(limit=c(0,35), expand=c(0,0))+
  xlab("")+ylab("Percentage")+
  theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

grid.arrange(p1, p2, nrow=1, ncol=2)

The good news is, about 85% of 311 requests are resolved on or before the target completion time. However, approximately 15% of the City of Boston’s 311 requests since 2016 are overdue. The lag time is highest in Beacon Hill, Downtown, and Greater Mattapan. However, these areas have less than 20% of all projects overdue. Mattapan, Chestnut Hill, and West Roxbury have the highest percentages of projects overdue; however, given the relatively small volume of requests from these neighborhoods, this is not surprising. The Top 5 neighborhoods identified above also experience some of the longest lag times. While fewer percentages of all requests in these neighborhoods are overdue, the neighborhoods of Beacon Hill and Jamaica Plain would definitely stand to benefit from more resourced and decreased lag times.

By department:

od<-subset(s3, OnTime_Status=="OVERDUE")  
lt_dept<-data.frame(od %>% count(Department, sort=TRUE))
lt_dept_all<-data.frame(s3 %>% count(Department, sort=TRUE))
lt_dept2<-merge(lt_dept, lt_dept_all, by="Department")
lt_dept2$perc<-round((lt_dept2$n.x/lt_dept2$n.y *100), digits=2)
lt_dept2 <- lt_dept2[order(lt_dept2$n.y, decreasing=T),]
lt_dept2$Department<-as.character(lt_dept2$Department)

lt1<-aggregate(lagtime ~ Department, data=od, FUN=mean, na.rm=TRUE)
lt1$lagtime<-round(lt1$lagtime, digits=2); 

lt_dept2<-merge(lt_dept2, lt1, by="Department")

cvi<-read.csv("./CodeValueIndex.csv", header=FALSE, stringsAsFactors=FALSE)
lt_dept3<-merge(lt_dept2, cvi, by.x="Department", by.y="V1"); lt_dept3$Department<-NULL;

lt_dept3<-lt_dept3[,c(5,1,2,3, 4)]
lt_dept3 <- lt_dept3[order(lt_dept3$n.y, decreasing=T),]

names(lt_dept3)<-c("Department", "Overdue Requests", "All Requests", "Percentage of Requests Overdue", "Lag Time (Days)")
row.names(lt_dept3)<-NULL;
kable(lt_dept3)

Department	Overdue Requests	All Requests	Percentage of Requests Overdue	Lag Time (Days)
Public Works	7739	120535	6.42	28.28
Transportation Department	5261	50113	10.50	22.53
Inspectional Services	14331	27970	51.24	51.95
Parks	1445	14710	9.82	22.40
Information Channel (Not a department)	5911	12691	46.58	20.19
Property Management	610	3594	16.97	31.38
Water and Sewer Commission	584	1848	31.60	11.67
Disability Commission	16	539	2.97	42.51
Animal Control	20	452	4.42	0.05
Boston Public Schools	84	116	72.41	39.45
Disabilities/ADA	7	8	87.50	42.33

Looking at overdue requests by department, we can see that the departments with largest volume of 311 requests are largely robust. 6.4% of Public Works requests and 10.5% of Transportation requests are overdue. However, 51.2% of requests for Inspectional Services and 46.6% of requests regarding the Information Channel are overdue. Inspection Services also has the largest lag time, at approximately 52 days, which might merit further action.

Timing of 311 Requests

library(dplyr); library(knitr);

s3$date<-as.Date(s3$OPEN_DT)
s3$monthyear<-format(as.Date(s3$date), "%Y-%m")
s3$weekyear<-format(as.Date(s3$date), "%Y-%W")

ggplot(data=s3, aes(x=monthyear, y=..count..))+geom_histogram(stat="count")+theme_economist()+
xlab("Month and Year")+ylab("Count of 311 Requests")+ggtitle("311 Requests by Month")+
scale_x_discrete(labels=c("Jan 16\n", "Feb 16\n", "Mar 16\n", "Apr 16\n", "May 16\n", "Jun 16\n", "Jul 16\n", 
"Aug 16\n", "Sept 16\n", "Aug 16\n", "Nov 16\n", "Dec 16\n", "Jan 17\n", "Feb 17\n(to date)"))+
theme(axis.ticks=element_blank(),  panel.grid.minor=element_blank(), 
        plot.title = element_text(hjust = 0.5), , plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))+
        geom_hline(yintercept = 21000, color="red", linetype="dashed", lwd=2)+
        geom_hline(yintercept = 15000, color="green", linetype="dashed", lwd=2)

s3$month<-months(s3$OPEN_DT)
june_sept<-subset(s3, month=="June" | month=="September")
june_sept2<-data.frame(june_sept %>% count(REASON, sort=TRUE))
june_sept2$perc<-round(june_sept2$n/sum(june_sept2$n) * 100, digits=2); 
june_sept2$n<-NULL; june_sept2<-june_sept2[1:5,]; 
names(june_sept2)<-c("Reason for 311 Request", "Percentage")

kable(june_sept2)

Reason for 311 Request	Percentage
Enforcement & Abandoned Vehicles	13.49
Sanitation	12.72
Street Cleaning	12.40
Code Enforcement	8.98
Highway Maintenance	8.25

The City of Boston received the most 311 requests in the months of June and September, and the least requests in January of 2016. The winter low is partially attributable to a milder winter. The late summer/early fall peak is likely caused by students moving into these neighborhoods, and reporting on vehicles, sanitation, and street cleaning, among other things.

Sources of 311 requests

Now let’s take a look at HOW people submit 311 requests.

src<-data.frame(s3 %>% count(Source, sort=TRUE))

ggplot(src, aes(x=f(Source), y=n)) + geom_bar(stat="identity") + theme_economist()+coord_flip()+
  ylab("Number of 311 Requests") + xlab("Source")+
  ggtitle("311 Requests by Source")+
 theme(axis.ticks=element_blank(),  panel.grid.minor=element_blank(), 
        plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))

We can thus see that a constitutent call seems to be the most frequently used means of submitting a 311 request. The BOS:311 app, formerly knowns as Citizens Connect, runs a close second. Twitter and Maximo seem to report the fewest number of 311 requests. A neighborhood-wise breakdown of sources is below:

s3_hood<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s3_Source_nh<-data.frame(s3 %>% count(neighborhood, Source, sort=TRUE))
s3_sh<-merge(s3_Source_nh, s3_hood, by="neighborhood"); s3_sh$perc<-round((s3_sh$n.x/s3_sh$n.y)*100, digits=2);

colourCount = length(unique(s3_sh$Source))
getPalette = colorRampPalette(brewer.pal(colourCount, "Set3"))

ggplot(s3_sh, aes(x = neighborhood, y = perc, fill = Source)) +
  theme_economist_white()+
  geom_bar(stat="identity", width = 0.7) +
  scale_fill_manual(values = getPalette(colourCount))+
  scale_y_continuous(limits = c(-1,101), expand = c(0, 0))+
  labs(x = "Neighborhood", y = "Percentage") +
  theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(1,1,1,1), "cm"))+
  coord_flip()

Closure Reasons

Finally, we take a look at closure reasons to see why requests are closed, for a general idea of the topics citizens are reporting.

library(tm); library(SnowballC); library(wordcloud); library(randomcoloR);

wcdata<-paste(s3$CLOSURE_REASON, sep="", collapse="")
wcdata<-gsub("Case Closed\\.\\ Closed date ", "", wcdata, fixed=TRUE)
wcdata<-gsub('[[:digit:]]+', '', wcdata)
wcdata<-gsub("Case Resolved ", "", wcdata, fixed=TRUE)

setwd("I:/City of Boston")
write.table(s3$CLOSURE_REASON, "./temp/closurereason.txt")

# Create a corpus
cname <- file.path("./temp/")   
docs <- Corpus(DirSource(cname))   

docs <- tm_map(docs, removePunctuation) 
docs <- tm_map(docs, removeNumbers)   
docs <- tm_map(docs, stripWhitespace)  # Remove whitespace
docs<-tm_map(docs, tolower)  # convert to lowercase
docs <- tm_map(docs, removeWords, stopwords('english'))

docs <- tm_map(docs, removeWords, c("case closed", "closed date", "case"))   

for(j in seq(docs))   
   {    docs[[j]] <- gsub("resolved", "resolv", docs[[j]])
        }   

# Next we convert all words to their word stem to ensure different forms of word are counted as one
docs <- tm_map(docs, stemDocument)   
docs <- tm_map(docs, PlainTextDocument)   

dtm <- DocumentTermMatrix(docs)   # dtm 

freq <- colSums(as.matrix(dtm))   #length(freq)   

ord <- order(freq)   

dtms <- removeSparseTerms(dtm, 0.1) # This makes a matrix that is 10% empty space, maximum.   
# inspect(dtms)  

# freq[head(ord)]   
# freq[tail(ord)]  
freq <- colSums(as.matrix(dtms))   
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)   

palette<-distinctColorPalette(length(subset(freq, freq>2000)))
names(palette)<-NULL
wordcloud(names(freq), freq, min.freq=2000, max.words=100, color=palette, rot.per=0.25, use.r.layout=FALSE,
scale=c(10,1), random.order=FALSE)

Point Density Map

Finally, we will take a look at the point density distribution of the 311 requests to see if there were any geographically significant peaks.

library(ggmap); library(ggplot2); library(animation); library(maps); library(plyr); library(stringr); library(zoo);

# Remove parentheses from Geocoded Location column
data$Geocoded_Location<-gsub("\\(", "", data$Geocoded_Location); 
data$Geocoded_Location<-gsub("\\)", "", data$Geocoded_Location); 

# Split Geocoded Location into two columns: lat and long

points<-str_split_fixed(data$Geocoded_Location, ", ", 2)
data$lat<-as.numeric(points[,1]); data$lon<-as.numeric(points[,2]);

animap<-data.frame(s3$LATITUDE, s3$LONGITUDE, s3$OPEN_DT)
animap$s3.OPEN_DT<-as.yearmon(animap$s3.OPEN_DT)
names(animap)<-c("Latitude", "Longitude", "start")

# Remove "NA" lat/longs
animap<-animap[animap$Latitude!=42.3594,]; animap<-animap[animap$Longitude!=-71.0587,];

# Get Boston map
boston.map = get_map(location = "Boston, MA", zoom = 11, color = "bw")

# set start same as end
#  adjust however you would like
animap$end <- animap$start

timev<-time(zooreg(1:13, as.yearmon("2016-01"), freq = 12))
v = 1:length(timev)

oopt <- animation::ani.options(interval = 1)

detach("package:tm", unload=TRUE); detach("package:NLP", unload=TRUE); library(ggplot2);

test <- function(j)
{ sub<-subset(animap, start==timev[j])

  ggmap(boston.map, extent = "panel", maprange=FALSE) +
    geom_density2d(data = sub, aes(x = Longitude, y = Latitude)) +
    stat_density2d(data = sub, aes(x = Longitude, y = Latitude,  fill = ..level..,
    alpha =    ..level..), size = 0.01, bins = 5, geom = 'polygon') +
    scale_fill_gradient(low = "green", high = "red") +
    scale_alpha(range = c(0.10, 0.50), guide = FALSE) +
    theme(legend.position = "none", axis.title = element_blank(), text = element_text(size = 12))+
    annotate("text", label=timev[j], x = -70.9, y = 42.35, size=5, color = "red")
    ggsave(filename=paste("animap_",j,".png",sep=""), width=8, height=8) 
}

FUN2 <- function() {
    lapply(v, function(i) {
    print(test(i))
    ani.pause()
  })
}

FUN2()

# setwd("I:/City of Boston/")
# saveHTML(FUN2(), autoplay = FALSE, loop = FALSE, verbose = FALSE, img.name = "animap", imgdir = "I:/City of Boston/GIF/", 
# htmlfile = "animap.html", single.opts = "'controls': ['first', 'previous', 'play', 'next', 'last', 'loop', 'speed'], 'delayMin': 0")

# saveGIF(FUN2(), movie.name="animap.gif", img.name="Rplot")