In 2015, the City of Boston released a new 311 system for reporting non-emergency issues around the city. It is a neat platform, frequently used around the city, with app support for both iPhone and Android for maximum access. The data collected from this system is used for goal-setting around the city. The 311 dataset for this project is maintained by the Mayor’s Office, and is available here.
First steps first: download the data. This is a relatively large dataset (0.424 GB), and may take some time to download and parse.
library(data.table)
data <- fread("https://data.cityofboston.gov/api/views/awu8-dc52/rows.csv?accessType=DOWNLOAD")
head(data)
str(data)
There is a lot of valuable information here! The Data Dictionary explains what each variable means in further detail.
The next step is data cleaning. Format the date and time to a consistent format, and simplify neighborhood names for further plotting.
# Set consistent date/time format
data$OPEN_DT<-as.POSIXct(data$OPEN_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")
data$TARGET_DT<-as.POSIXct(data$TARGET_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")
data$CLOSED_DT<-as.POSIXct(data$CLOSED_DT, "%m/%d/%Y %I:%M:%S %p", tz = "EST5EDT")
# Set missing neighborhood name to "Other"
data$neighborhood[data$neighborhood==""]<-"Unspecified"
# Simplify Neighborhood names
data$neighborhood[data$neighborhood=="Downtown / Financial District"]<-"Downtown"
data$neighborhood[data$neighborhood=="South Boston / South Boston Waterfront"]<-"South Boston"
data$neighborhood[data$neighborhood=="Fenway / Kenmore / Audubon Circle / Longwood"]<-"Fenway-Kenmore"
For this analysis, we are only interested in requests made after January 1, 2016. So we subset the data, and generate an initial plot
library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2);
s2<-subset(data, OPEN_DT>as.POSIXct("2016-01-01 00:00:00"))
s3<-data.frame(s2)
cols <- c("SUBJECT", "REASON", "TYPE", "Department", "neighborhood")
s3[cols] <- lapply(s3[cols], factor)
levels(s3$neighborhood)<-rev(levels(s3$neighborhood))
s3_nh1<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s3_nh1$perc<-round((s3_nh1$n/sum(s3_nh1$n) * 100), digits=2)
f <- function(x) factor(x, levels = rev(unique(x)))
ggplot(s3_nh1, aes(x=f(neighborhood), y=perc)) + geom_bar(stat="identity") + theme_economist()+coord_flip()+
ylab("Percent of Total 311 Requests") + xlab("Neighborhood")+
ggtitle("311 Requests by Neighborhood")+
theme(plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"), plot.title = element_text(hjust = 0.5))
Thus, the Top 5 neighborhoods by volume of 311 requests are Jamaica Plain, Boston, Beacon Hill, Hyde Park, and Dorchester.
Next, let’s look at the subject distribution for all the 311 requests.
s3_sub<-data.frame(s3 %>% count(SUBJECT, sort=TRUE))
s3_sub$perc<-round(s3_sub$n/nrow(s3) * 100, digits=1)
s3_sub$SUBJECT<-as.character(s3_sub$SUBJECT);
s3_sub$SUBJECT<-factor(s3_sub$SUBJECT, levels=rev(s3_sub$SUBJECT));
ggplot(data=s3_sub, aes(x=SUBJECT, y=perc)) + theme_economist() +
geom_bar(position="dodge",stat="identity") +
coord_flip() +
scale_y_continuous(limits = c(0,55), expand = c(0, 0)) +
scale_x_discrete(expand=c(0,0))+
ggtitle("311 Requests by Subject") +
xlab("Subject") + ylab("Percentage") +
theme(axis.ticks=element_blank(), panel.grid.minor=element_blank(),
plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
We can see that most of the 311 requests come through the Public Works Department, Transportation, and Inspectional Services.
Let’s take a closer look at the 311 request subjects from each neighborhood:
library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(stringr);
colourCount = length(unique(s3$SUBJECT))
getPalette = colorRampPalette(brewer.pal(12, "Set3"))
s3_sub_nh<-data.frame(s3 %>% count(SUBJECT, neighborhood, sort=TRUE))
s3_nh<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s4_sub_nh<-merge(s3_sub_nh, s3_nh, by="neighborhood")
s4_sub_nh$perc<-round(s4_sub_nh$n.x/s4_sub_nh$n.y * 100, digits=1)
s4_sub_nh$n.x<-s4_sub_nh$n.y<-NULL
s4_sub_nh_top5<-subset(s4_sub_nh, neighborhood=="Jamaica Plain" | neighborhood=="Boston" | neighborhood== "Beacon Hill" | neighborhood== "Hyde Park" | neighborhood== "Dorchester")
s4_sub_nh$neighborhood = str_wrap(s4_sub_nh$neighborhood , width = 10)
s4_sub_nh$neighborhood = factor(s4_sub_nh$neighborhood, levels=rev(unique(s4_sub_nh$neighborhood)))
ggplot(s4_sub_nh, aes(x = neighborhood, y = perc, fill = SUBJECT)) +
theme_economist()+
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = getPalette(colourCount))+
scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
labs(x = "Neighborhood", y = "Percentage") +
ggtitle("Subject of 311 Requests for All Neighborhoods")+
theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"),
plot.title = element_text(hjust = 0.5))+
coord_flip()
For the Top 5 neighborhoods identified above, let’s zoom into their 311 request subjects:
colourCount = length(unique(s4_sub_nh$SUBJECT))+7
getPalette = colorRampPalette(brewer.pal(colourCount, "Set2"))
ggplot(s4_sub_nh_top5, aes(x = neighborhood, y = perc, fill = SUBJECT)) +
theme_economist()+
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = getPalette(colourCount))+
scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
labs(x = "Neighborhood", y = "Percentage") +
ggtitle("Subject of 311 Requests for Top 5 Neighborhoods")+
theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"),
plot.title = element_text(hjust = 0.5))+
coord_flip()
There are a total of 54 unique reasons for 311 requests. To make sense of these, we first extract the top 10 reasons for 311 requests from the larger dataset, and plot them by neighborhood.
library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(knitr);
s3_rea<-data.frame(s3 %>% count(REASON, sort=TRUE))
s3_rea<-s3_rea$REASON[1:10]
s4<-s3[s3$REASON %in% s3_rea,]
s4_rea_nh<-data.frame(s4 %>% count(REASON, neighborhood, sort=TRUE))
s4_nh<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s4_rea_nh<-merge(s4_rea_nh, s4_nh, by="neighborhood")
s4_rea_nh$perc<-round(s4_rea_nh$n.x/s4_rea_nh$n.y * 100, digits=1)
s4_rea_nh$n.x<-s4_rea_nh$n.y<-NULL
s4_rea_nh_top5<-subset(s4_rea_nh, neighborhood=="Jamaica Plain" | neighborhood=="Boston" | neighborhood== "Beacon Hill" | neighborhood== "Hyde Park" | neighborhood== "Dorchester")
s4_rea_nh$neighborhood = factor(s4_rea_nh$neighborhood, levels=rev(unique(s4_rea_nh$neighborhood)))
colourCount = length(unique(s4$REASON))
getPalette = colorRampPalette(brewer.pal(colourCount, "Set3"))
ggplot(s4_rea_nh, aes(x = neighborhood, y = perc, fill = REASON)) +
theme_economist()+
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = getPalette(colourCount))+
scale_y_continuous(limits = c(0,100), expand = c(0, 0))+
labs(x = "Neighborhood", y = "Percentage") +
theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom")+
coord_flip()
The top 10 reasons capture most neighborhoods, but they are inadequate for a few others. To better understand different reasons behind 311 requests, we look at neighborhoods where less than 75% of 311 requests are captured by the Top 10 reasons. Allston/Brighton, Mattapan, Roslindale, and West Roxbury might face issues completely different from the other neighborhoods. We subset these neighborhoods and look at the distribution of 311 reasons from these areas:
library(dplyr); library(RColorBrewer); library(ggthemes); library(ggplot2); library(knitr);
s3_reason_subset<-subset(s3, neighborhood=="Allston / Brighton" | neighborhood=="Mattapan" | neighborhood == "Roslindale" | neighborhood=="West Roxbury")
s4_rea_nh<-data.frame(s3_reason_subset %>% count(REASON, neighborhood, sort=TRUE))
s4_nh<-data.frame(s3_reason_subset %>% count(neighborhood, sort=TRUE))
s4_rea_nh<-merge(s4_rea_nh, s4_nh, by="neighborhood")
s4_rea_nh$perc<-round(s4_rea_nh$n.x/s4_rea_nh$n.y * 100, digits=1)
s4_rea_nh$n.x<-s4_rea_nh$n.y<-NULL
s4_rea_nh$neighborhood = factor(s4_rea_nh$neighborhood, levels=rev(unique(s4_rea_nh$neighborhood)))
options(knitr.kable.NA = '')
s4_rea_nh2<-subset(s4_rea_nh, perc>4.00)
w<-reshape(s4_rea_nh2, timevar=c("neighborhood"), idvar=c("REASON"), direction="wide")
names(w)<-gsub("perc.", "", names(w)); names(w)[1]<-"Neighborhood"; row.names(w)<-NULL;
kable(w)
Neighborhood | Allston / Brighton | Mattapan | Roslindale | West Roxbury |
---|---|---|---|---|
Employee & General Comments | 36.6 | |||
Call Inquiry | 20.1 | |||
Park Maintenance & Safety | 17.9 | 8.3 | 11.0 | 15.3 |
Disability | 5.5 | |||
Signs & Signals | 5.1 | 4.2 | 5.0 | |
Health | 8.3 | |||
Highway Maintenance | 8.3 | 8.2 | 10.3 | |
Recycling | 25.0 | |||
Sanitation | 16.7 | 11.8 | 5.3 | |
Building | 25.0 | 11.7 | 7.6 | |
Weights and Measures | 8.3 | |||
Enforcement & Abandoned Vehicles | 10.6 | 11.5 | ||
Code Enforcement | 7.8 | 6.1 | ||
Trees | 5.0 | 4.2 | ||
Street Cleaning | 5.7 | 9.9 | ||
Housing | 10.7 |
We can see here that these four neighborhoods have slightly different priorities for 311 requests compared to the other neighborhoods. Allston/Brighton places 311 requests primarily for Employees & General Comments, as well as Call Inquiry and Park Maintenance and Safety. Mattapan residents, on the other hand, request Recycling and Building issues. Roslindale and West Roxbury have a generally even split regarding Enforcement & Abandoned Vehicles, Building, Sanitation, and Park Maintenance & Safety. However, given that only Allston/Brighton provides a significant number of 311 requests relative to the other neighborhoods, these areas might not be immediate priorities for the City of Boston.
We can compare these communities to the Top 5 neighborhoods by volume of 311 requests identified above:
s4_rea_nh_top5$neighborhood = factor(s4_rea_nh_top5$neighborhood, levels=rev(unique(s4_rea_nh_top5$neighborhood)))
w<-reshape(s4_rea_nh_top5, timevar=c("neighborhood"), idvar=c("REASON"), direction="wide")
names(w)<-gsub("perc.", "", names(w)); names(w)[1]<-"Neighborhood"; row.names(w)<-NULL;
kable(w)
Neighborhood | Beacon Hill | Boston | Dorchester | Hyde Park | Jamaica Plain |
---|---|---|---|---|---|
Enforcement & Abandoned Vehicles | 32.9 | 10.3 | 12.8 | 19.7 | 12.1 |
Trees | 3.3 | 3.1 | 4.2 | 1.7 | 3.3 |
Building | 7.3 | 4.7 | 5.6 | 4.9 | 5.4 |
Highway Maintenance | 5.7 | 9.9 | 10.2 | 12.8 | 6.3 |
Street Lights | 3.3 | 4.4 | 4.6 | 5.4 | 4.2 |
Sanitation | 8.2 | 14.5 | 15.3 | 3.1 | 20.5 |
Housing | 2.2 | 6.7 | 3.0 | 0.7 | 4.9 |
Street Cleaning | 10.4 | 11.5 | 10.4 | 19.9 | 13.1 |
Signs & Signals | 5.1 | 6.8 | 8.9 | 9.4 | 4.8 |
Code Enforcement | 10.0 | 8.3 | 8.1 | 8.8 | 7.9 |
The Lag Time is defined as the difference between a project’s target completion time and actual completion time.
library(knitr); library(gridExtra);
otstat<-round((table(s3$OnTime_Status)/nrow(s3) * 100), digits=1); otstat[-1];
##
## ONTIME OVERDUE
## 84.7 15.3
print(otstat) # On-time completion percentages
##
## ONTIME OVERDUE
## 0.0 84.7 15.3
nhtotals<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s3$projecttime<-as.numeric(difftime(s3$CLOSED_DT, s3$OPEN_DT, tz="EST5EDT", units="days"))
s3$lagtime<-as.numeric(difftime(s3$CLOSED_DT, s3$TARGET_DT, tz="EST5EDT", units="days"))
mean(s3$projecttime, na.rm=TRUE) # Average number of days taken to complete a request
## [1] 8.823248
## Check whether ontime is accurate or not
# test<-subset(s3, s3$lagtime>0); table(test$OnTime_Status)
# test<-subset(s3, s3$lagtime<0); table(test$OnTime_Status)
od<-subset(s3, OnTime_Status=="OVERDUE")
lt_h<-data.frame(od %>% count(neighborhood, sort=TRUE))
lt_all<-merge(lt_h, nhtotals, by="neighborhood"); lt_all$perc<-lt_all$n.x/lt_all$n.y * 100;
lt_all <- lt_all[order(lt_all$perc, decreasing=T),]
lt<-aggregate(lagtime ~ neighborhood, data=od, FUN=mean, na.rm=TRUE)
lt$lagtime<-round(lt$lagtime, digits=2); lt <- lt[order(lt$lagtime, decreasing=T),]
f <- function(x) factor(x, levels = rev(unique(x)))
names(lt)<-c("Neighborhood", "Lag Time in Days"); row.names(lt)<-NULL;
p1<-ggplot(lt, aes(x=f(Neighborhood), y=`Lag Time in Days`))+ geom_bar(stat="identity")+
theme_economist()+coord_flip()+ggtitle("Mean Lag Time in Days")+
scale_y_continuous(limit=c(0,65), expand=c(0,0))+
xlab("Neighborhoods")+ylab("Lag Time in Days")+
theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
p2<-ggplot(lt_all, aes(x=f(neighborhood), y=perc))+ geom_bar(stat="identity")+
theme_economist()+coord_flip()+ggtitle("% Requests Overdue")+
scale_y_continuous(limit=c(0,35), expand=c(0,0))+
xlab("")+ylab("Percentage")+
theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
grid.arrange(p1, p2, nrow=1, ncol=2)
How about for the Top 5 Neighborhoods? Let’s take a look:
library(knitr); library(gridExtra);
lt<-subset(lt, Neighborhood %in% c("Jamaica Plain", "Boston", "Beacon Hill", "Hyde Park", "Dorchester"))
lt_all<-subset(lt_all, neighborhood %in% c("Jamaica Plain", "Boston", "Beacon Hill", "Hyde Park", "Dorchester"))
p1<-ggplot(lt, aes(x=f(Neighborhood), y=`Lag Time in Days`))+ geom_bar(stat="identity")+
theme_economist()+coord_flip()+ggtitle("Mean Lag Time in Days")+
scale_y_continuous(limit=c(0,65), expand=c(0,0))+
xlab("Neighborhoods")+ylab("Lag Time in Days")+
theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
p2<-ggplot(lt_all, aes(x=f(neighborhood), y=perc))+ geom_bar(stat="identity")+
theme_economist()+coord_flip()+ggtitle("% Requests Overdue")+
scale_y_continuous(limit=c(0,35), expand=c(0,0))+
xlab("")+ylab("Percentage")+
theme(plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
grid.arrange(p1, p2, nrow=1, ncol=2)
The good news is, about 85% of 311 requests are resolved on or before the target completion time. However, approximately 15% of the City of Boston’s 311 requests since 2016 are overdue. The lag time is highest in Beacon Hill, Downtown, and Greater Mattapan. However, these areas have less than 20% of all projects overdue. Mattapan, Chestnut Hill, and West Roxbury have the highest percentages of projects overdue; however, given the relatively small volume of requests from these neighborhoods, this is not surprising. The Top 5 neighborhoods identified above also experience some of the longest lag times. While fewer percentages of all requests in these neighborhoods are overdue, the neighborhoods of Beacon Hill and Jamaica Plain would definitely stand to benefit from more resourced and decreased lag times.
By department:
od<-subset(s3, OnTime_Status=="OVERDUE")
lt_dept<-data.frame(od %>% count(Department, sort=TRUE))
lt_dept_all<-data.frame(s3 %>% count(Department, sort=TRUE))
lt_dept2<-merge(lt_dept, lt_dept_all, by="Department")
lt_dept2$perc<-round((lt_dept2$n.x/lt_dept2$n.y *100), digits=2)
lt_dept2 <- lt_dept2[order(lt_dept2$n.y, decreasing=T),]
lt_dept2$Department<-as.character(lt_dept2$Department)
lt1<-aggregate(lagtime ~ Department, data=od, FUN=mean, na.rm=TRUE)
lt1$lagtime<-round(lt1$lagtime, digits=2);
lt_dept2<-merge(lt_dept2, lt1, by="Department")
cvi<-read.csv("./CodeValueIndex.csv", header=FALSE, stringsAsFactors=FALSE)
lt_dept3<-merge(lt_dept2, cvi, by.x="Department", by.y="V1"); lt_dept3$Department<-NULL;
lt_dept3<-lt_dept3[,c(5,1,2,3, 4)]
lt_dept3 <- lt_dept3[order(lt_dept3$n.y, decreasing=T),]
names(lt_dept3)<-c("Department", "Overdue Requests", "All Requests", "Percentage of Requests Overdue", "Lag Time (Days)")
row.names(lt_dept3)<-NULL;
kable(lt_dept3)
Department | Overdue Requests | All Requests | Percentage of Requests Overdue | Lag Time (Days) |
---|---|---|---|---|
Public Works | 7739 | 120535 | 6.42 | 28.28 |
Transportation Department | 5261 | 50113 | 10.50 | 22.53 |
Inspectional Services | 14331 | 27970 | 51.24 | 51.95 |
Parks | 1445 | 14710 | 9.82 | 22.40 |
Information Channel (Not a department) | 5911 | 12691 | 46.58 | 20.19 |
Property Management | 610 | 3594 | 16.97 | 31.38 |
Water and Sewer Commission | 584 | 1848 | 31.60 | 11.67 |
Disability Commission | 16 | 539 | 2.97 | 42.51 |
Animal Control | 20 | 452 | 4.42 | 0.05 |
Boston Public Schools | 84 | 116 | 72.41 | 39.45 |
Disabilities/ADA | 7 | 8 | 87.50 | 42.33 |
Looking at overdue requests by department, we can see that the departments with largest volume of 311 requests are largely robust. 6.4% of Public Works requests and 10.5% of Transportation requests are overdue. However, 51.2% of requests for Inspectional Services and 46.6% of requests regarding the Information Channel are overdue. Inspection Services also has the largest lag time, at approximately 52 days, which might merit further action.
library(dplyr); library(knitr);
s3$date<-as.Date(s3$OPEN_DT)
s3$monthyear<-format(as.Date(s3$date), "%Y-%m")
s3$weekyear<-format(as.Date(s3$date), "%Y-%W")
ggplot(data=s3, aes(x=monthyear, y=..count..))+geom_histogram(stat="count")+theme_economist()+
xlab("Month and Year")+ylab("Count of 311 Requests")+ggtitle("311 Requests by Month")+
scale_x_discrete(labels=c("Jan 16\n", "Feb 16\n", "Mar 16\n", "Apr 16\n", "May 16\n", "Jun 16\n", "Jul 16\n",
"Aug 16\n", "Sept 16\n", "Aug 16\n", "Nov 16\n", "Dec 16\n", "Jan 17\n", "Feb 17\n(to date)"))+
theme(axis.ticks=element_blank(), panel.grid.minor=element_blank(),
plot.title = element_text(hjust = 0.5), , plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))+
geom_hline(yintercept = 21000, color="red", linetype="dashed", lwd=2)+
geom_hline(yintercept = 15000, color="green", linetype="dashed", lwd=2)
s3$month<-months(s3$OPEN_DT)
june_sept<-subset(s3, month=="June" | month=="September")
june_sept2<-data.frame(june_sept %>% count(REASON, sort=TRUE))
june_sept2$perc<-round(june_sept2$n/sum(june_sept2$n) * 100, digits=2);
june_sept2$n<-NULL; june_sept2<-june_sept2[1:5,];
names(june_sept2)<-c("Reason for 311 Request", "Percentage")
kable(june_sept2)
Reason for 311 Request | Percentage |
---|---|
Enforcement & Abandoned Vehicles | 13.49 |
Sanitation | 12.72 |
Street Cleaning | 12.40 |
Code Enforcement | 8.98 |
Highway Maintenance | 8.25 |
The City of Boston received the most 311 requests in the months of June and September, and the least requests in January of 2016. The winter low is partially attributable to a milder winter. The late summer/early fall peak is likely caused by students moving into these neighborhoods, and reporting on vehicles, sanitation, and street cleaning, among other things.
Now let’s take a look at HOW people submit 311 requests.
src<-data.frame(s3 %>% count(Source, sort=TRUE))
ggplot(src, aes(x=f(Source), y=n)) + geom_bar(stat="identity") + theme_economist()+coord_flip()+
ylab("Number of 311 Requests") + xlab("Source")+
ggtitle("311 Requests by Source")+
theme(axis.ticks=element_blank(), panel.grid.minor=element_blank(),
plot.title = element_text(hjust = 0.5), plot.margin = unit(c(0.5, 0.5, 0.5, 0.5), "cm"))
We can thus see that a constitutent call seems to be the most frequently used means of submitting a 311 request. The BOS:311 app, formerly knowns as Citizens Connect, runs a close second. Twitter and Maximo seem to report the fewest number of 311 requests. A neighborhood-wise breakdown of sources is below:
s3_hood<-data.frame(s3 %>% count(neighborhood, sort=TRUE))
s3_Source_nh<-data.frame(s3 %>% count(neighborhood, Source, sort=TRUE))
s3_sh<-merge(s3_Source_nh, s3_hood, by="neighborhood"); s3_sh$perc<-round((s3_sh$n.x/s3_sh$n.y)*100, digits=2);
colourCount = length(unique(s3_sh$Source))
getPalette = colorRampPalette(brewer.pal(colourCount, "Set3"))
ggplot(s3_sh, aes(x = neighborhood, y = perc, fill = Source)) +
theme_economist_white()+
geom_bar(stat="identity", width = 0.7) +
scale_fill_manual(values = getPalette(colourCount))+
scale_y_continuous(limits = c(-1,101), expand = c(0, 0))+
labs(x = "Neighborhood", y = "Percentage") +
theme_minimal(base_size = 14)+labs(fill="")+theme(legend.position="bottom", plot.margin = unit(c(1,1,1,1), "cm"))+
coord_flip()
Finally, we take a look at closure reasons to see why requests are closed, for a general idea of the topics citizens are reporting.
library(tm); library(SnowballC); library(wordcloud); library(randomcoloR);
wcdata<-paste(s3$CLOSURE_REASON, sep="", collapse="")
wcdata<-gsub("Case Closed\\.\\ Closed date ", "", wcdata, fixed=TRUE)
wcdata<-gsub('[[:digit:]]+', '', wcdata)
wcdata<-gsub("Case Resolved ", "", wcdata, fixed=TRUE)
setwd("I:/City of Boston")
write.table(s3$CLOSURE_REASON, "./temp/closurereason.txt")
# Create a corpus
cname <- file.path("./temp/")
docs <- Corpus(DirSource(cname))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace) # Remove whitespace
docs<-tm_map(docs, tolower) # convert to lowercase
docs <- tm_map(docs, removeWords, stopwords('english'))
docs <- tm_map(docs, removeWords, c("case closed", "closed date", "case"))
for(j in seq(docs))
{ docs[[j]] <- gsub("resolved", "resolv", docs[[j]])
}
# Next we convert all words to their word stem to ensure different forms of word are counted as one
docs <- tm_map(docs, stemDocument)
docs <- tm_map(docs, PlainTextDocument)
dtm <- DocumentTermMatrix(docs) # dtm
freq <- colSums(as.matrix(dtm)) #length(freq)
ord <- order(freq)
dtms <- removeSparseTerms(dtm, 0.1) # This makes a matrix that is 10% empty space, maximum.
# inspect(dtms)
# freq[head(ord)]
# freq[tail(ord)]
freq <- colSums(as.matrix(dtms))
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
palette<-distinctColorPalette(length(subset(freq, freq>2000)))
names(palette)<-NULL
wordcloud(names(freq), freq, min.freq=2000, max.words=100, color=palette, rot.per=0.25, use.r.layout=FALSE,
scale=c(10,1), random.order=FALSE)
Finally, we will take a look at the point density distribution of the 311 requests to see if there were any geographically significant peaks.
library(ggmap); library(ggplot2); library(animation); library(maps); library(plyr); library(stringr); library(zoo);
# Remove parentheses from Geocoded Location column
data$Geocoded_Location<-gsub("\\(", "", data$Geocoded_Location);
data$Geocoded_Location<-gsub("\\)", "", data$Geocoded_Location);
# Split Geocoded Location into two columns: lat and long
points<-str_split_fixed(data$Geocoded_Location, ", ", 2)
data$lat<-as.numeric(points[,1]); data$lon<-as.numeric(points[,2]);
animap<-data.frame(s3$LATITUDE, s3$LONGITUDE, s3$OPEN_DT)
animap$s3.OPEN_DT<-as.yearmon(animap$s3.OPEN_DT)
names(animap)<-c("Latitude", "Longitude", "start")
# Remove "NA" lat/longs
animap<-animap[animap$Latitude!=42.3594,]; animap<-animap[animap$Longitude!=-71.0587,];
# Get Boston map
boston.map = get_map(location = "Boston, MA", zoom = 11, color = "bw")
# set start same as end
# adjust however you would like
animap$end <- animap$start
timev<-time(zooreg(1:13, as.yearmon("2016-01"), freq = 12))
v = 1:length(timev)
oopt <- animation::ani.options(interval = 1)
detach("package:tm", unload=TRUE); detach("package:NLP", unload=TRUE); library(ggplot2);
test <- function(j)
{ sub<-subset(animap, start==timev[j])
ggmap(boston.map, extent = "panel", maprange=FALSE) +
geom_density2d(data = sub, aes(x = Longitude, y = Latitude)) +
stat_density2d(data = sub, aes(x = Longitude, y = Latitude, fill = ..level..,
alpha = ..level..), size = 0.01, bins = 5, geom = 'polygon') +
scale_fill_gradient(low = "green", high = "red") +
scale_alpha(range = c(0.10, 0.50), guide = FALSE) +
theme(legend.position = "none", axis.title = element_blank(), text = element_text(size = 12))+
annotate("text", label=timev[j], x = -70.9, y = 42.35, size=5, color = "red")
ggsave(filename=paste("animap_",j,".png",sep=""), width=8, height=8)
}
FUN2 <- function() {
lapply(v, function(i) {
print(test(i))
ani.pause()
})
}
FUN2()
# setwd("I:/City of Boston/")
# saveHTML(FUN2(), autoplay = FALSE, loop = FALSE, verbose = FALSE, img.name = "animap", imgdir = "I:/City of Boston/GIF/",
# htmlfile = "animap.html", single.opts = "'controls': ['first', 'previous', 'play', 'next', 'last', 'loop', 'speed'], 'delayMin': 0")
# saveGIF(FUN2(), movie.name="animap.gif", img.name="Rplot")