Public-facing data portals are increasingly common among local governments. This script explores the content of the open data portal of Austin, TX (http://data.austintexas.gov) by collecting all data provided on the platform through Socrata API and characterizing data provision and demand.
# Create the function that collects detailed information about the repository contents
get_content <- function(dm = "data.austintexas.gov", limit = 10000) {
"http://api.us.socrata.com/api/catalog/v1?domains=${dm}&limit=${limit}" %>%
str_interp() %>%
httr::GET() %>%
content("text") %>%
fromJSON(flatten = TRUE)
}
# Download Austin data
city <- get_content()
# Extract interested attributes and put them one data frame
austin <- tibble(
dataset = city$results$resource.name,
category = city$results$classification.domain_category,
tags = city$results$classification.domain_tags,
datatype = city$results$resource.type,
download = city$results$resource.download_count,
pageview_last_week = city$results$resource.page_views.page_views_last_week,
pageview_last_month = city$results$resource.page_views.page_views_last_month,
pageview_total = city$results$resource.page_views.page_views_total,
last_update = city$results$resource.updatedAt
)
tibble(Number_of_datasets = length(austin$dataset),
Unique_categories = austin$category %>% na.omit() %>%
tolower() %>% unique() %>% length(),
Unique_tags = austin$tags %>% unlist() %>%
tolower() %>% unique() %>% length()) %>%
knitr::kable() %>% kableExtra::kable_styling(full_width = F)
| Number_of_datasets | Unique_categories | Unique_tags |
|---|---|---|
| 2398 | 18 | 1601 |
tibble(
Variable = names(austin),
Missing = austin %>% map_dbl(.x = ., ~is.na(.x) %>% sum()),
Completeness = austin %>% map_dbl(.x = ., ~is.na(.x) %>% mean())) %>%
mutate(
Completeness = show_percent(1-Completeness)
)%>%
knitr::kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
| Variable | Missing | Completeness |
|---|---|---|
| dataset | 0 | 100% |
| category | 889 | 62.9% |
| tags | 0 | 100% |
| datatype | 0 | 100% |
| download | 7 | 99.7% |
| pageview_last_week | 0 | 100% |
| pageview_last_month | 0 | 100% |
| pageview_total | 0 | 100% |
| last_update | 0 | 100% |
table(austin$category) %>%
as.data.frame(.) %>%
setNames(c("Category", "Number of datasets")) %>%
arrange(desc(`Number of datasets`)) %>%
mutate(Percentage = show_percent(`Number of datasets`/nrow(austin))) %>%
knitr::kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
| Category | Number of datasets | Percentage |
|---|---|---|
| Public Safety | 295 | 12.3% |
| City Government | 261 | 10.9% |
| Utilities and City Services | 220 | 9.2% |
| Budget and Finance | 170 | 7.1% |
| Health and Community Services | 136 | 5.7% |
| Locations and Maps | 110 | 4.6% |
| Building and Development | 80 | 3.3% |
| Environment | 77 | 3.2% |
| Recreation and Culture | 45 | 1.9% |
| Transportation and Mobility | 42 | 1.8% |
| Housing and Real Estate | 39 | 1.6% |
| Geodata | 14 | 0.6% |
| Government | 6 | 0.3% |
| City Infrastructure | 4 | 0.2% |
| Environmental | 4 | 0.2% |
| Utility | 4 | 0.2% |
| Capital Planning | 1 | 0% |
| Financial | 1 | 0% |
austin$tags %>% unlist() %>% table() %>%
as.data.frame(.) %>%
setNames(c("Tag", "Number of datasets")) %>%
arrange(desc(`Number of datasets`)) %>%
head(.,20) %>%
knitr::kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
| Tag | Number of datasets |
|---|---|
| atcems | 160 |
| annual performance report | 122 |
| indicator | 122 |
| measure | 122 |
| ems | 121 |
| austin | 93 |
| mobility | 92 |
| annual report | 79 |
| development | 67 |
| transportation | 64 |
| water quality | 60 |
| sustainability | 59 |
| 911 | 58 |
| police | 57 |
| eii | 55 |
| ambulance | 54 |
| indicators | 54 |
| ali | 52 |
| energy | 52 |
| bacteria | 50 |
df <- table(austin$datatype) %>% prop.table() %>%
as.data.frame(.) %>%
setNames(c("Datatype", "Percentage")) %>%
arrange(desc(`Percentage`))
ggplot(df, aes(x = 2, y = Percentage, fill = Datatype)) +
geom_bar(width = 1, stat = "identity", color = "white") +
coord_polar("y", start = 0) +
geom_text(aes(y = Percentage, label = paste0(round(Percentage, 2)*100, "%")), position = position_stack(vjust = 0.5)) +
xlim(0.5, 2.5) +
xlab("Datatype") +
ylab("Percent") +
theme(legend.position = "right",
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
ggplot(austin) +
geom_violin(aes(x = factor("datasets"), y = download), trim = FALSE, alpha = 0.5, na.rm = TRUE) +
geom_boxplot(aes(x = factor("datasets"), y = download), colour = "black", width = .16, alpha = 0.8, na.rm = TRUE) + labs(
x = element_blank(),
y = "Count",
title = "Distribution of Download Numbers"
) +
#scale_fill_viridis_d() +
coord_flip() +
theme(legend.position = "none")
ggplot(austin) +
geom_violin(aes(x = factor("datasets"), y = pageview_total), trim = FALSE, alpha = 0.5) +
geom_boxplot(aes(x = factor("datasets"), y = pageview_total), colour = "black", width = .16, alpha = 0.8) +
labs(
x = element_blank(),
y = "Count",
title = "Distribution of Total Pageview Numbers"
) +
scale_fill_viridis_d() +
coord_flip() +
theme(legend.position = "none")
Urban studies have been critical of the “bias toward the supply side” and lack of “sufficient attention to the user perspective” in the way that open government data initiatives are implemented (Dawes et al., 2016). There is a chance that cities publish data that are easy to share, but are not necessarily of interest to an audience external to government.
Here we compile the supply (number of datasets under each category) and demand (total number of times that these datasets are downloaded and viewed) of each category, rank them by the amount of supply, download and pageview, and visualize the discrepency between these three features in the following graph.
# Categories provided and used
category_comparison <- austin %>% select(category, download, pageview_total) %>%
na.omit() %>%
group_by(category) %>%
summarise(provided = n(),
downloaded = sum(download),
viewed = sum(pageview_total)) %>%
mutate(provided_rank = dense_rank(desc(provided)),
download_rank = dense_rank(desc(downloaded)),
change1 = ifelse(download_rank - provided_rank> 1, "up", "down"),
pageview_rank = dense_rank(desc(viewed)),
change2 = ifelse(pageview_rank - provided_rank> 1, "up", "down"))
category_comparison %>% select(-c(change1, change2)) %>% head() %>%
knitr::kable() %>% kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
| category | provided | downloaded | viewed | provided_rank | download_rank | pageview_rank |
|---|---|---|---|---|---|---|
| Budget and Finance | 170 | 83181 | 1612341 | 4 | 7 | 2 |
| Building and Development | 73 | 43807 | 211130 | 8 | 8 | 9 |
| Capital Planning | 1 | 235 | 39 | 15 | 17 | 18 |
| City Government | 261 | 226341 | 698346 | 2 | 2 | 5 |
| City Infrastructure | 4 | 2736 | 1962 | 14 | 12 | 14 |
| Environment | 77 | 29082 | 91637 | 7 | 10 | 11 |
supply_lable <- category_comparison[order(category_comparison$provided, decreasing = TRUE),]$category %>% rev()
ggplot(category_comparison,
aes(axis1 = pageview_rank, axis2 = provided_rank , axis3 = download_rank)) +
geom_alluvium(aes(fill = change1)) +
geom_alluvium(aes(fill = change2)) +
scale_x_discrete(expand = c(0.1, 0)) +
annotate("text", x = c(1,2,3), y = 19, label = c("Rank in Pageview","Rank in Supply","Rank in Download"), size = 3.5, fontface =2)+
annotate("text", x = 2, y = seq(0.6,18,1), size = 3,
label = supply_lable)+
annotate("text", x = 3.2, y = seq(0.5,18,1), label = seq(18,1,-1), size = 3)+
annotate("text", x = 0.8, y = seq(0.5,18,1), label = seq(18,1,-1), size = 3)+
ggtitle("Change of ranking in categories supplied, downloaded and viewed") +
theme(
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none"
)
The stack in the middle of the graph is the ranking based on the number of datasets in each category. The columns on the left and right are the rankings of page views and number of downloads. The colored curve indicate the change of rankings between each two features. Overall, a arch-shaped curve means the category has a lot of datasets in supply but is not interested by the public, while a U-shaped curve indicates that the category of less priority in supply but is more interested.
The category of Public Safety which includes datasets like Annual Crime Dataset, Austin Fire Station Map, Traffic Fatalities for each year is the largest category on this open data portal, probably because these datasets are easy to gather from standard administrative reporting routines. They are not much viewed but are frequently downloaded, probably for research and analysis purpose.
The Environment category includes water quality data in the natural creeks, aquifers and lakes in the Austin area. These data are also easy to share as they are directly migrated from existing field sample database and of less privacy concerns, but they ranks low in both pageview and download.
The categories that rank high in demand but average in supply include Utilities and City Services, Recreation and Culture, and Transportation and Mobility. At the very least, the mismatch could be explained by the appealing content of these data to the communities, and less technical barrier of using and viewing these data. For example, certain types of datasets that are in standard formats (e.g. 311, GTFS), or are visualizations provided directly on the data portal (e.g. pool maps).
The format in which open data are provided is consequential in that many formats presuppose ways to access and use the underlying information. Machine-readable data formats (“dataset” on Socrata) invite civic-minded developers to build products and cultivate values around them, while online visualization, data stories (narrative), and mapping are more accessible to the public as they remove technical demands for non-technical users.
Here we compare and contrast the datasets in each format and the frequency that they are downloaded and viewed.
datatype_comparison <- austin %>% group_by(datatype) %>%
summarize(average_number_of_download = mean(download, na.rm = TRUE),
number_of_datasets = n()) %>%
mutate(average_number_of_download = average_number_of_download *100/nrow(austin),
number_of_datasets = number_of_datasets *100/nrow(austin),
number_of_datasets = -1 * number_of_datasets) %>%
gather(key = "facet", value = "value", average_number_of_download:number_of_datasets)
ggplot(datatype_comparison, aes(x = reorder(datatype, value), y = value, fill = facet)) +
geom_bar(subset = (datatype_comparison$facet == "average_number_of_download"), stat = "identity") +
geom_bar(subset = (datatype_comparison$facet == "number_of_datasets"), stat = "identity") +
scale_y_continuous(breaks = seq(-100, 100, 20),
labels = as.character(c(seq(100, 0, -20), seq(20, 100, 20)))) +
labs(
x = "Datatypes",
y = "Percent in supply vs Percent in number of download") +
coord_flip()
datatype_comparison <- austin %>% group_by(datatype) %>%
summarize(average_number_of_pageview = sum(pageview_total, na.rm = TRUE),
number_of_datasets = n()) %>%
mutate(average_number_of_pageview = average_number_of_pageview *100/sum(austin$pageview_total),
number_of_datasets = number_of_datasets *100/nrow(austin),
number_of_datasets = -1 * number_of_datasets) %>%
gather(key = "facet", value = "value", average_number_of_pageview:number_of_datasets)
ggplot(datatype_comparison, aes(x = reorder(datatype, value), y = value, fill = facet)) +
geom_bar(subset = (datatype_comparison$facet == "average_number_of_pageview"), stat = "identity") +
geom_bar(subset = (datatype_comparison$facet == "number_of_datasets"), stat = "identity") +
scale_y_continuous(breaks = seq(-100, 100, 20),
labels = as.character(c(seq(100, 0, -20), seq(20, 100, 20)))) +
labs(
x = "Datatypes",
y = "Percent in supply vs Percent in number of pageview") +
coord_flip()
Dataset, story, map, and chart are the formats that are in high supply, and also mostly welcomed in terms of being downloaded and viewed. Among them, data in the form of “dataset” (presumably as .csv, .xml, or .rdf files) are built for use and reuse, while map and story that visualize geospatial data are more useful for viewing.