# This program uses plotly to illustrate certain aspects of air pollution
# and accidents in Barcelona city. The data sets from the Portal Open Data BCN,
# the Ajuntament de Barcelona's open data service and they are available at
# https://www.kaggle.com/xvivancos/barcelona-data-sets. There are 16 files in the
# dataset
#required packages
library(dplyr); warnings = FALSE
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly); warnings = FALSE
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# The dataset has already been read. THe chunk below creates different files and
# separates the name and the .csv component from each file
filenames = list.files(path = file_location, full.names = TRUE)
file_names <- lapply(1:length(list.files(file_location)), function(x) {
unlist(strsplit(filenames[x], '/'))[length(unlist(strsplit(filenames[x], '/')))]
})
file_names <- lapply(1:length(file_names), function(x) {
unlist(strsplit(file_names[[x]], "[.]") %>% lapply("[", 1))
})
#The chunk creates 16 subsets of the entire large dataset. The entire dataset is called 'files'
#Each of the 16 subsets, say files[5] refers to one .csv file
files <- vector("list", 16)
for(x in 1:16){
files[[x]] <- read.csv(filenames[x], sep = ",")
}
# This reads the first file in the database (about accident statistics), creates a sample
# based on injuries and other variables (see code for 'accidents_plot') and plots with plotly
accidents <- files[1] %>% data.frame()
accidents <- accidents %>% mutate(total_injuries = select(., Mild.injuries, Serious.injuries) %>%
apply(1, sum, na.rm = TRUE))
set.seed(1000)
accidents_sample <- accidents %>% sample_n(300, replace = TRUE)
p <- ggplot(data = accidents_sample, aes(x = Day, y = total_injuries)) +
geom_point(aes(text = paste("Part of the day:", Part.of.the.day)), size = .7) +
geom_smooth(aes(color = District.Name, fill = District.Name)) + facet_wrap(~District.Name)
## Warning: Ignoring unknown aesthetics: text
gg <- ggplotly(p)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
gg
#Here we are comparing total air pollution levels against time for different areas
# of Barcelona City. Again create a sample using certain variables (see 'air_quality_plot'
# for the variables used) and plot using plotly
air_quality <- files[2] %>%
data.frame() %>%
mutate(total_pollution = select(., O3.Value,
NO2.Value, PM10.Value) %>%
apply(1, sum, na.rm = TRUE))
quantile_total_pollution <- air_quality$total_pollution %>%
na.omit() %>%
quantile(probs = c(0,.25,.5,.75,.98))
level_pollution <- findInterval((air_quality$total_pollution),
quantile_total_pollution) %>%
factor
air_quality <- air_quality %>%
mutate(level_pollution = level_pollution)
air_quality_sample <- air_quality %>% sample_n(300, replace = TRUE)
air_quality_plot <- ggplot(air_quality_sample, aes(x = Date.Time, y = total_pollution)) +
geom_point(aes(text = paste("Air Quality:", Air.Quality)), size = .7) +
geom_smooth(aes(color = Station, fill = Station)) + facet_wrap(~Station)
## Warning: Ignoring unknown aesthetics: text
ggplotly(air_quality_plot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
`