Barcelona_Analysis

# This program uses plotly to illustrate certain aspects of air pollution 
# and accidents in Barcelona city. The data sets from  the Portal Open Data BCN, 
# the Ajuntament de  Barcelona's open data service and they are available at   
# https://www.kaggle.com/xvivancos/barcelona-data-sets. There are 16 files in the 
# dataset

#required packages

library(dplyr); warnings = FALSE

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plotly); warnings = FALSE

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# The dataset has already been read. THe chunk below creates different files and
# separates the name and the .csv component from each file

filenames = list.files(path = file_location, full.names = TRUE)

file_names <- lapply(1:length(list.files(file_location)), function(x) {
    unlist(strsplit(filenames[x], '/'))[length(unlist(strsplit(filenames[x], '/')))]
  })

file_names <- lapply(1:length(file_names),  function(x) {
    unlist(strsplit(file_names[[x]], "[.]") %>% lapply("[", 1)) 
  })

#The chunk creates 16 subsets of the entire large dataset. The entire dataset is called 'files'
#Each of the 16 subsets, say files[5] refers to one .csv file

files <- vector("list", 16)
for(x in 1:16){
  files[[x]] <- read.csv(filenames[x], sep = ",")
}

# This reads the first file in the database (about accident statistics), creates a sample
# based on injuries and other variables (see code for 'accidents_plot') and plots with plotly

accidents <- files[1] %>% data.frame()

accidents <- accidents %>% mutate(total_injuries = select(., Mild.injuries, Serious.injuries) %>% 
                                    apply(1, sum, na.rm = TRUE))

set.seed(1000)

accidents_sample <- accidents %>% sample_n(300, replace = TRUE)

p <- ggplot(data = accidents_sample, aes(x = Day, y = total_injuries)) + 
    geom_point(aes(text = paste("Part of the day:", Part.of.the.day)), size = .7) +
    geom_smooth(aes(color = District.Name, fill = District.Name)) + facet_wrap(~District.Name)

## Warning: Ignoring unknown aesthetics: text

gg <- ggplotly(p)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

gg

#Here we are comparing total air pollution levels against time for different areas 
# of Barcelona City. Again create a sample using certain variables (see 'air_quality_plot' 
# for  the  variables used) and plot using plotly

air_quality <- files[2] %>% 
               data.frame() %>% 
               mutate(total_pollution = select(., O3.Value, 
                    NO2.Value, PM10.Value) %>% 
                    apply(1, sum, na.rm = TRUE))

quantile_total_pollution <- air_quality$total_pollution %>%
    na.omit() %>% 
    quantile(probs = c(0,.25,.5,.75,.98))
  
level_pollution <- findInterval((air_quality$total_pollution),
                    quantile_total_pollution) %>% 
                    factor

air_quality <- air_quality %>% 
               mutate(level_pollution = level_pollution)

air_quality_sample <- air_quality %>% sample_n(300, replace = TRUE) 
  

                  
air_quality_plot <- ggplot(air_quality_sample, aes(x = Date.Time, y = total_pollution)) + 
 geom_point(aes(text = paste("Air Quality:", Air.Quality)), size = .7) +
  geom_smooth(aes(color = Station, fill = Station)) + facet_wrap(~Station)

## Warning: Ignoring unknown aesthetics: text

ggplotly(air_quality_plot)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Barcelona_Analysis

Mansh Gyawali

March 8, 2019