library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.7
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts --------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
pm25_2004_2007<- read_xlsx('1_PM25_2004-2007.xlsx', col_names = TRUE, skip = 1)
pm25_2008_2016<- read_xlsx('2_PM25_2008_2016.xlsx', col_names = TRUE, skip = 1)
pm25_ago2016_dic2016<- read_xls('3_PM2.5_Agost2016-Diciembre2016.xls', col_names = TRUE, col_types = c('date', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric'))
pm25_ene2017_dic2017<- read_xls('4_PM2.5_Enero2017-Diciembre2017.xls', col_names = TRUE, col_types = c('date', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric'))
pm25_ene2018_jun2018<- read_xls('5_PM2.5_Enero2018-Junio2018.xls', col_names = TRUE, col_types = c('date', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric', 'numeric'))
The objective of our work is to merge each data file into a single one, as structure of such files have been changing during years. As it can be seen, data structure for temp file from 2004 to 2007 is very different from that of recent observations starting from August 2016.
head(pm25_2004_2007)
## # A tibble: 6 x 4
## ESTACION MAGNITUD FECHA DATO
## <chr> <chr> <chr> <dbl>
## 1 COTOCOLLAO PM2.5 2005-03-17 11:00:00.0 57.6
## 2 COTOCOLLAO PM2.5 2005-03-17 12:00:00.0 27.7
## 3 COTOCOLLAO PM2.5 2005-03-17 13:00:00.0 14.6
## 4 COTOCOLLAO PM2.5 2005-03-17 14:00:00.0 9.6
## 5 COTOCOLLAO PM2.5 2005-03-17 15:00:00.0 7.73
## 6 COTOCOLLAO PM2.5 2005-03-17 16:00:00.0 10.1
head(pm25_ago2016_dic2016)
## # A tibble: 6 x 9
## X__1 Belisario Carapungo Centro Cotocollao `El Camal`
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 2016-08-16 09:00:00 NA NA NA 18.1 NA
## 4 2016-08-16 10:00:00 NA NA NA 24.8 NA
## 5 2016-08-16 11:00:00 NA NA NA 23.9 NA
## 6 2016-08-16 12:00:00 NA NA NA NA NA
## # ... with 3 more variables: Guamani <dbl>, LosChillos <dbl>,
## # Tumbaco <dbl>
While data in the pm25_2004_2007 appears to be in long format, the other is in wide format. As wide format is the latter it is assumed that this format will be preserved for later data files.
Firstly we are going to drop the MAGNITUD column and re-order the other columns in a more convenient way.
pm25_2004_2007 <- pm25_2004_2007 %>% select(FECHA, ESTACION, DATO)
head(pm25_2004_2007)
## # A tibble: 6 x 3
## FECHA ESTACION DATO
## <chr> <chr> <dbl>
## 1 2005-03-17 11:00:00.0 COTOCOLLAO 57.6
## 2 2005-03-17 12:00:00.0 COTOCOLLAO 27.7
## 3 2005-03-17 13:00:00.0 COTOCOLLAO 14.6
## 4 2005-03-17 14:00:00.0 COTOCOLLAO 9.6
## 5 2005-03-17 15:00:00.0 COTOCOLLAO 7.73
## 6 2005-03-17 16:00:00.0 COTOCOLLAO 10.1
Next thing, we spread the dataset using as key the ESTACION column and DATO as value.
pm25_2004_2007<- pm25_2004_2007 %>% spread(key = ESTACION, value = DATO)
head(pm25_2004_2007)
## # A tibble: 6 x 6
## FECHA BELISARIO CARAPUNGO CENTRO COTOCOLLAO EL_CAMAL
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004-08-26 15:00:00.0 NA NA 16 NA NA
## 2 2004-08-26 16:00:00.0 NA NA 12 NA NA
## 3 2004-08-26 17:00:00.0 NA NA 13.7 NA 24.7
## 4 2004-08-26 18:00:00.0 NA NA 14.7 NA 19.9
## 5 2004-08-26 19:00:00.0 NA NA 33.7 NA 38.0
## 6 2004-08-26 20:00:00.0 NA NA 39.3 NA 43.8
We are almost done with the dataset from 2004 to 2007. Only thing we need is to parse the FECHA column into date-time format using functions from the lubridate package.
pm25_2004_2007$FECHA<- parse_date_time(pm25_2004_2007$FECHA, orders = 'ymd HMS')
head(pm25_2004_2007)
## # A tibble: 6 x 6
## FECHA BELISARIO CARAPUNGO CENTRO COTOCOLLAO EL_CAMAL
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004-08-26 15:00:00 NA NA 16 NA NA
## 2 2004-08-26 16:00:00 NA NA 12 NA NA
## 3 2004-08-26 17:00:00 NA NA 13.7 NA 24.7
## 4 2004-08-26 18:00:00 NA NA 14.7 NA 19.9
## 5 2004-08-26 19:00:00 NA NA 33.7 NA 38.0
## 6 2004-08-26 20:00:00 NA NA 39.3 NA 43.8
## Warning: Removed 444 rows containing missing values (position_stack).