Kennedy-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **ESTACION KENNEDY VS CANAIRIOS**
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones entre sensores de bajo costo y la estación oficial*

df <- read_excel("C:/Mediciones/KENNEDY_CANAIRIOS.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 22.0, 15.0, 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   483 01-12-2020 02:00      17   12      13.7       7.96  6.95    5.55
##  2   400 27-11-2020 15:00      20   17.8    22.5      10.7  10.8     9.15
##  3   808 14-12-2020 15:00       8   12.5    15         8.51  7.42    6.18
##  4   298 23-11-2020 09:00      47   15.7    18.6      10.8   9.37    8.13
##  5   591 05-12-2020 14:00      13    2.65    2.02      4.42  2.21    1.44
##  6    73 13-11-2020 24:00      32   27.8    33.5      17.3  14.9    13.3 
##  7    26 12-11-2020 01:00      38   26.5    32.1      13.4  13.8    12.2 
##  8   166 17-11-2020 21:00      17   13.4    15.6       8.94  7.62    6.42
##  9    72 13-11-2020 23:00      30   22.7    27.3      14.5  12.2    10.7 
## 10   528 02-12-2020 23:00      33   17.8    20.5      13.1   9.7     8.46

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.70	54.8	▇▃▁▁▁
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.772     0.029      26.7       0    0.715    0.829
## 2 SNGCJA5      1.09      0.002     439.        0    1.08     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  4.51    3.31      4.38    0.132
##  2     2  5.69    4.46      5.63    0.06 
##  3     3  6.51    5.2       6.44    0.074
##  4     4  4.71    3.45      4.53    0.18 
##  5     5  3.27    2.24      3.21    0.058
##  6     6  4.75    3.84      4.96   -0.205
##  7     7  6.02    4.91      6.12   -0.101
##  8     8  8.81    7.37      8.8     0.01 
##  9     9  8.69    7.11      8.52    0.173
## 10    10  8.06    6.57      7.93    0.131
## # ... with 793 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.4	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.968

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.63     0.119      13.7       0     1.39     1.86
## 2 SNGCJA5       1.12     0.01      109.        0     1.1      1.14

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      4.75    3.31          5.34   -0.586
##  2     2      6       4.46          6.62   -0.624
##  3     3      6.64    5.2           7.45   -0.813
##  4     4      4.91    3.45          5.49   -0.582
##  5     5      3.13    2.24          4.14   -1.01 
##  6     6      5.24    3.84          5.93   -0.689
##  7     7      5.74    4.91          7.13   -1.39 
##  8     8      8.41    7.37          9.88   -1.48 
##  9     9      7.45    7.11          9.59   -2.14 
## 10    10      6.96    6.57          8.99   -2.03 
## # ... with 793 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.997

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.997     0.089     -11.2       0    -1.17   -0.822
## 2 SNGCJA5      2.62      0.008     342.        0     2.60    2.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.91    3.31        7.67   -0.756
##  2     2    9.69    4.46       10.7    -0.986
##  3     3   11.9     5.2        12.6    -0.713
##  4     4    7.8     3.45        8.03   -0.232
##  5     5    4.55    2.24        4.87   -0.316
##  6     6    7.64    3.84        9.05   -1.41 
##  7     7   11.9     4.91       11.9     0.046
##  8     8   18.1     7.37       18.3    -0.192
##  9     9   18.5     7.11       17.6     0.888
## 10    10   16.8     6.57       16.2     0.602
## # ... with 793 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.116     0.089     -1.30   0.193   -0.291    0.059
## 2 SNGCJA5      2.14      0.008    279.     0        2.12     2.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.51    3.31        6.96   -0.453
##  2     2    8.59    4.46        9.42   -0.832
##  3     3   10.4     5.2        11.0    -0.605
##  4     4    7.11    3.45        7.26   -0.152
##  5     5    4.76    2.24        4.68    0.085
##  6     6    7.36    3.84        8.10   -0.737
##  7     7   11.4     4.91       10.4     1.01 
##  8     8   15.4     7.37       15.6    -0.246
##  9     9   15.9     7.11       15.1     0.81 
## 10    10   14.2     6.57       13.9     0.265
## # ... with 793 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.00	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.519

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    14.1      0.775      18.2       0    12.6     15.6 
## 2 SNGCJA5       1.15     0.067      17.2       0     1.01     1.28

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    3.31        17.9     4.11
##  2     2      15    4.46        19.2    -4.20
##  3     3      19    5.2         20.1    -1.05
##  4     4      17    3.45        18.0    -1.05
##  5     5      11    2.24        16.7    -5.66
##  6     6       8    3.84        18.5   -10.5 
##  7     7       7    4.91        19.7   -12.7 
##  8     8       9    7.37        22.5   -13.5 
##  9     9      20    7.11        22.2    -2.24
## 10    10      41    6.57        21.6    19.4 
## # ... with 793 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.40	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.992     0.145      6.84       0    0.707     1.28
## 2 SPS30        1.01      0.011     93.2        0    0.993     1.04

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      4.75  4.51          5.57   -0.817
##  2     2      6     5.69          6.76   -0.764
##  3     3      6.64  6.51          7.60   -0.956
##  4     4      4.91  4.71          5.77   -0.86 
##  5     5      3.13  3.27          4.31   -1.18 
##  6     6      5.24  4.75          5.81   -0.571
##  7     7      5.74  6.02          7.10   -1.36 
##  8     8      8.41  8.81          9.93   -1.52 
##  9     9      7.45  8.69          9.81   -2.36 
## 10    10      6.96  8.06          9.17   -2.21 
## # ... with 793 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.90	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.79     0.097     -28.8       0    -2.98    -2.60
## 2 SPS30         2.40     0.007     330.        0     2.38     2.41

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.91  4.51        8.02   -1.11 
##  2     2    9.69  5.69       10.8    -1.16 
##  3     3   11.9   6.51       12.8    -0.916
##  4     4    7.8   4.71        8.50   -0.701
##  5     5    4.55  3.27        5.05   -0.499
##  6     6    7.64  4.75        8.60   -0.957
##  7     7   11.9   6.02       11.6     0.259
##  8     8   18.1   8.81       18.3    -0.229
##  9     9   18.5   8.69       18.0     0.458
## 10    10   16.8   8.06       16.5     0.269
## # ... with 793 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.20	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.992

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.53     0.114     -13.4       0    -1.75    -1.30
## 2 SPS30         1.95     0.009     228.        0     1.94     1.97

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.51  4.51        7.28   -0.774
##  2     2    8.59  5.69        9.59   -1    
##  3     3   10.4   6.51       11.2    -0.793
##  4     4    7.11  4.71        7.68   -0.565
##  5     5    4.76  3.27        4.86   -0.101
##  6     6    7.36  4.75        7.75   -0.393
##  7     7   11.4   6.02       10.2     1.16 
##  8     8   15.4   8.81       15.7    -0.287
##  9     9   15.9   8.69       15.5     0.447
## 10    10   14.2   8.06       14.2    -0.022
## # ... with 793 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.00	32.00	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.509

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    13.5      0.819      16.5       0   11.9      15.2 
## 2 SPS30         1.03     0.061      16.7       0    0.908     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      22  4.51        18.2     3.82
##  2     2      15  5.69        19.4    -4.40
##  3     3      19  6.51        20.2    -1.24
##  4     4      17  4.71        18.4    -1.39
##  5     5      11  3.27        16.9    -5.91
##  6     6       8  4.75        18.4   -10.4 
##  7     7       7  6.02        19.7   -12.7 
##  8     8       9  8.81        22.6   -13.6 
##  9     9      20  8.69        22.5    -2.48
## 10    10      41  8.06        21.8    19.2 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.9	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.76     0.337      -8.2       0    -3.43    -2.10
## 2 HPMA115S0     2.17     0.023      93.7       0     2.13     2.22

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.91      4.75        7.56   -0.645
##  2     2    9.69      6          10.3    -0.580
##  3     3   11.9       6.64       11.7     0.239
##  4     4    7.8       4.91        7.90   -0.102
##  5     5    4.55      3.13        4.04    0.514
##  6     6    7.64      5.24        8.62   -0.979
##  7     7   11.9       5.74        9.70    2.19 
##  8     8   18.1       8.41       15.5     2.59 
##  9     9   18.5       7.45       13.4     5.08 
## 10    10   16.8       6.96       12.4     4.44 
## # ... with 793 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.7	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.963

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.72     0.258     -6.68       0    -2.23    -1.22
## 2 HPMA115S0     1.79     0.018    101.         0     1.75     1.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.51      4.75        6.77   -0.261
##  2     2    8.59      6           9.01   -0.416
##  3     3   10.4       6.64       10.2     0.249
##  4     4    7.11      4.91        7.06    0.053
##  5     5    4.76      3.13        3.87    0.886
##  6     6    7.36      5.24        7.65   -0.287
##  7     7   11.4       5.74        8.54    2.86 
##  8     8   15.4       8.41       13.3     2.08 
##  9     9   15.9       7.45       11.6     4.30 
## 10    10   14.2       6.96       10.7     3.48 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.0	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.514

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    13.0      0.84       15.4       0   11.3      14.6 
## 2 HPMA115S0     0.98     0.058      17.0       0    0.867     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      22      4.75        17.6    4.39 
##  2     2      15      6           18.8   -3.84 
##  3     3      19      6.64        19.5   -0.463
##  4     4      17      4.91        17.8   -0.767
##  5     5      11      3.13        16.0   -5.02 
##  6     6       8      5.24        18.1  -10.1  
##  7     7       7      5.74        18.6  -11.6  
##  8     8       9      8.41        21.2  -12.2  
##  9     9      20      7.45        20.3   -0.257
## 10    10      41      6.96        19.8   21.2  
## # ... with 793 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.1	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
PMS7003	0	1	20.51	14.0	1.00	10.5	17.7	27.2	116	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = PMSA003, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 PMS7003",
       title = "Relationship between PMSA003 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.705     0.054      13.0       0    0.599    0.811
## 2 PMSA003      0.817     0.002     448.        0    0.813    0.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 PMSA003 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.51    6.91        6.35    0.16 
##  2     2    8.59    9.69        8.62   -0.031
##  3     3   10.4    11.9        10.4    -0.026
##  4     4    7.11    7.8         7.08    0.033
##  5     5    4.76    4.55        4.42    0.338
##  6     6    7.36    7.64        6.95    0.414
##  7     7   11.4    11.9        10.4     0.974
##  8     8   15.4    18.1        15.5    -0.091
##  9     9   15.9    18.5        15.8     0.083
## 10    10   14.2    16.8        14.4    -0.229
## # ... with 793 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.520

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   14.6       0.752      19.4       0   13.1     16.0  
## 2 PMSA003      0.436     0.025      17.2       0    0.387    0.486

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    6.91        17.6    4.42 
##  2     2      15    9.69        18.8   -3.79 
##  3     3      19   11.9         19.8   -0.754
##  4     4      17    7.8         18.0   -0.964
##  5     5      11    4.55        16.5   -5.55 
##  6     6       8    7.64        17.9   -9.90 
##  7     7       7   11.9         19.8  -12.8  
##  8     8       9   18.1         22.5  -13.5  
##  9     9      20   18.5         22.6   -2.64 
## 10    10      41   16.8         21.9   19.1  
## # ... with 793 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.525

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   14.1      0.766       18.4       0   12.6     15.6  
## 2 PMS7003      0.539    0.0309      17.4       0    0.478    0.599

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    6.51        17.6    4.40 
##  2     2      15    8.59        18.7   -3.72 
##  3     3      19   10.4         19.7   -0.697
##  4     4      17    7.11        17.9   -0.925
##  5     5      11    4.76        16.7   -5.66 
##  6     6       8    7.36        18.1  -10.1  
##  7     7       7   11.4         20.2  -13.2  
##  8     8       9   15.4         22.4  -13.4  
##  9     9      20   15.9         22.7   -2.66 
## 10    10      41   14.2         21.7   19.3  
## # ... with 793 more rows

# **ESTACION KENNEDY -1hora VS CANAIRIOS**
# **Prueba con los valores de la estacion Kennedy retrasada 1 hora
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones con la estación oficial*


df <- read_excel("C:/Mediciones/KENNEDY_CANAIRIOS_1h.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 15.0, 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, 24.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   162 17-11-2020 17:00      10    5.53    5.55      5.4   3.72    2.81
##  2   663 08-12-2020 14:00      19   32.8    40.8      16.5  18.1    15.6 
##  3   137 16-11-2020 16:00      18   20.4    26.2      10.1  12.3     9.95
##  4   780 13-12-2020 11:00      12    2.21    1.79      3.04  2.07    1.11
##  5   248 21-11-2020 07:00      63   29.3    31.9      16.3  12.6    12.1 
##  6   552 03-12-2020 23:00      19   34.6    41.8      16.8  18.5    15.6 
##  7    63 13-11-2020 14:00      18   12.2    14.6       7.83  7.54    6.06
##  8    19 11-11-2020 18:00       8   24.8    28.5      12.9  11.9    10.6 
##  9   207 19-11-2020 14:00      21   15.7    18.8       9.45  9.2     7.57
## 10   306 23-11-2020 17:00      13   30.1    36.6      15.3  15.7    13.3

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 14: SNGCJA5 VS Oficial-1h

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.00	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.646

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    11.4      0.692      16.5       0    10.0     12.8 
## 2 SNGCJA5       1.43     0.059      24.0       0     1.31     1.54

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    3.31        16.1    -1.12
##  2     2      19    4.46        17.8     1.24
##  3     3      17    5.2         18.8    -1.82
##  4     4      11    3.45        16.3    -5.32
##  5     5       8    2.24        14.6    -6.60
##  6     6       7    3.84        16.9    -9.88
##  7     7       9    4.91        18.4    -9.40
##  8     8      20    7.37        21.9    -1.91
##  9     9      41    7.11        21.5    19.5 
## 10    10      24    6.57        20.8     3.23
## # ... with 793 more rows

#Caso 15: SPS30 VS Oficial-1

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.00	32.00	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.629

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    10.8      0.74       14.6       0     9.36    12.3 
## 2 SPS30         1.27     0.056      22.9       0     1.16     1.38

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      15  4.51        16.5   -1.55 
##  2     2      19  5.69        18.0    0.953
##  3     3      17  6.51        19.1   -2.09 
##  4     4      11  4.71        16.8   -5.80 
##  5     5       8  3.27        15.0   -6.97 
##  6     6       7  4.75        16.9   -9.85 
##  7     7       9  6.02        18.5   -9.47 
##  8     8      20  8.81        22.0   -2.02 
##  9     9      41  8.69        21.9   19.1  
## 10    10      24  8.06        21.1    2.94 
## # ... with 793 more rows

#Caso 16: HPMA115S0 VS Oficial-1h

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.0	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.654

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.65     0.741      13.0       0     8.20    11.1 
## 2 HPMA115S0     1.25     0.051      24.5       0     1.15     1.35

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      15      4.75        15.6   -0.575
##  2     2      19      6           17.1    1.87 
##  3     3      17      6.64        17.9   -0.931
##  4     4      11      4.91        15.8   -4.77 
##  5     5       8      3.13        13.6   -5.56 
##  6     6       7      5.24        16.2   -9.19 
##  7     7       9      5.74        16.8   -7.81 
##  8     8      20      8.41        20.1   -0.139
##  9     9      41      7.45        18.9   22.1  
## 10    10      24      6.96        18.3    5.67 
## # ... with 793 more rows

#Caso 17: PMSA003 VS Oficial-1h

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.647

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.0       0.671      17.8       0     10.6   13.3  
## 2 PMSA003      0.544     0.023      24.0       0      0.5    0.588

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    6.91        15.7   -0.725
##  2     2      19    9.69        17.2    1.76 
##  3     3      17   11.9         18.4   -1.44 
##  4     4      11    7.8         16.2   -5.21 
##  5     5       8    4.55        14.4   -6.44 
##  6     6       7    7.64        16.1   -9.12 
##  7     7       9   11.9         18.4   -9.44 
##  8     8      20   18.1         21.8   -1.81 
##  9     9      41   18.5         22.0   19.0  
## 10    10      24   16.8         21.1    2.90 
## # ... with 793 more rows

#Caso 18: PMS7003 VS Oficial-1h

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.657

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.3      0.679       16.7       0    9.99    12.7  
## 2 PMS7003      0.674    0.0273      24.7       0    0.621    0.728

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    6.51        15.7   -0.712
##  2     2      19    8.59        17.1    1.88 
##  3     3      17   10.4         18.3   -1.34 
##  4     4      11    7.11        16.1   -5.12 
##  5     5       8    4.76        14.5   -6.53 
##  6     6       7    7.36        16.3   -9.28 
##  7     7       9   11.4         19.0  -10.0  
##  8     8      20   15.4         21.7   -1.71 
##  9     9      41   15.9         22.0   19.0  
## 10    10      24   14.2         20.9    3.10 
## # ... with 793 more rows

# **ESTACION KENNEDY -2horas VS CANAIRIOS**
# **Prueba con los valores de la estacion Kennedy retrasada 2 horas
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones con la estación oficial*


df <- read_excel("C:/Mediciones/KENNEDY_CANAIRIOS_2h.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, 24.0, 23.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   325 24-11-2020 12:00      21   10.3    11.9       7.88  6.47    5.05
##  2   301 23-11-2020 12:00      43   47.1    59        24.3  26.9    22.9 
##  3   338 25-11-2020 01:00      25   17.1    19.8       9.98  9.3     7.66
##  4   507 02-12-2020 02:00      13    4.36    4.13      4.34  3.05    2.13
##  5   518 02-12-2020 13:00      10    1.97    1.47      3.14  1.76    0.93
##  6   273 22-11-2020 08:00      36   29.2    35.3      16.3  15.8    13.6 
##  7   576 04-12-2020 23:00      24   24.4    28.7      16.3  12.6    11.2 
##  8    66 13-11-2020 17:00      20    8.34    7.84      4.86  4.39    3.39
##  9   304 23-11-2020 15:00      14   27.4    33.7      14.1  14.8    12.4 
## 10   390 27-11-2020 05:00      27   16.3    19.8       9.78  9.71    7.66

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 19: SNGCJA5 VS Oficial-2h

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.00	32.5	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.786

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.45     0.561      15.1       0     7.35     9.55
## 2 SNGCJA5       1.74     0.048      36.0       0     1.64     1.83

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    3.31        14.2    4.81 
##  2     2      17    4.46        16.2    0.811
##  3     3      11    5.2         17.5   -6.47 
##  4     4       8    3.45        14.4   -6.44 
##  5     5       7    2.24        12.3   -5.34 
##  6     6       9    3.84        15.1   -6.11 
##  7     7      20    4.91        17.0    3.03 
##  8     8      41    7.37        21.2   19.8  
##  9     9      24    7.11        20.8    3.21 
## 10    10      23    6.57        19.8    3.15 
## # ... with 793 more rows

#Caso 20: SPS30 VS Oficial-2

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.00	32.50	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.766

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     7.71     0.612      12.6       0     6.51     8.91
## 2 SPS30         1.55     0.046      33.8       0     1.46     1.64

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      19  4.51        14.7    4.30 
##  2     2      17  5.69        16.5    0.475
##  3     3      11  6.51        17.8   -6.80 
##  4     4       8  4.71        15.0   -7.01 
##  5     5       7  3.27        12.8   -5.78 
##  6     6       9  4.75        15.1   -6.07 
##  7     7      20  6.02        17.0    2.96 
##  8     8      41  8.81        21.4   19.6  
##  9     9      24  8.69        21.2    2.83 
## 10    10      23  8.06        20.2    2.80 
## # ... with 793 more rows

#Caso 21: HPMA115S0 VS Oficial-2h

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.0	32.5	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.813

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.91     0.571      10.4       0     4.79     7.03
## 2 HPMA115S0     1.55     0.039      39.5       0     1.47     1.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      19      4.75        13.3     5.72
##  2     2      17      6           15.2     1.79
##  3     3      11      6.64        16.2    -5.20
##  4     4       8      4.91        13.5    -5.52
##  5     5       7      3.13        10.8    -3.76
##  6     6       9      5.24        14.0    -5.04
##  7     7      20      5.74        14.8     5.19
##  8     8      41      8.41        18.9    22.1 
##  9     9      24      7.45        17.5     6.54
## 10    10      23      6.96        16.7     6.30
## # ... with 793 more rows

#Caso 22: PMSA003 VS Oficial-2h

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.0	22.0	32.5	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.785

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.19     0.546      16.8       0    8.12    10.3  
## 2 PMSA003       0.66     0.018      35.9       0    0.624    0.696

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    6.91        13.8     5.25
##  2     2      17    9.69        15.6     1.42
##  3     3      11   11.9         17.0    -6.04
##  4     4       8    7.8         14.3    -6.34
##  5     5       7    4.55        12.2    -5.19
##  6     6       9    7.64        14.2    -5.23
##  7     7      20   11.9         17.0     2.96
##  8     8      41   18.1         21.1    19.9 
##  9     9      24   18.5         21.4     2.60
## 10    10      23   16.8         20.3     2.73
## # ... with 793 more rows

#Caso 23: PMS7003 VS Oficial-2h

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0	16.0	22.0	32.5	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.800

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.33     0.540       15.4       0    7.27     9.40 
## 2 PMS7003      0.822    0.0218      37.8       0    0.779    0.864

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    6.51        13.7     5.32
##  2     2      17    8.59        15.4     1.61
##  3     3      11   10.4         16.9    -5.88
##  4     4       8    7.11        14.2    -6.18
##  5     5       7    4.76        12.2    -5.24
##  6     6       9    7.36        14.4    -5.38
##  7     7      20   11.4         17.7     2.3 
##  8     8      41   15.4         21.0    20.0 
##  9     9      24   15.9         21.4     2.60
## 10    10      23   14.2         20.0     3.00
## # ... with 793 more rows

Kennedy-Canairios.R

DBB

2021-02-28