library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(moderndive)
library(skimr)

# **ESTACION TUNAL Y VS SENSORES BAJO COSTO**
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones entre sensores de bajo costo y la estación oficial*

df <- read_excel("C:/Mediciones/TUNAL.xlsx")
View(df)

glimpse(df)
## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00", ~
## $ Oficial   <dbl> 30, 23, 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 18, ~
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 54.9~
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 62.3~
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 37.1~
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 34.9~
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 28.7~
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1  2238 15-02-2021 19:00      20  15.9   17.6         0     9.59   8.99 
##  2   516 06-12-2020 01:00      16   1.6    1.12        2.78  2.22   1.37 
##  3   526 06-12-2020 11:00       0   0.977  0.697       2.7   1.64   0.902
##  4   428 02-12-2020 09:00      10   1.35   0.412       3.04  1.2    0.6  
##  5  1530 17-01-2021 07:00      25  24.7   29           0    16.1   13.5  
##  6  2166 12-02-2021 19:00      37  30.8   36.8         0    18.8   16.5  
##  7  1514 16-01-2021 15:00      19  18.9   23.2         0    12      9.75 
##  8   308 27-11-2020 09:00      41  47.2   55.7        31.2  29.3   24.2  
##  9  1039 27-12-2020 20:00       6   5.24   5.1         0     4.13   3.01 
## 10   445 03-12-2020 02:00       2   0.318  0.0758      1.46  1.05   0.455
fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig
#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0.00 2.08 4.93 10.3 38.6 ▇▃▁▁▁
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12.0 43.4 ▇▃▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995
ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.478     0.023      21.2       0    0.434    0.522
## 2 SNGCJA5      1.14      0.002     479.        0    1.14     1.15
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  15.9    14.4      17.0   -1.06 
##  2     2  12.4    10.7      12.7   -0.324
##  3     3  15.7    13.8      16.3   -0.572
##  4     4  17      14.7      17.3   -0.302
##  5     5  21      18.5      21.7   -0.651
##  6     6  22.4    20.3      23.7   -1.31 
##  7     7  27.4    24.8      28.9   -1.46 
##  8     8  29.7    26.6      30.9   -1.22 
##  9     9  34.9    28.7      33.3    1.58 
## 10    10  39.9    32.4      37.6    2.34 
## # ... with 2,203 more rows
#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.30 38.6 ▇▃▁▁▁
HPMA115S0 0 1 4.47 7.39 0 0.00 0.00 5.58 43.4 ▇▁▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607
ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.448     0.186     -2.41   0.016   -0.812   -0.084
## 2 SNGCJA5      0.706     0.02      35.9    0        0.667    0.745
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      17.4    14.4          9.72     7.68
##  2     2      14      10.7          7.11     6.89
##  3     3      17.2    13.8          9.30     7.90
##  4     4      18.6    14.7          9.93     8.67
##  5     5      23.5    18.5         12.6     10.9 
##  6     6      25.1    20.3         13.9     11.2 
##  7     7      30      24.8         17.1     12.9 
##  8     8      32.5    26.6         18.3     14.2 
##  9     9      37.1    28.7         19.8     17.3 
## 10    10      42.1    32.4         22.4     19.7 
## # ... with 2,203 more rows
#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.3 38.6 ▇▃▁▁▁
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.5 89.3 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995
ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.97     0.048     -40.8       0    -2.07    -1.88
## 2 SNGCJA5       2.38     0.005     464.        0     2.37     2.39
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    30.7    14.4        32.3    -1.58
##  2     2    21.6    10.7        23.5    -1.88
##  3     3    29      13.8        30.8    -1.85
##  4     4    31.6    14.7        33.0    -1.39
##  5     5    38.1    18.5        42.0    -3.93
##  6     6    41.7    20.3        46.3    -4.61
##  7     7    50.3    24.8        57.0    -6.71
##  8     8    54.4    26.6        61.3    -6.89
##  9     9    62.3    28.7        66.3    -3.99
## 10    10    69.8    32.4        75.1    -5.29
## # ... with 2,203 more rows
#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.3 38.6 ▇▃▁▁▁
PMS7003 0 1 12.96 12.73 0 3.20 8.55 19.3 75.7 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996
ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.96     0.036     -26.5       0    -1.03   -0.889
## 2 SNGCJA5       2.00     0.004     521.        0     1.99    2.00
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    26.7    14.4        27.8   -1.09 
##  2     2    19.5    10.7        20.4   -0.901
##  3     3    25.6    13.8        26.6   -0.989
##  4     4    27.3    14.7        28.4   -1.09 
##  5     5    32.8    18.5        36.0   -3.17 
##  6     6    36.4    20.3        39.6   -3.16 
##  7     7    44.2    24.8        48.5   -4.35 
##  8     8    47.9    26.6        52.1   -4.24 
##  9     9    54.9    28.7        56.3   -1.43 
## 10    10    60.5    32.4        63.7   -3.22 
## # ... with 2,203 more rows
#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.3 38.6 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.740
ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.04     0.252      24.0       0     5.54     6.53
## 2 SNGCJA5       1.38     0.027      51.7       0     1.33     1.43
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    14.4        25.9    4.09 
##  2     2      23    10.7        20.8    2.20 
##  3     3      18    13.8        25.1   -7.08 
##  4     4      26    14.7        26.3   -0.324
##  5     5      24    18.5        31.6   -7.57 
##  6     6      30    20.3        34.1   -4.05 
##  7     7      38    24.8        40.3   -2.26 
##  8     8      53    26.6        42.7   10.3  
##  9     9      54    28.7        45.6    8.35 
## 10    10      57    32.4        50.8    6.25 
## # ... with 2,203 more rows
#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12.00 43.4 ▇▃▁▁▁
HPMA115S0 0 1 4.47 7.39 0.00 0.00 0.00 5.58 43.4 ▇▁▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.608
ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.731     0.191     -3.83       0   -1.10    -0.356
## 2 SPS30        0.615     0.017     36.0        0    0.582    0.649
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      17.4  15.9          9.06     8.35
##  2     2      14    12.4          6.90     7.10
##  3     3      17.2  15.7          8.93     8.27
##  4     4      18.6  17            9.73     8.87
##  5     5      23.5  21           12.2     11.3 
##  6     6      25.1  22.4         13.1     12.0 
##  7     7      30    27.4         16.1     13.9 
##  8     8      32.5  29.7         17.5     15.0 
##  9     9      37.1  34.9         20.7     16.4 
## 10    10      42.1  39.9         23.8     18.3 
## # ... with 2,203 more rows
#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12.0 43.4 ▇▃▁▁▁
PMSA003 0 1 14.61 15.18 0.00 2.81 9.11 22.5 89.3 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995
ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.88     0.05      -57.6       0    -2.98    -2.78
## 2 SPS30         2.07     0.004     462.        0     2.06     2.08
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    30.7  15.9        30.0    0.697
##  2     2    21.6  12.4        22.8   -1.16 
##  3     3    29    15.7        29.6   -0.589
##  4     4    31.6  17          32.3   -0.678
##  5     5    38.1  21          40.6   -2.45 
##  6     6    41.7  22.4        43.4   -1.75 
##  7     7    50.3  27.4        53.8   -3.49 
##  8     8    54.4  29.7        58.5   -4.14 
##  9     9    62.3  34.9        69.3   -7.00 
## 10    10    69.8  39.9        79.6   -9.84 
## # ... with 2,203 more rows
#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12.0 43.4 ▇▃▁▁▁
PMS7003 0 1 12.96 12.73 0.00 3.20 8.55 19.3 75.7 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995
ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.71     0.041     -41.6       0    -1.79    -1.63
## 2 SPS30         1.73     0.004     471.        0     1.73     1.74
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    26.7  15.9        25.9    0.833
##  2     2    19.5  12.4        19.8   -0.297
##  3     3    25.6  15.7        25.5    0.08 
##  4     4    27.3  17          27.8   -0.475
##  5     5    32.8  21          34.7   -1.91 
##  6     6    36.4  22.4        37.1   -0.741
##  7     7    44.2  27.4        45.8   -1.61 
##  8     8    47.9  29.7        49.8   -1.90 
##  9     9    54.9  34.9        58.8   -3.92 
## 10    10    60.5  39.9        67.5   -6.99 
## # ... with 2,203 more rows
#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12 43.4 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0.00 7.00 13.00 22 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.730
ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.64     0.263      21.4       0     5.12     6.15
## 2 SPS30         1.18     0.024      50.3       0     1.14     1.23
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      30  15.9        24.5    5.52 
##  2     2      23  12.4        20.3    2.67 
##  3     3      18  15.7        24.2   -6.24 
##  4     4      26  17          25.8    0.216
##  5     5      24  21          30.5   -6.52 
##  6     6      30  22.4        32.2   -2.18 
##  7     7      38  27.4        38.1   -0.11 
##  8     8      53  29.7        40.8   12.2  
##  9     9      54  34.9        47.0    7.00 
## 10    10      57  39.9        52.9    4.08 
## # ... with 2,203 more rows
#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HPMA115S0 0 1 4.47 7.39 0 0.00 0.00 5.58 43.4 ▇▁▁▁▁
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.50 89.3 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.602
ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.08     0.301      30.1       0     8.49     9.67
## 2 HPMA115S0     1.24     0.035      35.4       0     1.17     1.30
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    30.7      17.4        30.6    0.113
##  2     2    21.6      14          26.4   -4.78 
##  3     3    29        17.2        30.3   -1.34 
##  4     4    31.6      18.6        32.1   -0.47 
##  5     5    38.1      23.5        38.1   -0.026
##  6     6    41.7      25.1        40.1    1.60 
##  7     7    50.3      30          46.2    4.14 
##  8     8    54.4      32.5        49.2    5.15 
##  9     9    62.3      37.1        54.9    7.36 
## 10    10    69.8      42.1        61.1    8.68 
## # ... with 2,203 more rows
#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HPMA115S0 0 1 4.47 7.39 0 0.0 0.00 5.58 43.4 ▇▁▁▁▁
PMS7003 0 1 12.96 12.73 0 3.2 8.55 19.30 75.7 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.615
ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.22     0.25       32.9       0     7.74     8.71
## 2 HPMA115S0     1.06     0.029      36.6       0     1.00     1.12
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    26.7      17.4        26.6    0.062
##  2     2    19.5      14          23.0   -3.54 
##  3     3    25.6      17.2        26.4   -0.826
##  4     4    27.3      18.6        27.9   -0.608
##  5     5    32.8      23.5        33.1   -0.293
##  6     6    36.4      25.1        34.8    1.61 
##  7     7    44.2      30          40.0    4.23 
##  8     8    47.9      32.5        42.6    5.28 
##  9     9    54.9      37.1        47.5    7.42 
## 10    10    60.5      42.1        52.8    7.72 
## # ... with 2,203 more rows
#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HPMA115S0 0 1 4.47 7.39 0 0 0 5.58 43.4 ▇▁▁▁▁
Oficial 0 1 15.66 11.85 0 7 13 22.00 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.388
ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.9       0.272      47.4       0    12.3    13.4  
## 2 HPMA115S0    0.621     0.031      19.8       0     0.56    0.683
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      30      17.4        23.7     6.31
##  2     2      23      14          21.6     1.42
##  3     3      18      17.2        23.6    -5.57
##  4     4      26      18.6        24.4     1.56
##  5     5      24      23.5        27.5    -3.48
##  6     6      30      25.1        28.5     1.52
##  7     7      38      30          31.5     6.48
##  8     8      53      32.5        33.1    19.9 
##  9     9      54      37.1        35.9    18.1 
## 10    10      57      42.1        39.0    18.0 
## # ... with 2,203 more rows
#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, PMS7003) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.5 89.3 ▇▂▁▁▁
PMS7003 0 1 12.96 12.73 0 3.20 8.55 19.3 75.7 ▇▂▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ PMS7003)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.999
ggplot(df, aes(x = PMSA003, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 PMS7003",
       title = "Relationship between PMSA003 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.723     0.018      39.4       0    0.687    0.759
## 2 PMSA003      0.837     0.001     963.        0    0.836    0.839
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID PMS7003 PMSA003 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    26.7    30.7        26.4    0.267
##  2     2    19.5    21.6        18.8    0.688
##  3     3    25.6    29          25.0    0.59 
##  4     4    27.3    31.6        27.2    0.113
##  5     5    32.8    38.1        32.6    0.169
##  6     6    36.4    41.7        35.6    0.754
##  7     7    44.2    50.3        42.8    1.35 
##  8     8    47.9    54.4        46.3    1.62 
##  9     9    54.9    62.3        52.9    2.00 
## 10    10    60.5    69.8        59.2    1.32 
## # ... with 2,203 more rows
#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.5 89.3 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.743
ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     7.18     0.234      30.7       0    6.72     7.64 
## 2 PMSA003       0.58     0.011      52.3       0    0.559    0.602
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    30.7        25.0    5.00 
##  2     2      23    21.6        19.7    3.28 
##  3     3      18    29          24.0   -6.01 
##  4     4      26    31.6        25.5    0.481
##  5     5      24    38.1        29.3   -5.29 
##  6     6      30    41.7        31.4   -1.38 
##  7     7      38    50.3        36.4    1.63 
##  8     8      53    54.4        38.8   14.2  
##  9     9      54    62.3        43.3   10.7  
## 10    10      57    69.8        47.7    9.31 
## # ... with 2,203 more rows
#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMS7003 0 1 12.96 12.73 0 3.2 8.55 19.3 75.7 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.0 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.743
ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.69     0.241       27.8       0    6.22     7.17 
## 2 PMS7003      0.692    0.0132      52.2       0    0.666    0.718
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    26.7        25.2    4.84 
##  2     2      23    19.5        20.2    2.82 
##  3     3      18    25.6        24.4   -6.40 
##  4     4      26    27.3        25.6    0.42 
##  5     5      24    32.8        29.4   -5.38 
##  6     6      30    36.4        31.9   -1.88 
##  7     7      38    44.2        37.3    0.729
##  8     8      53    47.9        39.8   13.2  
##  9     9      54    54.9        44.7    9.33 
## 10    10      57    60.5        48.5    8.45 
## # ... with 2,203 more rows
# **ESTACION TUNAL -1hora VS SENSORES BAJO COSTO**
# **Prueba con los valores de la estacion Tunal retrasada 1 hora
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones con la estación oficial*


df <- read_excel("C:/Mediciones/TUNAL_1h.xlsx")
View(df)

glimpse(df)
## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00", ~
## $ Oficial   <dbl> 23, 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 18, 17, ~
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 54.9~
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 62.3~
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 37.1~
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 34.9~
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 28.7~
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1    48 15-11-2020 23:00      11   10.8    12.1       8.15  7.61    6.14
##  2   152 20-11-2020 07:00      46   41.4    47.5      25.2  23.4    20.6 
##  3  1128 31-12-2020 13:00      17   26.9    32.8       0    18.6    15.1 
##  4   874 20-12-2020 23:00       7    4.31    3.55      4.03  3.51    2.42
##  5  1722 25-01-2021 07:00      34   22      25.6       0    12.8    12   
##  6  1919 02-02-2021 12:00      29   31.7    36.3       0    22.3    18   
##  7  1924 02-02-2021 17:00      10    2.49    1.93      0     2.67    2.03
##  8    41 15-11-2020 16:00      16   18.4    21.3      12.7  11.9     9.48
##  9  2078 09-02-2021 03:00      11    7.31    7.39      0     5.41    4.59
## 10  1637 21-01-2021 18:00      28   30.3    36.4       0    18.6    15.7
fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig
#Caso 14: SNGCJA5 VS Oficial-1h

df %>% select(SNGCJA5, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.3 38.6 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.896
ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     4.01     0.167      24.1       0     3.68     4.34
## 2 SNGCJA5       1.67     0.018      94.6       0     1.64     1.71
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    14.4        28.1   -5.07 
##  2     2      18    10.7        21.9   -3.89 
##  3     3      26    13.8        27.1   -1.07 
##  4     4      24    14.7        28.6   -4.57 
##  5     5      30    18.5        34.9   -4.92 
##  6     6      38    20.3        37.9    0.069
##  7     7      53    24.8        45.5    7.55 
##  8     8      54    26.6        48.5    5.54 
##  9     9      57    28.7        52.0    5.03 
## 10    10      60    32.4        58.2    1.85 
## # ... with 2,203 more rows
#Caso 15: SPS30 VS Oficial-1

df %>% select(SPS30, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12 43.4 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0.00 7.00 13.00 22 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.873
ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     3.68     0.188      19.6       0     3.31     4.05
## 2 SPS30         1.42     0.017      84.0       0     1.38     1.45
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      23  15.9        26.2   -3.20 
##  2     2      18  12.4        21.2   -3.24 
##  3     3      26  15.7        25.9    0.084
##  4     4      24  17          27.8   -3.76 
##  5     5      30  21          33.4   -3.42 
##  6     6      38  22.4        35.4    2.60 
##  7     7      53  27.4        42.5   10.5  
##  8     8      54  29.7        45.7    8.26 
##  9     9      57  34.9        53.1    3.89 
## 10    10      60  39.9        60.2   -0.187
## # ... with 2,203 more rows
#Caso 16: HPMA115S0 VS Oficial-1h

df %>% select(HPMA115S0, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HPMA115S0 0 1 4.47 7.39 0 0 0 5.58 43.4 ▇▁▁▁▁
Oficial 0 1 15.66 11.85 0 7 13 22.00 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.460
ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.4       0.262      47.3       0   11.8     12.9  
## 2 HPMA115S0    0.738     0.03       24.4       0    0.679    0.797
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      23      17.4        25.2   -2.20 
##  2     2      18      14          22.7   -4.69 
##  3     3      26      17.2        25.1    0.948
##  4     4      24      18.6        26.1   -2.08 
##  5     5      30      23.5        29.7    0.299
##  6     6      38      25.1        30.9    7.12 
##  7     7      53      30          34.5   18.5  
##  8     8      54      32.5        36.3   17.7  
##  9     9      57      37.1        39.7   17.3  
## 10    10      60      42.1        43.4   16.6  
## # ... with 2,203 more rows
#Caso 17: PMSA003 VS Oficial-1h

df %>% select(PMSA003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.5 89.3 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.891
ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    5.50      0.159      34.6       0    5.18      5.81
## 2 PMSA003      0.696     0.008      92.3       0    0.681     0.71
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    30.7        26.9   -3.85 
##  2     2      18    21.6        20.5   -2.52 
##  3     3      26    29          25.7    0.331
##  4     4      24    31.6        27.5   -3.48 
##  5     5      30    38.1        32.0   -2.00 
##  6     6      38    41.7        34.5    3.50 
##  7     7      53    50.3        40.5   12.5  
##  8     8      54    54.4        43.3   10.7  
##  9     9      57    62.3        48.8    8.17 
## 10    10      60    69.8        54.0    5.95 
## # ... with 2,203 more rows
#Caso 18: PMS7003 VS Oficial-1h

df %>% select(PMS7003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMS7003 0 1 12.96 12.73 0 3.2 8.55 19.3 75.7 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.0 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.894
ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial-1hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    4.87    0.161        30.2       0    4.56     5.19 
## 2 PMS7003      0.832   0.00887      93.8       0    0.815    0.850
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    26.7        27.1    -4.10
##  2     2      18    19.5        21.1    -3.10
##  3     3      26    25.6        26.2    -0.18
##  4     4      24    27.3        27.6    -3.60
##  5     5      30    32.8        32.2    -2.17
##  6     6      38    36.4        35.2     2.83
##  7     7      53    44.2        41.7    11.3 
##  8     8      54    47.9        44.7     9.26
##  9     9      57    54.9        50.6     6.43
## 10    10      60    60.5        55.2     4.77
## # ... with 2,203 more rows
# **ESTACION TUNAL -2horas VS SENSORES BAJO COSTO**
# **Prueba con los valores de la estacion Tunal retrasada 2 horas
# **5 sensores diferentes: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**
# **Comparaciones con la estación oficial*


df <- read_excel("C:/Mediciones/TUNAL_2h.xlsx")
View(df)

glimpse(df)
## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00", ~
## $ Oficial   <dbl> 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 18, 17, 26, ~
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 54.9~
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 62.3~
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 37.1~
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 34.9~
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 28.7~
df %>%
  sample_n(size = 10)
## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   166 20-11-2020 21:00       7   7.49     8.1       6.06  5.03   4.56 
##  2  1149 01-01-2021 10:00       0   0.345    0.23      0     1.14   0.331
##  3  1169 02-01-2021 06:00      25  12.4     14.5       0     7.95   7.12 
##  4    97 17-11-2020 24:00      17  17.6     20        12.3  11.2    9.57 
##  5  1734 25-01-2021 19:00      10   3.66     3.42      0     3.04   2.45 
##  6  1899 01-02-2021 16:00       7   2.34     1.96      0     2.43   1.63 
##  7  2198 14-02-2021 03:00      24  23.3     27.5       0    14.6   12.4  
##  8   910 22-12-2020 11:00       6   2.78     2.4       3.22  2.24   1.62 
##  9  1502 16-01-2021 03:00      16   8.18     8.41      0     6.24   4.85 
## 10  1884 01-02-2021 01:00       8   2.8      0.97      3.72  2.42   1.5
fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig
#Caso 19: SNGCJA5 VS Oficial-2h

df %>% select(SNGCJA5, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SNGCJA5 0 1 6.97 6.35 0 2.08 4.93 10.3 38.6 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.704
ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.50     0.266      24.4       0     5.98     7.02
## 2 SNGCJA5       1.31     0.028      46.6       0     1.26     1.37
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    14.4        25.4   -7.41 
##  2     2      26    10.7        20.6    5.45 
##  3     3      24    13.8        24.6   -0.625
##  4     4      30    14.7        25.8    4.19 
##  5     5      38    18.5        30.8    7.20 
##  6     6      53    20.3        33.2   19.8  
##  7     7      54    24.8        39.1   14.9  
##  8     8      57    26.6        41.4   15.6  
##  9     9      60    28.7        44.2   15.8  
## 10    10      61    32.4        49.1   11.9  
## # ... with 2,203 more rows
#Caso 20: SPS30 VS Oficial-2

df %>% select(SPS30, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
SPS30 0 1 8.46 7.30 0.01 2.98 5.94 12 43.4 ▇▃▁▁▁
Oficial 0 1 15.66 11.85 0.00 7.00 13.00 22 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.683
ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.29     0.282      22.3       0     5.74     6.84
## 2 SPS30         1.11     0.025      43.9       0     1.06     1.16
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      18  15.9        23.9   -5.90 
##  2     2      26  12.4        20.0    5.98 
##  3     3      24  15.7        23.7    0.323
##  4     4      30  17          25.1    4.88 
##  5     5      38  21          29.5    8.45 
##  6     6      53  22.4        31.1   21.9  
##  7     7      54  27.4        36.6   17.4  
##  8     8      57  29.7        39.2   17.8  
##  9     9      60  34.9        44.9   15.1  
## 10    10      61  39.9        50.5   10.5  
## # ... with 2,203 more rows
#Caso 21: HPMA115S0 VS Oficial-2h

df %>% select(HPMA115S0, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
HPMA115S0 0 1 4.47 7.39 0 0 0 5.58 43.4 ▇▁▁▁▁
Oficial 0 1 15.66 11.85 0 7 13 22.00 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.355
ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   13.1       0.275      47.6       0   12.6     13.6  
## 2 HPMA115S0    0.569     0.032      17.9       0    0.507    0.632
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      18      17.4        23.0    -5.01
##  2     2      26      14          21.1     4.92
##  3     3      24      17.2        22.9     1.10
##  4     4      30      18.6        23.7     6.30
##  5     5      38      23.5        26.5    11.5 
##  6     6      53      25.1        27.4    25.6 
##  7     7      54      30          30.2    23.8 
##  8     8      57      32.5        31.6    25.4 
##  9     9      60      37.1        34.2    25.8 
## 10    10      61      42.1        37.1    23.9 
## # ... with 2,203 more rows
#Caso 22: PMSA003 VS Oficial-2h

df %>% select(PMSA003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMSA003 0 1 14.61 15.18 0 2.81 9.11 22.5 89.3 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.00 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.691
ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.77      0.253      30.8       0    7.28     8.27 
## 2 PMSA003      0.539     0.012      45.0       0    0.516    0.563
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    30.7        24.3   -6.34 
##  2     2      26    21.6        19.4    6.57 
##  3     3      24    29          23.4    0.581
##  4     4      30    31.6        24.8    5.18 
##  5     5      38    38.1        28.3    9.67 
##  6     6      53    41.7        30.3   22.7  
##  7     7      54    50.3        34.9   19.1  
##  8     8      57    54.4        37.1   19.9  
##  9     9      60    62.3        41.4   18.6  
## 10    10      61    69.8        45.4   15.6  
## # ... with 2,203 more rows
#Caso 23: PMS7003 VS Oficial-2h

df %>% select(PMS7003, Oficial) %>% skim()
Data summary
Name Piped data
Number of rows 2213
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PMS7003 0 1 12.96 12.73 0 3.2 8.55 19.3 75.7 ▇▂▁▁▁
Oficial 0 1 15.66 11.85 0 7.0 13.00 22.0 78.0 ▇▅▁▁▁
#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)
## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.694
ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMS7003 and Oficial-2hour") +  
  
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model, digits = 11)
## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.28     0.259       28.1       0    6.77     7.79 
## 2 PMS7003      0.646    0.0142      45.4       0    0.618    0.674
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    26.7        24.5   -6.54 
##  2     2      26    19.5        19.9    6.12 
##  3     3      24    25.6        23.8    0.174
##  4     4      30    27.3        24.9    5.08 
##  5     5      38    32.8        28.5    9.52 
##  6     6      53    36.4        30.8   22.2  
##  7     7      54    44.2        35.8   18.2  
##  8     8      57    47.9        38.2   18.8  
##  9     9      60    54.9        42.8   17.2  
## 10    10      61    60.5        46.4   14.6  
## # ... with 2,203 more rows