Tunal-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **1 hour late station ESTACION TUNAL VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/TUNAL_ESTACION_CANAIRIOS_1h.xlsx")
View(df)

glimpse(df)

## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00...
## $ Oficial   <dbl> 23, 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 18, 1...
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 5...
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 6...
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 3...
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 3...
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1  1390 11-01-2021 11:00       2  0.0219  0.0146      0     0.7   0.0146
##  2   276 26-11-2020 01:00      23 38.9    44.7        25    23.3  20.8   
##  3     6 14-11-2020 05:00      38 36.4    41.7        25.1  22.4  20.3   
##  4   983 25-12-2020 12:00      14 15.5    17.9         0    10     7.54  
##  5   480 04-12-2020 13:00      18  5.91    6.36        5.35  3.59  2.71  
##  6   230 23-11-2020 13:00      23 25.5    29.7        17.3  15.9  13.4   
##  7  1481 15-01-2021 06:00      21  9.81   10.5         0     6.02  5.67  
##  8  1905 01-02-2021 22:00      10  2.55    2.03        0     2.78  2.01  
##  9   182 21-11-2020 13:00       0  0.589   0.489       1.54  1.28  0.362 
## 10  1761 26-01-2021 22:00      11  6.27    6.57        0     5.07  3.97

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0.00	2.08	4.93	10.3	38.6	▇▃▁▁▁
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.478     0.023      21.2       0    0.434    0.522
## 2 SNGCJA5      1.14      0.002     479.        0    1.14     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  15.9    14.4      17.0   -1.06 
##  2     2  12.4    10.7      12.7   -0.324
##  3     3  15.7    13.8      16.3   -0.572
##  4     4  17      14.7      17.3   -0.302
##  5     5  21      18.5      21.7   -0.651
##  6     6  22.4    20.3      23.7   -1.31 
##  7     7  27.4    24.8      28.9   -1.46 
##  8     8  29.7    26.6      30.9   -1.22 
##  9     9  34.9    28.7      33.3    1.58 
## 10    10  39.9    32.4      37.6    2.34 
## # ... with 2,203 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.30	38.6	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.448     0.186     -2.41   0.016   -0.812   -0.084
## 2 SNGCJA5      0.706     0.02      35.9    0        0.667    0.745

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      17.4    14.4          9.72     7.68
##  2     2      14      10.7          7.11     6.89
##  3     3      17.2    13.8          9.30     7.90
##  4     4      18.6    14.7          9.93     8.67
##  5     5      23.5    18.5         12.6     10.9 
##  6     6      25.1    20.3         13.9     11.2 
##  7     7      30      24.8         17.1     12.9 
##  8     8      32.5    26.6         18.3     14.2 
##  9     9      37.1    28.7         19.8     17.3 
## 10    10      42.1    32.4         22.4     19.7 
## # ... with 2,203 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.97     0.048     -40.8       0    -2.07    -1.88
## 2 SNGCJA5       2.38     0.005     464.        0     2.37     2.39

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    30.7    14.4        32.3    -1.58
##  2     2    21.6    10.7        23.5    -1.88
##  3     3    29      13.8        30.8    -1.85
##  4     4    31.6    14.7        33.0    -1.39
##  5     5    38.1    18.5        42.0    -3.93
##  6     6    41.7    20.3        46.3    -4.61
##  7     7    50.3    24.8        57.0    -6.71
##  8     8    54.4    26.6        61.3    -6.89
##  9     9    62.3    28.7        66.3    -3.99
## 10    10    69.8    32.4        75.1    -5.29
## # ... with 2,203 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.96     0.036     -26.5       0    -1.03   -0.889
## 2 SNGCJA5       2.00     0.004     521.        0     1.99    2.00

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    26.7    14.4        27.8   -1.09 
##  2     2    19.5    10.7        20.4   -0.901
##  3     3    25.6    13.8        26.6   -0.989
##  4     4    27.3    14.7        28.4   -1.09 
##  5     5    32.8    18.5        36.0   -3.17 
##  6     6    36.4    20.3        39.6   -3.16 
##  7     7    44.2    24.8        48.5   -4.35 
##  8     8    47.9    26.6        52.1   -4.24 
##  9     9    54.9    28.7        56.3   -1.43 
## 10    10    60.5    32.4        63.7   -3.22 
## # ... with 2,203 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.896

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     4.01     0.167      24.1       0     3.68     4.34
## 2 SNGCJA5       1.67     0.018      94.6       0     1.64     1.71

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    14.4        28.1   -5.07 
##  2     2      18    10.7        21.9   -3.89 
##  3     3      26    13.8        27.1   -1.07 
##  4     4      24    14.7        28.6   -4.57 
##  5     5      30    18.5        34.9   -4.92 
##  6     6      38    20.3        37.9    0.069
##  7     7      53    24.8        45.5    7.55 
##  8     8      54    26.6        48.5    5.54 
##  9     9      57    28.7        52.0    5.03 
## 10    10      60    32.4        58.2    1.85 
## # ... with 2,203 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.00	43.4	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0.00	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.608

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.731     0.191     -3.83       0   -1.10    -0.356
## 2 SPS30        0.615     0.017     36.0        0    0.582    0.649

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      17.4  15.9          9.06     8.35
##  2     2      14    12.4          6.90     7.10
##  3     3      17.2  15.7          8.93     8.27
##  4     4      18.6  17            9.73     8.87
##  5     5      23.5  21           12.2     11.3 
##  6     6      25.1  22.4         13.1     12.0 
##  7     7      30    27.4         16.1     13.9 
##  8     8      32.5  29.7         17.5     15.0 
##  9     9      37.1  34.9         20.7     16.4 
## 10    10      42.1  39.9         23.8     18.3 
## # ... with 2,203 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0.00	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.88     0.05      -57.6       0    -2.98    -2.78
## 2 SPS30         2.07     0.004     462.        0     2.06     2.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    30.7  15.9        30.0    0.697
##  2     2    21.6  12.4        22.8   -1.16 
##  3     3    29    15.7        29.6   -0.589
##  4     4    31.6  17          32.3   -0.678
##  5     5    38.1  21          40.6   -2.45 
##  6     6    41.7  22.4        43.4   -1.75 
##  7     7    50.3  27.4        53.8   -3.49 
##  8     8    54.4  29.7        58.5   -4.14 
##  9     9    62.3  34.9        69.3   -7.00 
## 10    10    69.8  39.9        79.6   -9.84 
## # ... with 2,203 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0.00	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.71     0.041     -41.6       0    -1.79    -1.63
## 2 SPS30         1.73     0.004     471.        0     1.73     1.74

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    26.7  15.9        25.9    0.833
##  2     2    19.5  12.4        19.8   -0.297
##  3     3    25.6  15.7        25.5    0.08 
##  4     4    27.3  17          27.8   -0.475
##  5     5    32.8  21          34.7   -1.91 
##  6     6    36.4  22.4        37.1   -0.741
##  7     7    44.2  27.4        45.8   -1.61 
##  8     8    47.9  29.7        49.8   -1.90 
##  9     9    54.9  34.9        58.8   -3.92 
## 10    10    60.5  39.9        67.5   -6.99 
## # ... with 2,203 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12	43.4	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0.00	7.00	13.00	22	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.873

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     3.68     0.188      19.6       0     3.31     4.05
## 2 SPS30         1.42     0.017      84.0       0     1.38     1.45

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      23  15.9        26.2   -3.20 
##  2     2      18  12.4        21.2   -3.24 
##  3     3      26  15.7        25.9    0.084
##  4     4      24  17          27.8   -3.76 
##  5     5      30  21          33.4   -3.42 
##  6     6      38  22.4        35.4    2.60 
##  7     7      53  27.4        42.5   10.5  
##  8     8      54  29.7        45.7    8.26 
##  9     9      57  34.9        53.1    3.89 
## 10    10      60  39.9        60.2   -0.187
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.50	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.602

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.08     0.301      30.1       0     8.49     9.67
## 2 HPMA115S0     1.24     0.035      35.4       0     1.17     1.30

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    30.7      17.4        30.6    0.113
##  2     2    21.6      14          26.4   -4.78 
##  3     3    29        17.2        30.3   -1.34 
##  4     4    31.6      18.6        32.1   -0.47 
##  5     5    38.1      23.5        38.1   -0.026
##  6     6    41.7      25.1        40.1    1.60 
##  7     7    50.3      30          46.2    4.14 
##  8     8    54.4      32.5        49.2    5.15 
##  9     9    62.3      37.1        54.9    7.36 
## 10    10    69.8      42.1        61.1    8.68 
## # ... with 2,203 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.0	0.00	5.58	43.4	▇▁▁▁▁
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.30	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.615

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.22     0.25       32.9       0     7.74     8.71
## 2 HPMA115S0     1.06     0.029      36.6       0     1.00     1.12

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    26.7      17.4        26.6    0.062
##  2     2    19.5      14          23.0   -3.54 
##  3     3    25.6      17.2        26.4   -0.826
##  4     4    27.3      18.6        27.9   -0.608
##  5     5    32.8      23.5        33.1   -0.293
##  6     6    36.4      25.1        34.8    1.61 
##  7     7    44.2      30          40.0    4.23 
##  8     8    47.9      32.5        42.6    5.28 
##  9     9    54.9      37.1        47.5    7.42 
## 10    10    60.5      42.1        52.8    7.72 
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0	0	5.58	43.4	▇▁▁▁▁
Oficial	0	1	15.66	11.85	0	7	13	22.00	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.460

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.4       0.262      47.3       0   11.8     12.9  
## 2 HPMA115S0    0.738     0.03       24.4       0    0.679    0.797

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      23      17.4        25.2   -2.20 
##  2     2      18      14          22.7   -4.69 
##  3     3      26      17.2        25.1    0.948
##  4     4      24      18.6        26.1   -2.08 
##  5     5      30      23.5        29.7    0.299
##  6     6      38      25.1        30.9    7.12 
##  7     7      53      30          34.5   18.5  
##  8     8      54      32.5        36.3   17.7  
##  9     9      57      37.1        39.7   17.3  
## 10    10      60      42.1        43.4   16.6  
## # ... with 2,203 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.891

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    5.50      0.159      34.6       0    5.18      5.81
## 2 PMSA003      0.696     0.008      92.3       0    0.681     0.71

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    30.7        26.9   -3.85 
##  2     2      18    21.6        20.5   -2.52 
##  3     3      26    29          25.7    0.331
##  4     4      24    31.6        27.5   -3.48 
##  5     5      30    38.1        32.0   -2.00 
##  6     6      38    41.7        34.5    3.50 
##  7     7      53    50.3        40.5   12.5  
##  8     8      54    54.4        43.3   10.7  
##  9     9      57    62.3        48.8    8.17 
## 10    10      60    69.8        54.0    5.95 
## # ... with 2,203 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.891

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    5.50      0.159      34.6       0    5.18      5.81
## 2 PMSA003      0.696     0.008      92.3       0    0.681     0.71

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    30.7        26.9   -3.85 
##  2     2      18    21.6        20.5   -2.52 
##  3     3      26    29          25.7    0.331
##  4     4      24    31.6        27.5   -3.48 
##  5     5      30    38.1        32.0   -2.00 
##  6     6      38    41.7        34.5    3.50 
##  7     7      53    50.3        40.5   12.5  
##  8     8      54    54.4        43.3   10.7  
##  9     9      57    62.3        48.8    8.17 
## 10    10      60    69.8        54.0    5.95 
## # ... with 2,203 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.3	75.7	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.0	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.894

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    4.87      0.161      30.2       0    4.56      5.19
## 2 PMS7003      0.832     0.009      93.8       0    0.815     0.85

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      23    26.7        27.1    -4.10
##  2     2      18    19.5        21.1    -3.10
##  3     3      26    25.6        26.2    -0.18
##  4     4      24    27.3        27.6    -3.60
##  5     5      30    32.8        32.2    -2.17
##  6     6      38    36.4        35.2     2.83
##  7     7      53    44.2        41.7    11.3 
##  8     8      54    47.9        44.7     9.26
##  9     9      57    54.9        50.6     6.43
## 10    10      60    60.5        55.2     4.77
## # ... with 2,203 more rows

Tunal-Canairios.R

DBB

2021-02-24