Tunal-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **ESTACION TUNAL VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/TUNAL_ESTACION_CANAIRIOS.xlsx")
View(df)

glimpse(df)

## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00...
## $ Oficial   <dbl> 30, 23, 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 1...
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 5...
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 6...
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 3...
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 3...
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1  1390 11-01-2021 11:00       4  0.0219  0.0146      0     0.7   0.0146
##  2   463 03-12-2020 20:00      37 47.7    54.8        30.9  28.4  24.9   
##  3   167 20-11-2020 22:00      14  4.64    4.56        4.81  3.75  3.27  
##  4  1115 30-12-2020 24:00      22 15      17.1         0    10.4   8.24  
##  5  1689 23-01-2021 22:00       6  2.48    2.87        0     2.65  1.93  
##  6  1431 13-01-2021 04:00       1  4.64    4.27        0     3.61  3.01  
##  7    82 17-11-2020 09:00      36 27.4    32.2        17.2  16.5  13.4   
##  8   423 02-12-2020 04:00      13 10.4    10.9         7.61  6.44  5.95  
##  9  1303 07-01-2021 20:00       6  3.67    3.4         0     3.13  2.35  
## 10   532 06-12-2020 17:00       1  2.17    1.62        2.67  2.25  1.51

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0.00	2.08	4.93	10.3	38.6	▇▃▁▁▁
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.478     0.023      21.2       0    0.434    0.522
## 2 SNGCJA5      1.14      0.002     479.        0    1.14     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  15.9    14.4      17.0   -1.06 
##  2     2  12.4    10.7      12.7   -0.324
##  3     3  15.7    13.8      16.3   -0.572
##  4     4  17      14.7      17.3   -0.302
##  5     5  21      18.5      21.7   -0.651
##  6     6  22.4    20.3      23.7   -1.31 
##  7     7  27.4    24.8      28.9   -1.46 
##  8     8  29.7    26.6      30.9   -1.22 
##  9     9  34.9    28.7      33.3    1.58 
## 10    10  39.9    32.4      37.6    2.34 
## # ... with 2,203 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.30	38.6	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.448     0.186     -2.41   0.016   -0.812   -0.084
## 2 SNGCJA5      0.706     0.02      35.9    0        0.667    0.745

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      17.4    14.4          9.72     7.68
##  2     2      14      10.7          7.11     6.89
##  3     3      17.2    13.8          9.30     7.90
##  4     4      18.6    14.7          9.93     8.67
##  5     5      23.5    18.5         12.6     10.9 
##  6     6      25.1    20.3         13.9     11.2 
##  7     7      30      24.8         17.1     12.9 
##  8     8      32.5    26.6         18.3     14.2 
##  9     9      37.1    28.7         19.8     17.3 
## 10    10      42.1    32.4         22.4     19.7 
## # ... with 2,203 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.97     0.048     -40.8       0    -2.07    -1.88
## 2 SNGCJA5       2.38     0.005     464.        0     2.37     2.39

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    30.7    14.4        32.3    -1.58
##  2     2    21.6    10.7        23.5    -1.88
##  3     3    29      13.8        30.8    -1.85
##  4     4    31.6    14.7        33.0    -1.39
##  5     5    38.1    18.5        42.0    -3.93
##  6     6    41.7    20.3        46.3    -4.61
##  7     7    50.3    24.8        57.0    -6.71
##  8     8    54.4    26.6        61.3    -6.89
##  9     9    62.3    28.7        66.3    -3.99
## 10    10    69.8    32.4        75.1    -5.29
## # ... with 2,203 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.96     0.036     -26.5       0    -1.03   -0.889
## 2 SNGCJA5       2.00     0.004     521.        0     1.99    2.00

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    26.7    14.4        27.8   -1.09 
##  2     2    19.5    10.7        20.4   -0.901
##  3     3    25.6    13.8        26.6   -0.989
##  4     4    27.3    14.7        28.4   -1.09 
##  5     5    32.8    18.5        36.0   -3.17 
##  6     6    36.4    20.3        39.6   -3.16 
##  7     7    44.2    24.8        48.5   -4.35 
##  8     8    47.9    26.6        52.1   -4.24 
##  9     9    54.9    28.7        56.3   -1.43 
## 10    10    60.5    32.4        63.7   -3.22 
## # ... with 2,203 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.740

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.04     0.252      24.0       0     5.54     6.53
## 2 SNGCJA5       1.38     0.027      51.7       0     1.33     1.43

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    14.4        25.9    4.09 
##  2     2      23    10.7        20.8    2.20 
##  3     3      18    13.8        25.1   -7.08 
##  4     4      26    14.7        26.3   -0.324
##  5     5      24    18.5        31.6   -7.57 
##  6     6      30    20.3        34.1   -4.05 
##  7     7      38    24.8        40.3   -2.26 
##  8     8      53    26.6        42.7   10.3  
##  9     9      54    28.7        45.6    8.35 
## 10    10      57    32.4        50.8    6.25 
## # ... with 2,203 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.00	43.4	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0.00	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.608

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.731     0.191     -3.83       0   -1.10    -0.356
## 2 SPS30        0.615     0.017     36.0        0    0.582    0.649

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      17.4  15.9          9.06     8.35
##  2     2      14    12.4          6.90     7.10
##  3     3      17.2  15.7          8.93     8.27
##  4     4      18.6  17            9.73     8.87
##  5     5      23.5  21           12.2     11.3 
##  6     6      25.1  22.4         13.1     12.0 
##  7     7      30    27.4         16.1     13.9 
##  8     8      32.5  29.7         17.5     15.0 
##  9     9      37.1  34.9         20.7     16.4 
## 10    10      42.1  39.9         23.8     18.3 
## # ... with 2,203 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0.00	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.88     0.05      -57.6       0    -2.98    -2.78
## 2 SPS30         2.07     0.004     462.        0     2.06     2.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    30.7  15.9        30.0    0.697
##  2     2    21.6  12.4        22.8   -1.16 
##  3     3    29    15.7        29.6   -0.589
##  4     4    31.6  17          32.3   -0.678
##  5     5    38.1  21          40.6   -2.45 
##  6     6    41.7  22.4        43.4   -1.75 
##  7     7    50.3  27.4        53.8   -3.49 
##  8     8    54.4  29.7        58.5   -4.14 
##  9     9    62.3  34.9        69.3   -7.00 
## 10    10    69.8  39.9        79.6   -9.84 
## # ... with 2,203 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0.00	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.71     0.041     -41.6       0    -1.79    -1.63
## 2 SPS30         1.73     0.004     471.        0     1.73     1.74

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    26.7  15.9        25.9    0.833
##  2     2    19.5  12.4        19.8   -0.297
##  3     3    25.6  15.7        25.5    0.08 
##  4     4    27.3  17          27.8   -0.475
##  5     5    32.8  21          34.7   -1.91 
##  6     6    36.4  22.4        37.1   -0.741
##  7     7    44.2  27.4        45.8   -1.61 
##  8     8    47.9  29.7        49.8   -1.90 
##  9     9    54.9  34.9        58.8   -3.92 
## 10    10    60.5  39.9        67.5   -6.99 
## # ... with 2,203 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12	43.4	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0.00	7.00	13.00	22	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.730

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.64     0.263      21.4       0     5.12     6.15
## 2 SPS30         1.18     0.024      50.3       0     1.14     1.23

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      30  15.9        24.5    5.52 
##  2     2      23  12.4        20.3    2.67 
##  3     3      18  15.7        24.2   -6.24 
##  4     4      26  17          25.8    0.216
##  5     5      24  21          30.5   -6.52 
##  6     6      30  22.4        32.2   -2.18 
##  7     7      38  27.4        38.1   -0.11 
##  8     8      53  29.7        40.8   12.2  
##  9     9      54  34.9        47.0    7.00 
## 10    10      57  39.9        52.9    4.08 
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.50	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.602

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.08     0.301      30.1       0     8.49     9.67
## 2 HPMA115S0     1.24     0.035      35.4       0     1.17     1.30

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    30.7      17.4        30.6    0.113
##  2     2    21.6      14          26.4   -4.78 
##  3     3    29        17.2        30.3   -1.34 
##  4     4    31.6      18.6        32.1   -0.47 
##  5     5    38.1      23.5        38.1   -0.026
##  6     6    41.7      25.1        40.1    1.60 
##  7     7    50.3      30          46.2    4.14 
##  8     8    54.4      32.5        49.2    5.15 
##  9     9    62.3      37.1        54.9    7.36 
## 10    10    69.8      42.1        61.1    8.68 
## # ... with 2,203 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.0	0.00	5.58	43.4	▇▁▁▁▁
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.30	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.615

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.22     0.25       32.9       0     7.74     8.71
## 2 HPMA115S0     1.06     0.029      36.6       0     1.00     1.12

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    26.7      17.4        26.6    0.062
##  2     2    19.5      14          23.0   -3.54 
##  3     3    25.6      17.2        26.4   -0.826
##  4     4    27.3      18.6        27.9   -0.608
##  5     5    32.8      23.5        33.1   -0.293
##  6     6    36.4      25.1        34.8    1.61 
##  7     7    44.2      30          40.0    4.23 
##  8     8    47.9      32.5        42.6    5.28 
##  9     9    54.9      37.1        47.5    7.42 
## 10    10    60.5      42.1        52.8    7.72 
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0	0	5.58	43.4	▇▁▁▁▁
Oficial	0	1	15.66	11.85	0	7	13	22.00	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.388

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.9       0.272      47.4       0    12.3    13.4  
## 2 HPMA115S0    0.621     0.031      19.8       0     0.56    0.683

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      30      17.4        23.7     6.31
##  2     2      23      14          21.6     1.42
##  3     3      18      17.2        23.6    -5.57
##  4     4      26      18.6        24.4     1.56
##  5     5      24      23.5        27.5    -3.48
##  6     6      30      25.1        28.5     1.52
##  7     7      38      30          31.5     6.48
##  8     8      53      32.5        33.1    19.9 
##  9     9      54      37.1        35.9    18.1 
## 10    10      57      42.1        39.0    18.0 
## # ... with 2,203 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.743

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.18      0.234      30.7       0    6.72     7.64 
## 2 PMSA003      0.580     0.011      52.3       0    0.559    0.602

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    30.7        25.0    5.00 
##  2     2      23    21.6        19.7    3.28 
##  3     3      18    29          24.0   -6.01 
##  4     4      26    31.6        25.5    0.481
##  5     5      24    38.1        29.3   -5.29 
##  6     6      30    41.7        31.4   -1.38 
##  7     7      38    50.3        36.4    1.63 
##  8     8      53    54.4        38.8   14.2  
##  9     9      54    62.3        43.3   10.7  
## 10    10      57    69.8        47.7    9.31 
## # ... with 2,203 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.743

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.18      0.234      30.7       0    6.72     7.64 
## 2 PMSA003      0.580     0.011      52.3       0    0.559    0.602

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    30.7        25.0    5.00 
##  2     2      23    21.6        19.7    3.28 
##  3     3      18    29          24.0   -6.01 
##  4     4      26    31.6        25.5    0.481
##  5     5      24    38.1        29.3   -5.29 
##  6     6      30    41.7        31.4   -1.38 
##  7     7      38    50.3        36.4    1.63 
##  8     8      53    54.4        38.8   14.2  
##  9     9      54    62.3        43.3   10.7  
## 10    10      57    69.8        47.7    9.31 
## # ... with 2,203 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.3	75.7	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.0	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.743

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.69      0.241      27.8       0    6.22     7.17 
## 2 PMS7003      0.692     0.013      52.2       0    0.666    0.718

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      30    26.7        25.2    4.84 
##  2     2      23    19.5        20.2    2.82 
##  3     3      18    25.6        24.4   -6.40 
##  4     4      26    27.3        25.6    0.42 
##  5     5      24    32.8        29.4   -5.38 
##  6     6      30    36.4        31.9   -1.88 
##  7     7      38    44.2        37.3    0.729
##  8     8      53    47.9        39.8   13.2  
##  9     9      54    54.9        44.7    9.33 
## 10    10      57    60.5        48.5    8.45 
## # ... with 2,203 more rows

Tunal-Canairios.R

DBB

2021-02-24