Tunal-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **2 hours late station ESTACION TUNAL VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/TUNAL_ESTACION_CANAIRIOS_2h.xlsx")
View(df)

glimpse(df)

## Rows: 2,213
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "13-11-2020 24:00", "14-11-2020 01:00", "14-11-2020 02:00...
## $ Oficial   <dbl> 18, 26, 24, 30, 38, 53, 54, 57, 60, 61, 41, 26, 18, 17, 2...
## $ PMS7003   <dbl> 26.70, 19.50, 25.60, 27.30, 32.80, 36.40, 44.20, 47.90, 5...
## $ PMSA003   <dbl> 30.70, 21.60, 29.00, 31.60, 38.10, 41.70, 50.30, 54.40, 6...
## $ HPMA115S0 <dbl> 17.40, 14.00, 17.20, 18.60, 23.50, 25.10, 30.00, 32.50, 3...
## $ SPS30     <dbl> 15.90, 12.40, 15.70, 17.00, 21.00, 22.40, 27.40, 29.70, 3...
## $ SNGCJA5   <dbl> 14.40, 10.70, 13.80, 14.70, 18.50, 20.30, 24.80, 26.60, 2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1  1597 20-01-2021 02:00       8    2.39    1.82      0     2.55    1.98
##  2   257 25-11-2020 06:00      12    4.18    4.03      3.89  3.19    2.51
##  3    60 16-11-2020 11:00      17    6.68    6.95      5.75  5.24    3.72
##  4  2228 15-02-2021 09:00      35   33      38.5       3.92 21.7    17.7 
##  5  2089 09-02-2021 14:00      39   40.6    48.6       0    26      20.6 
##  6   875 20-12-2020 24:00       7    3.64    2.75      4.05  3.12    2.29
##  7   591 09-12-2020 04:00       8    4.74    4.62      4.43  3.91    3.25
##  8  1304 07-01-2021 21:00      13    5.05    5.15      0     4.11    3.36
##  9  2248 16-02-2021 05:00      53   30.2    34.2       0    17.1    16.7 
## 10  1781 27-01-2021 18:00      12    7.95    8.16      0     5.7     4.84

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0.00	2.08	4.93	10.3	38.6	▇▃▁▁▁
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.478     0.023      21.2       0    0.434    0.522
## 2 SNGCJA5      1.14      0.002     479.        0    1.14     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  15.9    14.4      17.0   -1.06 
##  2     2  12.4    10.7      12.7   -0.324
##  3     3  15.7    13.8      16.3   -0.572
##  4     4  17      14.7      17.3   -0.302
##  5     5  21      18.5      21.7   -0.651
##  6     6  22.4    20.3      23.7   -1.31 
##  7     7  27.4    24.8      28.9   -1.46 
##  8     8  29.7    26.6      30.9   -1.22 
##  9     9  34.9    28.7      33.3    1.58 
## 10    10  39.9    32.4      37.6    2.34 
## # ... with 2,203 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.30	38.6	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.448     0.186     -2.41   0.016   -0.812   -0.084
## 2 SNGCJA5      0.706     0.02      35.9    0        0.667    0.745

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      17.4    14.4          9.72     7.68
##  2     2      14      10.7          7.11     6.89
##  3     3      17.2    13.8          9.30     7.90
##  4     4      18.6    14.7          9.93     8.67
##  5     5      23.5    18.5         12.6     10.9 
##  6     6      25.1    20.3         13.9     11.2 
##  7     7      30      24.8         17.1     12.9 
##  8     8      32.5    26.6         18.3     14.2 
##  9     9      37.1    28.7         19.8     17.3 
## 10    10      42.1    32.4         22.4     19.7 
## # ... with 2,203 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.97     0.048     -40.8       0    -2.07    -1.88
## 2 SNGCJA5       2.38     0.005     464.        0     2.37     2.39

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    30.7    14.4        32.3    -1.58
##  2     2    21.6    10.7        23.5    -1.88
##  3     3    29      13.8        30.8    -1.85
##  4     4    31.6    14.7        33.0    -1.39
##  5     5    38.1    18.5        42.0    -3.93
##  6     6    41.7    20.3        46.3    -4.61
##  7     7    50.3    24.8        57.0    -6.71
##  8     8    54.4    26.6        61.3    -6.89
##  9     9    62.3    28.7        66.3    -3.99
## 10    10    69.8    32.4        75.1    -5.29
## # ... with 2,203 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.96     0.036     -26.5       0    -1.03   -0.889
## 2 SNGCJA5       2.00     0.004     521.        0     1.99    2.00

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    26.7    14.4        27.8   -1.09 
##  2     2    19.5    10.7        20.4   -0.901
##  3     3    25.6    13.8        26.6   -0.989
##  4     4    27.3    14.7        28.4   -1.09 
##  5     5    32.8    18.5        36.0   -3.17 
##  6     6    36.4    20.3        39.6   -3.16 
##  7     7    44.2    24.8        48.5   -4.35 
##  8     8    47.9    26.6        52.1   -4.24 
##  9     9    54.9    28.7        56.3   -1.43 
## 10    10    60.5    32.4        63.7   -3.22 
## # ... with 2,203 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.97	6.35	0	2.08	4.93	10.3	38.6	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.704

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.50     0.266      24.4       0     5.98     7.02
## 2 SNGCJA5       1.31     0.028      46.6       0     1.26     1.37

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    14.4        25.4   -7.41 
##  2     2      26    10.7        20.6    5.45 
##  3     3      24    13.8        24.6   -0.625
##  4     4      30    14.7        25.8    4.19 
##  5     5      38    18.5        30.8    7.20 
##  6     6      53    20.3        33.2   19.8  
##  7     7      54    24.8        39.1   14.9  
##  8     8      57    26.6        41.4   15.6  
##  9     9      60    28.7        44.2   15.8  
## 10    10      61    32.4        49.1   11.9  
## # ... with 2,203 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.00	43.4	▇▃▁▁▁
HPMA115S0	0	1	4.47	7.39	0.00	0.00	0.00	5.58	43.4	▇▁▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.608

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.731     0.191     -3.83       0   -1.10    -0.356
## 2 SPS30        0.615     0.017     36.0        0    0.582    0.649

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      17.4  15.9          9.06     8.35
##  2     2      14    12.4          6.90     7.10
##  3     3      17.2  15.7          8.93     8.27
##  4     4      18.6  17            9.73     8.87
##  5     5      23.5  21           12.2     11.3 
##  6     6      25.1  22.4         13.1     12.0 
##  7     7      30    27.4         16.1     13.9 
##  8     8      32.5  29.7         17.5     15.0 
##  9     9      37.1  34.9         20.7     16.4 
## 10    10      42.1  39.9         23.8     18.3 
## # ... with 2,203 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMSA003	0	1	14.61	15.18	0.00	2.81	9.11	22.5	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.88     0.05      -57.6       0    -2.98    -2.78
## 2 SPS30         2.07     0.004     462.        0     2.06     2.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    30.7  15.9        30.0    0.697
##  2     2    21.6  12.4        22.8   -1.16 
##  3     3    29    15.7        29.6   -0.589
##  4     4    31.6  17          32.3   -0.678
##  5     5    38.1  21          40.6   -2.45 
##  6     6    41.7  22.4        43.4   -1.75 
##  7     7    50.3  27.4        53.8   -3.49 
##  8     8    54.4  29.7        58.5   -4.14 
##  9     9    62.3  34.9        69.3   -7.00 
## 10    10    69.8  39.9        79.6   -9.84 
## # ... with 2,203 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12.0	43.4	▇▃▁▁▁
PMS7003	0	1	12.96	12.73	0.00	3.20	8.55	19.3	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.71     0.041     -41.6       0    -1.79    -1.63
## 2 SPS30         1.73     0.004     471.        0     1.73     1.74

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    26.7  15.9        25.9    0.833
##  2     2    19.5  12.4        19.8   -0.297
##  3     3    25.6  15.7        25.5    0.08 
##  4     4    27.3  17          27.8   -0.475
##  5     5    32.8  21          34.7   -1.91 
##  6     6    36.4  22.4        37.1   -0.741
##  7     7    44.2  27.4        45.8   -1.61 
##  8     8    47.9  29.7        49.8   -1.90 
##  9     9    54.9  34.9        58.8   -3.92 
## 10    10    60.5  39.9        67.5   -6.99 
## # ... with 2,203 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	8.46	7.30	0.01	2.98	5.94	12	43.4	▇▃▁▁▁
Oficial	0	1	15.66	11.85	0.00	7.00	13.00	22	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.683

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.29     0.282      22.3       0     5.74     6.84
## 2 SPS30         1.11     0.025      43.9       0     1.06     1.16

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      18  15.9        23.9   -5.90 
##  2     2      26  12.4        20.0    5.98 
##  3     3      24  15.7        23.7    0.323
##  4     4      30  17          25.1    4.88 
##  5     5      38  21          29.5    8.45 
##  6     6      53  22.4        31.1   21.9  
##  7     7      54  27.4        36.6   17.4  
##  8     8      57  29.7        39.2   17.8  
##  9     9      60  34.9        44.9   15.1  
## 10    10      61  39.9        50.5   10.5  
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.00	0.00	5.58	43.4	▇▁▁▁▁
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.50	89.3	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.602

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.08     0.301      30.1       0     8.49     9.67
## 2 HPMA115S0     1.24     0.035      35.4       0     1.17     1.30

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    30.7      17.4        30.6    0.113
##  2     2    21.6      14          26.4   -4.78 
##  3     3    29        17.2        30.3   -1.34 
##  4     4    31.6      18.6        32.1   -0.47 
##  5     5    38.1      23.5        38.1   -0.026
##  6     6    41.7      25.1        40.1    1.60 
##  7     7    50.3      30          46.2    4.14 
##  8     8    54.4      32.5        49.2    5.15 
##  9     9    62.3      37.1        54.9    7.36 
## 10    10    69.8      42.1        61.1    8.68 
## # ... with 2,203 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0.0	0.00	5.58	43.4	▇▁▁▁▁
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.30	75.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.615

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.22     0.25       32.9       0     7.74     8.71
## 2 HPMA115S0     1.06     0.029      36.6       0     1.00     1.12

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    26.7      17.4        26.6    0.062
##  2     2    19.5      14          23.0   -3.54 
##  3     3    25.6      17.2        26.4   -0.826
##  4     4    27.3      18.6        27.9   -0.608
##  5     5    32.8      23.5        33.1   -0.293
##  6     6    36.4      25.1        34.8    1.61 
##  7     7    44.2      30          40.0    4.23 
##  8     8    47.9      32.5        42.6    5.28 
##  9     9    54.9      37.1        47.5    7.42 
## 10    10    60.5      42.1        52.8    7.72 
## # ... with 2,203 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	4.47	7.39	0	0	0	5.58	43.4	▇▁▁▁▁
Oficial	0	1	15.66	11.85	0	7	13	22.00	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.355

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   13.1       0.275      47.6       0   12.6     13.6  
## 2 HPMA115S0    0.569     0.032      17.9       0    0.507    0.632

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      18      17.4        23.0    -5.01
##  2     2      26      14          21.1     4.92
##  3     3      24      17.2        22.9     1.10
##  4     4      30      18.6        23.7     6.30
##  5     5      38      23.5        26.5    11.5 
##  6     6      53      25.1        27.4    25.6 
##  7     7      54      30          30.2    23.8 
##  8     8      57      32.5        31.6    25.4 
##  9     9      60      37.1        34.2    25.8 
## 10    10      61      42.1        37.1    23.9 
## # ... with 2,203 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.691

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.77      0.253      30.8       0    7.28     8.27 
## 2 PMSA003      0.539     0.012      45.0       0    0.516    0.563

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    30.7        24.3   -6.34 
##  2     2      26    21.6        19.4    6.57 
##  3     3      24    29          23.4    0.581
##  4     4      30    31.6        24.8    5.18 
##  5     5      38    38.1        28.3    9.67 
##  6     6      53    41.7        30.3   22.7  
##  7     7      54    50.3        34.9   19.1  
##  8     8      57    54.4        37.1   19.9  
##  9     9      60    62.3        41.4   18.6  
## 10    10      61    69.8        45.4   15.6  
## # ... with 2,203 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	14.61	15.18	0	2.81	9.11	22.5	89.3	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.00	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.691

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.77      0.253      30.8       0    7.28     8.27 
## 2 PMSA003      0.539     0.012      45.0       0    0.516    0.563

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    30.7        24.3   -6.34 
##  2     2      26    21.6        19.4    6.57 
##  3     3      24    29          23.4    0.581
##  4     4      30    31.6        24.8    5.18 
##  5     5      38    38.1        28.3    9.67 
##  6     6      53    41.7        30.3   22.7  
##  7     7      54    50.3        34.9   19.1  
##  8     8      57    54.4        37.1   19.9  
##  9     9      60    62.3        41.4   18.6  
## 10    10      61    69.8        45.4   15.6  
## # ... with 2,203 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	2213
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	12.96	12.73	0	3.2	8.55	19.3	75.7	▇▂▁▁▁
Oficial	0	1	15.66	11.85	0	7.0	13.00	22.0	78.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.694

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.28      0.259      28.1       0    6.77     7.79 
## 2 PMS7003      0.646     0.014      45.4       0    0.618    0.674

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 2,213 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      18    26.7        24.5   -6.54 
##  2     2      26    19.5        19.9    6.12 
##  3     3      24    25.6        23.8    0.174
##  4     4      30    27.3        24.9    5.08 
##  5     5      38    32.8        28.5    9.52 
##  6     6      53    36.4        30.8   22.2  
##  7     7      54    44.2        35.8   18.2  
##  8     8      57    47.9        38.2   18.8  
##  9     9      60    54.9        42.8   17.2  
## 10    10      61    60.5        46.4   14.6  
## # ... with 2,203 more rows

Tunal-Canairios.R

DBB

2021-02-24