Kennedy-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **2 hours late station ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/KENNEDY_CONSOLIDADO_final_2h.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, 24.0, 23.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   676 09-12-2020 03:00      26    21.1    24.7     17    12.2    10.7 
##  2   652 08-12-2020 03:00      47    65.3    77.3     33.7  32.5    28.9 
##  3   743 11-12-2020 22:00      48    43.1    51.5     24    23      19.7 
##  4   733 11-12-2020 12:00      41    49.3    61.7     25.3  28.9    24.8 
##  5   250 21-11-2020 09:00      35    24      27.8     13.4  12      10.3 
##  6   472 30-11-2020 15:00      25    21.6    26.9     12.1  12.1    10   
##  7   661 08-12-2020 12:00      33    11.2    13.7      7.78  6.91    5.41
##  8   654 08-12-2020 05:00      42    45.9    55.3     24.4  23.2    20.3 
##  9   398 27-11-2020 13:00      20    27      34.2     14.3  16.3    13.6 
## 10    55 13-11-2020 06:00      17    12.3    13.5      5.91  6.47    5.26

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.70	54.8	▇▃▁▁▁
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.772     0.029      26.7       0    0.715    0.829
## 2 SNGCJA5      1.09      0.002     439.        0    1.08     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  4.51    3.31      4.38    0.132
##  2     2  5.69    4.46      5.63    0.06 
##  3     3  6.51    5.2       6.44    0.074
##  4     4  4.71    3.45      4.53    0.18 
##  5     5  3.27    2.24      3.21    0.058
##  6     6  4.75    3.84      4.96   -0.205
##  7     7  6.02    4.91      6.12   -0.101
##  8     8  8.81    7.37      8.8     0.01 
##  9     9  8.69    7.11      8.52    0.173
## 10    10  8.06    6.57      7.93    0.131
## # ... with 793 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.4	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.968

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.63     0.119      13.7       0     1.39     1.86
## 2 SNGCJA5       1.12     0.01      109.        0     1.1      1.14

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      4.75    3.31          5.34   -0.586
##  2     2      6       4.46          6.62   -0.624
##  3     3      6.64    5.2           7.45   -0.813
##  4     4      4.91    3.45          5.49   -0.582
##  5     5      3.13    2.24          4.14   -1.01 
##  6     6      5.24    3.84          5.93   -0.689
##  7     7      5.74    4.91          7.13   -1.39 
##  8     8      8.41    7.37          9.88   -1.48 
##  9     9      7.45    7.11          9.59   -2.14 
## 10    10      6.96    6.57          8.99   -2.03 
## # ... with 793 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.997

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.997     0.089     -11.2       0    -1.17   -0.822
## 2 SNGCJA5      2.62      0.008     342.        0     2.60    2.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.91    3.31        7.67   -0.756
##  2     2    9.69    4.46       10.7    -0.986
##  3     3   11.9     5.2        12.6    -0.713
##  4     4    7.8     3.45        8.03   -0.232
##  5     5    4.55    2.24        4.87   -0.316
##  6     6    7.64    3.84        9.05   -1.41 
##  7     7   11.9     4.91       11.9     0.046
##  8     8   18.1     7.37       18.3    -0.192
##  9     9   18.5     7.11       17.6     0.888
## 10    10   16.8     6.57       16.2     0.602
## # ... with 793 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.116     0.089     -1.30   0.193   -0.291    0.059
## 2 SNGCJA5      2.14      0.008    279.     0        2.12     2.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.51    3.31        6.96   -0.453
##  2     2    8.59    4.46        9.42   -0.832
##  3     3   10.4     5.2        11.0    -0.605
##  4     4    7.11    3.45        7.26   -0.152
##  5     5    4.76    2.24        4.68    0.085
##  6     6    7.36    3.84        8.10   -0.737
##  7     7   11.4     4.91       10.4     1.01 
##  8     8   15.4     7.37       15.6    -0.246
##  9     9   15.9     7.11       15.1     0.81 
## 10    10   14.2     6.57       13.9     0.265
## # ... with 793 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.00	32.5	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.786

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     8.45     0.561      15.1       0     7.35     9.55
## 2 SNGCJA5       1.74     0.048      36.0       0     1.64     1.83

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    3.31        14.2    4.81 
##  2     2      17    4.46        16.2    0.811
##  3     3      11    5.2         17.5   -6.47 
##  4     4       8    3.45        14.4   -6.44 
##  5     5       7    2.24        12.3   -5.34 
##  6     6       9    3.84        15.1   -6.11 
##  7     7      20    4.91        17.0    3.03 
##  8     8      41    7.37        21.2   19.8  
##  9     9      24    7.11        20.8    3.21 
## 10    10      23    6.57        19.8    3.15 
## # ... with 793 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.40	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.992     0.145      6.84       0    0.707     1.28
## 2 SPS30        1.01      0.011     93.2        0    0.993     1.04

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      4.75  4.51          5.57   -0.817
##  2     2      6     5.69          6.76   -0.764
##  3     3      6.64  6.51          7.60   -0.956
##  4     4      4.91  4.71          5.77   -0.86 
##  5     5      3.13  3.27          4.31   -1.18 
##  6     6      5.24  4.75          5.81   -0.571
##  7     7      5.74  6.02          7.10   -1.36 
##  8     8      8.41  8.81          9.93   -1.52 
##  9     9      7.45  8.69          9.81   -2.36 
## 10    10      6.96  8.06          9.17   -2.21 
## # ... with 793 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.90	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.79     0.097     -28.8       0    -2.98    -2.60
## 2 SPS30         2.40     0.007     330.        0     2.38     2.41

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.91  4.51        8.02   -1.11 
##  2     2    9.69  5.69       10.8    -1.16 
##  3     3   11.9   6.51       12.8    -0.916
##  4     4    7.8   4.71        8.50   -0.701
##  5     5    4.55  3.27        5.05   -0.499
##  6     6    7.64  4.75        8.60   -0.957
##  7     7   11.9   6.02       11.6     0.259
##  8     8   18.1   8.81       18.3    -0.229
##  9     9   18.5   8.69       18.0     0.458
## 10    10   16.8   8.06       16.5     0.269
## # ... with 793 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.20	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.992

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.53     0.114     -13.4       0    -1.75    -1.30
## 2 SPS30         1.95     0.009     228.        0     1.94     1.97

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.51  4.51        7.28   -0.774
##  2     2    8.59  5.69        9.59   -1    
##  3     3   10.4   6.51       11.2    -0.793
##  4     4    7.11  4.71        7.68   -0.565
##  5     5    4.76  3.27        4.86   -0.101
##  6     6    7.36  4.75        7.75   -0.393
##  7     7   11.4   6.02       10.2     1.16 
##  8     8   15.4   8.81       15.7    -0.287
##  9     9   15.9   8.69       15.5     0.447
## 10    10   14.2   8.06       14.2    -0.022
## # ... with 793 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.00	32.50	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.766

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     7.71     0.612      12.6       0     6.51     8.91
## 2 SPS30         1.55     0.046      33.8       0     1.46     1.64

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      19  4.51        14.7    4.30 
##  2     2      17  5.69        16.5    0.475
##  3     3      11  6.51        17.8   -6.80 
##  4     4       8  4.71        15.0   -7.01 
##  5     5       7  3.27        12.8   -5.78 
##  6     6       9  4.75        15.1   -6.07 
##  7     7      20  6.02        17.0    2.96 
##  8     8      41  8.81        21.4   19.6  
##  9     9      24  8.69        21.2    2.83 
## 10    10      23  8.06        20.2    2.80 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.9	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.76     0.337      -8.2       0    -3.43    -2.10
## 2 HPMA115S0     2.17     0.023      93.7       0     2.13     2.22

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.91      4.75        7.56   -0.645
##  2     2    9.69      6          10.3    -0.580
##  3     3   11.9       6.64       11.7     0.239
##  4     4    7.8       4.91        7.90   -0.102
##  5     5    4.55      3.13        4.04    0.514
##  6     6    7.64      5.24        8.62   -0.979
##  7     7   11.9       5.74        9.70    2.19 
##  8     8   18.1       8.41       15.5     2.59 
##  9     9   18.5       7.45       13.4     5.08 
## 10    10   16.8       6.96       12.4     4.44 
## # ... with 793 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.7	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.963

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.72     0.258     -6.68       0    -2.23    -1.22
## 2 HPMA115S0     1.79     0.018    101.         0     1.75     1.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.51      4.75        6.77   -0.261
##  2     2    8.59      6           9.01   -0.416
##  3     3   10.4       6.64       10.2     0.249
##  4     4    7.11      4.91        7.06    0.053
##  5     5    4.76      3.13        3.87    0.886
##  6     6    7.36      5.24        7.65   -0.287
##  7     7   11.4       5.74        8.54    2.86 
##  8     8   15.4       8.41       13.3     2.08 
##  9     9   15.9       7.45       11.6     4.30 
## 10    10   14.2       6.96       10.7     3.48 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.00	22.0	32.5	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.813

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.91     0.571      10.4       0     4.79     7.03
## 2 HPMA115S0     1.55     0.039      39.5       0     1.47     1.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      19      4.75        13.3     5.72
##  2     2      17      6           15.2     1.79
##  3     3      11      6.64        16.2    -5.20
##  4     4       8      4.91        13.5    -5.52
##  5     5       7      3.13        10.8    -3.76
##  6     6       9      5.24        14.0    -5.04
##  7     7      20      5.74        14.8     5.19
##  8     8      41      8.41        18.9    22.1 
##  9     9      24      7.45        17.5     6.54
## 10    10      23      6.96        16.7     6.30
## # ... with 793 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.0	22.0	32.5	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.785

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.19     0.546      16.8       0    8.12    10.3  
## 2 PMSA003       0.66     0.018      35.9       0    0.624    0.696

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    6.91        13.8     5.25
##  2     2      17    9.69        15.6     1.42
##  3     3      11   11.9         17.0    -6.04
##  4     4       8    7.8         14.3    -6.34
##  5     5       7    4.55        12.2    -5.19
##  6     6       9    7.64        14.2    -5.23
##  7     7      20   11.9         17.0     2.96
##  8     8      41   18.1         21.1    19.9 
##  9     9      24   18.5         21.4     2.60
## 10    10      23   16.8         20.3     2.73
## # ... with 793 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0.00	16.0	22.0	32.5	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.785

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.19     0.546      16.8       0    8.12    10.3  
## 2 PMSA003       0.66     0.018      35.9       0    0.624    0.696

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    6.91        13.8     5.25
##  2     2      17    9.69        15.6     1.42
##  3     3      11   11.9         17.0    -6.04
##  4     4       8    7.8         14.3    -6.34
##  5     5       7    4.55        12.2    -5.19
##  6     6       9    7.64        14.2    -5.23
##  7     7      20   11.9         17.0     2.96
##  8     8      41   18.1         21.1    19.9 
##  9     9      24   18.5         21.4     2.60
## 10    10      23   16.8         20.3     2.73
## # ... with 793 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.18	14.37	0	16.0	22.0	32.5	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.800

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.34      0.54       15.4       0    7.27     9.40 
## 2 PMS7003      0.822     0.022      37.8       0    0.779    0.864

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      19    6.51        13.7     5.32
##  2     2      17    8.59        15.4     1.61
##  3     3      11   10.4         16.9    -5.88
##  4     4       8    7.11        14.2    -6.18
##  5     5       7    4.76        12.2    -5.24
##  6     6       9    7.36        14.4    -5.38
##  7     7      20   11.4         17.7     2.3 
##  8     8      41   15.4         21.0    20.0 
##  9     9      24   15.9         21.4     2.60
## 10    10      23   14.2         20.0     3.00
## # ... with 793 more rows

Kennedy-Canairios.R

DBB

2021-02-24