Kennedy-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **1 hour late station ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/KENNEDY_CONSOLIDADO_final_1h.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 15.0, 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, 24.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   734 11-12-2020 13:00      41   49      61.8      24.7  29.5    24.8 
##  2    72 13-11-2020 23:00      32   22.7    27.3      14.5  12.2    10.7 
##  3   306 23-11-2020 17:00      13   30.1    36.6      15.3  15.7    13.3 
##  4   477 30-11-2020 20:00      11   15.1    18.1       7.45  8.33    6.94
##  5   353 25-11-2020 16:00      10    7.05    7.67      5.72  4.56    3.58
##  6   574 04-12-2020 21:00      27   11.3    11.5       9.3   5.74    5.22
##  7   135 16-11-2020 14:00      24   20.9    26.3      10.5  12.5    10.4 
##  8   596 05-12-2020 19:00      27   19.2    22.5      11.9  10.5     8.91
##  9   736 11-12-2020 15:00      30   36.5    45.1      18.7  20.6    17.8 
## 10   324 24-11-2020 11:00      19   12.7    15         8.91  7.7     6.44

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.70	54.8	▇▃▁▁▁
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.772     0.029      26.7       0    0.715    0.829
## 2 SNGCJA5      1.09      0.002     439.        0    1.08     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  4.51    3.31      4.38    0.132
##  2     2  5.69    4.46      5.63    0.06 
##  3     3  6.51    5.2       6.44    0.074
##  4     4  4.71    3.45      4.53    0.18 
##  5     5  3.27    2.24      3.21    0.058
##  6     6  4.75    3.84      4.96   -0.205
##  7     7  6.02    4.91      6.12   -0.101
##  8     8  8.81    7.37      8.8     0.01 
##  9     9  8.69    7.11      8.52    0.173
## 10    10  8.06    6.57      7.93    0.131
## # ... with 793 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.4	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.968

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.63     0.119      13.7       0     1.39     1.86
## 2 SNGCJA5       1.12     0.01      109.        0     1.1      1.14

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      4.75    3.31          5.34   -0.586
##  2     2      6       4.46          6.62   -0.624
##  3     3      6.64    5.2           7.45   -0.813
##  4     4      4.91    3.45          5.49   -0.582
##  5     5      3.13    2.24          4.14   -1.01 
##  6     6      5.24    3.84          5.93   -0.689
##  7     7      5.74    4.91          7.13   -1.39 
##  8     8      8.41    7.37          9.88   -1.48 
##  9     9      7.45    7.11          9.59   -2.14 
## 10    10      6.96    6.57          8.99   -2.03 
## # ... with 793 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.997

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.997     0.089     -11.2       0    -1.17   -0.822
## 2 SNGCJA5      2.62      0.008     342.        0     2.60    2.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.91    3.31        7.67   -0.756
##  2     2    9.69    4.46       10.7    -0.986
##  3     3   11.9     5.2        12.6    -0.713
##  4     4    7.8     3.45        8.03   -0.232
##  5     5    4.55    2.24        4.87   -0.316
##  6     6    7.64    3.84        9.05   -1.41 
##  7     7   11.9     4.91       11.9     0.046
##  8     8   18.1     7.37       18.3    -0.192
##  9     9   18.5     7.11       17.6     0.888
## 10    10   16.8     6.57       16.2     0.602
## # ... with 793 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.116     0.089     -1.30   0.193   -0.291    0.059
## 2 SNGCJA5      2.14      0.008    279.     0        2.12     2.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.51    3.31        6.96   -0.453
##  2     2    8.59    4.46        9.42   -0.832
##  3     3   10.4     5.2        11.0    -0.605
##  4     4    7.11    3.45        7.26   -0.152
##  5     5    4.76    2.24        4.68    0.085
##  6     6    7.36    3.84        8.10   -0.737
##  7     7   11.4     4.91       10.4     1.01 
##  8     8   15.4     7.37       15.6    -0.246
##  9     9   15.9     7.11       15.1     0.81 
## 10    10   14.2     6.57       13.9     0.265
## # ... with 793 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.00	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.646

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    11.4      0.692      16.5       0    10.0     12.8 
## 2 SNGCJA5       1.43     0.059      24.0       0     1.31     1.54

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    3.31        16.1    -1.12
##  2     2      19    4.46        17.8     1.24
##  3     3      17    5.2         18.8    -1.82
##  4     4      11    3.45        16.3    -5.32
##  5     5       8    2.24        14.6    -6.60
##  6     6       7    3.84        16.9    -9.88
##  7     7       9    4.91        18.4    -9.40
##  8     8      20    7.37        21.9    -1.91
##  9     9      41    7.11        21.5    19.5 
## 10    10      24    6.57        20.8     3.23
## # ... with 793 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.40	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.992     0.145      6.84       0    0.707     1.28
## 2 SPS30        1.01      0.011     93.2        0    0.993     1.04

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      4.75  4.51          5.57   -0.817
##  2     2      6     5.69          6.76   -0.764
##  3     3      6.64  6.51          7.60   -0.956
##  4     4      4.91  4.71          5.77   -0.86 
##  5     5      3.13  3.27          4.31   -1.18 
##  6     6      5.24  4.75          5.81   -0.571
##  7     7      5.74  6.02          7.10   -1.36 
##  8     8      8.41  8.81          9.93   -1.52 
##  9     9      7.45  8.69          9.81   -2.36 
## 10    10      6.96  8.06          9.17   -2.21 
## # ... with 793 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.90	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.79     0.097     -28.8       0    -2.98    -2.60
## 2 SPS30         2.40     0.007     330.        0     2.38     2.41

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.91  4.51        8.02   -1.11 
##  2     2    9.69  5.69       10.8    -1.16 
##  3     3   11.9   6.51       12.8    -0.916
##  4     4    7.8   4.71        8.50   -0.701
##  5     5    4.55  3.27        5.05   -0.499
##  6     6    7.64  4.75        8.60   -0.957
##  7     7   11.9   6.02       11.6     0.259
##  8     8   18.1   8.81       18.3    -0.229
##  9     9   18.5   8.69       18.0     0.458
## 10    10   16.8   8.06       16.5     0.269
## # ... with 793 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.20	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.992

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.53     0.114     -13.4       0    -1.75    -1.30
## 2 SPS30         1.95     0.009     228.        0     1.94     1.97

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.51  4.51        7.28   -0.774
##  2     2    8.59  5.69        9.59   -1    
##  3     3   10.4   6.51       11.2    -0.793
##  4     4    7.11  4.71        7.68   -0.565
##  5     5    4.76  3.27        4.86   -0.101
##  6     6    7.36  4.75        7.75   -0.393
##  7     7   11.4   6.02       10.2     1.16 
##  8     8   15.4   8.81       15.7    -0.287
##  9     9   15.9   8.69       15.5     0.447
## 10    10   14.2   8.06       14.2    -0.022
## # ... with 793 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.00	32.00	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.629

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    10.8      0.74       14.6       0     9.36    12.3 
## 2 SPS30         1.27     0.056      22.9       0     1.16     1.38

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      15  4.51        16.5   -1.55 
##  2     2      19  5.69        18.0    0.953
##  3     3      17  6.51        19.1   -2.09 
##  4     4      11  4.71        16.8   -5.80 
##  5     5       8  3.27        15.0   -6.97 
##  6     6       7  4.75        16.9   -9.85 
##  7     7       9  6.02        18.5   -9.47 
##  8     8      20  8.81        22.0   -2.02 
##  9     9      41  8.69        21.9   19.1  
## 10    10      24  8.06        21.1    2.94 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.9	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.76     0.337      -8.2       0    -3.43    -2.10
## 2 HPMA115S0     2.17     0.023      93.7       0     2.13     2.22

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.91      4.75        7.56   -0.645
##  2     2    9.69      6          10.3    -0.580
##  3     3   11.9       6.64       11.7     0.239
##  4     4    7.8       4.91        7.90   -0.102
##  5     5    4.55      3.13        4.04    0.514
##  6     6    7.64      5.24        8.62   -0.979
##  7     7   11.9       5.74        9.70    2.19 
##  8     8   18.1       8.41       15.5     2.59 
##  9     9   18.5       7.45       13.4     5.08 
## 10    10   16.8       6.96       12.4     4.44 
## # ... with 793 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.7	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.963

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.72     0.258     -6.68       0    -2.23    -1.22
## 2 HPMA115S0     1.79     0.018    101.         0     1.75     1.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.51      4.75        6.77   -0.261
##  2     2    8.59      6           9.01   -0.416
##  3     3   10.4       6.64       10.2     0.249
##  4     4    7.11      4.91        7.06    0.053
##  5     5    4.76      3.13        3.87    0.886
##  6     6    7.36      5.24        7.65   -0.287
##  7     7   11.4       5.74        8.54    2.86 
##  8     8   15.4       8.41       13.3     2.08 
##  9     9   15.9       7.45       11.6     4.30 
## 10    10   14.2       6.96       10.7     3.48 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.00	22.0	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.654

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     9.65     0.741      13.0       0     8.20    11.1 
## 2 HPMA115S0     1.25     0.051      24.5       0     1.15     1.35

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      15      4.75        15.6   -0.575
##  2     2      19      6           17.1    1.87 
##  3     3      17      6.64        17.9   -0.931
##  4     4      11      4.91        15.8   -4.77 
##  5     5       8      3.13        13.6   -5.56 
##  6     6       7      5.24        16.2   -9.19 
##  7     7       9      5.74        16.8   -7.81 
##  8     8      20      8.41        20.1   -0.139
##  9     9      41      7.45        18.9   22.1  
## 10    10      24      6.96        18.3    5.67 
## # ... with 793 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.647

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.0       0.671      17.8       0     10.6   13.3  
## 2 PMSA003      0.544     0.023      24.0       0      0.5    0.588

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    6.91        15.7   -0.725
##  2     2      19    9.69        17.2    1.76 
##  3     3      17   11.9         18.4   -1.44 
##  4     4      11    7.8         16.2   -5.21 
##  5     5       8    4.55        14.4   -6.44 
##  6     6       7    7.64        16.1   -9.12 
##  7     7       9   11.9         18.4   -9.44 
##  8     8      20   18.1         21.8   -1.81 
##  9     9      41   18.5         22.0   19.0  
## 10    10      24   16.8         21.1    2.90 
## # ... with 793 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.647

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   12.0       0.671      17.8       0     10.6   13.3  
## 2 PMSA003      0.544     0.023      24.0       0      0.5    0.588

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    6.91        15.7   -0.725
##  2     2      19    9.69        17.2    1.76 
##  3     3      17   11.9         18.4   -1.44 
##  4     4      11    7.8         16.2   -5.21 
##  5     5       8    4.55        14.4   -6.44 
##  6     6       7    7.64        16.1   -9.12 
##  7     7       9   11.9         18.4   -9.44 
##  8     8      20   18.1         21.8   -1.81 
##  9     9      41   18.5         22.0   19.0  
## 10    10      24   16.8         21.1    2.90 
## # ... with 793 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.16	14.37	0	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.657

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.3       0.679      16.7       0    9.99    12.7  
## 2 PMS7003      0.674     0.027      24.7       0    0.621    0.728

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      15    6.51        15.7   -0.712
##  2     2      19    8.59        17.1    1.88 
##  3     3      17   10.4         18.3   -1.34 
##  4     4      11    7.11        16.1   -5.12 
##  5     5       8    4.76        14.5   -6.53 
##  6     6       7    7.36        16.3   -9.28 
##  7     7       9   11.4         19.0  -10.0  
##  8     8      20   15.4         21.7   -1.71 
##  9     9      41   15.9         22.0   19.0  
## 10    10      24   14.2         20.9    3.10 
## # ... with 793 more rows

Kennedy-Canairios.R

DBB

2021-02-24