Kennedy-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/KENNEDY_CONSOLIDADO_final.xlsx")
View(df)

glimpse(df)

## Rows: 803
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "10-11-2020 24:00", "11-11-2020 01:00", "11-11-2020 02:00...
## $ Oficial   <dbl> 22.0, 15.0, 19.0, 17.0, 11.0, 8.0, 7.0, 9.0, 20.0, 41.0, ...
## $ PMS7003   <dbl> 6.51, 8.59, 10.40, 7.11, 4.76, 7.36, 11.40, 15.40, 15.90,...
## $ PMSA003   <dbl> 6.91, 9.69, 11.90, 7.80, 4.55, 7.64, 11.90, 18.10, 18.50,...
## $ HPMA115S0 <dbl> 4.75, 6.00, 6.64, 4.91, 3.13, 5.24, 5.74, 8.41, 7.45, 6.9...
## $ SPS30     <dbl> 4.51, 5.69, 6.51, 4.71, 3.27, 4.75, 6.02, 8.81, 8.69, 8.0...
## $ SNGCJA5   <dbl> 3.31, 4.46, 5.20, 3.45, 2.24, 3.84, 4.91, 7.37, 7.11, 6.5...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   223 20-11-2020 06:00      47   35.5    42.7      23.7  18.7    17.1 
##  2   374 26-11-2020 13:00      20    9.22   11         9.76  6.48    5.21
##  3   382 26-11-2020 21:00      12   28.4    34.3      16.9  15.3    13.9 
##  4   207 19-11-2020 14:00      24   15.7    18.8       9.45  9.2     7.57
##  5   603 06-12-2020 02:00      15   11.1    12.2       8.97  6.34    5.33
##  6   385 26-11-2020 24:00      19   22.3    27.2      11.8  12.2    10.5 
##  7   791 13-12-2020 22:00      24    6.35    6.87      6.72  4.15    3.15
##  8   178 18-11-2020 09:00      20   11.3    13.2       7.38  7.23    5.79
##  9   291 23-11-2020 02:00      33   35.7    43.6      19.6  19.5    17   
## 10   145 16-11-2020 24:00       7    3.43    3.31      2.64  2.88    1.59

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.70	54.8	▇▃▁▁▁
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.772     0.029      26.7       0    0.715    0.829
## 2 SNGCJA5      1.09      0.002     439.        0    1.08     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  4.51    3.31      4.38    0.132
##  2     2  5.69    4.46      5.63    0.06 
##  3     3  6.51    5.2       6.44    0.074
##  4     4  4.71    3.45      4.53    0.18 
##  5     5  3.27    2.24      3.21    0.058
##  6     6  4.75    3.84      4.96   -0.205
##  7     7  6.02    4.91      6.12   -0.101
##  8     8  8.81    7.37      8.8     0.01 
##  9     9  8.69    7.11      8.52    0.173
## 10    10  8.06    6.57      7.93    0.131
## # ... with 793 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.4	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.968

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.63     0.119      13.7       0     1.39     1.86
## 2 SNGCJA5       1.12     0.01      109.        0     1.1      1.14

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      4.75    3.31          5.34   -0.586
##  2     2      6       4.46          6.62   -0.624
##  3     3      6.64    5.2           7.45   -0.813
##  4     4      4.91    3.45          5.49   -0.582
##  5     5      3.13    2.24          4.14   -1.01 
##  6     6      5.24    3.84          5.93   -0.689
##  7     7      5.74    4.91          7.13   -1.39 
##  8     8      8.41    7.37          9.88   -1.48 
##  9     9      7.45    7.11          9.59   -2.14 
## 10    10      6.96    6.57          8.99   -2.03 
## # ... with 793 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.997

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.997     0.089     -11.2       0    -1.17   -0.822
## 2 SNGCJA5      2.62      0.008     342.        0     2.60    2.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.91    3.31        7.67   -0.756
##  2     2    9.69    4.46       10.7    -0.986
##  3     3   11.9     5.2        12.6    -0.713
##  4     4    7.8     3.45        8.03   -0.232
##  5     5    4.55    2.24        4.87   -0.316
##  6     6    7.64    3.84        9.05   -1.41 
##  7     7   11.9     4.91       11.9     0.046
##  8     8   18.1     7.37       18.3    -0.192
##  9     9   18.5     7.11       17.6     0.888
## 10    10   16.8     6.57       16.2     0.602
## # ... with 793 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.116     0.089     -1.30   0.193   -0.291    0.059
## 2 SNGCJA5      2.14      0.008    279.     0        2.12     2.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    6.51    3.31        6.96   -0.453
##  2     2    8.59    4.46        9.42   -0.832
##  3     3   10.4     5.2        11.0    -0.605
##  4     4    7.11    3.45        7.26   -0.152
##  5     5    4.76    2.24        4.68    0.085
##  6     6    7.36    3.84        8.10   -0.737
##  7     7   11.4     4.91       10.4     1.01 
##  8     8   15.4     7.37       15.6    -0.246
##  9     9   15.9     7.11       15.1     0.81 
## 10    10   14.2     6.57       13.9     0.265
## # ... with 793 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	9.64	6.51	0.43	5.04	8.36	12.7	54.8	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.00	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.519

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    14.1      0.775      18.2       0    12.6     15.6 
## 2 SNGCJA5       1.15     0.067      17.2       0     1.01     1.28

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    3.31        17.9     4.11
##  2     2      15    4.46        19.2    -4.20
##  3     3      19    5.2         20.1    -1.05
##  4     4      17    3.45        18.0    -1.05
##  5     5      11    2.24        16.7    -5.66
##  6     6       8    3.84        18.5   -10.5 
##  7     7       7    4.91        19.7   -12.7 
##  8     8       9    7.37        22.5   -13.5 
##  9     9      20    7.11        22.2    -2.24
## 10    10      41    6.57        21.6    19.4 
## # ... with 793 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.70	15.40	64.6	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.992     0.145      6.84       0    0.707     1.28
## 2 SPS30        1.01      0.011     93.2        0    0.993     1.04

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      4.75  4.51          5.57   -0.817
##  2     2      6     5.69          6.76   -0.764
##  3     3      6.64  6.51          7.60   -0.956
##  4     4      4.91  4.71          5.77   -0.86 
##  5     5      3.13  3.27          4.31   -1.18 
##  6     6      5.24  4.75          5.81   -0.571
##  7     7      5.74  6.02          7.10   -1.36 
##  8     8      8.41  8.81          9.93   -1.52 
##  9     9      7.45  8.69          9.81   -2.36 
## 10    10      6.96  8.06          9.17   -2.21 
## # ... with 793 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.90	32.90	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.79     0.097     -28.8       0    -2.98    -2.60
## 2 SPS30         2.40     0.007     330.        0     2.38     2.41

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.91  4.51        8.02   -1.11 
##  2     2    9.69  5.69       10.8    -1.16 
##  3     3   11.9   6.51       12.8    -0.916
##  4     4    7.8   4.71        8.50   -0.701
##  5     5    4.55  3.27        5.05   -0.499
##  6     6    7.64  4.75        8.60   -0.957
##  7     7   11.9   6.02       11.6     0.259
##  8     8   18.1   8.81       18.3    -0.229
##  9     9   18.5   8.69       18.0     0.458
## 10    10   16.8   8.06       16.5     0.269
## # ... with 793 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.70	27.20	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.992

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.53     0.114     -13.4       0    -1.75    -1.30
## 2 SPS30         1.95     0.009     228.        0     1.94     1.97

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    6.51  4.51        7.28   -0.774
##  2     2    8.59  5.69        9.59   -1    
##  3     3   10.4   6.51       11.2    -0.793
##  4     4    7.11  4.71        7.68   -0.565
##  5     5    4.76  3.27        4.86   -0.101
##  6     6    7.36  4.75        7.75   -0.393
##  7     7   11.4   6.02       10.2     1.16 
##  8     8   15.4   8.81       15.7    -0.287
##  9     9   15.9   8.69       15.5     0.447
## 10    10   14.2   8.06       14.2    -0.022
## # ... with 793 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.28	7.11	1.35	6.25	9.89	14.65	62.7	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.00	32.00	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.509

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    13.5      0.819      16.5       0   11.9      15.2 
## 2 SPS30         1.03     0.061      16.7       0    0.908     1.15

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1      22  4.51        18.2     3.82
##  2     2      15  5.69        19.4    -4.40
##  3     3      19  6.51        20.2    -1.24
##  4     4      17  4.71        18.4    -1.39
##  5     5      11  3.27        16.9    -5.91
##  6     6       8  4.75        18.4   -10.4 
##  7     7       7  6.02        19.7   -12.7 
##  8     8       9  8.81        22.6   -13.6 
##  9     9      20  8.69        22.5    -2.48
## 10    10      41  8.06        21.8    19.2 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMSA003	0	1	24.25	17.10	0.84	11.70	20.9	32.9	138.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.957

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.76     0.337      -8.2       0    -3.43    -2.10
## 2 HPMA115S0     2.17     0.023      93.7       0     2.13     2.22

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.91      4.75        7.56   -0.645
##  2     2    9.69      6          10.3    -0.580
##  3     3   11.9       6.64       11.7     0.239
##  4     4    7.8       4.91        7.90   -0.102
##  5     5    4.55      3.13        4.04    0.514
##  6     6    7.64      5.24        8.62   -0.979
##  7     7   11.9       5.74        9.70    2.19 
##  8     8   18.1       8.41       15.5     2.59 
##  9     9   18.5       7.45       13.4     5.08 
## 10    10   16.8       6.96       12.4     4.44 
## # ... with 793 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
PMS7003	0	1	20.51	14.00	1.00	10.50	17.7	27.2	116.0	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.963

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.72     0.258     -6.68       0    -2.23    -1.22
## 2 HPMA115S0     1.79     0.018    101.         0     1.75     1.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    6.51      4.75        6.77   -0.261
##  2     2    8.59      6           9.01   -0.416
##  3     3   10.4       6.64       10.2     0.249
##  4     4    7.11      4.91        7.06    0.053
##  5     5    4.76      3.13        3.87    0.886
##  6     6    7.36      5.24        7.65   -0.287
##  7     7   11.4       5.74        8.54    2.86 
##  8     8   15.4       8.41       13.3     2.08 
##  9     9   15.9       7.45       11.6     4.30 
## 10    10   14.2       6.96       10.7     3.48 
## # ... with 793 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.43	7.54	2.25	7.26	10.7	15.4	64.6	▇▂▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.00	22.0	32.0	124.0	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.514

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    13.0      0.84       15.4       0   11.3      14.6 
## 2 HPMA115S0     0.98     0.058      17.0       0    0.867     1.09

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1      22      4.75        17.6    4.39 
##  2     2      15      6           18.8   -3.84 
##  3     3      19      6.64        19.5   -0.463
##  4     4      17      4.91        17.8   -0.767
##  5     5      11      3.13        16.0   -5.02 
##  6     6       8      5.24        18.1  -10.1  
##  7     7       7      5.74        18.6  -11.6  
##  8     8       9      8.41        21.2  -12.2  
##  9     9      20      7.45        20.3   -0.257
## 10    10      41      6.96        19.8   21.2  
## # ... with 793 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.520

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   14.6       0.752      19.4       0   13.1     16.0  
## 2 PMSA003      0.436     0.025      17.2       0    0.387    0.486

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    6.91        17.6    4.42 
##  2     2      15    9.69        18.8   -3.79 
##  3     3      19   11.9         19.8   -0.754
##  4     4      17    7.8         18.0   -0.964
##  5     5      11    4.55        16.5   -5.55 
##  6     6       8    7.64        17.9   -9.90 
##  7     7       7   11.9         19.8  -12.8  
##  8     8       9   18.1         22.5  -13.5  
##  9     9      20   18.5         22.6   -2.64 
## 10    10      41   16.8         21.9   19.1  
## # ... with 793 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.25	17.10	0.84	11.7	20.9	32.9	138	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0.00	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.520

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   14.6       0.752      19.4       0   13.1     16.0  
## 2 PMSA003      0.436     0.025      17.2       0    0.387    0.486

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    6.91        17.6    4.42 
##  2     2      15    9.69        18.8   -3.79 
##  3     3      19   11.9         19.8   -0.754
##  4     4      17    7.8         18.0   -0.964
##  5     5      11    4.55        16.5   -5.55 
##  6     6       8    7.64        17.9   -9.90 
##  7     7       7   11.9         19.8  -12.8  
##  8     8       9   18.1         22.5  -13.5  
##  9     9      20   18.5         22.6   -2.64 
## 10    10      41   16.8         21.9   19.1  
## # ... with 793 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	803
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.51	14.00	1	10.5	17.7	27.2	116	▇▃▁▁▁
Oficial	0	1	25.14	14.37	0	16.0	22.0	32.0	124	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.525

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   14.1       0.766      18.4       0   12.6     15.6  
## 2 PMS7003      0.539     0.031      17.4       0    0.478    0.599

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 803 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1      22    6.51        17.6    4.40 
##  2     2      15    8.59        18.7   -3.72 
##  3     3      19   10.4         19.7   -0.697
##  4     4      17    7.11        17.9   -0.925
##  5     5      11    4.76        16.7   -5.66 
##  6     6       8    7.36        18.1  -10.1  
##  7     7       7   11.4         20.2  -13.2  
##  8     8       9   15.4         22.4  -13.4  
##  9     9      20   15.9         22.7   -2.66 
## 10    10      41   14.2         21.7   19.3  
## # ... with 793 more rows

Kennedy-Canairios.R

DBB

2021-02-24