Paiba-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **ESTACION PAIBA VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/PAIBA_CANAIRIOS.xlsx")
View(df)

glimpse(df)

## Rows: 1,241
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <dttm> 2020-11-07 00:00:00, 2020-11-07 01:00:00, 2020-11-07 02:...
## $ Oficial   <dbl> 15.99, 12.09, 10.31, 13.64, 16.51, 18.62, 18.97, 22.07, 2...
## $ PMS7003   <dbl> 33.574074, 27.057692, 22.096154, 37.192308, 46.180000, 47...
## $ PMSA003   <dbl> 31.666667, 25.134615, 20.115385, 35.923077, 44.300000, 47...
## $ HPMA115S0 <dbl> 19.333333, 14.730769, 12.230769, 19.750000, 24.360000, 25...
## $ SPS30     <dbl> 17.185185, 14.115385, 11.634615, 19.442308, 23.260000, 24...
## $ SNGCJA5   <dbl> 13.259259, 10.634615, 8.673077, 14.846154, 17.900000, 19....

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha               Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <dttm>                <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   542 2020-11-30 21:00:00    8.65   18.7   18.3       11.3  10.5    7.45 
##  2  1216 2020-12-28 23:00:00   11.1     7.62   6.13       5.25  5.09   2.93 
##  3  1095 2020-12-23 22:00:00   12      19.3   20.0       10.6  10.8    7.55 
##  4   700 2020-12-07 11:00:00   32.2    30.2   28.5       16.2  14.9   10.7  
##  5   808 2020-12-11 23:00:00   18.2    18.0   18.5       11.4  10.3    7.87 
##  6  1108 2020-12-24 11:00:00   12.8     5.21   3.86       3.72  3.61   1.68 
##  7   408 2020-11-25 07:00:00    9.7    21.9   22.8       11.3  11.1    8.63 
##  8   255 2020-11-17 14:00:00    9.67   16.8   12.6        9.81  7.93   5.80 
##  9   160 2020-11-13 15:00:00   10.9    17.4   15.5       11.4   8.89   6.82 
## 10   724 2020-12-08 11:00:00   12.7     1.94   0.774      2.32  1.72   0.434

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	8.87	6.48	0.05	3.80	7.47	12.32	40.80	▇▅▂▁▁
SPS30	0	1	12.25	8.36	1.03	5.78	10.41	16.59	51.67	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.875     0.042      20.9       0    0.793    0.957
## 2 SNGCJA5      1.28      0.004     336.        0    1.27     1.29

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  17.2   13.3       17.9   -0.688
##  2     2  14.1   10.6       14.5   -0.393
##  3     3  11.6    8.67      12.0   -0.359
##  4     4  19.4   14.8       19.9   -0.465
##  5     5  23.3   17.9       23.8   -0.563
##  6     6  25.0   19.3       25.7   -0.679
##  7     7  23.9   18.4       24.5   -0.56 
##  8     8  27.5   22.1       29.2   -1.76 
##  9     9  28.4   22.9       30.2   -1.75 
## 10    10  39.4   30.7       40.3   -0.885
## # ... with 1,231 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	8.87	6.48	0.05	3.80	7.47	12.32	40.80	▇▅▂▁▁
HPMA115S0	0	1	12.88	8.60	1.35	6.15	11.05	17.25	51.89	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.990

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.24     0.059      20.9       0     1.12     1.36
## 2 SNGCJA5       1.31     0.005     243.        0     1.30     1.32

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      19.3   13.3           18.6    0.688
##  2     2      14.7   10.6           15.2   -0.469
##  3     3      12.2    8.67          12.6   -0.394
##  4     4      19.8   14.8           20.7   -0.978
##  5     5      24.4   17.9           24.7   -0.377
##  6     6      25     19.3           26.6   -1.62 
##  7     7      24.5   18.4           25.4   -0.926
##  8     8      29.3   22.1           30.3   -0.987
##  9     9      30.8   22.9           31.3   -0.447
## 10    10      42.3   30.7           41.6    0.698
## # ... with 1,231 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	8.87	6.48	0.05	3.80	7.47	12.32	40.80	▇▅▂▁▁
PMSA003	0	1	22.22	17.04	0.04	8.44	18.74	31.61	108.45	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.991     0.074     -13.3       0    -1.14   -0.845
## 2 SNGCJA5      2.62      0.007     387.        0     2.60    2.63

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    31.7   13.3         33.7    -2.04
##  2     2    25.1   10.6         26.8    -1.70
##  3     3    20.1    8.67        21.7    -1.59
##  4     4    35.9   14.8         37.9    -1.93
##  5     5    44.3   17.9         45.8    -1.55
##  6     6    47.1   19.3         49.6    -2.49
##  7     7    42.2   18.4         47.2    -5.00
##  8     8    52.0   22.1         56.9    -4.90
##  9     9    55.0   22.9         58.9    -3.87
## 10    10    75.5   30.7         79.4    -3.94
## # ... with 1,231 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	8.87	6.48	0.05	3.80	7.47	12.32	40.80	▇▅▂▁▁
PMS7003	0	1	22.17	15.69	0.70	9.64	19.25	30.73	93.25	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.984     0.122      8.08       0    0.745     1.22
## 2 SNGCJA5      2.39      0.011    215.         0    2.37      2.41

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    33.6   13.3         32.7    0.92 
##  2     2    27.1   10.6         26.4    0.673
##  3     3    22.1    8.67        21.7    0.396
##  4     4    37.2   14.8         36.4    0.748
##  5     5    46.2   17.9         43.7    2.44 
##  6     6    47.7   19.3         47.2    0.561
##  7     7    44.4   18.4         45.0   -0.576
##  8     8    52.9   22.1         53.8   -0.889
##  9     9    54.0   22.9         55.6   -1.65 
## 10    10    73.7   30.7         74.4   -0.667
## # ... with 1,231 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	8.87	6.48	0.05	3.80	7.47	12.32	40.80	▇▅▂▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.564

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.88      0.334      26.6       0    8.22     9.54 
## 2 SNGCJA5      0.732     0.03       24.1       0    0.673    0.792

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0   13.3         18.6    -2.6 
##  2     2    12.1   10.6         16.7    -4.58
##  3     3    10.3    8.67        15.2    -4.92
##  4     4    13.6   14.8         19.8    -6.11
##  5     5    16.5   17.9         22.0    -5.48
##  6     6    18.6   19.3         23.0    -4.42
##  7     7    19.0   18.4         22.4    -3.40
##  8     8    22.1   22.1         25.1    -3.00
##  9     9    22.6   22.9         25.6    -3.04
## 10    10    28.7   30.7         31.4    -2.67
## # ... with 1,231 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	12.25	8.36	1.03	5.78	10.41	16.59	51.67	▇▅▂▁▁
HPMA115S0	0	1	12.88	8.60	1.35	6.15	11.05	17.25	51.89	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.994

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.357     0.047      7.56       0    0.265     0.45
## 2 SPS30        1.02      0.003    321.         0    1.02      1.03

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      19.3  17.2          17.9    1.40 
##  2     2      14.7  14.1          14.8   -0.065
##  3     3      12.2  11.6          12.3   -0.027
##  4     4      19.8  19.4          20.2   -0.494
##  5     5      24.4  23.3          24.1    0.211
##  6     6      25    25.0          25.9   -0.91 
##  7     7      24.5  23.9          24.8   -0.341
##  8     8      29.3  27.5          28.4    0.83 
##  9     9      30.8  28.4          29.5    1.36 
## 10    10      42.3  39.4          40.6    1.63 
## # ... with 1,231 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	12.25	8.36	1.03	5.78	10.41	16.59	51.67	▇▅▂▁▁
PMSA003	0	1	22.22	17.04	0.04	8.44	18.74	31.61	108.45	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.993

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.57     0.1       -25.7       0    -2.77    -2.38
## 2 SPS30         2.02     0.007     300.        0     2.01     2.04

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    31.7  17.2        32.2   -0.552
##  2     2    25.1  14.1        26.0   -0.869
##  3     3    20.1  11.6        21.0   -0.866
##  4     4    35.9  19.4        36.8   -0.865
##  5     5    44.3  23.3        44.5   -0.217
##  6     6    47.1  25.0        48.0   -0.891
##  7     7    42.2  23.9        45.9   -3.67 
##  8     8    52.0  27.5        53.0   -1.06 
##  9     9    55.0  28.4        55.0   -0.031
## 10    10    75.5  39.4        77.2   -1.68 
## # ... with 1,231 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	12.25	8.36	1.03	5.78	10.41	16.59	51.67	▇▅▂▁▁
PMS7003	0	1	22.17	15.69	0.70	9.64	19.25	30.73	93.25	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.991

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.62     0.105     -5.91       0   -0.825   -0.414
## 2 SPS30         1.86     0.007    263.         0    1.85     1.88

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    33.6  17.2        31.4    2.21 
##  2     2    27.1  14.1        25.6    1.41 
##  3     3    22.1  11.6        21.0    1.06 
##  4     4    37.2  19.4        35.6    1.63 
##  5     5    46.2  23.3        42.7    3.51 
##  6     6    47.7  25.0        45.9    1.85 
##  7     7    44.4  23.9        43.9    0.492
##  8     8    52.9  27.5        50.5    2.42 
##  9     9    54.0  28.4        52.3    1.65 
## 10    10    73.7  39.4        72.7    1.04 
## # ... with 1,231 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	12.25	8.36	1.03	5.78	10.41	16.59	51.67	▇▅▂▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.559

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.49      0.352      24.1       0    7.80     9.18 
## 2 SPS30        0.562     0.024      23.7       0    0.516    0.609

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    16.0  17.2        18.2    -2.16
##  2     2    12.1  14.1        16.4    -4.34
##  3     3    10.3  11.6        15.0    -4.72
##  4     4    13.6  19.4        19.4    -5.78
##  5     5    16.5  23.3        21.6    -5.06
##  6     6    18.6  25.0        22.5    -3.92
##  7     7    19.0  23.9        21.9    -2.97
##  8     8    22.1  27.5        23.9    -1.86
##  9     9    22.6  28.4        24.5    -1.89
## 10    10    28.7  39.4        30.6    -1.92
## # ... with 1,231 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.88	8.60	1.35	6.15	11.05	17.25	51.89	▇▅▂▁▁
PMSA003	0	1	22.22	17.04	0.04	8.44	18.74	31.61	108.45	▇▅▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.89     0.156     -18.6       0    -3.20    -2.59
## 2 HPMA115S0     1.95     0.01      194.        0     1.93     1.97

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    31.7      19.3        34.8   -3.12 
##  2     2    25.1      14.7        25.8   -0.685
##  3     3    20.1      12.2        20.9   -0.832
##  4     4    35.9      19.8        35.6    0.32 
##  5     5    44.3      24.4        44.6   -0.288
##  6     6    47.1      25          45.8    1.27 
##  7     7    42.2      24.5        44.8   -2.64 
##  8     8    52.0      29.3        54.2   -2.21 
##  9     9    55.0      30.8        57.2   -2.19 
## 10    10    75.5      42.3        79.5   -4.03 
## # ... with 1,231 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.88	8.60	1.35	6.15	11.05	17.25	51.89	▇▅▂▁▁
PMS7003	0	1	22.17	15.69	0.70	9.64	19.25	30.73	93.25	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.990

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.11     0.111     -10.0       0    -1.33   -0.895
## 2 HPMA115S0     1.81     0.007     253.        0     1.79    1.82

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    33.6      19.3        33.8   -0.252
##  2     2    27.1      14.7        25.5    1.55 
##  3     3    22.1      12.2        21.0    1.11 
##  4     4    37.2      19.8        34.6    2.61 
##  5     5    46.2      24.4        42.9    3.27 
##  6     6    47.7      25          44.1    3.66 
##  7     7    44.4      24.5        43.1    1.25 
##  8     8    52.9      29.3        51.8    1.11 
##  9     9    54.0      30.8        54.6   -0.612
## 10    10    73.7      42.3        75.3   -1.57 
## # ... with 1,231 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	12.88	8.60	1.35	6.15	11.05	17.25	51.89	▇▅▂▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.577

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.11      0.352      23.0       0     7.42    8.80 
## 2 HPMA115S0    0.564     0.023      24.8       0     0.52    0.609

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    16.0      19.3        19.0    -3.02
##  2     2    12.1      14.7        16.4    -4.33
##  3     3    10.3      12.2        15.0    -4.70
##  4     4    13.6      19.8        19.2    -5.61
##  5     5    16.5      24.4        21.8    -5.34
##  6     6    18.6      25          22.2    -3.59
##  7     7    19.0      24.5        21.9    -2.95
##  8     8    22.1      29.3        24.6    -2.55
##  9     9    22.6      30.8        25.5    -2.9 
## 10    10    28.7      42.3        32.0    -3.24
## # ... with 1,231 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	22.22	17.04	0.04	8.44	18.74	31.61	108.45	▇▅▁▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.580

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    9.00      0.32       28.1       0    8.38     9.63 
## 2 PMSA003      0.287     0.011      25.1       0    0.264    0.309

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    31.7        18.1    -2.09
##  2     2    12.1    25.1        16.2    -4.12
##  3     3    10.3    20.1        14.8    -4.46
##  4     4    13.6    35.9        19.3    -5.66
##  5     5    16.5    44.3        21.7    -5.20
##  6     6    18.6    47.1        22.5    -3.89
##  7     7    19.0    42.2        21.1    -2.13
##  8     8    22.1    52.0        23.9    -1.83
##  9     9    22.6    55.0        24.8    -2.18
## 10    10    28.7    75.5        30.6    -1.93
## # ... with 1,231 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	22.22	17.04	0.04	8.44	18.74	31.61	108.45	▇▅▁▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.580

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    9.00      0.32       28.1       0    8.38     9.63 
## 2 PMSA003      0.287     0.011      25.1       0    0.264    0.309

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    31.7        18.1    -2.09
##  2     2    12.1    25.1        16.2    -4.12
##  3     3    10.3    20.1        14.8    -4.46
##  4     4    13.6    35.9        19.3    -5.66
##  5     5    16.5    44.3        21.7    -5.20
##  6     6    18.6    47.1        22.5    -3.89
##  7     7    19.0    42.2        21.1    -2.13
##  8     8    22.1    52.0        23.9    -1.83
##  9     9    22.6    55.0        24.8    -2.18
## 10    10    28.7    75.5        30.6    -1.93
## # ... with 1,231 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1241
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	22.17	15.69	0.70	9.64	19.25	30.73	93.25	▇▆▂▁▁
Oficial	0	1	15.38	8.41	2.06	9.51	13.39	19.63	62.22	▇▅▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.574

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.55      0.339      25.2       0    7.89     9.22 
## 2 PMS7003      0.308     0.012      24.7       0    0.283    0.332

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,241 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    33.6        18.9    -2.89
##  2     2    12.1    27.1        16.9    -4.79
##  3     3    10.3    22.1        15.4    -5.04
##  4     4    13.6    37.2        20.0    -6.36
##  5     5    16.5    46.2        22.8    -6.25
##  6     6    18.6    47.7        23.2    -4.62
##  7     7    19.0    44.4        22.2    -3.24
##  8     8    22.1    52.9        24.8    -2.76
##  9     9    22.6    54.0        25.2    -2.57
## 10    10    28.7    73.7        31.2    -2.52
## # ... with 1,231 more rows

Paiba-Canairios.R

DBB

2021-02-24