Paiba-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **November ESTACION PAIBA VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/PAIBA_CANAIRIOS_NOV.xlsx")
View(df)

glimpse(df)

## Rows: 544
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <dttm> 2020-11-07 00:00:00, 2020-11-07 01:00:00, 2020-11-07 02:...
## $ Oficial   <dbl> 15.99, 12.09, 10.31, 13.64, 16.51, 18.62, 18.97, 22.07, 2...
## $ PMS7003   <dbl> 33.574074, 27.057692, 22.096154, 37.192308, 46.180000, 47...
## $ PMSA003   <dbl> 31.666667, 25.134615, 20.115385, 35.923077, 44.300000, 47...
## $ HPMA115S0 <dbl> 19.333333, 14.730769, 12.230769, 19.750000, 24.360000, 25...
## $ SPS30     <dbl> 17.185185, 14.115385, 11.634615, 19.442308, 23.260000, 24...
## $ SNGCJA5   <dbl> 13.259259, 10.634615, 8.673077, 14.846154, 17.900000, 19....

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha               Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <dttm>                <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   424 2020-11-25 23:00:00   14.7    30.3    34.2      18.5  17.6    14.1 
##  2   351 2020-11-22 22:00:00    6.77    8.45    7.38      5.76  5.24    3.35
##  3    42 2020-11-08 17:00:00   12.0    27.5    25.2      14.4  13.9    10.4 
##  4    54 2020-11-09 05:00:00   17.2    44.5    40.2      26.1  23.8    17.6 
##  5   370 2020-11-23 17:00:00   13.5    37.3    40.3      21.4  20.5    15.4 
##  6    21 2020-11-07 20:00:00    7.67   11.4    10.6       7.69  6.89    5.05
##  7   184 2020-11-14 15:00:00   13.2    25.1    23.5      13.2  12.1     9.60
##  8    50 2020-11-09 01:00:00   17.8    44.5    42.9      25.2  23.5    18.1 
##  9   137 2020-11-12 16:00:00   14.0    37.5    33.9      21.2  18.3    14.1 
## 10   211 2020-11-15 18:00:00    4.16    5.45    3.78      3.25  3.45    2.04

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	10.07	6.36	0.05	5.29	9.15	13.71	36.96	▇▇▃▁▁
SPS30	0	1	13.47	8.07	1.03	7.38	12.33	18.29	44.85	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     0.75     0.062      12.0       0    0.628    0.873
## 2 SNGCJA5       1.26     0.005     241.        0    1.25     1.27

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  17.2   13.3       17.5   -0.323
##  2     2  14.1   10.6       14.2   -0.076
##  3     3  11.6    8.67      11.7   -0.077
##  4     4  19.4   14.8       19.5   -0.071
##  5     5  23.3   17.9       23.4   -0.113
##  6     6  25.0   19.3       25.2   -0.203
##  7     7  23.9   18.4       24.0   -0.101
##  8     8  27.5   22.1       28.7   -1.23 
##  9     9  28.4   22.9       29.7   -1.21 
## 10    10  39.4   30.7       39.6   -0.203
## # ... with 534 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	10.07	6.36	0.05	5.29	9.15	13.71	36.96	▇▇▃▁▁
HPMA115S0	0	1	14.32	8.53	1.42	8.05	13.04	18.80	51.08	▇▇▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.988

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.972     0.106       9.2       0    0.765     1.18
## 2 SNGCJA5      1.33      0.009     149.        0    1.31      1.34

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      19.3   13.3           18.6    0.776
##  2     2      14.7   10.6           15.1   -0.346
##  3     3      12.2    8.67          12.5   -0.244
##  4     4      19.8   14.8           20.7   -0.912
##  5     5      24.4   17.9           24.7   -0.352
##  6     6      25     19.3           26.6   -1.61 
##  7     7      24.5   18.4           25.4   -0.908
##  8     8      29.3   22.1           30.3   -1.02 
##  9     9      30.8   22.9           31.3   -0.489
## 10    10      42.3   30.7           41.7    0.549
## # ... with 534 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	10.07	6.36	0.05	5.29	9.15	13.71	36.96	▇▇▃▁▁
PMSA003	0	1	24.69	16.22	0.04	12.43	22.70	34.49	95.43	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.889     0.123     -7.22       0    -1.13   -0.647
## 2 SNGCJA5      2.54      0.01     246.         0     2.52    2.56

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    31.7   13.3         32.8   -1.13 
##  2     2    25.1   10.6         26.1   -0.997
##  3     3    20.1    8.67        21.1   -1.03 
##  4     4    35.9   14.8         36.8   -0.91 
##  5     5    44.3   17.9         44.6   -0.292
##  6     6    47.1   19.3         48.2   -1.12 
##  7     7    42.2   18.4         45.9   -3.71 
##  8     8    52.0   22.1         55.3   -3.33 
##  9     9    55.0   22.9         57.2   -2.24 
## 10    10    75.5   30.7         77.2   -1.71 
## # ... with 534 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	10.07	6.36	0.05	5.29	9.15	13.71	36.96	▇▇▃▁▁
PMS7003	0	1	24.80	15.38	0.70	13.55	22.36	33.59	87.28	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.985

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     0.79     0.211      3.74       0    0.375     1.21
## 2 SNGCJA5       2.38     0.018    134.         0    2.35      2.42

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    33.6   13.3         32.4    1.16 
##  2     2    27.1   10.6         26.2    0.907
##  3     3    22.1    8.67        21.5    0.623
##  4     4    37.2   14.8         36.2    0.998
##  5     5    46.2   17.9         43.5    2.70 
##  6     6    47.7   19.3         46.9    0.827
##  7     7    44.4   18.4         44.7   -0.313
##  8     8    52.9   22.1         53.5   -0.612
##  9     9    54.0   22.9         55.3   -1.37 
## 10    10    73.7   30.7         74.1   -0.357
## # ... with 534 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	10.07	6.36	0.05	5.29	9.15	13.71	36.96	▇▇▃▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.70	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.962

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.61      0.115      31.4       0    3.38     3.84 
## 2 SNGCJA5      0.797     0.01       82.5       0    0.778    0.816

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0   13.3         14.2    1.81 
##  2     2    12.1   10.6         12.1    0.003
##  3     3    10.3    8.67        10.5   -0.213
##  4     4    13.6   14.8         15.4   -1.80 
##  5     5    16.5   17.9         17.9   -1.37 
##  6     6    18.6   19.3         19.0   -0.4  
##  7     7    19.0   18.4         18.3    0.679
##  8     8    22.1   22.1         21.2    0.836
##  9     9    22.6   22.9         21.8    0.751
## 10    10    28.7   30.7         28.1    0.609
## # ... with 534 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	13.47	8.07	1.03	7.38	12.33	18.29	44.85	▇▇▃▁▁
HPMA115S0	0	1	14.32	8.53	1.42	8.05	13.04	18.80	51.08	▇▇▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.994

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.172     0.081      2.13   0.033    0.014    0.331
## 2 SPS30        1.05      0.005    204.     0        1.04     1.06

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      19.3  17.2          18.2    1.11 
##  2     2      14.7  14.1          15.0   -0.267
##  3     3      12.2  11.6          12.4   -0.161
##  4     4      19.8  19.4          20.6   -0.843
##  5     5      24.4  23.3          24.6   -0.242
##  6     6      25    25.0          26.4   -1.41 
##  7     7      24.5  23.9          25.3   -0.812
##  8     8      29.3  27.5          29.0    0.261
##  9     9      30.8  28.4          30.0    0.767
## 10    10      42.3  39.4          41.5    0.737
## # ... with 534 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	13.47	8.07	1.03	7.38	12.33	18.29	44.85	▇▇▃▁▁
PMSA003	0	1	24.69	16.22	0.04	12.43	22.70	34.49	95.43	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.991

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.15     0.181     -11.8       0    -2.51    -1.79
## 2 SPS30         1.99     0.012     172.        0     1.97     2.02

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    31.7  17.2        32.1   -0.416
##  2     2    25.1  14.1        26.0   -0.833
##  3     3    20.1  11.6        21.0   -0.911
##  4     4    35.9  19.4        36.6   -0.656
##  5     5    44.3  23.3        44.2    0.117
##  6     6    47.1  25.0        47.6   -0.501
##  7     7    42.2  23.9        45.5   -3.31 
##  8     8    52.0  27.5        52.6   -0.592
##  9     9    55.0  28.4        54.5    0.471
## 10    10    75.5  39.4        76.3   -0.824
## # ... with 534 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	13.47	8.07	1.03	7.38	12.33	18.29	44.85	▇▇▃▁▁
PMS7003	0	1	24.80	15.38	0.70	13.55	22.36	33.59	87.28	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.989

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -0.6      0.191     -3.14   0.002   -0.975   -0.225
## 2 SPS30         1.88     0.012    155.     0        1.86     1.91

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    33.6  17.2        31.8    1.78 
##  2     2    27.1  14.1        26.0    1.05 
##  3     3    22.1  11.6        21.3    0.765
##  4     4    37.2  19.4        36.0    1.14 
##  5     5    46.2  23.3        43.2    2.94 
##  6     6    47.7  25.0        46.5    1.23 
##  7     7    44.4  23.9        44.5   -0.102
##  8     8    52.9  27.5        51.2    1.74 
##  9     9    54.0  28.4        53.0    0.946
## 10    10    73.7  39.4        73.6    0.075
## # ... with 534 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	13.47	8.07	1.03	7.38	12.33	18.29	44.85	▇▇▃▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.70	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.955

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.24      0.13       24.9       0    2.98     3.50 
## 2 SPS30        0.623     0.008      75.1       0    0.607    0.639

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    16.0  17.2        13.9    2.04 
##  2     2    12.1  14.1        12.0    0.056
##  3     3    10.3  11.6        10.5   -0.179
##  4     4    13.6  19.4        15.4   -1.71 
##  5     5    16.5  23.3        17.7   -1.22 
##  6     6    18.6  25.0        18.8   -0.183
##  7     7    19.0  23.9        18.1    0.823
##  8     8    22.1  27.5        20.3    1.72 
##  9     9    22.6  28.4        21.0    1.63 
## 10    10    28.7  39.4        27.8    0.936
## # ... with 534 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	14.32	8.53	1.42	8.05	13.04	18.80	51.08	▇▇▂▁▁
PMSA003	0	1	24.69	16.22	0.04	12.43	22.70	34.49	95.43	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.979

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.98     0.277     -7.15       0    -2.52    -1.43
## 2 HPMA115S0     1.86     0.017    112.         0     1.83     1.89

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    31.7      19.3        34.0   -2.35 
##  2     2    25.1      14.7        25.4   -0.313
##  3     3    20.1      12.2        20.8   -0.678
##  4     4    35.9      19.8        34.8    1.13 
##  5     5    44.3      24.4        43.4    0.925
##  6     6    47.1      25          44.6    2.54 
##  7     7    42.2      24.5        43.6   -1.42 
##  8     8    52.0      29.3        52.5   -0.568
##  9     9    55.0      30.8        55.4   -0.411
## 10    10    75.5      42.3        76.7   -1.25 
## # ... with 534 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	14.32	8.53	1.42	8.05	13.04	18.80	51.08	▇▇▂▁▁
PMS7003	0	1	24.80	15.38	0.70	13.55	22.36	33.59	87.28	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.987

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.683     0.211     -3.24   0.001    -1.10   -0.269
## 2 HPMA115S0    1.78      0.013    141.     0         1.75    1.80

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    33.6      19.3        33.7   -0.135
##  2     2    27.1      14.7        25.5    1.54 
##  3     3    22.1      12.2        21.1    1.02 
##  4     4    37.2      19.8        34.5    2.74 
##  5     5    46.2      24.4        42.7    3.53 
##  6     6    47.7      25          43.8    3.93 
##  7     7    44.4      24.5        42.9    1.52 
##  8     8    52.9      29.3        51.4    1.51 
##  9     9    54.0      30.8        54.1   -0.171
## 10    10    73.7      42.3        74.5   -0.807
## # ... with 534 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	14.32	8.53	1.42	8.05	13.04	18.8	51.08	▇▇▂▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.7	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.959

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.16      0.126      25.2       0    2.91     3.41 
## 2 HPMA115S0    0.592     0.008      78.5       0    0.577    0.606

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    16.0      19.3        14.6    1.39 
##  2     2    12.1      14.7        11.9    0.215
##  3     3    10.3      12.2        10.4   -0.086
##  4     4    13.6      19.8        14.8   -1.20 
##  5     5    16.5      24.4        17.6   -1.06 
##  6     6    18.6      25          18.0    0.67 
##  7     7    19.0      24.5        17.6    1.32 
##  8     8    22.1      29.3        20.5    1.59 
##  9     9    22.6      30.8        21.4    1.2  
## 10    10    28.7      42.3        28.2    0.542
## # ... with 534 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.69	16.22	0.04	12.43	22.70	34.49	95.43	▇▇▃▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.70	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.954

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     3.99     0.123      32.4       0    3.75     4.23 
## 2 PMSA003       0.31     0.004      74.3       0    0.301    0.318

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    31.7        13.8    2.19 
##  2     2    12.1    25.1        11.8    0.318
##  3     3    10.3    20.1        10.2    0.092
##  4     4    13.6    35.9        15.1   -1.47 
##  5     5    16.5    44.3        17.7   -1.20 
##  6     6    18.6    47.1        18.6    0.043
##  7     7    19.0    42.2        17.1    1.91 
##  8     8    22.1    52.0        20.1    1.99 
##  9     9    22.6    55.0        21.0    1.58 
## 10    10    28.7    75.5        27.4    1.35 
## # ... with 534 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	24.69	16.22	0.04	12.43	22.70	34.49	95.43	▇▇▃▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.70	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.954

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     3.99     0.123      32.4       0    3.75     4.23 
## 2 PMSA003       0.31     0.004      74.3       0    0.301    0.318

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    31.7        13.8    2.19 
##  2     2    12.1    25.1        11.8    0.318
##  3     3    10.3    20.1        10.2    0.092
##  4     4    13.6    35.9        15.1   -1.47 
##  5     5    16.5    44.3        17.7   -1.20 
##  6     6    18.6    47.1        18.6    0.043
##  7     7    19.0    42.2        17.1    1.91 
##  8     8    22.1    52.0        20.1    1.99 
##  9     9    22.6    55.0        21.0    1.58 
## 10    10    28.7    75.5        27.4    1.35 
## # ... with 534 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	544
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	24.80	15.38	0.70	13.55	22.36	33.59	87.28	▇▇▃▁▁
Oficial	0	1	11.63	5.26	2.06	7.79	11.01	14.70	33.11	▅▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.952

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    3.56      0.131      27.1       0    3.30     3.81 
## 2 PMS7003      0.326     0.004      72.4       0    0.317    0.335

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 544 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    16.0    33.6        14.5    1.50 
##  2     2    12.1    27.1        12.4   -0.281
##  3     3    10.3    22.1        10.8   -0.444
##  4     4    13.6    37.2        15.7   -2.03 
##  5     5    16.5    46.2        18.6   -2.09 
##  6     6    18.6    47.7        19.1   -0.482
##  7     7    19.0    44.4        18.0    0.95 
##  8     8    22.1    52.9        20.8    1.28 
##  9     9    22.6    54.0        21.1    1.46 
## 10    10    28.7    73.7        27.6    1.14 
## # ... with 534 more rows

Paiba-Canairios.R

DBB

2021-02-24