Paiba-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **December ESTACION PAIBA VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/PAIBA_CANAIRIOS_DIC.xlsx")
View(df)

glimpse(df)

## Rows: 697
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <dttm> 2020-12-01 00:00:00, 2020-12-01 01:00:00, 2020-12-01 02:...
## $ Oficial   <dbl> 8.12, 7.27, 10.32, 9.91, 12.52, 13.31, 14.70, 21.35, 23.8...
## $ PMS7003   <dbl> 7.218182, 8.224138, 11.259259, 14.290909, 15.464286, 13.2...
## $ PMSA003   <dbl> 5.363636, 6.724138, 10.407407, 14.545455, 15.285714, 12.4...
## $ HPMA115S0 <dbl> 5.345455, 5.413793, 7.425926, 9.490909, 9.982143, 8.19642...
## $ SPS30     <dbl> 4.363636, 4.862069, 6.740741, 8.654545, 9.178571, 7.28571...
## $ SNGCJA5   <dbl> 2.600000, 3.068966, 4.666667, 6.236364, 6.642857, 5.30357...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha               Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <dttm>                <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   436 2020-12-19 03:00:00   11.5     8.74    7.40      5.43  5.34    3.36
##  2   428 2020-12-18 19:00:00   29.1    34.5    35.6      18.9  18.4    13.3 
##  3   245 2020-12-11 04:00:00   24.4    38.7    43.3      23.8  23.4    17.3 
##  4   129 2020-12-06 08:00:00    8.07    3.71    2.8       2.67  2.56    1.18
##  5    78 2020-12-04 05:00:00   21.7    21.5    22.9      10.9  10.9     8.80
##  6   621 2020-12-26 20:00:00   12.3    12.6    11.9       8.29  8.25    5.14
##  7    71 2020-12-03 22:00:00   22.7    27.9    29.9      16.5  15.9    11.5 
##  8    95 2020-12-04 22:00:00    9.73    4.11    3.85      3.36  2.87    1.55
##  9   148 2020-12-07 03:00:00   11.2     8.67    7.76      5.44  5.28    3.57
## 10   622 2020-12-26 21:00:00   12.0     5.74    3.65      4.09  3.93    2.04

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	7.94	6.43	0.10	2.96	6.56	10.75	40.80	▇▃▁▁▁
SPS30	0	1	11.29	8.46	1.17	4.85	9.30	15.11	51.67	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.907     0.051      17.9       0    0.807     1.01
## 2 SNGCJA5      1.31      0.005     263.        0    1.30      1.32

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  4.36    2.6       4.31    0.056
##  2     2  4.86    3.07      4.92   -0.06 
##  3     3  6.74    4.67      7.01   -0.271
##  4     4  8.65    6.24      9.06   -0.411
##  5     5  9.18    6.64      9.60   -0.418
##  6     6  7.29    5.30      7.84   -0.559
##  7     7  6.33    4.72      7.09   -0.755
##  8     8 10.8     8.52     12.1    -1.23 
##  9     9 16.7    12.5      17.3    -0.598
## 10    10 16.9    12.4      17.1    -0.225
## # ... with 687 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	7.94	6.43	0.10	2.96	6.56	10.75	40.80	▇▃▁▁▁
HPMA115S0	0	1	11.76	8.49	1.35	5.31	9.60	15.41	51.89	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.991

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.38     0.07       19.9       0     1.25     1.52
## 2 SNGCJA5       1.31     0.007     192.        0     1.29     1.32

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      5.34    2.6           4.78    0.563
##  2     2      5.41    3.07          5.40    0.018
##  3     3      7.43    4.67          7.48   -0.059
##  4     4      9.49    6.24          9.54   -0.047
##  5     5      9.98    6.64         10.1    -0.087
##  6     6      8.20    5.30          8.32   -0.121
##  7     7      7.31    4.72          7.56   -0.248
##  8     8     11.7     8.52         12.5    -0.784
##  9     9     17      12.5          17.8    -0.752
## 10    10     17.8    12.4          17.6     0.214
## # ... with 687 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	7.94	6.43	0.10	2.96	6.56	10.75	40.80	▇▃▁▁▁
PMSA003	0	1	20.29	17.42	0.18	6.54	16.23	28.48	108.45	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.998

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.14     0.069     -16.5       0    -1.28    -1.01
## 2 SNGCJA5       2.70     0.007     398.        0     2.69     2.71

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    5.36    2.6         5.88   -0.515
##  2     2    6.72    3.07        7.15   -0.422
##  3     3   10.4     4.67       11.5    -1.05 
##  4     4   14.5     6.24       15.7    -1.16 
##  5     5   15.3     6.64       16.8    -1.51 
##  6     6   12.5     5.30       13.2    -0.7  
##  7     7   11.5     4.72       11.6    -0.15 
##  8     8   22.8     8.52       21.9     0.89 
##  9     9   34.1    12.5        32.7     1.46 
## 10    10   32.8    12.4        32.3     0.439
## # ... with 687 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	7.94	6.43	0.10	2.96	6.56	10.75	40.80	▇▃▁▁▁
PMS7003	0	1	20.12	15.64	0.96	7.84	16.61	27.35	93.25	▇▃▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.988

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     1.07     0.147      7.28       0     0.78     1.36
## 2 SNGCJA5       2.40     0.014    167.         0     2.37     2.43

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    7.22    2.6         7.31   -0.092
##  2     2    8.22    3.07        8.44   -0.212
##  3     3   11.3     4.67       12.3    -1.01 
##  4     4   14.3     6.24       16.0    -1.75 
##  5     5   15.5     6.64       17.0    -1.55 
##  6     6   13.3     5.30       13.8    -0.515
##  7     7   11.9     4.72       12.4    -0.471
##  8     8   22.1     8.52       21.5     0.578
##  9     9   32.0    12.5        31.1     0.897
## 10    10   30.2    12.4        30.8    -0.606
## # ... with 687 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	7.94	6.43	0.10	2.96	6.56	10.75	40.80	▇▃▁▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.606

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.4       0.442      25.8       0   10.5     12.3  
## 2 SNGCJA5      0.869     0.043      20.1       0    0.784    0.954

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    8.12    2.6         13.7   -5.54 
##  2     2    7.27    3.07        14.1   -6.80 
##  3     3   10.3     4.67        15.5   -5.13 
##  4     4    9.91    6.24        16.8   -6.91 
##  5     5   12.5     6.64        17.2   -4.65 
##  6     6   13.3     5.30        16.0   -2.70 
##  7     7   14.7     4.72        15.5   -0.805
##  8     8   21.4     8.52        18.8    2.55 
##  9     9   23.9    12.5         22.3    1.59 
## 10    10   22.9    12.4         22.2    0.745
## # ... with 687 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.29	8.46	1.17	4.85	9.3	15.11	51.67	▇▃▁▁▁
HPMA115S0	0	1	11.76	8.49	1.35	5.31	9.6	15.41	51.89	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    0.483     0.053      9.05       0    0.378    0.588
## 2 SPS30        0.999     0.004    264.         0    0.992    1.01

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      5.34  4.36          4.84    0.503
##  2     2      5.41  4.86          5.34    0.074
##  3     3      7.43  6.74          7.22    0.209
##  4     4      9.49  8.65          9.13    0.362
##  5     5      9.98  9.18          9.65    0.33 
##  6     6      8.20  7.29          7.76    0.435
##  7     7      7.31  6.33          6.81    0.504
##  8     8     11.7  10.8          11.3     0.448
##  9     9     17    16.7          17.2    -0.151
## 10    10     17.8  16.9          17.4     0.442
## # ... with 687 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.29	8.46	1.17	4.85	9.30	15.11	51.67	▇▃▁▁▁
PMSA003	0	1	20.29	17.42	0.18	6.54	16.23	28.48	108.45	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.995

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.83     0.114     -24.7       0    -3.05    -2.60
## 2 SPS30         2.05     0.008     253.        0     2.03     2.06

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    5.36  4.36        6.11   -0.747
##  2     2    6.72  4.86        7.13   -0.407
##  3     3   10.4   6.74       11.0    -0.571
##  4     4   14.5   8.65       14.9    -0.353
##  5     5   15.3   9.18       16.0    -0.686
##  6     6   12.5   7.29       12.1     0.387
##  7     7   11.5   6.33       10.1     1.33 
##  8     8   22.8  10.8        19.3     3.43 
##  9     9   34.1  16.7        31.3     2.78 
## 10    10   32.8  16.9        31.8     0.998
## # ... with 687 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.29	8.46	1.17	4.85	9.30	15.11	51.67	▇▃▁▁▁
PMS7003	0	1	20.12	15.64	0.96	7.84	16.61	27.35	93.25	▇▃▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.993

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -0.602     0.117     -5.17       0   -0.831   -0.374
## 2 SPS30        1.84      0.008    222.         0    1.82     1.85

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    7.22  4.36        7.41   -0.19 
##  2     2    8.22  4.86        8.32   -0.099
##  3     3   11.3   6.74       11.8    -0.512
##  4     4   14.3   8.65       15.3    -0.994
##  5     5   15.5   9.18       16.2    -0.782
##  6     6   13.3   7.29       12.8     0.514
##  7     7   11.9   6.33       11.0     0.917
##  8     8   22.1  10.8        19.3     2.84 
##  9     9   32.0  16.7        30.0     1.99 
## 10    10   30.2  16.9        30.4    -0.197
## # ... with 687 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	11.29	8.46	1.17	4.85	9.30	15.11	51.67	▇▃▁▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.573

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.2       0.478      23.5       0   10.3     12.2  
## 2 SPS30        0.625     0.034      18.4       0    0.559    0.692

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1    8.12  4.36        14.0   -5.85 
##  2     2    7.27  4.86        14.3   -7.01 
##  3     3   10.3   6.74        15.5   -5.13 
##  4     4    9.91  8.65        16.6   -6.74 
##  5     5   12.5   9.18        17.0   -4.46 
##  6     6   13.3   7.29        15.8   -2.48 
##  7     7   14.7   6.33        15.2   -0.498
##  8     8   21.4  10.8         18.0    3.35 
##  9     9   23.9  16.7         21.7    2.20 
## 10    10   22.9  16.9         21.8    1.11 
## # ... with 687 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	11.76	8.49	1.35	5.31	9.60	15.41	51.89	▇▃▁▁▁
PMSA003	0	1	20.29	17.42	0.18	6.54	16.23	28.48	108.45	▇▃▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.988

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -3.55     0.171     -20.7       0    -3.88    -3.21
## 2 HPMA115S0     2.03     0.012     172.        0     2.00     2.05

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    5.36      5.34        7.29   -1.92 
##  2     2    6.72      5.41        7.43   -0.703
##  3     3   10.4       7.43       11.5    -1.10 
##  4     4   14.5       9.49       15.7    -1.15 
##  5     5   15.3       9.98       16.7    -1.40 
##  6     6   12.5       8.20       13.1    -0.586
##  7     7   11.5       7.31       11.3     0.192
##  8     8   22.8      11.7        20.3     2.51 
##  9     9   34.1      17          30.9     3.21 
## 10    10   32.8      17.8        32.5     0.23 
## # ... with 687 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	11.76	8.49	1.35	5.31	9.60	15.41	51.89	▇▃▁▁▁
PMS7003	0	1	20.12	15.64	0.96	7.84	16.61	27.35	93.25	▇▃▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.993

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.39     0.119     -11.7       0    -1.62    -1.15
## 2 HPMA115S0     1.83     0.008     224.        0     1.81     1.84

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    7.22      5.34        8.39   -1.17 
##  2     2    8.22      5.41        8.52   -0.291
##  3     3   11.3       7.43       12.2    -0.935
##  4     4   14.3       9.49       16.0    -1.68 
##  5     5   15.5       9.98       16.9    -1.40 
##  6     6   13.3       8.20       13.6    -0.318
##  7     7   11.9       7.31       12.0    -0.048
##  8     8   22.1      11.7        20.1     2.02 
##  9     9   32.0      17          29.7     2.32 
## 10    10   30.2      17.8        31.2    -0.955
## # ... with 687 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	11.76	8.49	1.35	5.31	9.60	15.41	51.89	▇▃▁▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.616

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   10.4       0.471      22.2       0    9.50    11.4  
## 2 HPMA115S0    0.669     0.032      20.6       0    0.605    0.733

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1    8.12      5.34        14.0   -5.88 
##  2     2    7.27      5.41        14.0   -6.78 
##  3     3   10.3       7.43        15.4   -5.08 
##  4     4    9.91      9.49        16.8   -6.87 
##  5     5   12.5       9.98        17.1   -4.59 
##  6     6   13.3       8.20        15.9   -2.60 
##  7     7   14.7       7.31        15.3   -0.621
##  8     8   21.4      11.7         18.3    3.07 
##  9     9   23.9      17           21.8    2.07 
## 10    10   22.9      17.8         22.3    0.576
## # ... with 687 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	20.29	17.42	0.18	6.54	16.23	28.48	108.45	▇▃▁▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.8       0.427      27.6       0   10.9     12.6  
## 2 PMSA003      0.322     0.016      20.2       0    0.290    0.353

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    8.12    5.36        13.5   -5.37 
##  2     2    7.27    6.72        13.9   -6.66 
##  3     3   10.3    10.4         15.1   -4.80 
##  4     4    9.91   14.5         16.4   -6.54 
##  5     5   12.5    15.3         16.7   -4.16 
##  6     6   13.3    12.5         15.8   -2.47 
##  7     7   14.7    11.5         15.5   -0.758
##  8     8   21.4    22.8         19.1    2.26 
##  9     9   23.9    34.1         22.7    1.12 
## 10    10   22.9    32.8         22.3    0.605
## # ... with 687 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	20.29	17.42	0.18	6.54	16.23	28.48	108.45	▇▃▁▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.607

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.8       0.427      27.6       0   10.9     12.6  
## 2 PMSA003      0.322     0.016      20.2       0    0.290    0.353

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    8.12    5.36        13.5   -5.37 
##  2     2    7.27    6.72        13.9   -6.66 
##  3     3   10.3    10.4         15.1   -4.80 
##  4     4    9.91   14.5         16.4   -6.54 
##  5     5   12.5    15.3         16.7   -4.16 
##  6     6   13.3    12.5         15.8   -2.47 
##  7     7   14.7    11.5         15.5   -0.758
##  8     8   21.4    22.8         19.1    2.26 
##  9     9   23.9    34.1         22.7    1.12 
## 10    10   22.9    32.8         22.3    0.605
## # ... with 687 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	697
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	20.12	15.64	0.96	7.84	16.61	27.35	93.25	▇▃▂▁▁
Oficial	0	1	18.30	9.22	3.47	11.12	16.86	23.87	62.22	▇▇▃▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.614

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.0       0.45       24.5       0   10.1     11.9  
## 2 PMS7003      0.362     0.018      20.5       0    0.327    0.397

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 697 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1    8.12    7.22        13.6   -5.50 
##  2     2    7.27    8.22        14.0   -6.72 
##  3     3   10.3    11.3         15.1   -4.77 
##  4     4    9.91   14.3         16.2   -6.28 
##  5     5   12.5    15.5         16.6   -4.09 
##  6     6   13.3    13.3         15.8   -2.51 
##  7     7   14.7    11.9         15.3   -0.634
##  8     8   21.4    22.1         19.0    2.34 
##  9     9   23.9    32.0         22.6    1.27 
## 10    10   22.9    30.2         21.9    0.964
## # ... with 687 more rows

Paiba-Canairios.R

DBB

2021-02-24