Ferias-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/FERIAS_CANAIRIOS.xlsx")
#View(df)

glimpse(df)

## Rows: 1,115
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "12-11-2020 24:00", "13-11-2020 01:00", "13-11-2020 02:00...
## $ Oficial   <dbl> 3.7, 0.3, 5.9, 8.1, 3.7, 4.0, 4.8, 6.0, 8.3, 9.8, 16.5, 6...
## $ PMS7003   <dbl> 0.00, 0.29, 0.71, 1.40, 3.12, 4.16, 6.13, 7.61, 8.40, 20....
## $ PMSA003   <dbl> 0.0169, 0.0000, 1.6100, 0.4100, 1.7200, 2.6500, 4.0400, 5...
## $ HPMA115S0 <dbl> 18.6, 19.1, 19.4, 19.7, 20.4, 20.6, 22.0, 22.8, 23.7, 28....
## $ SPS30     <dbl> 0.90, 1.02, 1.25, 1.78, 2.60, 3.00, 4.11, 4.94, 5.48, 11....
## $ SNGCJA5   <dbl> 0.00, 0.18, 0.45, 0.90, 1.51, 1.81, 2.86, 3.39, 3.48, 8.2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1  1208 12-01-2021 10:00    10    0.0364  0.0182      23.2  1.05   12.1 
##  2    98 17-11-2020 02:00     7.2  7.21    5.31        25.4  5.28    3.97
##  3   909 23-12-2020 03:00    21    6.57    4.43        25.3  4.29    3.25
##  4  1203 12-01-2021 05:00     7    0.491   0.151       22.2  1.15   15.7 
##  5  1115 08-01-2021 16:00    23   14.4    13.3         31.6  9.62    6.96
##  6   538 04-12-2020 19:00    10.3 12.4    11.4         28.1  7.25    6.5 
##  7   780 17-12-2020 05:00    10    4.09    3.12        23.9  2.96    2.02
##  8  1021 28-12-2020 03:00    14    7.76    6.21        27.5  5.9     4.31
##  9   400 29-11-2020 16:00     3.6  1.05    0.0545      21.4  1.13    1.27
## 10  1040 05-01-2021 15:00    20   15      14.8         30.5  9.23    7.77

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.69	9.75	0.00	1.53	4.81	9.76	236.87	▇▁▁▁▁
SPS30	0	1	7.32	6.66	0.08	1.97	5.52	10.65	51.50	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.496

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    5.05      0.21       24.0       0    4.64     5.46 
## 2 SNGCJA5      0.339     0.018      19.1       0    0.304    0.374

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  0.9     0         5.05   -4.15 
##  2     2  1.02    0.18      5.11   -4.09 
##  3     3  1.25    0.45      5.20   -3.96 
##  4     4  1.78    0.9       5.36   -3.58 
##  5     5  2.6     1.51      5.56   -2.96 
##  6     6  3       1.81      5.66   -2.66 
##  7     7  4.11    2.86      6.02   -1.91 
##  8     8  4.94    3.39      6.20   -1.26 
##  9     9  5.48    3.48      6.23   -0.751
## 10    10 11.7     8.23      7.84    3.86 
## # ... with 1,105 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.69	9.75	0.0	1.53	4.81	9.76	236.87	▇▁▁▁▁
HPMA115S0	0	1	28.48	7.27	18.6	23.10	26.40	31.05	77.70	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.500

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   26.0       0.229     114.        0   25.5     26.4  
## 2 SNGCJA5      0.373     0.019      19.2       0    0.335    0.411

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      18.6    0             26.0   -7.39 
##  2     2      19.1    0.18          26.1   -6.96 
##  3     3      19.4    0.45          26.2   -6.76 
##  4     4      19.7    0.9           26.3   -6.63 
##  5     5      20.4    1.51          26.6   -6.16 
##  6     6      20.6    1.81          26.7   -6.07 
##  7     7      22      2.86          27.1   -5.06 
##  8     8      22.8    3.39          27.3   -4.46 
##  9     9      23.7    3.48          27.3   -3.59 
## 10    10      28.6    8.23          29.1   -0.459
## # ... with 1,105 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.69	9.75	0	1.53	4.81	9.76	236.87	▇▁▁▁▁
PMSA003	0	1	10.46	11.99	0	0.85	6.48	16.20	89.00	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.503

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.33      0.376      16.8       0    5.59     7.07 
## 2 SNGCJA5      0.618     0.032      19.4       0    0.556    0.681

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0.017    0           6.33    -6.31
##  2     2   0        0.18        6.44    -6.44
##  3     3   1.61     0.45        6.61    -5.00
##  4     4   0.41     0.9         6.88    -6.47
##  5     5   1.72     1.51        7.26    -5.54
##  6     6   2.65     1.81        7.45    -4.80
##  7     7   4.04     2.86        8.10    -4.06
##  8     8   5.35     3.39        8.42    -3.07
##  9     9   6.25     3.48        8.48    -2.23
## 10    10  18.5      8.23       11.4      7.08
## # ... with 1,105 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.69	9.75	0	1.53	4.81	9.76	236.87	▇▁▁▁▁
PMS7003	0	1	11.60	11.87	0	1.94	8.34	17.90	91.80	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.496

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.56      0.374      20.2       0    6.83     8.30 
## 2 SNGCJA5      0.604     0.032      19.1       0    0.542    0.666

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0        0           7.56    -7.56
##  2     2   0.290    0.18        7.67    -7.38
##  3     3   0.71     0.45        7.84    -7.13
##  4     4   1.4      0.9         8.11    -6.71
##  5     5   3.12     1.51        8.48    -5.36
##  6     6   4.16     1.81        8.66    -4.50
##  7     7   6.13     2.86        9.29    -3.16
##  8     8   7.61     3.39        9.61    -2.00
##  9     9   8.4      3.48        9.67    -1.27
## 10    10  20.2      8.23       12.5      7.66
## # ... with 1,105 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.69	9.75	0	1.53	4.81	9.76	236.87	▇▁▁▁▁
Oficial	0	1	12.80	8.32	0	6.50	11.20	17.70	55.40	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.275

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   11.2       0.291     38.7        0   10.7     11.8  
## 2 SNGCJA5      0.234     0.025      9.53       0    0.186    0.283

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     3.7    0           11.2    -7.53
##  2     2     0.3    0.18        11.3   -11.0 
##  3     3     5.9    0.45        11.3    -5.44
##  4     4     8.1    0.9         11.4    -3.34
##  5     5     3.7    1.51        11.6    -7.89
##  6     6     4      1.81        11.7    -7.66
##  7     7     4.8    2.86        11.9    -7.10
##  8     8     6      3.39        12.0    -6.03
##  9     9     8.3    3.48        12.0    -3.75
## 10    10     9.8    8.23        13.2    -3.36
## # ... with 1,105 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.52	10.65	51.5	▇▂▁▁▁
HPMA115S0	0	1	28.48	7.27	18.60	23.10	26.40	31.05	77.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.983

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.6      0.059      348.       0    20.5     20.7 
## 2 SPS30         1.07     0.006      179.       0     1.06     1.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      18.6  0.9           21.6    -3.00
##  2     2      19.1  1.02          21.7    -2.62
##  3     3      19.4  1.25          22.0    -2.57
##  4     4      19.7  1.78          22.5    -2.84
##  5     5      20.4  2.6           23.4    -3.02
##  6     6      20.6  3             23.8    -3.25
##  7     7      22    4.11          25.0    -3.04
##  8     8      22.8  4.94          25.9    -3.13
##  9     9      23.7  5.48          26.5    -2.81
## 10    10      28.6 11.7           33.2    -4.59
## # ... with 1,105 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.52	10.65	51.5	▇▂▁▁▁
PMSA003	0	1	10.46	11.99	0.00	0.85	6.48	16.20	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.65     0.048     -55.2       0    -2.75    -2.56
## 2 SPS30         1.79     0.005     369.        0     1.78     1.80

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0.017  0.9       -1.04     1.06 
##  2     2   0      1.02      -0.826    0.826
##  3     3   1.61   1.25      -0.414    2.02 
##  4     4   0.41   1.78       0.536   -0.126
##  5     5   1.72   2.6        2.01    -0.286
##  6     6   2.65   3          2.72    -0.073
##  7     7   4.04   4.11       4.71    -0.672
##  8     8   5.35   4.94       6.2     -0.85 
##  9     9   6.25   5.48       7.17    -0.918
## 10    10  18.5   11.7       18.3      0.183
## # ... with 1,105 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.52	10.65	51.5	▇▂▁▁▁
PMS7003	0	1	11.60	11.87	0.00	1.94	8.34	17.90	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.38     0.049     -28.1       0    -1.48    -1.29
## 2 SPS30         1.78     0.005     357.        0     1.76     1.78

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0      0.9        0.213   -0.213
##  2     2   0.290  1.02       0.426   -0.136
##  3     3   0.71   1.25       0.834   -0.124
##  4     4   1.4    1.78       1.78    -0.375
##  5     5   3.12   2.6        3.23    -0.11 
##  6     6   4.16   3          3.94     0.219
##  7     7   6.13   4.11       5.91     0.219
##  8     8   7.61   4.94       7.38     0.226
##  9     9   8.4    5.48       8.34     0.057
## 10    10  20.2   11.7       19.4      0.817
## # ... with 1,105 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.52	10.65	51.5	▇▂▁▁▁
Oficial	0	1	12.80	8.32	0.00	6.50	11.20	17.70	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.565

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.64      0.306      25.0       0    7.04     8.24 
## 2 SPS30        0.705     0.031      22.8       0    0.645    0.766

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1     3.7  0.9         8.27   -4.57 
##  2     2     0.3  1.02        8.36   -8.06 
##  3     3     5.9  1.25        8.52   -2.62 
##  4     4     8.1  1.78        8.89   -0.793
##  5     5     3.7  2.6         9.47   -5.77 
##  6     6     4    3           9.75   -5.75 
##  7     7     4.8  4.11       10.5    -5.74 
##  8     8     6    4.94       11.1    -5.12 
##  9     9     8.3  5.48       11.5    -3.20 
## 10    10     9.8 11.7        15.9    -6.09 
## # ... with 1,105 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.48	7.27	18.6	23.10	26.40	31.05	77.7	▇▂▁▁▁
PMSA003	0	1	10.46	11.99	0.0	0.85	6.48	16.20	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -35.8      0.256     -140.       0   -36.3    -35.2 
## 2 HPMA115S0     1.62     0.009      186.       0     1.60     1.64

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0.017      18.6      -5.58      5.59
##  2     2   0          19.1      -4.76      4.76
##  3     3   1.61       19.4      -4.28      5.89
##  4     4   0.41       19.7      -3.79      4.2 
##  5     5   1.72       20.4      -2.65      4.37
##  6     6   2.65       20.6      -2.33      4.98
##  7     7   4.04       22        -0.058     4.10
##  8     8   5.35       22.8       1.24      4.11
##  9     9   6.25       23.7       2.7       3.55
## 10    10  18.5        28.6      10.6       7.85
## # ... with 1,105 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.48	7.27	18.6	23.10	26.40	31.05	77.7	▇▂▁▁▁
PMS7003	0	1	11.60	11.87	0.0	1.94	8.34	17.90	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.978

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -33.9      0.298     -114.       0   -34.5    -33.3 
## 2 HPMA115S0     1.60     0.01       158.       0     1.58     1.62

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0          18.6       -4.18     4.18
##  2     2   0.290      19.1       -3.39     3.68
##  3     3   0.71       19.4       -2.91     3.62
##  4     4   1.4        19.7       -2.43     3.83
##  5     5   3.12       20.4       -1.31     4.43
##  6     6   4.16       20.6       -0.99     5.15
##  7     7   6.13       22          1.25     4.88
##  8     8   7.61       22.8        2.52     5.08
##  9     9   8.4        23.7        3.96     4.44
## 10    10  20.2        28.6       11.8      8.41
## # ... with 1,105 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.48	7.27	18.6	23.1	26.4	31.05	77.7	▇▂▁▁▁
Oficial	0	1	12.80	8.32	0.0	6.5	11.2	17.70	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.557

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -5.35      0.837     -6.40       0   -7.00    -3.71 
## 2 HPMA115S0    0.637     0.028     22.4        0    0.581    0.693

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1     3.7      18.6        6.5    -2.8  
##  2     2     0.3      19.1        6.82   -6.52 
##  3     3     5.9      19.4        7.01   -1.11 
##  4     4     8.1      19.7        7.20    0.899
##  5     5     3.7      20.4        7.65   -3.95 
##  6     6     4        20.6        7.78   -3.78 
##  7     7     4.8      22          8.67   -3.87 
##  8     8     6        22.8        9.18   -3.18 
##  9     9     8.3      23.7        9.75   -1.45 
## 10    10     9.8      28.6       12.9    -3.07 
## # ... with 1,105 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.46	11.99	0	0.85	6.48	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.80	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.560

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.73      0.274      31.9       0    8.20     9.27 
## 2 PMSA003      0.389     0.017      22.5       0    0.355    0.422

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     3.7   0.017        8.74   -5.04 
##  2     2     0.3   0            8.73   -8.43 
##  3     3     5.9   1.61         9.36   -3.46 
##  4     4     8.1   0.41         8.89   -0.793
##  5     5     3.7   1.72         9.40   -5.70 
##  6     6     4     2.65         9.76   -5.76 
##  7     7     4.8   4.04        10.3    -5.50 
##  8     8     6     5.35        10.8    -4.81 
##  9     9     8.3   6.25        11.2    -2.86 
## 10    10     9.8  18.5         15.9    -6.12 
## # ... with 1,105 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.46	11.99	0	0.85	6.48	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.80	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.560

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.73      0.274      31.9       0    8.20     9.27 
## 2 PMSA003      0.389     0.017      22.5       0    0.355    0.422

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     3.7   0.017        8.74   -5.04 
##  2     2     0.3   0            8.73   -8.43 
##  3     3     5.9   1.61         9.36   -3.46 
##  4     4     8.1   0.41         8.89   -0.793
##  5     5     3.7   1.72         9.40   -5.70 
##  6     6     4     2.65         9.76   -5.76 
##  7     7     4.8   4.04        10.3    -5.50 
##  8     8     6     5.35        10.8    -4.81 
##  9     9     8.3   6.25        11.2    -2.86 
## 10    10     9.8  18.5         15.9    -6.12 
## # ... with 1,105 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1115
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	11.6	11.87	0	1.94	8.34	17.9	91.8	▇▂▁▁▁
Oficial	0	1	12.8	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.563

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    8.22      0.288      28.5       0     7.66    8.79 
## 2 PMS7003      0.394     0.017      22.7       0     0.36    0.428

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,115 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     3.7   0            8.22   -4.52 
##  2     2     0.3   0.290        8.34   -8.04 
##  3     3     5.9   0.71         8.50   -2.60 
##  4     4     8.1   1.4          8.78   -0.675
##  5     5     3.7   3.12         9.45   -5.75 
##  6     6     4     4.16         9.86   -5.86 
##  7     7     4.8   6.13        10.6    -5.84 
##  8     8     6     7.61        11.2    -5.22 
##  9     9     8.3   8.4         11.5    -3.24 
## 10    10     9.8  20.2         16.2    -6.39 
## # ... with 1,105 more rows

Ferias-Canairios.R

DBB

2021-02-24