Ferias-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **1 hour late ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/FERIAS_CANAIRIOS_1h.xlsx")
#View(df)

glimpse(df)

## Rows: 1,114
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "12-11-2020 24:00", "13-11-2020 01:00", "13-11-2020 02:00...
## $ Oficial   <dbl> 0.3, 5.9, 8.1, 3.7, 4.0, 4.8, 6.0, 8.3, 9.8, 16.5, 6.2, 1...
## $ PMS7003   <dbl> 0.00, 0.29, 0.71, 1.40, 3.12, 4.16, 6.13, 7.61, 8.40, 20....
## $ PMSA003   <dbl> 0.0169, 0.0000, 1.6100, 0.4100, 1.7200, 2.6500, 4.0400, 5...
## $ HPMA115S0 <dbl> 18.6, 19.1, 19.4, 19.7, 20.4, 20.6, 22.0, 22.8, 23.7, 28....
## $ SPS30     <dbl> 0.90, 1.02, 1.25, 1.78, 2.60, 3.00, 4.11, 4.94, 5.48, 11....
## $ SNGCJA5   <dbl> 0.00, 0.18, 0.45, 0.90, 1.51, 1.81, 2.86, 3.39, 3.48, 8.2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   167 19-11-2020 24:00    12.8 29.9    26.9         38   17.2  13.9   
##  2  1099 08-01-2021 02:00    15    7.89    6.28        26.9  5.30  3.35  
##  3  1210 12-01-2021 12:00     8    0.0175  0           23.3  1.02 12.5   
##  4   622 10-12-2020 16:00    35.4 53.3    54.3         57.9 31.8  27.1   
##  5   192 21-11-2020 01:00     2.9  1.37    0.44        22    1.86  1.09  
##  6   478 02-12-2020 22:00     6    0.0345  0           21.3  0.97  0.0517
##  7     7 13-11-2020 06:00     6    6.13    4.04        22    4.11  2.86  
##  8   153 19-11-2020 10:00     4.8  4.91    3.33        23.8  3.82  2.04  
##  9    23 13-11-2020 22:00    22.3 18.5    15.6         29.4 10     8.21  
## 10   204 21-11-2020 13:00     6.9  0.0351  0.0175      22.1  0.94  0

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.68	9.75	0.00	1.53	4.80	9.75	236.87	▇▁▁▁▁
SPS30	0	1	7.32	6.66	0.08	1.97	5.53	10.67	51.50	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.497

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    5.06      0.21       24.1       0    4.64     5.47 
## 2 SNGCJA5      0.339     0.018      19.1       0    0.305    0.374

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  0.9     0         5.06   -4.16 
##  2     2  1.02    0.18      5.12   -4.10 
##  3     3  1.25    0.45      5.21   -3.96 
##  4     4  1.78    0.9       5.36   -3.58 
##  5     5  2.6     1.51      5.57   -2.97 
##  6     6  3       1.81      5.67   -2.67 
##  7     7  4.11    2.86      6.03   -1.92 
##  8     8  4.94    3.39      6.21   -1.27 
##  9     9  5.48    3.48      6.24   -0.757
## 10    10 11.7     8.23      7.85    3.85 
## # ... with 1,104 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.68	9.75	0.0	1.53	4.8	9.75	236.87	▇▁▁▁▁
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.4	31.08	77.70	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.501

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   26.0       0.229     114.        0   25.5     26.4  
## 2 SNGCJA5      0.373     0.019      19.3       0    0.335    0.411

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      18.6    0             26.0   -7.40 
##  2     2      19.1    0.18          26.1   -6.96 
##  3     3      19.4    0.45          26.2   -6.76 
##  4     4      19.7    0.9           26.3   -6.63 
##  5     5      20.4    1.51          26.6   -6.16 
##  6     6      20.6    1.81          26.7   -6.07 
##  7     7      22      2.86          27.1   -5.06 
##  8     8      22.8    3.39          27.3   -4.46 
##  9     9      23.7    3.48          27.3   -3.60 
## 10    10      28.6    8.23          29.1   -0.468
## # ... with 1,104 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.68	9.75	0	1.53	4.80	9.75	236.87	▇▁▁▁▁
PMSA003	0	1	10.47	11.99	0	0.85	6.49	16.20	89.00	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.504

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     6.33     0.376      16.8       0    5.60     7.07 
## 2 SNGCJA5       0.62     0.032      19.5       0    0.557    0.682

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0.017    0           6.33    -6.32
##  2     2   0        0.18        6.44    -6.44
##  3     3   1.61     0.45        6.61    -5.00
##  4     4   0.41     0.9         6.89    -6.48
##  5     5   1.72     1.51        7.27    -5.55
##  6     6   2.65     1.81        7.45    -4.80
##  7     7   4.04     2.86        8.10    -4.07
##  8     8   5.35     3.39        8.43    -3.08
##  9     9   6.25     3.48        8.49    -2.24
## 10    10  18.5      8.23       11.4      7.07
## # ... with 1,104 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.68	9.75	0	1.53	4.80	9.75	236.87	▇▁▁▁▁
PMS7003	0	1	11.61	11.87	0	1.95	8.37	17.90	91.80	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.497

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.57      0.374      20.2       0    6.84     8.30 
## 2 SNGCJA5      0.606     0.032      19.1       0    0.543    0.668

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0        0           7.57    -7.57
##  2     2   0.290    0.18        7.68    -7.39
##  3     3   0.71     0.45        7.84    -7.13
##  4     4   1.4      0.9         8.11    -6.71
##  5     5   3.12     1.51        8.48    -5.36
##  6     6   4.16     1.81        8.67    -4.51
##  7     7   6.13     2.86        9.30    -3.17
##  8     8   7.61     3.39        9.62    -2.01
##  9     9   8.4      3.48        9.68    -1.28
## 10    10  20.2      8.23       12.6      7.65
## # ... with 1,104 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.68	9.75	0	1.53	4.8	9.75	236.87	▇▁▁▁▁
Oficial	0	1	12.81	8.32	0	6.50	11.2	17.70	55.40	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.347

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   10.8       0.283      38.2       0   10.3     11.4  
## 2 SNGCJA5      0.296     0.024      12.4       0    0.249    0.343

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     0.3    0           10.8   -10.5 
##  2     2     5.9    0.18        10.9    -4.98
##  3     3     8.1    0.45        11.0    -2.86
##  4     4     3.7    0.9         11.1    -7.39
##  5     5     4      1.51        11.3    -7.28
##  6     6     4.8    1.81        11.4    -6.56
##  7     7     6      2.86        11.7    -5.68
##  8     8     8.3    3.39        11.8    -3.53
##  9     9     9.8    3.48        11.9    -2.06
## 10    10    16.5    8.23        13.3     3.23
## # ... with 1,104 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.53	10.67	51.5	▇▂▁▁▁
HPMA115S0	0	1	28.49	7.27	18.60	23.10	26.40	31.08	77.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.983

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.6      0.059      347.       0    20.5     20.7 
## 2 SPS30         1.07     0.006      179.       0     1.06     1.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      18.6  0.9           21.6    -2.99
##  2     2      19.1  1.02          21.7    -2.62
##  3     3      19.4  1.25          22.0    -2.57
##  4     4      19.7  1.78          22.5    -2.84
##  5     5      20.4  2.6           23.4    -3.02
##  6     6      20.6  3             23.8    -3.25
##  7     7      22    4.11          25.0    -3.04
##  8     8      22.8  4.94          25.9    -3.13
##  9     9      23.7  5.48          26.5    -2.81
## 10    10      28.6 11.7           33.2    -4.59
## # ... with 1,104 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.53	10.67	51.5	▇▂▁▁▁
PMSA003	0	1	10.47	11.99	0.00	0.85	6.49	16.20	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.66     0.048     -55.2       0    -2.75    -2.56
## 2 SPS30         1.79     0.005     369.        0     1.78     1.80

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0.017  0.9       -1.04     1.06 
##  2     2   0      1.02      -0.827    0.827
##  3     3   1.61   1.25      -0.415    2.02 
##  4     4   0.41   1.78       0.535   -0.125
##  5     5   1.72   2.6        2.00    -0.285
##  6     6   2.65   3          2.72    -0.072
##  7     7   4.04   4.11       4.71    -0.672
##  8     8   5.35   4.94       6.20    -0.849
##  9     9   6.25   5.48       7.17    -0.917
## 10    10  18.5   11.7       18.3      0.184
## # ... with 1,104 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.53	10.67	51.5	▇▂▁▁▁
PMS7003	0	1	11.61	11.87	0.00	1.95	8.37	17.90	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.38     0.049     -28.1       0    -1.48    -1.29
## 2 SPS30         1.78     0.005     356.        0     1.76     1.78

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0      0.9        0.213   -0.213
##  2     2   0.290  1.02       0.426   -0.136
##  3     3   0.71   1.25       0.834   -0.124
##  4     4   1.4    1.78       1.78    -0.375
##  5     5   3.12   2.6        3.23    -0.111
##  6     6   4.16   3          3.94     0.219
##  7     7   6.13   4.11       5.91     0.219
##  8     8   7.61   4.94       7.38     0.226
##  9     9   8.4    5.48       8.34     0.057
## 10    10  20.2   11.7       19.4      0.817
## # ... with 1,104 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.32	6.66	0.08	1.97	5.53	10.67	51.5	▇▂▁▁▁
Oficial	0	1	12.81	8.32	0.00	6.50	11.20	17.70	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.712

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.3       0.26       24.2       0    5.79      6.81
## 2 SPS30        0.889     0.026      33.8       0    0.837     0.94

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1     0.3  0.9         7.1    -6.8  
##  2     2     5.9  1.02        7.21   -1.31 
##  3     3     8.1  1.25        7.41    0.689
##  4     4     3.7  1.78        7.88   -4.18 
##  5     5     4    2.6         8.61   -4.61 
##  6     6     4.8  3           8.97   -4.17 
##  7     7     6    4.11        9.95   -3.95 
##  8     8     8.3  4.94       10.7    -2.39 
##  9     9     9.8  5.48       11.2    -1.37 
## 10    10    16.5 11.7        16.7    -0.196
## # ... with 1,104 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.40	31.08	77.7	▇▂▁▁▁
PMSA003	0	1	10.47	11.99	0.0	0.85	6.49	16.20	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -35.7      0.256     -140.       0   -36.2    -35.2 
## 2 HPMA115S0     1.62     0.009      186.       0     1.60     1.64

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0.017      18.6      -5.57      5.59
##  2     2   0          19.1      -4.76      4.76
##  3     3   1.61       19.4      -4.27      5.88
##  4     4   0.41       19.7      -3.79      4.20
##  5     5   1.72       20.4      -2.65      4.37
##  6     6   2.65       20.6      -2.33      4.98
##  7     7   4.04       22        -0.056     4.10
##  8     8   5.35       22.8       1.24      4.11
##  9     9   6.25       23.7       2.70      3.55
## 10    10  18.5        28.6      10.7       7.85
## # ... with 1,104 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.40	31.08	77.7	▇▂▁▁▁
PMS7003	0	1	11.61	11.87	0.0	1.95	8.37	17.90	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.978

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -33.9      0.298     -114.       0   -34.5    -33.3 
## 2 HPMA115S0     1.60     0.01       158.       0     1.58     1.62

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0          18.6      -4.18      4.18
##  2     2   0.290      19.1      -3.38      3.67
##  3     3   0.71       19.4      -2.90      3.61
##  4     4   1.4        19.7      -2.42      3.82
##  5     5   3.12       20.4      -1.30      4.42
##  6     6   4.16       20.6      -0.986     5.15
##  7     7   6.13       22         1.25      4.88
##  8     8   7.61       22.8       2.53      5.08
##  9     9   8.4        23.7       3.97      4.43
## 10    10  20.2        28.6      11.8       8.41
## # ... with 1,104 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.1	26.4	31.08	77.7	▇▂▁▁▁
Oficial	0	1	12.81	8.32	0.0	6.5	11.2	17.70	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.711

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept  -10.4       0.709     -14.6       0  -11.8     -8.97 
## 2 HPMA115S0    0.813     0.024      33.7       0    0.766    0.861

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1     0.3      18.6        4.76   -4.46 
##  2     2     5.9      19.1        5.17    0.729
##  3     3     8.1      19.4        5.42    2.68 
##  4     4     3.7      19.7        5.66   -1.96 
##  5     5     4        20.4        6.23   -2.23 
##  6     6     4.8      20.6        6.39   -1.59 
##  7     7     6        22          7.53   -1.53 
##  8     8     8.3      22.8        8.18    0.12 
##  9     9     9.8      23.7        8.91    0.888
## 10    10    16.5      28.6       12.9     3.60 
## # ... with 1,104 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.47	11.99	0	0.85	6.49	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.81	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.711

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.64      0.233      32.8       0    7.19     8.1  
## 2 PMSA003      0.493     0.015      33.7       0    0.464    0.522

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     0.3   0.017        7.65   -7.35 
##  2     2     5.9   0            7.64   -1.74 
##  3     3     8.1   1.61         8.44   -0.337
##  4     4     3.7   0.41         7.85   -4.15 
##  5     5     4     1.72         8.49   -4.49 
##  6     6     4.8   2.65         8.95   -4.15 
##  7     7     6     4.04         9.64   -3.64 
##  8     8     8.3   5.35        10.3    -1.98 
##  9     9     9.8   6.25        10.7    -0.925
## 10    10    16.5  18.5         16.8    -0.266
## # ... with 1,104 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.47	11.99	0	0.85	6.49	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.81	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.711

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.64      0.233      32.8       0    7.19     8.1  
## 2 PMSA003      0.493     0.015      33.7       0    0.464    0.522

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     0.3   0.017        7.65   -7.35 
##  2     2     5.9   0            7.64   -1.74 
##  3     3     8.1   1.61         8.44   -0.337
##  4     4     3.7   0.41         7.85   -4.15 
##  5     5     4     1.72         8.49   -4.49 
##  6     6     4.8   2.65         8.95   -4.15 
##  7     7     6     4.04         9.64   -3.64 
##  8     8     8.3   5.35        10.3    -1.98 
##  9     9     9.8   6.25        10.7    -0.925
## 10    10    16.5  18.5         16.8    -0.266
## # ... with 1,104 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1114
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	11.61	11.87	0	1.95	8.37	17.9	91.8	▇▂▁▁▁
Oficial	0	1	12.81	8.32	0	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.713

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     7.00     0.244      28.6       0    6.52     7.48 
## 2 PMS7003       0.5      0.015      34.0       0    0.471    0.529

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,114 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     0.3   0            7.00   -6.70 
##  2     2     5.9   0.290        7.15   -1.25 
##  3     3     8.1   0.71         7.36    0.742
##  4     4     3.7   1.4          7.70   -4.00 
##  5     5     4     3.12         8.56   -4.56 
##  6     6     4.8   4.16         9.08   -4.28 
##  7     7     6     6.13        10.1    -4.07 
##  8     8     8.3   7.61        10.8    -2.51 
##  9     9     9.8   8.4         11.2    -1.40 
## 10    10    16.5  20.2         17.1    -0.598
## # ... with 1,104 more rows

Ferias-Canairios.R

DBB

2021-02-24