Ferias-Canairios.R

library(readxl)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(moderndive)
library(skimr)

# **2 hours late ESTACION KENNEDY VS CANAIRIOS**
# **5 different sensors: PMS7003 & PMSA003 & HPMA115S0 & SPS30 & SNGCJA5**

df <- read_excel("C:/Mediciones/FERIAS_CANAIRIOS_2h.xlsx")
#View(df)

glimpse(df)

## Rows: 1,113
## Columns: 8
## $ Num       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...
## $ Fecha     <chr> "12-11-2020 24:00", "13-11-2020 01:00", "13-11-2020 02:00...
## $ Oficial   <dbl> 5.9, 8.1, 3.7, 4.0, 4.8, 6.0, 8.3, 9.8, 16.5, 6.2, 10.2, ...
## $ PMS7003   <dbl> 0.00, 0.29, 0.71, 1.40, 3.12, 4.16, 6.13, 7.61, 8.40, 20....
## $ PMSA003   <dbl> 0.0169, 0.0000, 1.6100, 0.4100, 1.7200, 2.6500, 4.0400, 5...
## $ HPMA115S0 <dbl> 18.6, 19.1, 19.4, 19.7, 20.4, 20.6, 22.0, 22.8, 23.7, 28....
## $ SPS30     <dbl> 0.90, 1.02, 1.25, 1.78, 2.60, 3.00, 4.11, 4.94, 5.48, 11....
## $ SNGCJA5   <dbl> 0.00, 0.18, 0.45, 0.90, 1.51, 1.81, 2.86, 3.39, 3.48, 8.2...

df %>%
  sample_n(size = 10)

## # A tibble: 10 x 8
##      Num Fecha            Oficial PMS7003 PMSA003 HPMA115S0 SPS30 SNGCJA5
##    <dbl> <chr>              <dbl>   <dbl>   <dbl>     <dbl> <dbl>   <dbl>
##  1   623 10-12-2020 17:00    31.7  46     45.3         51   26.1   23.5  
##  2   849 20-12-2020 12:00     6     0.641  0.0469      23.6  1.17   0.219
##  3   683 12-12-2020 22:00     9.7  14.3   12.8         30.6  8.8    7.27 
##  4    75 16-11-2020 02:00    14.6  19.6   18.8         31.9 12.6   10.2  
##  5     4 13-11-2020 03:00     4     1.4    0.41        19.7  1.78   0.9  
##  6   476 02-12-2020 20:00    16.1  15.7   14           31.6  9.59   8.21 
##  7   775 16-12-2020 24:00     6     1.21   0.293       22.4  1.60   0.724
##  8   139 18-11-2020 20:00    15.4  13.3   10.4         27.5  7.76   6.47 
##  9    12 13-11-2020 11:00    16.5  10.8    8.72        25    6.58   4.64 
## 10   370 28-11-2020 10:00     6     2.4    1.1         23.9  2.62   1.5

fig <- plot_ly(df, x = ~Num, y = ~PMS7003, name = 'PM2.5 PMS7003', type = 'scatter', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~PMSA003, name = 'PM2.5 PMSA003', mode = 'lines+markers')
fig <- fig %>% add_trace(y = ~HPMA115S0, name = 'PM2.5 HPMA115S0', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SPS30, name = 'PM2.5 SPS30', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~SNGCJA5, name = 'PM2.5 SNGCJA5', mode = 'lines+markers') 
fig <- fig %>% add_trace(y = ~Oficial, name = 'PM2.5 Oficial', mode = 'lines+markers')
fig

#Caso 1: SNGCJA5 VS SPS30

df %>% select(SNGCJA5, SPS30) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.67	9.75	0.00	1.53	4.80	9.75	236.87	▇▁▁▁▁
SPS30	0	1	7.33	6.66	0.08	1.97	5.53	10.70	51.50	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ SPS30)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.499

ggplot(df, aes(x = SNGCJA5, y = SPS30)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 SPS30",
       title = "Relationship between SNGCJA5 and SPS30") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(SPS30 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.06     0.21       24.1       0    4.65     5.47 
## 2 SNGCJA5       0.34     0.018      19.2       0    0.306    0.375

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID SPS30 SNGCJA5 SPS30_hat residual
##    <int> <dbl>   <dbl>     <dbl>    <dbl>
##  1     1  0.9     0         5.06   -4.16 
##  2     2  1.02    0.18      5.12   -4.10 
##  3     3  1.25    0.45      5.21   -3.96 
##  4     4  1.78    0.9       5.36   -3.58 
##  5     5  2.6     1.51      5.57   -2.97 
##  6     6  3       1.81      5.67   -2.67 
##  7     7  4.11    2.86      6.03   -1.92 
##  8     8  4.94    3.39      6.21   -1.27 
##  9     9  5.48    3.48      6.24   -0.763
## 10    10 11.7     8.23      7.86    3.84 
## # ... with 1,103 more rows

#Caso 2: SNGCJA5 VS HPMA115S0

df %>% select(SNGCJA5, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.67	9.75	0.0	1.53	4.8	9.75	236.87	▇▁▁▁▁
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.4	31.10	77.70	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.502

ggplot(df, aes(x = SNGCJA5, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 HPMA115S0",
       title = "Relationship between SNGCJA5 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   26.0       0.229     114.        0   25.5     26.4  
## 2 SNGCJA5      0.374     0.019      19.3       0    0.336    0.412

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID HPMA115S0 SNGCJA5 HPMA115S0_hat residual
##    <int>     <dbl>   <dbl>         <dbl>    <dbl>
##  1     1      18.6    0             26.0   -7.40 
##  2     2      19.1    0.18          26.1   -6.96 
##  3     3      19.4    0.45          26.2   -6.77 
##  4     4      19.7    0.9           26.3   -6.64 
##  5     5      20.4    1.51          26.6   -6.16 
##  6     6      20.6    1.81          26.7   -6.08 
##  7     7      22      2.86          27.1   -5.07 
##  8     8      22.8    3.39          27.3   -4.47 
##  9     9      23.7    3.48          27.3   -3.6  
## 10    10      28.6    8.23          29.1   -0.478
## # ... with 1,103 more rows

#Caso 3: SNGCJA5 VS PMSA003

df %>% select(SNGCJA5, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.67	9.75	0	1.53	4.8	9.75	236.87	▇▁▁▁▁
PMSA003	0	1	10.48	11.99	0	0.85	6.5	16.20	89.00	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.505

ggplot(df, aes(x = SNGCJA5, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMSA003",
       title = "Relationship between SNGCJA5 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.34      0.376      16.9       0    5.60     7.07 
## 2 SNGCJA5      0.621     0.032      19.5       0    0.559    0.684

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMSA003 SNGCJA5 PMSA003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0.017    0           6.34    -6.32
##  2     2   0        0.18        6.45    -6.45
##  3     3   1.61     0.45        6.62    -5.01
##  4     4   0.41     0.9         6.90    -6.49
##  5     5   1.72     1.51        7.28    -5.56
##  6     6   2.65     1.81        7.46    -4.81
##  7     7   4.04     2.86        8.11    -4.07
##  8     8   5.35     3.39        8.44    -3.09
##  9     9   6.25     3.48        8.50    -2.25
## 10    10  18.5      8.23       11.4      7.05
## # ... with 1,103 more rows

#Caso 4: SNGCJA5 VS PMS7003

df %>% select(SNGCJA5, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.67	9.75	0	1.53	4.8	9.75	236.87	▇▁▁▁▁
PMS7003	0	1	11.62	11.87	0	1.97	8.4	17.90	91.80	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.499

ggplot(df, aes(x = SNGCJA5, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 PMS7003",
       title = "Relationship between SNGCJA5 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    7.57      0.374      20.3       0    6.84     8.31 
## 2 SNGCJA5      0.607     0.032      19.2       0    0.545    0.669

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMS7003 SNGCJA5 PMS7003_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1   0        0           7.57    -7.57
##  2     2   0.290    0.18        7.68    -7.39
##  3     3   0.71     0.45        7.85    -7.14
##  4     4   1.4      0.9         8.12    -6.72
##  5     5   3.12     1.51        8.49    -5.37
##  6     6   4.16     1.81        8.67    -4.51
##  7     7   6.13     2.86        9.31    -3.18
##  8     8   7.61     3.39        9.63    -2.02
##  9     9   8.4      3.48        9.69    -1.29
## 10    10  20.2      8.23       12.6      7.63
## # ... with 1,103 more rows

#Caso 5: SNGCJA5 VS Oficial

df %>% select(SNGCJA5, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SNGCJA5	0	1	6.67	9.75	0	1.53	4.8	9.75	236.87	▇▁▁▁▁
Oficial	0	1	12.82	8.31	0	6.50	11.2	17.70	55.40	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SNGCJA5 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.405

ggplot(df, aes(x = SNGCJA5, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SNGCJA5", y = "PM25 Oficial",
       title = "Relationship between SNGCJA5 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SNGCJA5, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   10.5       0.276      38.1       0    9.97    11.1  
## 2 SNGCJA5      0.345     0.023      14.8       0    0.299    0.391

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial SNGCJA5 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     5.9    0           10.5    -4.62
##  2     2     8.1    0.18        10.6    -2.48
##  3     3     3.7    0.45        10.7    -6.97
##  4     4     4      0.9         10.8    -6.83
##  5     5     4.8    1.51        11.0    -6.24
##  6     6     6      1.81        11.1    -5.14
##  7     7     8.3    2.86        11.5    -3.20
##  8     8     9.8    3.39        11.7    -1.89
##  9     9    16.5    3.48        11.7     4.78
## 10    10     6.2    8.23        13.4    -7.16
## # ... with 1,103 more rows

#Caso 6: SPS30 VS HPMA115S0

df %>% select(SPS30, HPMA115S0) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.33	6.66	0.08	1.97	5.53	10.7	51.5	▇▂▁▁▁
HPMA115S0	0	1	28.49	7.27	18.60	23.10	26.40	31.1	77.7	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ HPMA115S0)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.983

ggplot(df, aes(x = SPS30, y = HPMA115S0)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 HPMA115S0",
       title = "Relationship between SPS30 and HPMA115S0") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(HPMA115S0 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    20.6      0.059      347.       0    20.5     20.7 
## 2 SPS30         1.07     0.006      179.       0     1.06     1.08

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID HPMA115S0 SPS30 HPMA115S0_hat residual
##    <int>     <dbl> <dbl>         <dbl>    <dbl>
##  1     1      18.6  0.9           21.6    -2.99
##  2     2      19.1  1.02          21.7    -2.62
##  3     3      19.4  1.25          22.0    -2.57
##  4     4      19.7  1.78          22.5    -2.84
##  5     5      20.4  2.6           23.4    -3.02
##  6     6      20.6  3             23.8    -3.25
##  7     7      22    4.11          25.0    -3.04
##  8     8      22.8  4.94          25.9    -3.13
##  9     9      23.7  5.48          26.5    -2.81
## 10    10      28.6 11.7           33.2    -4.59
## # ... with 1,103 more rows

#Caso 7: SPS30 VS PMSA003

df %>% select(SPS30, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.33	6.66	0.08	1.97	5.53	10.7	51.5	▇▂▁▁▁
PMSA003	0	1	10.48	11.99	0.00	0.85	6.50	16.2	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMSA003",
       title = "Relationship between SPS30 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -2.66     0.048     -55.1       0    -2.75    -2.56
## 2 SPS30         1.79     0.005     368.        0     1.78     1.80

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMSA003 SPS30 PMSA003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0.017  0.9       -1.04     1.06 
##  2     2   0      1.02      -0.828    0.828
##  3     3   1.61   1.25      -0.416    2.03 
##  4     4   0.41   1.78       0.534   -0.124
##  5     5   1.72   2.6        2.00    -0.284
##  6     6   2.65   3          2.72    -0.071
##  7     7   4.04   4.11       4.71    -0.671
##  8     8   5.35   4.94       6.20    -0.849
##  9     9   6.25   5.48       7.17    -0.917
## 10    10  18.5   11.7       18.3      0.184
## # ... with 1,103 more rows

#Caso 8: SPS30 VS PMS7003

df %>% select(SPS30, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.33	6.66	0.08	1.97	5.53	10.7	51.5	▇▂▁▁▁
PMS7003	0	1	11.62	11.87	0.00	1.97	8.40	17.9	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.996

ggplot(df, aes(x = SPS30, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 PMS7003",
       title = "Relationship between SPS30 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    -1.38     0.049     -28.0       0    -1.48    -1.29
## 2 SPS30         1.78     0.005     356.        0     1.76     1.78

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMS7003 SPS30 PMS7003_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1   0      0.9        0.214   -0.214
##  2     2   0.290  1.02       0.427   -0.137
##  3     3   0.71   1.25       0.835   -0.125
##  4     4   1.4    1.78       1.78    -0.376
##  5     5   3.12   2.6        3.23    -0.111
##  6     6   4.16   3          3.94     0.219
##  7     7   6.13   4.11       5.91     0.219
##  8     8   7.61   4.94       7.38     0.225
##  9     9   8.4    5.48       8.34     0.057
## 10    10  20.2   11.7       19.4      0.817
## # ... with 1,103 more rows

#Caso 9: SPS30 VS Oficial

df %>% select(SPS30, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
SPS30	0	1	7.33	6.66	0.08	1.97	5.53	10.7	51.5	▇▂▁▁▁
Oficial	0	1	12.82	8.31	0.00	6.50	11.20	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = SPS30 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.807

ggplot(df, aes(x = SPS30, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 SPS30", y = "PM25 Oficial",
       title = "Relationship between SPS30 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ SPS30, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept     5.44     0.219      24.8       0    5.01      5.87
## 2 SPS30         1.01     0.022      45.5       0    0.964     1.05

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial SPS30 Oficial_hat residual
##    <int>   <dbl> <dbl>       <dbl>    <dbl>
##  1     1     5.9  0.9         6.34   -0.445
##  2     2     8.1  1.02        6.47    1.63 
##  3     3     3.7  1.25        6.70   -3.00 
##  4     4     4    1.78        7.23   -3.23 
##  5     5     4.8  2.6         8.06   -3.26 
##  6     6     6    3           8.46   -2.46 
##  7     7     8.3  4.11        9.58   -1.28 
##  8     8     9.8  4.94       10.4    -0.613
##  9     9    16.5  5.48       11.0     5.54 
## 10    10     6.2 11.7        17.2   -11.0  
## # ... with 1,103 more rows

#Caso 10: HPMA115S0 VS PMSA003

df %>% select(HPMA115S0, PMSA003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.4	31.1	77.7	▇▂▁▁▁
PMSA003	0	1	10.48	11.99	0.0	0.85	6.5	16.2	89.0	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMSA003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.984

ggplot(df, aes(x = HPMA115S0, y = PMSA003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMSA003",
       title = "Relationship between HPMA115S0 and PMSA003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMSA003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -35.7      0.256     -139.       0   -36.2    -35.2 
## 2 HPMA115S0     1.62     0.009      186.       0     1.60     1.64

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMSA003 HPMA115S0 PMSA003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0.017      18.6      -5.57      5.59
##  2     2   0          19.1      -4.76      4.76
##  3     3   1.61       19.4      -4.27      5.88
##  4     4   0.41       19.7      -3.78      4.19
##  5     5   1.72       20.4      -2.65      4.37
##  6     6   2.65       20.6      -2.32      4.97
##  7     7   4.04       22        -0.053     4.09
##  8     8   5.35       22.8       1.24      4.11
##  9     9   6.25       23.7       2.70      3.55
## 10    10  18.5        28.6      10.7       7.85
## # ... with 1,103 more rows

#Caso 11: HPMA115S0 VS PMS7003

df %>% select(HPMA115S0, PMS7003) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.10	26.4	31.1	77.7	▇▂▁▁▁
PMS7003	0	1	11.62	11.87	0.0	1.97	8.4	17.9	91.8	▇▂▁▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ PMS7003)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.978

ggplot(df, aes(x = HPMA115S0, y = PMS7003)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 PMS7003",
       title = "Relationship between HPMA115S0 and PMS7003") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(PMS7003 ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept   -33.9      0.298     -114.       0   -34.5    -33.3 
## 2 HPMA115S0     1.60     0.01       158.       0     1.58     1.62

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID PMS7003 HPMA115S0 PMS7003_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1   0          18.6      -4.18      4.18
##  2     2   0.290      19.1      -3.38      3.67
##  3     3   0.71       19.4      -2.90      3.61
##  4     4   1.4        19.7      -2.42      3.82
##  5     5   3.12       20.4      -1.30      4.42
##  6     6   4.16       20.6      -0.981     5.14
##  7     7   6.13       22         1.25      4.88
##  8     8   7.61       22.8       2.53      5.08
##  9     9   8.4        23.7       3.97      4.43
## 10    10  20.2        28.6      11.8       8.41
## # ... with 1,103 more rows

#Caso 10: HPMA115S0 VS Oficial

df %>% select(HPMA115S0, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
HPMA115S0	0	1	28.49	7.27	18.6	23.1	26.4	31.1	77.7	▇▂▁▁▁
Oficial	0	1	12.82	8.31	0.0	6.5	11.2	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = HPMA115S0 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.809

ggplot(df, aes(x = HPMA115S0, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 HPMA115S0", y = "PM25 Oficial",
       title = "Relationship between HPMA115S0 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ HPMA115S0, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept  -13.5       0.593     -22.8       0  -14.7    -12.3  
## 2 HPMA115S0    0.924     0.02       45.8       0    0.884    0.963

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial HPMA115S0 Oficial_hat residual
##    <int>   <dbl>     <dbl>       <dbl>    <dbl>
##  1     1     5.9      18.6        3.68    2.22 
##  2     2     8.1      19.1        4.14    3.96 
##  3     3     3.7      19.4        4.42   -0.716
##  4     4     4        19.7        4.69   -0.693
##  5     5     4.8      20.4        5.34   -0.54 
##  6     6     6        20.6        5.52    0.475
##  7     7     8.3      22          6.82    1.48 
##  8     8     9.8      22.8        7.56    2.24 
##  9     9    16.5      23.7        8.39    8.11 
## 10    10     6.2      28.6       12.9    -6.72 
## # ... with 1,103 more rows

#Caso 11: PMSA003 VS PMS7003

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.48	11.99	0	0.85	6.5	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.82	8.31	0	6.50	11.2	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.806

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.96      0.196      35.5       0    6.58     7.35 
## 2 PMSA003      0.559     0.012      45.3       0    0.534    0.583

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     5.9   0.017        6.97   -1.07 
##  2     2     8.1   0            6.96    1.14 
##  3     3     3.7   1.61         7.86   -4.16 
##  4     4     4     0.41         7.19   -3.19 
##  5     5     4.8   1.72         7.92   -3.12 
##  6     6     6     2.65         8.44   -2.44 
##  7     7     8.3   4.04         9.22   -0.921
##  8     8     9.8   5.35         9.95   -0.152
##  9     9    16.5   6.25        10.5     6.04 
## 10    10     6.2  18.5         17.3   -11.1  
## # ... with 1,103 more rows

#Caso 12: PMSA003 VS Oficial

df %>% select(PMSA003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMSA003	0	1	10.48	11.99	0	0.85	6.5	16.2	89.0	▇▂▁▁▁
Oficial	0	1	12.82	8.31	0	6.50	11.2	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMSA003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.806

ggplot(df, aes(x = PMSA003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMSA003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMSA003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.96      0.196      35.5       0    6.58     7.35 
## 2 PMSA003      0.559     0.012      45.3       0    0.534    0.583

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial PMSA003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     5.9   0.017        6.97   -1.07 
##  2     2     8.1   0            6.96    1.14 
##  3     3     3.7   1.61         7.86   -4.16 
##  4     4     4     0.41         7.19   -3.19 
##  5     5     4.8   1.72         7.92   -3.12 
##  6     6     6     2.65         8.44   -2.44 
##  7     7     8.3   4.04         9.22   -0.921
##  8     8     9.8   5.35         9.95   -0.152
##  9     9    16.5   6.25        10.5     6.04 
## 10    10     6.2  18.5         17.3   -11.1  
## # ... with 1,103 more rows

#Caso 13: PMS7003 VS Oficial

df %>% select(PMS7003, Oficial) %>% skim()

Data summary
Name	Piped data
Number of rows	1113
Number of columns	2
_______________________
Column type frequency:
numeric	2
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
PMS7003	0	1	11.62	11.87	0	1.97	8.4	17.9	91.8	▇▂▁▁▁
Oficial	0	1	12.82	8.31	0	6.50	11.2	17.7	55.4	▇▆▂▁▁

#**Pearson correlation coefficient original**

df %>% 
  get_correlation(formula = PMS7003 ~ Oficial)

## # A tibble: 1 x 1
##     cor
##   <dbl>
## 1 0.811

ggplot(df, aes(x = PMS7003, y = Oficial)) +
  geom_point(alpha = 0.2) +
  labs(x = "PM25 PMS7003", y = "PM25 Oficial",
       title = "Relationship between PMSA003 and Oficial") +  
  
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

#**Fit regression model original:**
score_model <- lm(Oficial ~ PMS7003, data = df)
#**Get regression table original:**
get_regression_table(score_model)

## # A tibble: 2 x 7
##   term      estimate std_error statistic p_value lower_ci upper_ci
##   <chr>        <dbl>     <dbl>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 intercept    6.22      0.204      30.5       0    5.82     6.62 
## 2 PMS7003      0.567     0.012      46.1       0    0.543    0.592

regression_points <- get_regression_points(score_model)
regression_points

## # A tibble: 1,113 x 5
##       ID Oficial PMS7003 Oficial_hat residual
##    <int>   <dbl>   <dbl>       <dbl>    <dbl>
##  1     1     5.9   0            6.22   -0.321
##  2     2     8.1   0.290        6.39    1.71 
##  3     3     3.7   0.71         6.62   -2.92 
##  4     4     4     1.4          7.02   -3.02 
##  5     5     4.8   3.12         7.99   -3.19 
##  6     6     6     4.16         8.58   -2.58 
##  7     7     8.3   6.13         9.7    -1.4  
##  8     8     9.8   7.61        10.5    -0.74 
##  9     9    16.5   8.4         11.0     5.51 
## 10    10     6.2  20.2         17.7   -11.5  
## # ... with 1,103 more rows

Ferias-Canairios.R

DBB

2021-02-24