#**Linear regression CanAirIO (Humedal La Vaca) vs Oficial Kennedy**
#**12feb2020 7april2020 resolution 1hour**
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
EstacionOficialVSEstacionCanAirIO <- read_excel("Estacion Oficial VS Estacion CanAirIO.xlsx")
## New names:
## * `` -> ...1
dfOri <- na.omit(EstacionOficialVSEstacionCanAirIO)
df <- dfOri %>%
select(date, ken, cana)
glimpse(df)
## Rows: 5,336
## Columns: 3
## $ date <chr> "30-10-2019 24:00", "31-10-2019 01:00", "31-10-2019 02:00", "3...
## $ ken <dbl> 48, 48, 25, 21, 24, 23, 35, 50, 39, 66, 86, 68, 27, 21, 16, 14...
## $ cana <dbl> 16.787234, 16.981132, 16.533333, 16.976190, 14.725490, 37.2352...
df %>%
sample_n(size = 10)
## # A tibble: 10 x 3
## date ken cana
## <chr> <dbl> <dbl>
## 1 18-03-2020 10:00 65 84.7
## 2 05-06-2020 22:00 7 6.47
## 3 26-05-2020 14:00 10 3.33
## 4 06-03-2020 19:00 44 65.7
## 5 01-04-2020 05:00 20 23
## 6 26-12-2019 14:00 16 2.59
## 7 11-11-2019 01:00 16 5.39
## 8 12-02-2020 09:00 95 8.98
## 9 15-02-2020 21:00 22 15.3
## 10 25-04-2020 02:00 15 26.1
ggplot(df, aes(x=date)) +
geom_line(aes(y = cana), group = 1, color = "red") +
geom_line(aes(y = ken), group = 2, color = "blue")

#**CanAirIO Sensor boxplot and outliers**
impute_outliersP <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataP <- impute_outliersP(df$cana)
par(mfrow = c(1,2))
boxplot(df$cana, main = "PM2.5 with outliers CanAirIO",
col = 3)
boxplot(imputed_dataP, main = "PPM2.5 without outliers CanAirIO",col=2)

#**Kennedy Sensor boxplot and outliers**
impute_outliersH <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataH <- impute_outliersH(df$ken)
par(mfrow = c(1,2))
boxplot(df$ken, main = "PM2.5 with outliers Kennedy",
col = 3)
boxplot(imputed_dataH, main = "PPM2.5 without outliers Kennedy",col=2)

df[c("kenH")] <- imputed_dataH
df[c("canaH")] <- imputed_dataP
dfT <- df %>%
select(date, kenH, canaH)
dfC <- na.omit(dfT)
#**Original data**
df %>% select(ken, cana) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
5336 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| ken |
0 |
1 |
24.46 |
15.19 |
0.00 |
13.00 |
22.00 |
32.00 |
127.0 |
▇▅▁▁▁ |
| cana |
0 |
1 |
22.81 |
18.27 |
0.24 |
9.34 |
18.25 |
31.14 |
233.8 |
▇▁▁▁▁ |
#**Data without outliers**
dfC %>% select(kenH, canaH) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
4470 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| kenH |
0 |
1 |
23.18 |
11.29 |
5.00 |
15.00 |
22.0 |
31.00 |
53.00 |
▆▇▅▃▂ |
| canaH |
0 |
1 |
21.06 |
12.92 |
3.73 |
10.44 |
18.5 |
29.33 |
57.55 |
▇▆▃▂▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = ken ~ cana)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.585
#**Pearson correlation coefficient without outliers**
dfC %>%
get_correlation(formula = kenH ~ canaH)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.549
ggplot(df, aes(x = cana, y = ken)) +
geom_point(alpha = 0.2) +
labs(x = "CanAirIO", y = "Kennedy",
title = "Relationship between Kennedy and CanAirIO scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataP, y = imputed_dataH)) +
geom_point(alpha = 0.2) +
labs(x = "CanAirIO", y = "Kennedy",
title = "Relationship between Kennedy and CanAirIO scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 866 rows containing non-finite values (stat_smooth).
## Warning: Removed 866 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(ken ~ cana, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 13.4 0.27 49.5 0 12.8 13.9
## 2 cana 0.487 0.009 52.7 0 0.469 0.505
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 5,336 x 5
## ID ken cana ken_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 48 16.8 21.5 26.5
## 2 2 48 17.0 21.6 26.4
## 3 3 25 16.5 21.4 3.59
## 4 4 21 17.0 21.6 -0.621
## 5 5 24 14.7 20.5 3.47
## 6 6 23 37.2 31.5 -8.48
## 7 7 35 25.1 25.6 9.44
## 8 8 50 24.5 25.3 24.7
## 9 9 39 33.9 29.9 9.12
## 10 10 66 24.6 25.3 40.7
## # ... with 5,326 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(kenH ~ canaH, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 13.1 0.27 48.4 0 12.6 13.6
## 2 canaH 0.48 0.011 43.9 0 0.458 0.501
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 4,470 x 5
## ID kenH canaH kenH_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 48 16.8 21.1 26.9
## 2 2 48 17.0 21.2 26.8
## 3 3 25 16.5 21.0 3.99
## 4 4 21 17.0 21.2 -0.225
## 5 5 24 14.7 20.1 3.86
## 6 6 23 37.2 30.9 -7.94
## 7 7 35 25.1 25.1 9.90
## 8 8 50 24.5 24.8 25.2
## 9 9 39 33.9 29.4 9.64
## 10 10 27 8.51 17.2 9.84
## # ... with 4,460 more rows
Vken = mutate(df, VALken = 13.4 + (0.487 * cana))
Vrest = mutate(Vken, Valori = ken )
Vfin = mutate (Vrest, VALrest = VALken - ken )
View(Vfin)
#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALUEkennedy = 13.4 + 0.487 * VALUEcanairio**