#**Linear regression SML complete Panasonic vs Honeywell**
#**12feb2020 7april2020 resolution 1hour**
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)
Honeywel_PanasonicSML_1hour_nonull <- read_excel("Honeywel&PanasonicSML_1hour_nonull.xlsx")
## New names:
## * `` -> ...6
df <- Honeywel_PanasonicSML_1hour_nonull %>%
select(date, honey, pana)
glimpse(df)
## Rows: 3,273
## Columns: 3
## $ date <chr> "2/12/2020 12:00:00.000000000 AM", "2/12/2020 1:00:00.0000000...
## $ honey <dbl> 21.568966, 18.137931, 16.500000, 16.325000, 18.847458, 29.719...
## $ pana <dbl> 14.350877, 12.413793, 11.137931, 11.153846, 13.051724, 21.708...
df %>%
sample_n(size = 10)
## # A tibble: 10 x 3
## date honey pana
## <chr> <dbl> <dbl>
## 1 6/11/2020 12:00:00.000000000 PM 4.22 1.04
## 2 5/15/2020 2:00:00.000000000 PM 2.35 0.193
## 3 3/21/2020 5:00:00.000000000 AM 40.5 26.1
## 4 3/30/2020 7:00:00.000000000 AM 65.1 41.3
## 5 3/16/2020 12:00:00.000000000 AM 18.4 11.3
## 6 2/22/2020 3:00:00.000000000 PM 20.4 13.5
## 7 6/28/2020 2:00:00.000000000 AM 4.11 0.789
## 8 3/6/2020 7:00:00.000000000 AM 48.7 32.1
## 9 3/9/2020 4:00:00.000000000 PM 23.8 13.1
## 10 3/27/2020 4:00:00.000000000 PM 39.8 22.6
ggplot(df, aes(x=date)) +
geom_line(aes(y = pana), group = 1, color = "red") +
geom_line(aes(y = honey), group = 2, color = "blue")

#**Panasonic Sensor boxplot and outliers**
impute_outliersP <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.25, 0.75), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataP <- impute_outliersP(df$pana)
par(mfrow = c(1,2))
boxplot(df$pana, main = "PM2.5 with outliers Panasonic",
col = 3)
boxplot(imputed_dataP, main = "PPM2.5 without outliers Panasonic",col=2)

#**Honeywell Sensor boxplot and outliers**
impute_outliersH <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.25, 0.75), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataH <- impute_outliersH(df$honey)
par(mfrow = c(1,2))
boxplot(df$honey, main = "PM2.5 with outliers Honeywell",
col = 3)
boxplot(imputed_dataH, main = "PPM2.5 without outliers Honeywell",col=2)

df[c("honeyH")] <- imputed_dataH
df[c("panaH")] <- imputed_dataP
View(df)
dfT <- df %>%
select(date, honeyH, panaH)
dfC <- na.omit(dfT)
#**Original data**
df %>% select(honey, pana) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
3273 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| honey |
0 |
1 |
13.4 |
11.91 |
0 |
4.12 |
9.17 |
19.47 |
65.07 |
▇▃▂▁▁ |
| pana |
0 |
1 |
7.1 |
7.77 |
0 |
0.94 |
4.17 |
10.98 |
41.26 |
▇▂▁▁▁ |
#**Data without outliers**
dfC %>% select(honeyH, panaH) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
1513 |
| Number of columns |
2 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
2 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| honeyH |
0 |
1 |
10.14 |
4.24 |
4.12 |
6.38 |
9.31 |
13.60 |
19.47 |
▇▆▅▃▂ |
| panaH |
0 |
1 |
4.91 |
2.80 |
0.95 |
2.48 |
4.34 |
7.16 |
10.98 |
▇▆▅▃▃ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = honey ~ pana)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.986
#**Pearson correlation coefficient without outliers**
dfC %>%
get_correlation(formula = honeyH ~ panaH)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.949
ggplot(df, aes(x = pana, y = honey)) +
geom_point(alpha = 0.2) +
labs(x = "Panasonic", y = "Honeywell",
title = "Relationship between Honeywell and Panasonic scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataP, y = imputed_dataH)) +
geom_point(alpha = 0.2) +
labs(x = "Panasonic", y = "Honeywell",
title = "Relationship between Honeywell and Panasonic scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1760 rows containing non-finite values (stat_smooth).
## Warning: Removed 1760 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(honey ~ pana, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.68 0.048 56.0 0 2.59 2.78
## 2 pana 1.51 0.005 332. 0 1.50 1.52
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 3,273 x 5
## ID honey pana honey_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 21.6 14.4 24.3 -2.78
## 2 2 18.1 12.4 21.4 -3.29
## 3 3 16.5 11.1 19.5 -3.00
## 4 4 16.3 11.2 19.5 -3.20
## 5 5 18.8 13.1 22.4 -3.54
## 6 6 29.7 21.7 35.5 -5.74
## 7 7 42.5 31.5 50.2 -7.72
## 8 8 37.2 27.9 44.8 -7.65
## 9 9 45.7 34.6 54.9 -9.19
## 10 10 50.6 35.6 56.4 -5.88
## # ... with 3,263 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(honeyH ~ panaH, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.07 0.069 44.3 0 2.94 3.21
## 2 panaH 1.44 0.012 117. 0 1.42 1.46
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 1,513 x 5
## ID honeyH panaH honeyH_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 7.93 4.79 9.97 -2.04
## 2 2 6.45 3.32 7.85 -1.40
## 3 3 5.33 2.48 6.65 -1.32
## 4 4 5.12 2.38 6.50 -1.38
## 5 5 7 3.66 8.33 -1.33
## 6 6 8.97 4.97 10.2 -1.25
## 7 7 10.2 5.91 11.6 -1.34
## 8 8 16.3 10.9 18.8 -2.58
## 9 9 14.5 9.84 17.2 -2.77
## 10 10 16.6 10.9 18.7 -2.12
## # ... with 1,503 more rows
View(df)
Vhoney = mutate(df, VALhoney = 2.68 + (1.51 * pana))
Vrest = mutate(Vhoney, Valori = honey )
Vfin = mutate (Vrest, VALrest = VALhoney - honey )
View(Vfin)
#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALhoney = 2.68 + 1.51 * Valpana**