#**Linear regression 4 models lcs: Plantower, Honeywell, Nova, Panasonic**
#**12feb2020 7april2020 resolution 1hour**
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(moderndive)
library(skimr)
library(gapminder)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(readr)
grafana_data_export_4_sensors_1hour <- read_csv("grafana_data_export 4 sensors 1hour.csv", col_types = cols(date = col_datetime(format = "%Y-%m-%d %H:%M:%S")))
View(grafana_data_export_4_sensors_1hour)
df <- na.omit(grafana_data_export_4_sensors_1hour)
glimpse(df)
## Rows: 3,042
## Columns: 5
## $ date <dttm> 2020-02-25 00:00:00, 2020-02-25 01:00:00, 2020-02-25 02:00:0...
## $ plan <dbl> 20.12308, 24.93750, 20.89062, 19.41538, 24.27692, 22.27692, 3...
## $ honey <dbl> 16.51724, 21.60345, 17.74138, 15.50000, 18.79310, 17.00000, 2...
## $ nova <dbl> 11.034483, 18.224138, 14.810345, 12.000000, 14.456140, 12.793...
## $ pana <dbl> 9.614035, 12.701754, 9.896552, 9.122807, 11.551724, 10.413793...
df %>%
sample_n(size = 10)
## # A tibble: 10 x 5
## date plan honey nova pana
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-03-03 09:00:00 30.2 24.4 16.8 14.1
## 2 2020-03-28 03:00:00 39.6 31.0 28.5 18.5
## 3 2020-05-01 16:00:00 0.185 1.55 0.714 0.05
## 4 2020-03-29 10:00:00 52.4 39.6 30.9 23.3
## 5 2020-05-24 21:00:00 1.25 3.52 1.47 0.456
## 6 2020-02-29 17:00:00 24.9 19.5 13.4 10.8
## 7 2020-07-03 04:00:00 6.31 7.69 3.54 3.21
## 8 2020-03-31 12:00:00 9.15 9.75 5.79 4.25
## 9 2020-03-24 16:00:00 43.8 34.6 31.6 20.5
## 10 2020-03-23 19:00:00 47.3 33.1 24.1 20.4
ggplot(df, aes(x=date)) +
geom_line(aes(y = plan), group = 1, color = "red") +
geom_line(aes(y = honey), group = 2, color = "blue") +
geom_line(aes(y = nova), group = 3, color = "green") +
geom_line(aes(y = pana), group = 4, color = "yellow")

#**Plantower Sensor boxplot and outliers**
impute_outliersPlan <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataPlan <- impute_outliersPlan(df$plan)
par(mfrow = c(1,2))
boxplot(df$plan, main = "PM2.5 with outliers Plantower", col = 3)
boxplot(imputed_dataPlan, main = "PPM2.5 without outliers Plantower",col=2)

#**Honeywell Sensor boxplot and outliers**
impute_outliersHoney <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataHoney <- impute_outliersHoney(df$honey)
par(mfrow = c(1,2))
boxplot(df$honey, main = "PM2.5 with outliers Honey",
col = 3)
boxplot(imputed_dataHoney, main = "PPM2.5 without outliers Honey",col=2)

#**Nova Sensor boxplot and outliers**
impute_outliersNova <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataNova <- impute_outliersNova(df$nova)
par(mfrow = c(1,2))
boxplot(df$nova, main = "PM2.5 with outliers Nova",
col = 3)
boxplot(imputed_dataNova, main = "PPM2.5 without outliers Nova",col=2)

#**Panasonic Sensor boxplot and outliers**
impute_outliersPana <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- NA
x[x>quantiles[2]] <- NA
x
}
imputed_dataPana <- impute_outliersPana(df$pana)
par(mfrow = c(1,2))
boxplot(df$pana, main = "PM2.5 with outliers Panasonic",
col = 3)
boxplot(imputed_dataPana, main = "PPM2.5 without outliers Panasonic",col=2)

df[c("plantower")] <- imputed_dataPlan
df[c("honeywell")] <- imputed_dataHoney
df[c("novasen")] <- imputed_dataNova
df[c("panasonic")] <- imputed_dataPana
dfT <- df %>%
select(date, plantower, honeywell, novasen, panasonic)
View(dfT)
dfC <- na.omit(dfT)
#**Original data**
df %>% select(plan, honey,nova,pana) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
3042 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| plan |
0 |
1 |
15.17 |
17.24 |
0 |
1.67 |
7.85 |
23.82 |
91.33 |
▇▂▁▁▁ |
| honey |
0 |
1 |
13.30 |
12.13 |
0 |
3.81 |
8.91 |
19.49 |
65.07 |
▇▃▂▁▁ |
| nova |
0 |
1 |
9.42 |
10.38 |
0 |
1.67 |
5.06 |
13.81 |
55.61 |
▇▂▁▁▁ |
| pana |
0 |
1 |
6.92 |
7.87 |
0 |
0.76 |
3.71 |
10.73 |
41.26 |
▇▂▁▁▁ |
#**Data without outliers**
dfC %>% select(plantower, honeywell, novasen, panasonic) %>% skim()
Data summary
| Name |
Piped data |
| Number of rows |
2629 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| numeric |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: numeric
| plantower |
0 |
1 |
13.36 |
13.31 |
0.08 |
2.22 |
8.14 |
21.68 |
51.81 |
▇▂▂▂▁ |
| honeywell |
0 |
1 |
12.13 |
9.30 |
1.55 |
4.33 |
9.02 |
17.80 |
38.78 |
▇▃▂▂▁ |
| novasen |
0 |
1 |
8.26 |
7.73 |
0.47 |
2.00 |
5.25 |
12.59 |
32.26 |
▇▃▂▁▁ |
| panasonic |
0 |
1 |
6.07 |
6.02 |
0.00 |
1.05 |
3.83 |
9.81 |
23.52 |
▇▂▂▂▁ |
#**Pearson correlation coefficient original**
df %>%
get_correlation(formula = plan ~ honey)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.987
df %>%
get_correlation(formula = plan ~ nova)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.988
df %>%
get_correlation(formula = plan ~ pana)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.993
df %>%
get_correlation(formula = honey ~ nova)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.985
df %>%
get_correlation(formula = honey ~ pana)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.985
df %>%
get_correlation(formula = nova ~ pana)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.985
#**Pearson correlation coefficient without outliers**
dfC %>%
get_correlation(formula = plantower ~ honeywell)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.980
dfC %>%
get_correlation(formula = plantower ~ novasen)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.983
dfC %>%
get_correlation(formula = plantower ~ panasonic)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.989
dfC %>%
get_correlation(formula = honeywell ~ novasen)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.978
dfC %>%
get_correlation(formula = honeywell ~ panasonic)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.976
dfC %>%
get_correlation(formula = novasen ~ panasonic)
## # A tibble: 1 x 1
## cor
## <dbl>
## 1 0.979
ggplot(df, aes(x = plan , y = honey)) +
geom_point(alpha = 0.2) +
labs(x = "Plantower", y = "Honeywell",
title = "Relationship between Plantower and Honeywell scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

ggplot(df, aes(x = imputed_dataPlan, y = imputed_dataHoney)) +
geom_point(alpha = 0.2) +
labs(x = "Plantower", y = "Honeywell",
title = "Relationship between Plantower and Honeywell scores") +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 370 rows containing non-finite values (stat_smooth).
## Warning: Removed 370 rows containing missing values (geom_point).

#**Fit regression model original:**
score_model <- lm(honey ~ plan, data = df)
#**Get regression table original:**
get_regression_table(score_model)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.76 0.046 59.8 0 2.67 2.85
## 2 plan 0.695 0.002 345. 0 0.691 0.699
regression_points <- get_regression_points(score_model)
regression_points
## # A tibble: 3,042 x 5
## ID honey plan honey_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 16.5 20.1 16.7 -0.224
## 2 2 21.6 24.9 20.1 1.52
## 3 3 17.7 20.9 17.3 0.467
## 4 4 15.5 19.4 16.2 -0.749
## 5 5 18.8 24.3 19.6 -0.833
## 6 6 17 22.3 18.2 -1.24
## 7 7 22.5 31.6 24.7 -2.20
## 8 8 17.9 24.1 19.5 -1.56
## 9 9 18.0 21.7 17.8 0.213
## 10 10 19.6 24.4 19.7 -0.122
## # ... with 3,032 more rows
#**Fit regression model withput outliers:**
score_modelC <- lm(honeywell ~ plantower, data = dfC)
#**Get regression table withput outliers:**
get_regression_table(score_modelC)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.98 0.051 58.1 0 2.88 3.08
## 2 plantower 0.685 0.003 252. 0 0.68 0.69
regression_pointsC <- get_regression_points(score_modelC)
regression_pointsC
## # A tibble: 2,629 x 5
## ID honeywell plantower honeywell_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 16.5 20.1 16.8 -0.243
## 2 2 21.6 24.9 20.1 1.55
## 3 3 17.7 20.9 17.3 0.456
## 4 4 15.5 19.4 16.3 -0.775
## 5 5 18.8 24.3 19.6 -0.812
## 6 6 17 22.3 18.2 -1.24
## 7 7 22.5 31.6 24.6 -2.11
## 8 8 17.9 24.1 19.5 -1.54
## 9 9 18.0 21.7 17.8 0.21
## 10 10 19.6 24.4 19.7 -0.099
## # ... with 2,619 more rows
#Vken = mutate(df, VALken = 13.4 + (0.487 * cana))
#Vrest = mutate(Vken, Valori = ken )
#Vfin = mutate (Vrest, VALrest = VALken - ken )
#View(Vfin)
#**Conclusion:**
#**Better model original with outliers**
#**Linear regression: y = b0 + b1 * x => VALUEkennedy = 13.4 + 0.487 * VALUEcanairio**