#Install and open the "dplyr" package
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##The pipe operator: The %>% operator is a handy function that allows for simplicity and a more natural way of writing code.
# The pipe operator takes the output of the expression or function on its left-hand side and uses it as the first argument of the function on its right-hand side.
# This allows for chaining multiple operations together without the need for nested function calls or creating intermediate objects.
# Example: Suppose we want to clean a dataset, and remove all observations that is an "NA". Use the dataset "airquality".
# Check dataset
View(airquality)
# Conventional way: filter(airquality, !is.na(Ozone) & !is.na(Solar.R))
# We can also write this as:
#Notice that I don't need to specify the first argument in the "filer function.
airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
group_by(Month) %>%
mutate(z_Ozone =scale(Ozone)) #Normalizing the 'Ozone' variable. How would the resuts change if the 'Month' was not a group?
## # A tibble: 111 × 7
## # Groups: Month [5]
## Ozone Solar.R Wind Temp Month Day z_Ozone[,1]
## <int> <int> <dbl> <int> <int> <int> <dbl>
## 1 41 190 7.4 67 5 1 0.737
## 2 36 118 8 72 5 2 0.519
## 3 12 149 12.6 74 5 3 -0.530
## 4 18 313 11.5 62 5 4 -0.268
## 5 23 299 8.6 65 5 7 -0.0492
## 6 19 99 13.8 59 5 8 -0.224
## 7 8 19 20.1 61 5 9 -0.705
## 8 16 256 9.7 69 5 12 -0.355
## 9 11 290 9.2 66 5 13 -0.573
## 10 14 274 10.9 68 5 14 -0.442
## # ℹ 101 more rows
# Let us add more operations in the filtered data
# Create a table with the mean and standard deviation of the observation variables by groups.
air_count <- airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
group_by(Month) %>%
count()
air_transform <- airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
group_by(Month) %>%
summarise(
mean(Ozone),
sd(Ozone)
)
#Create vectors for the mean and standard deviation of the variable Ozone.
meanOzone <- as.vector(sapply(list(air_transform$`mean(Ozone)`),rep,air_count$n))
stdOzone <- as.vector(sapply(list(air_transform$`sd(Ozone)`),rep,air_count$n))
# Add the standardized z scores of the variables of interest
mod_airquality <- airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
mutate(meanOzone) %>%
mutate(stdOzone) %>%
mutate(stand_Ozone = (Ozone-meanOzone)/stdOzone)
#Visualizing correlations among the variables of interest. # Select variables of interest using the "select" function.
#Selecting only a subset of variables.
airquality_subset <- airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
select(Ozone, Solar.R, Wind)
#Plot among all combination of variables
pairs(airquality_subset)

# It seems like it is only observations of Ozone and Solar.R are positively correlated.
# We can check the correlation by using the "cor" function.
cor(airquality_subset)
## Ozone Solar.R Wind
## Ozone 1.0000000 0.3483417 -0.6124966
## Solar.R 0.3483417 1.0000000 -0.1271835
## Wind -0.6124966 -0.1271835 1.0000000
# Let us confirm our observations by modelling a regression
summary(lm(Ozone~Wind, airquality_subset))
##
## Call:
## lm(formula = Ozone ~ Wind, data = airquality_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.513 -18.597 -5.035 15.814 88.437
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.0413 7.4724 13.25 < 2e-16 ***
## Wind -5.7288 0.7082 -8.09 9.09e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.42 on 109 degrees of freedom
## Multiple R-squared: 0.3752, Adjusted R-squared: 0.3694
## F-statistic: 65.44 on 1 and 109 DF, p-value: 9.089e-13
# We find confirmation of the negative correlation between Wind and Solar.R. Let us now check by including the other variables.
summary(lm(Ozone~Wind+Solar.R, airquality_subset))
##
## Call:
## lm(formula = Ozone ~ Wind + Solar.R, data = airquality_subset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.651 -18.164 -5.959 18.514 85.237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.24604 9.06751 8.519 1.05e-13 ***
## Wind -5.40180 0.67324 -8.024 1.34e-12 ***
## Solar.R 0.10035 0.02628 3.819 0.000224 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.92 on 108 degrees of freedom
## Multiple R-squared: 0.4495, Adjusted R-squared: 0.4393
## F-statistic: 44.09 on 2 and 108 DF, p-value: 1.003e-14
#What does the regression model tell you? What metric will you observe to comment on the efficacy of the model.
# Variance *explained* by the model.
summary(lm(Ozone~Wind+Solar.R, airquality_subset))$sigma^2
## [1] 620.8586
# Variance of the dependent variable.
var(airquality_subset$Ozone)
## [1] 1107.29
# Therefore, the proportion of *unexplained* variance, R squared, from the model is
(var(airquality_subset$Ozone)-summary(lm(Ozone~Wind+Solar.R, airquality_subset))$sigma^2)/var(airquality_subset$Ozone)
## [1] 0.439299
# Compare it with the R-sq value of the model.
airquality %>%
filter(!is.na(Ozone) & !is.na(Solar.R)) %>%
mutate(Ozone = scale(Ozone))
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03302982 190 7.4 67 5 1
## 2 -0.18328840 118 8.0 72 5 2
## 3 -0.90452961 149 12.6 74 5 3
## 4 -0.72421931 313 11.5 62 5 4
## 5 -0.57396073 299 8.6 65 5 7
## 6 -0.69416759 99 13.8 59 5 8
## 7 -1.02473648 19 20.1 61 5 9
## 8 -0.78432275 256 9.7 69 5 12
## 9 -0.93458133 290 9.2 66 5 13
## 10 -0.84442618 274 10.9 68 5 14
## 11 -0.72421931 65 13.2 58 5 15
## 12 -0.84442618 334 11.5 64 5 16
## 13 -0.24339184 307 12.0 66 5 17
## 14 -1.08483992 78 18.4 57 5 18
## 15 -0.36359870 322 11.5 68 5 19
## 16 -0.93458133 44 9.7 62 5 20
## 17 -1.23509850 8 9.7 59 5 21
## 18 -0.93458133 320 16.6 73 5 22
## 19 -1.14494335 25 9.7 61 5 23
## 20 -0.30349527 92 12.0 61 5 24
## 21 -0.57396073 13 12.0 67 5 28
## 22 0.08717705 252 14.9 81 5 29
## 23 2.19079726 223 5.7 79 5 30
## 24 -0.15323668 279 7.4 76 5 31
## 25 -0.39365042 127 9.7 82 6 7
## 26 0.86852170 291 13.8 90 6 9
## 27 -0.09313325 323 11.5 87 6 10
## 28 -0.57396073 148 8.0 82 6 13
## 29 -0.63406416 191 14.9 77 6 16
## 30 -0.15323668 284 20.7 72 6 17
## 31 -0.66411588 37 9.2 65 6 18
## 32 -0.90452961 120 11.5 73 6 19
## 33 -0.87447790 137 10.3 76 6 20
## 34 2.79183160 269 4.1 84 7 1
## 35 0.20738392 248 9.2 85 7 2
## 36 -0.30349527 236 9.2 81 7 3
## 37 0.65815968 175 4.6 83 7 5
## 38 -0.06308153 314 10.9 83 7 6
## 39 1.04883200 276 5.1 88 7 7
## 40 1.64986635 267 6.3 92 7 8
## 41 1.64986635 272 5.7 92 7 9
## 42 1.28924574 175 7.4 89 7 10
## 43 -0.96463305 264 14.3 73 7 12
## 44 -0.45375386 175 14.9 81 7 13
## 45 -1.05478820 48 14.3 80 7 15
## 46 0.17733221 260 6.9 81 7 16
## 47 -0.21334012 274 10.3 82 7 17
## 48 0.56800453 285 6.3 84 7 18
## 49 1.10893544 187 5.1 87 7 19
## 50 0.62810796 220 11.5 85 7 20
## 51 -0.78432275 7 6.9 74 7 21
## 52 1.13898716 294 8.6 86 7 24
## 53 1.98043524 223 8.0 85 7 25
## 54 -0.66411588 81 8.6 82 7 26
## 55 0.29753907 82 12.0 86 7 27
## 56 1.19909059 213 7.4 88 7 28
## 57 0.23743564 275 7.4 86 7 29
## 58 0.65815968 253 7.4 83 7 30
## 59 0.50790109 254 9.2 81 7 31
## 60 -0.09313325 83 6.9 81 8 1
## 61 -0.99468477 24 13.8 81 8 2
## 62 -0.78432275 77 7.4 82 8 3
## 63 2.40115928 255 4.0 89 8 7
## 64 1.40945261 229 10.3 90 8 8
## 65 2.04053867 207 8.0 90 8 9
## 66 0.05712534 192 11.5 86 8 12
## 67 -0.42370214 273 11.5 82 8 13
## 68 0.68821140 157 9.7 80 8 14
## 69 -0.60401244 71 10.3 77 8 16
## 70 0.50790109 51 6.3 79 8 17
## 71 -0.57396073 115 7.4 76 8 18
## 72 -0.33354699 244 10.9 78 8 19
## 73 0.05712534 190 10.3 78 8 20
## 74 -0.63406416 259 15.5 77 8 21
## 75 -0.99468477 36 14.3 72 8 22
## 76 0.08717705 212 9.7 79 8 24
## 77 3.78353827 238 3.4 81 8 25
## 78 0.92862514 215 8.0 86 8 26
## 79 1.01878029 203 9.7 97 8 28
## 80 2.28095241 225 2.3 94 8 29
## 81 1.25919402 237 6.3 96 8 30
## 82 1.28924574 188 6.3 94 8 31
## 83 1.61981463 167 6.9 91 9 1
## 84 1.07888372 197 5.1 92 9 2
## 85 0.92862514 183 2.8 93 9 3
## 86 1.46955605 189 4.6 93 9 4
## 87 0.14728049 95 7.4 87 9 5
## 88 -0.30349527 92 15.5 84 9 6
## 89 -0.66411588 252 10.9 80 9 7
## 90 -0.57396073 220 10.3 78 9 8
## 91 -0.63406416 230 10.9 75 9 9
## 92 -0.54390901 259 9.7 73 9 10
## 93 0.05712534 236 14.9 81 9 11
## 94 -0.63406416 259 15.5 76 9 12
## 95 -0.42370214 238 6.3 77 9 13
## 96 -0.99468477 24 10.9 71 9 14
## 97 -0.87447790 112 11.5 71 9 15
## 98 0.11722877 237 6.9 78 9 16
## 99 -0.72421931 224 13.8 67 9 17
## 100 -0.87447790 27 10.3 76 9 18
## 101 -0.54390901 238 10.3 68 9 19
## 102 -0.78432275 201 8.0 82 9 20
## 103 -0.87447790 238 12.6 64 9 21
## 104 -0.57396073 14 9.2 71 9 22
## 105 -0.18328840 139 10.3 81 9 23
## 106 -1.05478820 49 10.3 69 9 24
## 107 -0.84442618 20 16.6 63 9 25
## 108 -0.36359870 193 6.9 70 9 26
## 109 -0.84442618 191 14.3 75 9 28
## 110 -0.72421931 131 8.0 76 9 29
## 111 -0.66411588 223 11.5 68 9 30