Preparing Workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 533753 28.6 1190298 63.6 NA 669420 35.8
## Vcells 986097 7.6 8388608 64.0 16384 1851779 14.2
cat("\f") # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device
## 1
Installing Packages
packages <- c("psych",
"tidyverse",
"ggplot2",
"lemon",
"gridExtra",
"ggrepel",
"scales",
"dplyr"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i], dependencies = TRUE)
}
library(packages[i], character.only = TRUE)
}
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'lemon'
##
##
## The following object is masked from 'package:purrr':
##
## %||%
##
##
##
## Attaching package: 'gridExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## combine
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
##
##
## The following objects are masked from 'package:psych':
##
## alpha, rescale
rm(packages)
Retrieving Dataset
data("airquality")
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
Imputing Missing Values
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 37 7 0 0 0 0
ozone.source.data <- airquality$Ozone #Storing source data
ozone.mean <- mean(airquality$Ozone, na.rm = T) #calculating mean ozone level
airquality$Ozone[is.na(airquality$Ozone)] <- mean(airquality$Ozone, na.rm=TRUE) #Replacing NAs with mean ozone level
sum(is.na(airquality$Ozone)) #Check
## [1] 0
summary(airquality$Ozone)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 21.00 42.13 42.13 46.00 168.00
solar.r.source.data <- airquality$Solar.R #Storing source data
solar.r.mean <- mean(airquality$Solar.R, na.rm = T) #calculating mean ozone level
airquality$Solar.R[is.na(airquality$Solar.R)] <- mean(airquality$Solar.R, na.rm=TRUE) #Replacing NAs with mean ozone level
sum(is.na(airquality$Solar.R)) #Check
## [1] 0
summary(airquality$Solar.R)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.0 120.0 194.0 185.9 256.0 334.0
colSums(is.na(airquality))
## Ozone Solar.R Wind Temp Month Day
## 0 0 0 0 0 0
Estimating Equation:
\[
Ozone \sim \beta_0 + Solar Radiation * \beta_1 + \epsilon
\] \[\epsilon \sim
N(0,\sigma^2)\]
Testing Correlation
cor(airquality$Ozone, airquality$Solar.R) #positive correlation
## [1] 0.3029695
Fitting Linear Model
air.quality.lm <- lm(airquality$Ozone ~ airquality$Solar.R)
summary(air.quality.lm)
##
## Call:
## lm(formula = airquality$Ozone ~ airquality$Solar.R)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.379 -17.125 -6.332 10.143 120.725
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.75348 5.20054 4.567 1.02e-05 ***
## airquality$Solar.R 0.09883 0.02530 3.907 0.000141 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.44 on 151 degrees of freedom
## Multiple R-squared: 0.09179, Adjusted R-squared: 0.08578
## F-statistic: 15.26 on 1 and 151 DF, p-value: 0.0001409
Plotting Line of Best Fit
plot(x = airquality$Solar.R, y = airquality$Ozone,
main = "Solar Radiation vs. Ozone Level",
xlab = "Solar Radiation",
ylab = "Ozone Level"
)
abline(air.quality.lm, col = "red")

Replicating Slope Parameter
covariance.xy <- cov(airquality$Solar.R, airquality$Ozone)
variance.x <- var(airquality$Solar.R)
beta.1 <- covariance.xy / variance.x
print(beta.1)
## [1] 0.09883118
Replicating Intercept Parameter
beta.0 <- mean(airquality$Ozone) - beta.1 * mean(airquality$Solar.R)
print(beta.0)
## [1] 23.75348
Part II