An Extended Analysis of Predictor Variables, Assumptions, and Model Performance

# Required libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.3.3

# Setting theme and options
theme_set(theme_minimal())
options(scipen = 6)

# Loading earthquake data
quakes <- read_csv("https://raw.githubusercontent.com/leontoddjohnson/datasets/main/data/quakes/quakes.csv")

## Rows: 18334 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): magType, net, id, place, type, status, locationSource, magSource
## dbl  (12): latitude, longitude, depth, mag, nst, gap, dmin, rms, horizontalE...
## dttm  (2): time, updated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Check the structure of the 'quakes' dataset
str(quakes)

## spc_tbl_ [18,334 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ time           : POSIXct[1:18334], format: "2013-01-01 03:51:13" "2013-01-01 07:35:49" ...
##  $ latitude       : num [1:18334] -20.81 46.87 -15.84 -1.56 -16.45 ...
##  $ longitude      : num [1:18334] -69.7 151.1 -172.1 127.4 -173.4 ...
##  $ depth          : num [1:18334] 56.1 35 10 26.3 35 129 66.8 15 78.9 12.3 ...
##  $ mag            : num [1:18334] 5.1 5.1 5 5.6 5.1 5 5 5 5.1 5.2 ...
##  $ magType        : chr [1:18334] "mb" "mwb" "mb" "mwb" ...
##  $ nst            : num [1:18334] 64 519 199 139 194 102 177 48 139 235 ...
##  $ gap            : num [1:18334] 109.1 46.6 33 36.5 69.8 ...
##  $ dmin           : num [1:18334] NA NA NA NA NA NA NA NA NA NA ...
##  $ rms            : num [1:18334] NA 0.61 0.93 1.35 0.82 0.86 0.95 0.92 0.93 1.03 ...
##  $ net            : chr [1:18334] "us" "us" "us" "us" ...
##  $ id             : chr [1:18334] "usp000jxpn" "usp000jxpv" "usc000eihe" "usp000jxrn" ...
##  $ updated        : POSIXct[1:18334], format: "2014-11-07 01:49:43" "2022-05-03 16:02:39" ...
##  $ place          : chr [1:18334] "83 km SE of Iquique, Chile" "Kuril Islands" "177 km E of Hihifo, Tonga" "251 km NNW of Ambon, Indonesia" ...
##  $ type           : chr [1:18334] "earthquake" "earthquake" "earthquake" "earthquake" ...
##  $ horizontalError: num [1:18334] NA NA NA NA NA NA NA NA NA NA ...
##  $ depthError     : num [1:18334] NA NA NA 12.8 NA 4.7 4.3 0 8 3.5 ...
##  $ magError       : num [1:18334] NA NA NA NA NA NA NA NA NA NA ...
##  $ magNst         : num [1:18334] 18 NA 109 NA 123 59 114 33 64 56 ...
##  $ status         : chr [1:18334] "reviewed" "reviewed" "reviewed" "reviewed" ...
##  $ locationSource : chr [1:18334] "guc" "us" "us" "us" ...
##  $ magSource      : chr [1:18334] "us" "us" "us" "us" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   time = col_datetime(format = ""),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   depth = col_double(),
##   ..   mag = col_double(),
##   ..   magType = col_character(),
##   ..   nst = col_double(),
##   ..   gap = col_double(),
##   ..   dmin = col_double(),
##   ..   rms = col_double(),
##   ..   net = col_character(),
##   ..   id = col_character(),
##   ..   updated = col_datetime(format = ""),
##   ..   place = col_character(),
##   ..   type = col_character(),
##   ..   horizontalError = col_double(),
##   ..   depthError = col_double(),
##   ..   magError = col_double(),
##   ..   magNst = col_double(),
##   ..   status = col_character(),
##   ..   locationSource = col_character(),
##   ..   magSource = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

# Print the first few rows of the dataset to see its variables
head(quakes)

## # A tibble: 6 × 22
##   time                latitude longitude depth   mag magType   nst   gap  dmin
##   <dttm>                 <dbl>     <dbl> <dbl> <dbl> <chr>   <dbl> <dbl> <dbl>
## 1 2013-01-01 03:51:13   -20.8      -69.7  56.1   5.1 mb         64 109.     NA
## 2 2013-01-01 07:35:49    46.9      151.   35     5.1 mwb       519  46.6    NA
## 3 2013-01-02 19:35:15   -15.8     -172.   10     5   mb        199  33      NA
## 4 2013-01-03 00:02:16    -1.56     127.   26.3   5.6 mwb       139  36.5    NA
## 5 2013-01-04 13:13:45   -16.4     -173.   35     5.1 mb        194  69.8    NA
## 6 2013-01-04 17:59:13     1.38     127.  129     5   mb        102  25.7    NA
## # ℹ 13 more variables: rms <dbl>, net <chr>, id <chr>, updated <dttm>,
## #   place <chr>, type <chr>, horizontalError <dbl>, depthError <dbl>,
## #   magError <dbl>, magNst <dbl>, status <chr>, locationSource <chr>,
## #   magSource <chr>

# Correlation analysis
correlation_matrix <- cor(quakes[, c("depth", "mag")])
correlation_with_depth <- correlation_matrix["depth", ]

# Assumptions of Linear Model
pairs(~ depth + mag, data = quakes)

model <- lm(depth ~ mag, data = quakes)
residuals <- resid(model)
hist(residuals, main = "Histogram of Residuals")

# Load the dataset (quakes is a built-in dataset in R)
data(quakes)

# Fit a linear regression model
subgroup_model <- lm(depth ~ mag * stations, data = quakes)

# Display the summary of the model
summary(subgroup_model)

## 
## Call:
## lm(formula = depth ~ mag * stations, data = quakes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -418.12 -174.83  -50.52  200.93  451.75 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1659.0136   148.8416  11.146   <2e-16 ***
## mag          -323.7232    34.2126  -9.462   <2e-16 ***
## stations        4.9619     3.3022   1.503    0.133    
## mag:stations   -0.1095     0.6198  -0.177    0.860    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 203.9 on 996 degrees of freedom
## Multiple R-squared:  0.108,  Adjusted R-squared:  0.1053 
## F-statistic: 40.18 on 3 and 996 DF,  p-value: < 2.2e-16

# Impact of Assumption Violations
predicted <- predict(model)
mse <- mean((predicted - quakes$depth)^2)

## Warning in predicted - quakes$depth: longer object length is not a multiple of
## shorter object length

mae <- mean(abs(predicted - quakes$depth))

## Warning in predicted - quakes$depth: longer object length is not a multiple of
## shorter object length

rsquared <- summary(model)$r.squared

# Evaluation Metrics
mse <- mean((predicted - quakes$depth)^2)

## Warning in predicted - quakes$depth: longer object length is not a multiple of
## shorter object length

mae <- mean(abs(predicted - quakes$depth))

## Warning in predicted - quakes$depth: longer object length is not a multiple of
## shorter object length

rsquared <- summary(model)$r.squared