R Markdown

names(faults)
##  [1] "ogc_fid"            "accuracy"           "average_dip"       
##  [4] "is_active"          "name"               "dip_dir"           
##  [7] "downthrown_side_id" "last_movement"      "net_slip_rate"     
## [10] "strike_slip_rate"   "vert_slip_rate"     "fs_name"           
## [13] "slip_type"          "average_rake"       "reference"         
## [16] "exposure_quality"   "epistemic_quality"  "shortening_rate"   
## [19] "notes"              "geometry"
# Quick Plot

plot(st_geometry(faults), col = "red", lwd = 1)

parse_tuple_first_chr <- function(x) {
  out <- rep(NA_real_, length(x))
  idx <- which(!is.na(x))
  if (length(idx) == 0) return(out)

  s <- gsub("[()\\s]", "", x[idx])

  out[idx] <- vapply(strsplit(s, ",", fixed = TRUE), function(parts) {
    parts <- parts[parts != ""] 
    nums <- suppressWarnings(as.numeric(parts))
    nums <- nums[!is.na(nums)]
    if (length(nums) >= 1) nums[1] else NA_real_
  }, numeric(1))

  out
}
library(sf)
library(dplyr)

faults_df <- faults %>%
  st_drop_geometry() %>%
  mutate(
    strike_pref = parse_tuple_first_chr(strike_slip_rate),
    vert_pref   = parse_tuple_first_chr(vert_slip_rate),
    short_pref  = parse_tuple_first_chr(shortening_rate),
    net_pref    = parse_tuple_first_chr(net_slip_rate),
    dip_pref    = parse_tuple_first_chr(average_dip)
  )
summary(faults_df$short_pref)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -13.000  -0.100  -0.050   1.037   0.500  75.000     185
quantile(faults_df$short_pref, probs = c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE)
##    5%   25%   50%   75%   95% 
## -1.50 -0.10 -0.05  0.50  2.00
library(ggplot2)
library(dplyr)

q1 <- quantile(faults_df$short_pref, 0.25, na.rm = TRUE)
q3 <- quantile(faults_df$short_pref, 0.75, na.rm = TRUE)
iqr <- q3 - q1

lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr

# Remove outliers
faults_df_trim <- faults_df %>%
  filter(
    !is.na(short_pref),
    short_pref >= lower_bound,
    short_pref <= upper_bound
  )

# Replot
ggplot(faults_df_trim, aes(short_pref)) +
  geom_histogram(bins = 40, fill = "steelblue", alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Distribution of Shortening Rates (Outliers Removed)",
    x = "Shortening Rate (negative = extension)",
    y = "Count"
  )

Now we start to import the earthquake data and do some EDA

library(sf)
library(dplyr)
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
eq_path <- "/Users/leoli/Desktop/UPenn 2025-2026/Quant/DE Shaw Case/central_am_earthquakes.tsv"
eq <- read_tsv(eq_path, show_col_types = FALSE)
names(eq)
##  [1] "Search Parameters"                  "Year"                              
##  [3] "Mo"                                 "Dy"                                
##  [5] "Hr"                                 "Mn"                                
##  [7] "Sec"                                "Tsu"                               
##  [9] "Vol"                                "Location Name"                     
## [11] "Latitude"                           "Longitude"                         
## [13] "Focal Depth (km)"                   "Mag"                               
## [15] "MMI Int"                            "Deaths"                            
## [17] "Death Description"                  "Missing"                           
## [19] "Missing Description"                "Injuries"                          
## [21] "Injuries Description"               "Damage ($Mil)"                     
## [23] "Damage Description"                 "Houses Destroyed"                  
## [25] "Houses Destroyed Description"       "Houses Damaged"                    
## [27] "Houses Damaged Description"         "Total Deaths"                      
## [29] "Total Death Description"            "Total Missing"                     
## [31] "Total Missing Description"          "Total Injuries"                    
## [33] "Total Injuries Description"         "Total Damage ($Mil)"               
## [35] "Total Damage Description"           "Total Houses Destroyed"            
## [37] "Total Houses Destroyed Description" "Total Houses Damaged"              
## [39] "Total Houses Damaged Description"
LON_COL <- "Longitude"   
LAT_COL <- "Latitude"    

Now we need to convert earthquakes into sf points (EPSG:4326)

First inspect and filter out missing coordinates

eq_clean <- eq %>%
  filter(
    !is.na(.data[[LON_COL]]),
    !is.na(.data[[LAT_COL]])
  )
summary(eq_clean)
##  Search Parameters       Year            Mo               Dy       
##  Length:182         Min.   :1565   Min.   : 1.000   Min.   : 1.00  
##  Class :character   1st Qu.:1884   1st Qu.: 4.000   1st Qu.: 6.00  
##  Mode  :character   Median :1950   Median : 7.000   Median :13.00  
##                     Mean   :1920   Mean   : 6.646   Mean   :14.44  
##                     3rd Qu.:1999   3rd Qu.:10.000   3rd Qu.:22.75  
##                     Max.   :2025   Max.   :12.000   Max.   :31.00  
##                                    NA's   :7        NA's   :12     
##        Hr              Mn             Sec             Tsu            Vol      
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.00   Min.   : 507   Min.   :3551  
##  1st Qu.: 4.75   1st Qu.:13.00   1st Qu.: 9.60   1st Qu.:1748   1st Qu.:3580  
##  Median :10.00   Median :27.00   Median :28.00   Median :5062   Median :3662  
##  Mean   :10.83   Mean   :27.21   Mean   :27.19   Mean   :3668   Mean   :3856  
##  3rd Qu.:17.00   3rd Qu.:42.00   3rd Qu.:44.00   3rd Qu.:5583   3rd Qu.:3664  
##  Max.   :23.00   Max.   :59.00   Max.   :58.90   Max.   :6080   Max.   :4824  
##  NA's   :50      NA's   :51      NA's   :63      NA's   :143    NA's   :177   
##  Location Name         Latitude       Longitude      Focal Depth (km)
##  Length:182         Min.   : 6.50   Min.   :-92.64   Min.   :  1.00  
##  Class :character   1st Qu.:10.11   1st Qu.:-89.37   1st Qu.: 10.50  
##  Mode  :character   Median :13.00   Median :-87.81   Median : 28.00  
##                     Mean   :12.25   Mean   :-87.19   Mean   : 34.38  
##                     3rd Qu.:13.83   3rd Qu.:-84.70   3rd Qu.: 44.50  
##                     Max.   :15.89   Max.   :-77.31   Max.   :135.00  
##                                                      NA's   :76      
##       Mag           MMI Int           Deaths        Death Description
##  Min.   :4.400   Min.   : 4.000   Min.   :    1.0   Min.   :1.000    
##  1st Qu.:5.750   1st Qu.: 6.000   1st Qu.:    2.0   1st Qu.:1.000    
##  Median :6.500   Median : 8.000   Median :    7.0   Median :1.000    
##  Mean   :6.388   Mean   : 7.203   Mean   :  429.4   Mean   :1.831    
##  3rd Qu.:7.150   3rd Qu.: 8.000   3rd Qu.:  300.0   3rd Qu.:3.000    
##  Max.   :7.900   Max.   :11.000   Max.   :10000.0   Max.   :4.000    
##  NA's   :43      NA's   :118      NA's   :121       NA's   :105      
##     Missing    Missing Description    Injuries        Injuries Description
##  Min.   :17    Min.   :1           Min.   :    1.00   Min.   :1.000       
##  1st Qu.:17    1st Qu.:1           1st Qu.:    4.00   1st Qu.:1.000       
##  Median :17    Median :1           Median :   25.00   Median :1.000       
##  Mean   :17    Mean   :1           Mean   :  932.82   Mean   :1.804       
##  3rd Qu.:17    3rd Qu.:1           3rd Qu.:   81.25   3rd Qu.:3.000       
##  Max.   :17    Max.   :1           Max.   :20000.00   Max.   :4.000       
##  NA's   :181   NA's   :181         NA's   :138        NA's   :126         
##  Damage ($Mil)     Damage Description Houses Destroyed
##  Min.   :   0.05   Min.   :1.0        Min.   :     2  
##  1st Qu.:   2.00   1st Qu.:1.0        1st Qu.:     7  
##  Median :  15.00   Median :2.0        Median :    84  
##  Mean   : 307.35   Mean   :2.2        Mean   :  6894  
##  3rd Qu.: 210.00   3rd Qu.:3.0        3rd Qu.:   522  
##  Max.   :2968.00   Max.   :4.0        Max.   :108226  
##  NA's   :161       NA's   :17         NA's   :159     
##  Houses Destroyed Description Houses Damaged   Houses Damaged Description
##  Min.   :1.000                Min.   :     2   Min.   :1.000             
##  1st Qu.:1.000                1st Qu.:    41   1st Qu.:2.000             
##  Median :2.000                Median :    86   Median :2.000             
##  Mean   :2.328                Mean   :  6987   Mean   :2.434             
##  3rd Qu.:3.000                3rd Qu.:   554   3rd Qu.:3.000             
##  Max.   :4.000                Max.   :169632   Max.   :4.000             
##  NA's   :124                  NA's   :153      NA's   :129               
##   Total Deaths     Total Death Description Total Missing
##  Min.   :    1.0   Min.   :1.000           Min.   :17   
##  1st Qu.:    2.0   1st Qu.:1.000           1st Qu.:17   
##  Median :    7.0   Median :1.000           Median :17   
##  Mean   :  390.7   Mean   :1.819           Mean   :17   
##  3rd Qu.:  158.8   3rd Qu.:3.000           3rd Qu.:17   
##  Max.   :10000.0   Max.   :4.000           Max.   :17   
##  NA's   :124       NA's   :110             NA's   :181  
##  Total Missing Description Total Injuries     Total Injuries Description
##  Min.   :1                 Min.   :    1.00   Min.   :1.000             
##  1st Qu.:1                 1st Qu.:    4.25   1st Qu.:1.000             
##  Median :1                 Median :   28.50   Median :1.000             
##  Mean   :1                 Mean   : 1122.46   Mean   :1.828             
##  3rd Qu.:1                 3rd Qu.:  100.00   3rd Qu.:3.000             
##  Max.   :1                 Max.   :20000.00   Max.   :4.000             
##  NA's   :181               NA's   :136        NA's   :124               
##  Total Damage ($Mil) Total Damage Description Total Houses Destroyed
##  Min.   :   0.05     Min.   :1.000            Min.   :     2.0      
##  1st Qu.:   2.00     1st Qu.:1.000            1st Qu.:     7.0      
##  Median :  21.25     Median :2.000            Median :   262.5      
##  Mean   : 322.71     Mean   :2.218            Mean   :  6710.6      
##  3rd Qu.: 244.62     3rd Qu.:3.000            3rd Qu.:   650.8      
##  Max.   :2968.00     Max.   :4.000            Max.   :108226.0      
##  NA's   :162         NA's   :40               NA's   :158           
##  Total Houses Destroyed Description Total Houses Damaged
##  Min.   :1.000                      Min.   :     2.00   
##  1st Qu.:1.000                      1st Qu.:    31.75   
##  Median :2.000                      Median :    75.00   
##  Mean   :2.344                      Mean   :  6724.60   
##  3rd Qu.:3.000                      3rd Qu.:   537.25   
##  Max.   :4.000                      Max.   :169632.00   
##  NA's   :121                        NA's   :152         
##  Total Houses Damaged Description
##  Min.   :1.000                   
##  1st Qu.:1.000                   
##  Median :2.000                   
##  Mean   :2.283                   
##  3rd Qu.:3.000                   
##  Max.   :4.000                   
##  NA's   :129

Now convert into sf

eq_sf <- st_as_sf(
  eq_clean,
  coords = c(LON_COL, LAT_COL),
  crs = 4326,
  remove = FALSE
)
summary(eq_sf)
##  Search Parameters       Year            Mo               Dy       
##  Length:182         Min.   :1565   Min.   : 1.000   Min.   : 1.00  
##  Class :character   1st Qu.:1884   1st Qu.: 4.000   1st Qu.: 6.00  
##  Mode  :character   Median :1950   Median : 7.000   Median :13.00  
##                     Mean   :1920   Mean   : 6.646   Mean   :14.44  
##                     3rd Qu.:1999   3rd Qu.:10.000   3rd Qu.:22.75  
##                     Max.   :2025   Max.   :12.000   Max.   :31.00  
##                                    NA's   :7        NA's   :12     
##        Hr              Mn             Sec             Tsu            Vol      
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.00   Min.   : 507   Min.   :3551  
##  1st Qu.: 4.75   1st Qu.:13.00   1st Qu.: 9.60   1st Qu.:1748   1st Qu.:3580  
##  Median :10.00   Median :27.00   Median :28.00   Median :5062   Median :3662  
##  Mean   :10.83   Mean   :27.21   Mean   :27.19   Mean   :3668   Mean   :3856  
##  3rd Qu.:17.00   3rd Qu.:42.00   3rd Qu.:44.00   3rd Qu.:5583   3rd Qu.:3664  
##  Max.   :23.00   Max.   :59.00   Max.   :58.90   Max.   :6080   Max.   :4824  
##  NA's   :50      NA's   :51      NA's   :63      NA's   :143    NA's   :177   
##  Location Name         Latitude       Longitude      Focal Depth (km)
##  Length:182         Min.   : 6.50   Min.   :-92.64   Min.   :  1.00  
##  Class :character   1st Qu.:10.11   1st Qu.:-89.37   1st Qu.: 10.50  
##  Mode  :character   Median :13.00   Median :-87.81   Median : 28.00  
##                     Mean   :12.25   Mean   :-87.19   Mean   : 34.38  
##                     3rd Qu.:13.83   3rd Qu.:-84.70   3rd Qu.: 44.50  
##                     Max.   :15.89   Max.   :-77.31   Max.   :135.00  
##                                                      NA's   :76      
##       Mag           MMI Int           Deaths        Death Description
##  Min.   :4.400   Min.   : 4.000   Min.   :    1.0   Min.   :1.000    
##  1st Qu.:5.750   1st Qu.: 6.000   1st Qu.:    2.0   1st Qu.:1.000    
##  Median :6.500   Median : 8.000   Median :    7.0   Median :1.000    
##  Mean   :6.388   Mean   : 7.203   Mean   :  429.4   Mean   :1.831    
##  3rd Qu.:7.150   3rd Qu.: 8.000   3rd Qu.:  300.0   3rd Qu.:3.000    
##  Max.   :7.900   Max.   :11.000   Max.   :10000.0   Max.   :4.000    
##  NA's   :43      NA's   :118      NA's   :121       NA's   :105      
##     Missing    Missing Description    Injuries        Injuries Description
##  Min.   :17    Min.   :1           Min.   :    1.00   Min.   :1.000       
##  1st Qu.:17    1st Qu.:1           1st Qu.:    4.00   1st Qu.:1.000       
##  Median :17    Median :1           Median :   25.00   Median :1.000       
##  Mean   :17    Mean   :1           Mean   :  932.82   Mean   :1.804       
##  3rd Qu.:17    3rd Qu.:1           3rd Qu.:   81.25   3rd Qu.:3.000       
##  Max.   :17    Max.   :1           Max.   :20000.00   Max.   :4.000       
##  NA's   :181   NA's   :181         NA's   :138        NA's   :126         
##  Damage ($Mil)     Damage Description Houses Destroyed
##  Min.   :   0.05   Min.   :1.0        Min.   :     2  
##  1st Qu.:   2.00   1st Qu.:1.0        1st Qu.:     7  
##  Median :  15.00   Median :2.0        Median :    84  
##  Mean   : 307.35   Mean   :2.2        Mean   :  6894  
##  3rd Qu.: 210.00   3rd Qu.:3.0        3rd Qu.:   522  
##  Max.   :2968.00   Max.   :4.0        Max.   :108226  
##  NA's   :161       NA's   :17         NA's   :159     
##  Houses Destroyed Description Houses Damaged   Houses Damaged Description
##  Min.   :1.000                Min.   :     2   Min.   :1.000             
##  1st Qu.:1.000                1st Qu.:    41   1st Qu.:2.000             
##  Median :2.000                Median :    86   Median :2.000             
##  Mean   :2.328                Mean   :  6987   Mean   :2.434             
##  3rd Qu.:3.000                3rd Qu.:   554   3rd Qu.:3.000             
##  Max.   :4.000                Max.   :169632   Max.   :4.000             
##  NA's   :124                  NA's   :153      NA's   :129               
##   Total Deaths     Total Death Description Total Missing
##  Min.   :    1.0   Min.   :1.000           Min.   :17   
##  1st Qu.:    2.0   1st Qu.:1.000           1st Qu.:17   
##  Median :    7.0   Median :1.000           Median :17   
##  Mean   :  390.7   Mean   :1.819           Mean   :17   
##  3rd Qu.:  158.8   3rd Qu.:3.000           3rd Qu.:17   
##  Max.   :10000.0   Max.   :4.000           Max.   :17   
##  NA's   :124       NA's   :110             NA's   :181  
##  Total Missing Description Total Injuries     Total Injuries Description
##  Min.   :1                 Min.   :    1.00   Min.   :1.000             
##  1st Qu.:1                 1st Qu.:    4.25   1st Qu.:1.000             
##  Median :1                 Median :   28.50   Median :1.000             
##  Mean   :1                 Mean   : 1122.46   Mean   :1.828             
##  3rd Qu.:1                 3rd Qu.:  100.00   3rd Qu.:3.000             
##  Max.   :1                 Max.   :20000.00   Max.   :4.000             
##  NA's   :181               NA's   :136        NA's   :124               
##  Total Damage ($Mil) Total Damage Description Total Houses Destroyed
##  Min.   :   0.05     Min.   :1.000            Min.   :     2.0      
##  1st Qu.:   2.00     1st Qu.:1.000            1st Qu.:     7.0      
##  Median :  21.25     Median :2.000            Median :   262.5      
##  Mean   : 322.71     Mean   :2.218            Mean   :  6710.6      
##  3rd Qu.: 244.62     3rd Qu.:3.000            3rd Qu.:   650.8      
##  Max.   :2968.00     Max.   :4.000            Max.   :108226.0      
##  NA's   :162         NA's   :40               NA's   :158           
##  Total Houses Destroyed Description Total Houses Damaged
##  Min.   :1.000                      Min.   :     2.00   
##  1st Qu.:1.000                      1st Qu.:    31.75   
##  Median :2.000                      Median :    75.00   
##  Mean   :2.344                      Mean   :  6724.60   
##  3rd Qu.:3.000                      3rd Qu.:   537.25   
##  Max.   :4.000                      Max.   :169632.00   
##  NA's   :121                        NA's   :152         
##  Total Houses Damaged Description          geometry  
##  Min.   :1.000                    POINT        :182  
##  1st Qu.:1.000                    epsg:4326    :  0  
##  Median :2.000                    +proj=long...:  0  
##  Mean   :2.283                                       
##  3rd Qu.:3.000                                       
##  Max.   :4.000                                       
##  NA's   :129
plot(
  st_geometry(faults),
  col = "red",
  lwd = 1,
  main = "Central America Faults (red) and Earthquakes (blue)"
)

plot(
  st_geometry(eq_sf),
  col = rgb(0, 0, 1, 0.5),  # semi-transparent blue
  pch = 16,
  cex = 0.4,
  add = TRUE
)

MAG_COL <- "Mag"

plot(
  st_geometry(faults),
  col = "red",
  lwd = 1,
  main = "Faults (red) and Earthquakes by Magnitude (blue)"
)

plot(
  st_geometry(eq_sf),
  col = rgb(0, 0, 1, 0.4),
  pch = 16,
  cex = scales::rescale(eq_sf[[MAG_COL]], to = c(0.4, 1.5)),
  add = TRUE
)

Now we project this and find the shortest distance between earthquake and fault for each earthquake

faults_utm <- st_transform(faults, 32616)
eq_utm     <- st_transform(eq_sf, 32616)

nearest_idx <- st_nearest_feature(eq_utm, faults_utm)
nearest_dist_m <- st_distance(eq_utm, faults_utm[nearest_idx, ], by_element = TRUE)

result <- eq_clean %>%
  mutate(
    quake_id = row_number(),
    nearest_fault_row = nearest_idx,
    dist_to_fault_m = as.numeric(nearest_dist_m)
  )
resultFaultAttached <- result %>%
  left_join(
    faults %>% st_drop_geometry() %>% mutate(nearest_fault_row = row_number()),
    by = "nearest_fault_row"
  )
resultFaultAttached
## # A tibble: 182 × 61
##    `Search Parameters`  Year    Mo    Dy    Hr    Mn   Sec   Tsu   Vol
##    <chr>               <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 <NA>                 1565     8    30    NA    NA    NA    NA  3580
##  2 <NA>                 1566     5    NA    NA    NA    NA    NA    NA
##  3 <NA>                 1575     6     2    NA    NA    NA    NA    NA
##  4 <NA>                 1576    NA    NA    NA    NA    NA    NA    NA
##  5 <NA>                 1579     3    16    NA    NA    NA  5646    NA
##  6 <NA>                 1581    NA    NA    NA    NA    NA    NA    NA
##  7 <NA>                 1586    NA    NA    NA    NA    NA    NA    NA
##  8 <NA>                 1594     4    21    NA    NA    NA    NA    NA
##  9 <NA>                 1621     5     2    NA    NA    NA    NA    NA
## 10 <NA>                 1648    NA    NA    NA    NA    NA    NA    NA
## # ℹ 172 more rows
## # ℹ 52 more variables: `Location Name` <chr>, Latitude <dbl>, Longitude <dbl>,
## #   `Focal Depth (km)` <dbl>, Mag <dbl>, `MMI Int` <dbl>, Deaths <dbl>,
## #   `Death Description` <dbl>, Missing <dbl>, `Missing Description` <dbl>,
## #   Injuries <dbl>, `Injuries Description` <dbl>, `Damage ($Mil)` <dbl>,
## #   `Damage Description` <dbl>, `Houses Destroyed` <dbl>,
## #   `Houses Destroyed Description` <dbl>, `Houses Damaged` <dbl>, …

Quick check

summary(result$dist_to_fault_m)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    136.1   8700.9  17419.2  30471.5  43028.9 252665.2
head(result %>% select(quake_id, nearest_fault_row, dist_to_fault_m))
## # A tibble: 6 × 3
##   quake_id nearest_fault_row dist_to_fault_m
##      <int>             <int>           <dbl>
## 1        1                31           8701.
## 2        2                31           8701.
## 3        3                93          16319.
## 4        4                31           8701.
## 5        5               185          31190.
## 6        6                93          16319.
# drop geometry from faults and keep attributes
faults_attr <- faults %>%
  st_drop_geometry() %>%
  mutate(nearest_fault_row = row_number())

# combine earthquake + fault data
reg_df <- result %>%
  left_join(faults_attr, by = "nearest_fault_row")

LM: Magnitude vs. Distance

reg_simple <- result %>%
  select(
    magnitude = all_of(MAG_COL),
    dist_to_fault_m
  ) %>%
  filter(
    !is.na(magnitude),
    !is.na(dist_to_fault_m)
  ) %>%
  mutate(
    dist_km = dist_to_fault_m / 1000
  )
nrow(reg_simple)
## [1] 139
summary(reg_simple)
##    magnitude     dist_to_fault_m       dist_km        
##  Min.   :4.400   Min.   :   136.1   Min.   :  0.1361  
##  1st Qu.:5.750   1st Qu.:  8727.8   1st Qu.:  8.7278  
##  Median :6.500   Median : 19239.3   Median : 19.2393  
##  Mean   :6.388   Mean   : 32371.4   Mean   : 32.3714  
##  3rd Qu.:7.150   3rd Qu.: 46987.0   3rd Qu.: 46.9870  
##  Max.   :7.900   Max.   :252665.2   Max.   :252.6652
lm_simple <- lm(magnitude ~ dist_km, data = reg_simple)
summary(lm_simple)
## 
## Call:
## lm(formula = magnitude ~ dist_km, data = reg_simple)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.08848 -0.58040  0.06461  0.68967  1.64926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.236726   0.097438  64.007   <2e-16 ***
## dist_km     0.004688   0.001947   2.408   0.0174 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8761 on 137 degrees of freedom
## Multiple R-squared:  0.0406, Adjusted R-squared:  0.0336 
## F-statistic: 5.798 on 1 and 137 DF,  p-value: 0.01737
ggplot(reg_simple, aes(x = dist_km, y = magnitude)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    x = "Distance to nearest fault (km)",
    y = "Earthquake magnitude",
    title = "Earthquake Magnitude vs Distance to Nearest Fault"
  )
## `geom_smooth()` using formula = 'y ~ x'

names(reg_df)
##  [1] "Search Parameters"                  "Year"                              
##  [3] "Mo"                                 "Dy"                                
##  [5] "Hr"                                 "Mn"                                
##  [7] "Sec"                                "Tsu"                               
##  [9] "Vol"                                "Location Name"                     
## [11] "Latitude"                           "Longitude"                         
## [13] "Focal Depth (km)"                   "Mag"                               
## [15] "MMI Int"                            "Deaths"                            
## [17] "Death Description"                  "Missing"                           
## [19] "Missing Description"                "Injuries"                          
## [21] "Injuries Description"               "Damage ($Mil)"                     
## [23] "Damage Description"                 "Houses Destroyed"                  
## [25] "Houses Destroyed Description"       "Houses Damaged"                    
## [27] "Houses Damaged Description"         "Total Deaths"                      
## [29] "Total Death Description"            "Total Missing"                     
## [31] "Total Missing Description"          "Total Injuries"                    
## [33] "Total Injuries Description"         "Total Damage ($Mil)"               
## [35] "Total Damage Description"           "Total Houses Destroyed"            
## [37] "Total Houses Destroyed Description" "Total Houses Damaged"              
## [39] "Total Houses Damaged Description"   "quake_id"                          
## [41] "nearest_fault_row"                  "dist_to_fault_m"                   
## [43] "ogc_fid"                            "accuracy"                          
## [45] "average_dip"                        "is_active"                         
## [47] "name"                               "dip_dir"                           
## [49] "downthrown_side_id"                 "last_movement"                     
## [51] "net_slip_rate"                      "strike_slip_rate"                  
## [53] "vert_slip_rate"                     "fs_name"                           
## [55] "slip_type"                          "average_rake"                      
## [57] "reference"                          "exposure_quality"                  
## [59] "epistemic_quality"                  "shortening_rate"                   
## [61] "notes"
reg_joint <- reg_df %>%
  select(
    magnitude = Mag,
    depth_km  = `Focal Depth (km)`,
    dist_m    = dist_to_fault_m
  ) %>%
  filter(
    !is.na(magnitude),
    !is.na(depth_km),
    !is.na(dist_m)
  ) %>%
  mutate(
    dist_km = dist_m / 1000
  )

nrow(reg_joint)
## [1] 106
summary(reg_joint)
##    magnitude        depth_km          dist_m            dist_km        
##  Min.   :4.400   Min.   :  1.00   Min.   :   136.1   Min.   :  0.1361  
##  1st Qu.:5.600   1st Qu.: 10.50   1st Qu.:  8771.9   1st Qu.:  8.7719  
##  Median :6.400   Median : 28.00   Median : 21426.3   Median : 21.4263  
##  Mean   :6.281   Mean   : 34.38   Mean   : 35652.1   Mean   : 35.6521  
##  3rd Qu.:6.975   3rd Qu.: 44.50   3rd Qu.: 54362.2   3rd Qu.: 54.3622  
##  Max.   :7.900   Max.   :135.00   Max.   :252665.2   Max.   :252.6652
lm_joint <- lm(
  magnitude ~ depth_km + dist_km,
  data = reg_joint
)

summary(lm_joint)
## 
## Call:
## lm(formula = magnitude ~ depth_km + dist_km, data = reg_joint)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98688 -0.59763  0.00751  0.67077  1.75065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.839945   0.138967  42.024   <2e-16 ***
## depth_km    0.007451   0.002923   2.549   0.0123 *  
## dist_km     0.005190   0.002032   2.554   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8514 on 103 degrees of freedom
## Multiple R-squared:  0.1361, Adjusted R-squared:  0.1193 
## F-statistic: 8.111 on 2 and 103 DF,  p-value: 0.0005357

Adjust for heteroskedastic

install.packages(“sandwich”) install.packages(“lmtest”)

library(sandwich)
library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
coeftest(lm_joint, vcov = vcovHC(lm_joint, type = "HC3"))
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.8399450  0.1366384 42.7401  < 2e-16 ***
## depth_km    0.0074514  0.0029520  2.5242  0.01312 *  
## dist_km     0.0051898  0.0028494  1.8214  0.07145 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So depth_km is statistically significant

# Partial regression (added-variable) plots
par(mfrow = c(1, 2))

# Effect of depth controlling for dist
avPlot_x <- function(fit, xname) {
  # residualize y and x against the other predictors
  others <- setdiff(names(fit$model)[-1], xname)
  y_res <- resid(lm(fit$model[[1]] ~ ., data = fit$model[others]))
  x_res <- resid(lm(fit$model[[xname]] ~ ., data = fit$model[others]))
  
  plot(x_res, y_res,
       xlab = paste0(xname, " (residualized)"),
       ylab = "magnitude (residualized)",
       main = paste("Added-variable plot:", xname),
       pch = 16, col = rgb(0,0,1,0.4))
  abline(lm(y_res ~ x_res), lwd = 2, col = "black")
}

avPlot_x(lm_joint, "depth_km")
avPlot_x(lm_joint, "dist_km")

par(mfrow = c(1, 1))
# Create prediction grids
depth_grid <- seq(min(reg_joint$depth_km, na.rm = TRUE),
                  max(reg_joint$depth_km, na.rm = TRUE),
                  length.out = 100)

dist_grid <- seq(min(reg_joint$dist_km, na.rm = TRUE),
                 max(reg_joint$dist_km, na.rm = TRUE),
                 length.out = 100)

depth_med <- median(reg_joint$depth_km, na.rm = TRUE)
dist_med  <- median(reg_joint$dist_km,  na.rm = TRUE)

# Predicted vs depth (distance fixed at median)
new_depth <- data.frame(depth_km = depth_grid, dist_km = dist_med)
pred_depth <- predict(lm_joint, newdata = new_depth, interval = "confidence")

plot(depth_grid, pred_depth[, "fit"], type = "l", lwd = 2,
     xlab = "Depth (km)", ylab = "Predicted magnitude",
     main = paste0("Predicted magnitude vs depth (dist_km = ", round(dist_med,1), ")"))
lines(depth_grid, pred_depth[, "lwr"], lty = 2)
lines(depth_grid, pred_depth[, "upr"], lty = 2)

# Predicted vs distance (depth fixed at median)
new_dist <- data.frame(depth_km = depth_med, dist_km = dist_grid)
pred_dist <- predict(lm_joint, newdata = new_dist, interval = "confidence")

plot(dist_grid, pred_dist[, "fit"], type = "l", lwd = 2,
     xlab = "Distance to nearest fault (km)", ylab = "Predicted magnitude",
     main = paste0("Predicted magnitude vs distance (depth_km = ", round(depth_med,1), ")"))
lines(dist_grid, pred_dist[, "lwr"], lty = 2)
lines(dist_grid, pred_dist[, "upr"], lty = 2)

library(dplyr)
library(sf)

# Fault attributes table with a join key
faults_attr <- faults %>%
  st_drop_geometry() %>%
  mutate(nearest_fault_row = row_number())

# Count earthquakes per fault (from your nearest-fault assignment)
eq_counts <- reg_df %>%   # or use `result` if that's the one containing nearest_fault_row
  filter(!is.na(nearest_fault_row)) %>%
  count(nearest_fault_row, name = "n_quakes") %>%
  arrange(desc(n_quakes))

# Join counts back onto fault attributes
faults_ranked <- eq_counts %>%
  left_join(faults_attr, by = "nearest_fault_row") %>%
  arrange(desc(n_quakes))
faults_with_counts <- faults_attr %>%
  left_join(eq_counts, by = "nearest_fault_row") %>%
  mutate(n_quakes = ifelse(is.na(n_quakes), 0L, n_quakes))

# Slip type comparison
faults_with_counts %>%
  group_by(slip_type) %>%
  summarise(
    n_faults = n(),
    mean_quakes = mean(n_quakes),
    median_quakes = median(n_quakes),
    total_quakes = sum(n_quakes)
  ) %>%
  arrange(desc(mean_quakes))
## # A tibble: 16 × 5
##    slip_type           n_faults mean_quakes median_quakes total_quakes
##    <chr>                  <int>       <dbl>         <dbl>        <int>
##  1 Subduction Thrust          1      19                19           19
##  2 Dextral                   15       4.67              4           70
##  3 Sinistral Normal           1       2                 2            2
##  4 Dextral-Normal            13       1.23              0           16
##  5 <NA>                       7       0.857             0            6
##  6 Sinistral-Reverse          5       0.6               0            3
##  7 Normal                    69       0.551             0           38
##  8 Thrust                    21       0.381             0            8
##  9 Normal-Dextral             4       0.25              0            1
## 10 Sinistral Transform        9       0.222             0            2
## 11 Sinistral-Normal          10       0.2               0            2
## 12 Sinistral                 69       0.174             0           12
## 13 Normal-Sinistral           8       0.125             0            1
## 14 Reverse                   25       0.08              0            2
## 15 Reverse-Sinistral          1       0                 0            0
## 16 Spreading Ridge            1       0                 0            0
# Is_Active

faults_with_counts %>%
  group_by(is_active) %>%
  summarise(
    n_faults = n(),
    mean_quakes = mean(n_quakes),
    median_quakes = median(n_quakes),
    total_quakes = sum(n_quakes)
  ) %>%
  arrange(desc(mean_quakes))
## # A tibble: 3 × 5
##   is_active n_faults mean_quakes median_quakes total_quakes
##       <int>    <int>       <dbl>         <int>        <int>
## 1         1      149      1.15               0          171
## 2        NA       69      0.101              0            7
## 3         2       41      0.0976             0            4
parse_tuple_first_chr <- function(x) {
  out <- rep(NA_real_, length(x))
  idx <- which(!is.na(x))
  if (length(idx) == 0) return(out)
  s <- gsub("[()\\s]", "", x[idx])
  out[idx] <- vapply(strsplit(s, ",", fixed = TRUE), function(parts) {
    parts <- parts[parts != ""]
    nums <- suppressWarnings(as.numeric(parts))
    nums <- nums[!is.na(nums)]
    if (length(nums) >= 1) nums[1] else NA_real_
  }, numeric(1))
  out
}

faults_num <- faults_with_counts %>%
  mutate(
    dip_pref    = parse_tuple_first_chr(average_dip),
    net_pref    = parse_tuple_first_chr(net_slip_rate),
    strike_pref = parse_tuple_first_chr(strike_slip_rate),
    vert_pref   = parse_tuple_first_chr(vert_slip_rate),
    short_pref  = parse_tuple_first_chr(shortening_rate)
  )

# Simple correlations with quake counts
cor(faults_num$n_quakes, faults_num$strike_pref, use = "complete.obs")
## [1] 0.1090592
cor(faults_num$n_quakes, faults_num$net_pref,    use = "complete.obs")
## [1] 0.5881437
cor(faults_num$n_quakes, faults_num$vert_pref,   use = "complete.obs")
## [1] 0.5303874
cor(faults_num$n_quakes, faults_num$dip_pref,    use = "complete.obs")
## [1] 0.07939976
cor(faults_num$n_quakes, faults_num$short_pref,  use = "complete.obs")
## [1] 0.7344656
pca_df <- faults_num %>%
  select(
    strike_pref,
    short_pref,
    dip_pref
  ) %>%
  filter(complete.cases(.))

pca_fit <- prcomp(pca_df, scale. = TRUE)
summary(pca_fit)
## Importance of components:
##                          PC1    PC2       PC3
## Standard deviation     1.507 0.8538 7.841e-17
## Proportion of Variance 0.757 0.2430 0.000e+00
## Cumulative Proportion  0.757 1.0000 1.000e+00
pca_fit$rotation
##                    PC1         PC2        PC3
## strike_pref  0.5692008  0.60202852 -0.5599751
## short_pref  -0.6624806 -0.06755787 -0.7460264
## dip_pref     0.4869599 -0.79561146 -0.3603782