Proyecto terremotos

importar librerias

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(gghighlight)
library(leaflet)
library(IRdisplay)
options(scipen = 999)
options(warn = -1)
library(lubridate)
library(viridis)

## Loading required package: viridisLite

library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(caTools)
library(kernlab)

## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:purrr':
## 
##     cross
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ROSE)

## Loaded ROSE 0.0-4

list.files(path = "../input")

## character(0)

earthquake <- read_csv("all_month.csv")

## Rows: 8161 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): magType, net, id, place, type, status, locationSource, magSource
## dbl  (12): latitude, longitude, depth, mag, nst, gap, dmin, rms, horizontalE...
## dttm  (2): time, updated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(earthquake)

## Rows: 8,161
## Columns: 22
## $ time            <dttm> 2019-03-11 04:30:18, 2019-03-11 04:24:26, 2019-03-11 …
## $ latitude        <dbl> 33.52100, 61.06940, 38.77333, 65.01810, 37.29130, 35.1…
## $ longitude       <dbl> -116.7945, -151.8329, -122.7435, -148.7292, -117.5088,…
## $ depth           <dbl> 2.82, 105.60, 1.16, 8.60, 6.90, 5.00, 6.20, 48.70, 2.5…
## $ mag             <dbl> 0.44, 1.10, 0.73, 1.60, 0.70, 1.80, 1.70, 1.90, 0.56, …
## $ magType         <chr> "ml", "ml", "md", "ml", "ml", "mb_lg", "ml", "ml", "md…
## $ nst             <dbl> 18, NA, 5, NA, 8, NA, 30, NA, 10, NA, NA, NA, NA, NA, …
## $ gap             <dbl> 53.00, NA, 213.00, NA, 176.36, 64.00, 93.14, NA, 77.00…
## $ dmin            <dbl> 0.009898, NA, 0.011220, NA, 0.375000, 0.219000, 0.1370…
## $ rms             <dbl> 0.1400, 0.2500, 0.0100, 0.5100, 0.2800, 0.5100, 0.3300…
## $ net             <chr> "ci", "ak", "nc", "ak", "nn", "us", "nn", "ak", "nc", …
## $ id              <chr> "ci38263479", "ak01937u587x", "nc73150441", "ak01937u2…
## $ updated         <dttm> 2019-03-11 04:33:55, 2019-03-11 04:27:53, 2019-03-11 …
## $ place           <chr> "11km NE of Aguanga, CA", "51km NW of Nikiski, Alaska"…
## $ type            <chr> "earthquake", "earthquake", "earthquake", "earthquake"…
## $ horizontalError <dbl> 0.25, NA, 1.55, NA, NA, 1.10, NA, NA, 0.42, 5.80, NA, …
## $ depthError      <dbl> 1.02, 1.50, 0.70, 0.20, 24.50, 1.70, 0.70, 1.50, 1.31,…
## $ magError        <dbl> 0.129, NA, 0.230, NA, NA, 0.255, NA, NA, NA, 0.080, NA…
## $ magNst          <dbl> 11, NA, 2, NA, NA, 4, NA, NA, 1, 15, NA, NA, 11, 60, 1…
## $ status          <chr> "automatic", "automatic", "automatic", "automatic", "a…
## $ locationSource  <chr> "ci", "ak", "nc", "ak", "nn", "us", "nn", "ak", "nc", …
## $ magSource       <chr> "ci", "ak", "nc", "ak", "nn", "us", "nn", "ak", "nc", …

head(earthquake)

## # A tibble: 6 × 22
##   time                latitude longi…¹  depth   mag magType   nst   gap     dmin
##   <dttm>                 <dbl>   <dbl>  <dbl> <dbl> <chr>   <dbl> <dbl>    <dbl>
## 1 2019-03-11 04:30:18     33.5  -117.    2.82  0.44 ml         18   53   0.00990
## 2 2019-03-11 04:24:26     61.1  -152.  106.    1.1  ml         NA   NA  NA      
## 3 2019-03-11 04:22:42     38.8  -123.    1.16  0.73 md          5  213   0.0112 
## 4 2019-03-11 04:13:42     65.0  -149.    8.6   1.6  ml         NA   NA  NA      
## 5 2019-03-11 04:10:13     37.3  -118.    6.9   0.7  ml          8  176.  0.375  
## 6 2019-03-11 04:04:12     35.1   -97.6   5     1.8  mb_lg      NA   64   0.219  
## # … with 13 more variables: rms <dbl>, net <chr>, id <chr>, updated <dttm>,
## #   place <chr>, type <chr>, horizontalError <dbl>, depthError <dbl>,
## #   magError <dbl>, magNst <dbl>, status <chr>, locationSource <chr>,
## #   magSource <chr>, and abbreviated variable name ¹longitude

tabla 1

La primera tabla muestra la organización de la toma de datos, donde obtendremos los siguientes datos: la profundidad del evento en kilómetros, la incertidumbre de la profundidad reportada, la distancia horizontal al epicentro, la incertidumbre de la ubicación reportada, la única identificador, la latitud en decimales, la magnitud del sismo y su incertidumbre, el número de estaciones sísmicas utilizadas para calcular la magnitud, etc.

options(repr.plot.width=12, repr.plot.height=6) 

 

ggplot(earthquake, aes(time, mag)) +  

geom_line(color = 'pink')+ theme_bw()+  

ggtitle("Earthquake Magnitude")+ 

xlab('Date')+ 

ylab('Magnitude')+ 

theme(plot.title = element_text(hjust = 0.5))

Grafica 2

En este gráfico puedes ver la correlación (día-magnitud) de la magnitud de los sismos entre febrero 2019 y marzo 2019.

#Location 

earthquake$location <- sub('.*,\\s*','', earthquake$place) 

 

#Time of the day in 'Hour' 

earthquake$hour <- ymd_hms(earthquake$time) 

earthquake$hour <- hour(earthquake$hour) 

 

#Visualizing the number of quakes that have happened at a particular time 

earthquake %>%  

filter(!is.na(mag))%>% 

group_by(hour)%>% 

summarise(count = length(id),max_magnitude = max(mag))%>% 

ggplot(aes(hour,count, color = max_magnitude))+geom_line()+ 

scale_color_viridis(direction = -1)+ 

scale_x_continuous(breaks=seq(0,23,1))+ 

xlab("Time of the day")+ 

ylab("Number of earthquakes")+ 

ggtitle("Number of quakes as per time of the day")+ 

theme_bw()+ 

theme(plot.title = element_text(hjust = 0.5))

Grafico 3

Este gráfico de líneas muestra el número de terremotos que ocurrieron durante un día determinado. Se puede observar que la mayoría de los sismos ocurren a la medianoche, y la hora en la que menos sismos ocurren es entre las 5 y las 7 de la noche. Los sismos más fuertes ocurrieron alrededor de las 8 y 9

earthquake %>%  

group_by(location) %>%  

filter(!(is.na(mag)))%>% 

summarise(Number_of_quakes = length(location),  

          Average_Magnitude = mean(mag))%>% 

mutate(Percent = round(prop.table(Number_of_quakes)*100,2))%>% 

arrange(desc(Number_of_quakes))%>% top_n(25, Number_of_quakes)

## # A tibble: 25 × 4
##    location    Number_of_quakes Average_Magnitude Percent
##    <chr>                  <int>             <dbl>   <dbl>
##  1 CA                      2643             1.02    32.4 
##  2 Alaska                  2551             1.72    31.3 
##  3 Nevada                   480             0.776    5.88
##  4 Utah                     480             1.39     5.88
##  5 Puerto Rico              269             2.37     3.3 
##  6 Hawaii                   241             1.83     2.95
##  7 Montana                  174             1.27     2.13
##  8 Washington               114             1.06     1.4 
##  9 Indonesia                102             4.60     1.25
## 10 California                86             0.980    1.05
## # … with 15 more rows

Grafica 4

En este gráfico puedes ver la cantidad de terremotos en cada país, así como la magnitud promedio de los terremotos en cada país y el porcentaje a nivel mundial.

ggplot(earthquake, aes(mag,depth,color = hour))+ 

geom_jitter(alpha = 0.5)+ 

gghighlight(max(depth>200)| max(mag>4))+ 

scale_x_continuous(breaks=seq(0,9,1))+ 

scale_color_viridis(direction = -1)+ 

theme_bw()+ 

xlab('Magnitude')+ 

ylab('Depth')+ 

ggtitle('Magnitude Vs. Depth in Km')+ 

theme(plot.title = element_text(hjust = 0.5))

Grafico 5

Grafico de dispersión de el tamaño y la profundidad de la parcela para ver si el tamaño es proporcional a la profundidad. Y es posible demostrar que los terremotos más pequeños suelen ocurrir a una profundidad de 200 kilómetros. Y los epicentros de algunos de los terremotos más pequeños estaban a mayores profundidades.

earthquake %>% 

group_by(type)%>% 

summarise(count = length(type))%>% 

mutate(Percent = prop.table(count*100))%>% 

ggplot(aes(type,Percent, fill = type))+ 

geom_col()+theme_bw()+ 

xlab('Type of Seismic Activity')+ 

ylab('Percent')+ 

ggtitle('Percent of Seismic Activity')+ 

theme(plot.title = element_text(hjust = 0.5))

Grafico 6

El cuadro muestra un diagrama de caja que la mayor parte de la actividad sísmica no natural ocurre principalmente en la superficie de la tierra. Los terremotos, por otro lado, ocurren principalmente de 10 a 20 kilómetros debajo de la superficie, pero también pueden ocurrir varios kilómetros debajo de la superficie. Pero eso generalmente no sucede con otros tipos de actividad sísmica.

ggplot(earthquake, aes('',depth, color = type))+geom_boxplot()+ 

scale_y_continuous(limits = c(0, 100))+theme_bw()+ 

xlab('Type of Seismic Activity')+ 

ggtitle('Depth of Seismic Activity')

Grafico 7

Este recuadro muestra que la magnitud de los terremotos naturales se puede correlacionar entre 0 y 7 grados. Los terremotos causados por el hombre son mucho más pequeños, por lo general entre 0 y 2,2 en la escala de Richter.

ggplot(earthquake, aes('',mag, color = type))+geom_boxplot()+ 

scale_y_continuous(limits = c(0, 8))+theme_bw()+ 

xlab('Type of Seismic Activity')+ 

ggtitle('Magnitude of Seismic Activity')

Grafico 8

Como puede ver en este recuadro, los terremotos en ciertos momentos pueden ser causados por explosiones químicas en el área, ya que estadísticamente pocos terremotos naturales ocurren entre la medianoche y las 5:00 a.m.

ggplot(earthquake, aes('',hour, color = type))+geom_boxplot()+ 

scale_y_continuous(limits = c(0, 24))+theme_bw()+ 

xlab('Type of Seismic Activity')+ 

ggtitle('Time of Seismic Activity')

Grafica 9

A continuación, se intenta predecir las causas de los terremotos considerando factores como la profundidad, el tamaño y la ubicación, ya que la sismicidad natural e inducida ha estimado la extensión de los factores anteriores.

#Choosing the relevant columns 

quake <- earthquake[,c(2,3,4,5,24,15)] 

 

#Categorising the type 

quake$type <- ifelse(quake$type == 'earthquake', 'Earthquake', 'Otherquake') 

 

#quake[,c(1:5)] <- scale(quake[,c(1:5)]) 

 

#Converting our target column 'type' into a factor 

quake$type <- factor(quake$type) 

 

#Checking and removing any row having NA 

colSums(is.na(quake))

##  latitude longitude     depth       mag      hour      type 
##         0         0         0         4         0         0

quake <- quake[!is.na(quake$mag),]

head(quake)

## # A tibble: 6 × 6
##   latitude longitude  depth   mag  hour type      
##      <dbl>     <dbl>  <dbl> <dbl> <int> <fct>     
## 1     33.5    -117.    2.82  0.44     4 Earthquake
## 2     61.1    -152.  106.    1.1      4 Earthquake
## 3     38.8    -123.    1.16  0.73     4 Earthquake
## 4     65.0    -149.    8.6   1.6      4 Earthquake
## 5     37.3    -118.    6.9   0.7      4 Earthquake
## 6     35.1     -97.6   5     1.8      4 Earthquake

table(quake$type)

## 
## Earthquake Otherquake 
##       8005        152

# splitting the data between train and test 

set.seed(1234) 

 

indices = sample.split(quake$type, SplitRatio = 0.7) 

 

train = quake[indices,] 

 

test = quake[!(indices),] 

 

head(train)

## # A tibble: 6 × 6
##   latitude longitude  depth   mag  hour type      
##      <dbl>     <dbl>  <dbl> <dbl> <int> <fct>     
## 1     33.5    -117.    2.82  0.44     4 Earthquake
## 2     61.1    -152.  106.    1.1      4 Earthquake
## 3     38.8    -123.    1.16  0.73     4 Earthquake
## 4     65.0    -149.    8.6   1.6      4 Earthquake
## 5     35.1     -97.6   5     1.8      4 Earthquake
## 6     37.3    -117.    6.2   1.7      3 Earthquake

table(train$type)

## 
## Earthquake Otherquake 
##       5604        106

table(test$type)

## 
## Earthquake Otherquake 
##       2401         46

sin tratatar el desequilirbio de clases

Podemos ver claramente los errores en esta predicción. Todos los terremotos se pronosticaron correctamente, lo que hizo que la sensibilidad (verdaderos positivos) fuera muy alta. Sin embargo, ninguno de los “otros terremotos” se predijo correctamente, lo que resultó en una especificidad (negativo verdadero) de 0.

set.seed(1234) 

#Using RBF Kernel 

Model_RBF <- ksvm(type~ ., data = train, scale = FALSE, kernel = "rbfdot") 

Eval_RBF <- predict(Model_RBF, test[,-6]) 

 

  #confusion matrix - RBF Kernel 

  confusionMatrix(Eval_RBF,test$type)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Earthquake Otherquake
##   Earthquake       2401         46
##   Otherquake          0          0
##                                           
##                Accuracy : 0.9812          
##                  95% CI : (0.975, 0.9862) 
##     No Information Rate : 0.9812          
##     P-Value [Acc > NIR] : 0.5391          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.00000000003247
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9812          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9812          
##          Detection Rate : 0.9812          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Earthquake      
##

tratando el equilibrio de clases

Se puede ver claramente un aumento significativo en la especificidad. Aunque se pronosticaron incorrectamente 150 terremotos, 47 de los 55 terremotos restantes en nuestros datos de prueba se pronosticaron correctamente. La introducción del sobremuestreo ayuda mucho a lidiar con el desequilibrio de clases y mejora significativamente nuestra especificidad al 85%.

set.seed(1234) 

over <- ovun.sample(type~., data = train, method="over", N= 11640)$data

set.seed(1234) 

#Using RBF Kernel 

Model_RBF <- ksvm(type~ ., data = over, scale = FALSE, kernel = "rbfdot") 

Eval_RBF <- predict(Model_RBF, test[,-6]) 

 

  #confusion matrix - RBF Kernel 

  confusionMatrix(Eval_RBF,test$type)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   Earthquake Otherquake
##   Earthquake       2285          4
##   Otherquake        116         42
##                                              
##                Accuracy : 0.951              
##                  95% CI : (0.9416, 0.9592)   
##     No Information Rate : 0.9812             
##     P-Value [Acc > NIR] : 1                  
##                                              
##                   Kappa : 0.3941             
##                                              
##  Mcnemar's Test P-Value : <0.0000000000000002
##                                              
##             Sensitivity : 0.9517             
##             Specificity : 0.9130             
##          Pos Pred Value : 0.9983             
##          Neg Pred Value : 0.2658             
##              Prevalence : 0.9812             
##          Detection Rate : 0.9338             
##    Detection Prevalence : 0.9354             
##       Balanced Accuracy : 0.9324             
##                                              
##        'Positive' Class : Earthquake         
##

Mapa 1

Mapa 2