Previsione su eventi siccitosi nel mondo

Caricamento pacchetti:

library(dplyr)
library(ggplot2)
library(forecast)

Parte 1: Dati

Carico le serie storiche dal 1900 al 2021 dei disastri nel mondo scaricate da qui : https://www.kaggle.com/datasets/brsdincer/all-natural-disasters-19002021-eosdis

data <- read.csv("1900_2021_DISASTERS.xlsx - emdat data.csv", stringsAsFactors = T)

Ci sono valori mancanti nel dataset ma le variabili Year,Disaster.Type, Disaster.Subtype e Continent non ne hanno quindi non considero il problema dei valori mancanti.

colSums(is.na(data))

##                       Year                        Seq 
##                          0                          0 
##                      Glide             Disaster.Group 
##                          0                          0 
##          Disaster.Subgroup              Disaster.Type 
##                          0                          0 
##           Disaster.Subtype        Disaster.Subsubtype 
##                          0                          0 
##                 Event.Name                    Country 
##                          0                          0 
##                        ISO                     Region 
##                          0                          0 
##                  Continent                   Location 
##                          0                          0 
##                     Origin             Associated.Dis 
##                          0                          0 
##            Associated.Dis2              OFDA.Response 
##                          0                          0 
##                     Appeal                Declaration 
##                          0                          0 
##           Aid.Contribution              Dis.Mag.Value 
##                      15449                      11180 
##              Dis.Mag.Scale                   Latitude 
##                          0                          0 
##                  Longitude                 Local.Time 
##                          0                          0 
##                River.Basin                 Start.Year 
##                          0                          0 
##                Start.Month                  Start.Day 
##                        387                       3628 
##                   End.Year                  End.Month 
##                          0                        708 
##                    End.Day               Total.Deaths 
##                       3556                       4713 
##                 No.Injured                No.Affected 
##                      12231                       6906 
##                No.Homeless             Total.Affected 
##                      13696                       4509 
## Insured.Damages...000.US..   Total.Damages...000.US.. 
##                      15030                      10881 
##                        CPI                  Adm.Level 
##                        315                          0 
##                Admin1.Code                Admin2.Code 
##                          0                          0 
##              Geo.Locations 
##                          0

Valori delle variabili categoriali nel dataset:

levels(data$Disaster.Subgroup)

## [1] "Biological"        "Climatological"    "Extra-terrestrial"
## [4] "Geophysical"       "Hydrological"      "Meteorological"

print("")

## [1] ""

levels(data$Disaster.Type)

##  [1] "Animal accident"       "Drought"               "Earthquake"           
##  [4] "Epidemic"              "Extreme temperature "  "Flood"                
##  [7] "Fog"                   "Glacial lake outburst" "Impact"               
## [10] "Insect infestation"    "Landslide"             "Mass movement (dry)"  
## [13] "Storm"                 "Volcanic activity"     "Wildfire"

print("")

## [1] ""

levels(data$Disaster.Subtype)

##  [1] ""                                 "Ash fall"                        
##  [3] "Avalanche"                        "Bacterial disease"               
##  [5] "Coastal flood"                    "Cold wave"                       
##  [7] "Convective storm"                 "Drought"                         
##  [9] "Extra-tropical storm"             "Flash flood"                     
## [11] "Forest fire"                      "Grasshopper"                     
## [13] "Ground movement"                  "Heat wave"                       
## [15] "Land fire (Brush, Bush, Pasture)" "Landslide"                       
## [17] "Lava flow"                        "Locust"                          
## [19] "Mudslide"                         "Parasitic disease"               
## [21] "Pyroclastic flow "                "Riverine flood"                  
## [23] "Rockfall"                         "Severe winter conditions"        
## [25] "Subsidence"                       "Tropical cyclone"                
## [27] "Tsunami"                          "Viral disease"

print("")

## [1] ""

levels(data$Disaster.Subsubtype)

##  [1] ""                        "Derecho"                
##  [3] "Hail"                    "Lightning/Thunderstorms"
##  [5] "Mudslide"                "Rain"                   
##  [7] "Sand/Dust storm"         "Severe storm"           
##  [9] "Snow/Ice"                "Storm/Surge"            
## [11] "Sudden subsidence"       "Tornado"                
## [13] "Winter storm/Blizzard"

print("")

## [1] ""

levels(data$Continent)

## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"

print("")

## [1] ""

levels(data$Region)

##  [1] "Australia and New Zealand" "Caribbean"                
##  [3] "Central America"           "Central Asia"             
##  [5] "Eastern Africa"            "Eastern Asia"             
##  [7] "Eastern Europe"            "Melanesia"                
##  [9] "Micronesia"                "Middle Africa"            
## [11] "Northern Africa"           "Northern America"         
## [13] "Northern Europe"           "Polynesia"                
## [15] "Russian Federation"        "South America"            
## [17] "South-Eastern Asia"        "Southern Africa"          
## [19] "Southern Asia"             "Southern Europe"          
## [21] "Western Africa"            "Western Asia"             
## [23] "Western Europe"

Parte 2: Domanda di ricerca

Si vuole predire la frequenza degli eventi siccitosi dal 2022 in poi e valutare se esiste un trend crescente della time series .

Parte 3: EDA

Come si vede dal seguente grafico la siccità (Drought) è al sesto posto come frequenza di disastri dal 1901 al 2021:

df<-data %>%
  group_by(Disaster.Type) %>%
  summarise(total=n()) 

df %>%
mutate(Disaster.Type=reorder(Disaster.Type,total))  %>%
  ggplot(aes(Disaster.Type,total, fill=Disaster.Type)) +
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(aes(label=total), hjust=0, size=2)+
  guides(fill="none")

df<-data %>%
  filter(Disaster.Type=="Drought") %>%
  group_by(Country) %>%
  summarise(total=n()) 

df %>%
filter(total>5) %>%  
mutate(Country=reorder(Country,total))  %>%
  ggplot(aes(Country,total, fill=Country)) +
  geom_bar(stat = "identity")+
  coord_flip()+
  geom_text(aes(label=total), hjust=0, size=2)+
  guides(fill="none")+
  ggtitle("Numero totale di eventi siccitosi", subtitle = "per paese dal 1901 al 2021")

Si nota dal seguente grafico un trend crescente degli eventi siccitosi dal 1975 in poi che sembra appiattirsi di recente :

data %>%
  filter(Disaster.Type==c("Drought")) %>%
  group_by(Disaster.Type, Year) %>%
  summarise(total=n(), .groups = "keep") %>%
  ggplot(aes(Year,total, colour=Disaster.Type)) +
  geom_line()

Parte 4: Previsioni

Confronto il modello di forecasting ets con arima calcolando gli errori previsionali dividendo la serie storica in un train e un test. Si nota che il modello arima dà errori minori sul test e quindi risulta migliore:

dis<-data %>%
    filter(Disaster.Type=="Drought") %>%
    group_by(Year) %>%
    summarise(total=n())

#training and test sets
train <- ts(dis[1:64,2])
test <- ts(dis[65:76,2])

fc_ets <- forecast(train,12)
acc_ets <- accuracy(fc_ets, test[1:12])
acc_ets

##                      ME     RMSE      MAE       MPE      MAPE      MASE
## Training set  0.7603952 5.679190 4.447353 -87.55634 117.40467 0.9465650
## Test set     -0.2011196 5.536647 4.604854 -13.89526  33.81144 0.9800872
##                   ACF1
## Training set 0.3002882
## Test set            NA

ar <- auto.arima(train)
fc_ets <- forecast(ar,12)
acc_ets <- accuracy(fc_ets, test[1:12])
acc_ets

##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.62944795 5.728252 4.432243 -44.45134 78.87091 0.9433491
## Test set     0.09220802 5.170176 4.166667 -10.99215 30.12563 0.8868243
##                    ACF1
## Training set 0.05532845
## Test set             NA

Creo una funzione che predica il numero totale di eventi siccitosi nel mondo dal 2022 al 2032 con un intervallo di confidenza all’80% e al 95% utilizzando il modello arima:

forecast_disaster_arima <- function( Disaster) {
  dis<-data %>%
    filter(Disaster.Type==Disaster) %>%
    group_by(Year) %>%
    summarise(total=n()) 
  
  serie <-ts(dis$total)
  ar <- auto.arima(serie)
  fore<-forecast(ar)
  
  plot(fore)
  
  year<-2022:2031
  df_pred <-data.frame(q=99:108)
  p<-predict (fore,newdata=df_pred, se=TRUE)
  cat("Forecasting",Disaster,"\n")
  cbind(as.data.frame(year),p)
}

forecast_disaster_arima("Drought")

## Forecasting Drought