library(dplyr)
library(ggplot2)
library(forecast)
Carico le serie storiche dal 1900 al 2021 dei disastri nel mondo scaricate da qui : https://www.kaggle.com/datasets/brsdincer/all-natural-disasters-19002021-eosdis
data <- read.csv("1900_2021_DISASTERS.xlsx - emdat data.csv", stringsAsFactors = T)
Ci sono valori mancanti nel dataset ma le variabili Year,Disaster.Type, Disaster.Subtype e Continent non ne hanno quindi non considero il problema dei valori mancanti.
colSums(is.na(data))
## Year Seq
## 0 0
## Glide Disaster.Group
## 0 0
## Disaster.Subgroup Disaster.Type
## 0 0
## Disaster.Subtype Disaster.Subsubtype
## 0 0
## Event.Name Country
## 0 0
## ISO Region
## 0 0
## Continent Location
## 0 0
## Origin Associated.Dis
## 0 0
## Associated.Dis2 OFDA.Response
## 0 0
## Appeal Declaration
## 0 0
## Aid.Contribution Dis.Mag.Value
## 15449 11180
## Dis.Mag.Scale Latitude
## 0 0
## Longitude Local.Time
## 0 0
## River.Basin Start.Year
## 0 0
## Start.Month Start.Day
## 387 3628
## End.Year End.Month
## 0 708
## End.Day Total.Deaths
## 3556 4713
## No.Injured No.Affected
## 12231 6906
## No.Homeless Total.Affected
## 13696 4509
## Insured.Damages...000.US.. Total.Damages...000.US..
## 15030 10881
## CPI Adm.Level
## 315 0
## Admin1.Code Admin2.Code
## 0 0
## Geo.Locations
## 0
Valori delle variabili categoriali nel dataset:
levels(data$Disaster.Subgroup)
## [1] "Biological" "Climatological" "Extra-terrestrial"
## [4] "Geophysical" "Hydrological" "Meteorological"
print("")
## [1] ""
levels(data$Disaster.Type)
## [1] "Animal accident" "Drought" "Earthquake"
## [4] "Epidemic" "Extreme temperature " "Flood"
## [7] "Fog" "Glacial lake outburst" "Impact"
## [10] "Insect infestation" "Landslide" "Mass movement (dry)"
## [13] "Storm" "Volcanic activity" "Wildfire"
print("")
## [1] ""
levels(data$Disaster.Subtype)
## [1] "" "Ash fall"
## [3] "Avalanche" "Bacterial disease"
## [5] "Coastal flood" "Cold wave"
## [7] "Convective storm" "Drought"
## [9] "Extra-tropical storm" "Flash flood"
## [11] "Forest fire" "Grasshopper"
## [13] "Ground movement" "Heat wave"
## [15] "Land fire (Brush, Bush, Pasture)" "Landslide"
## [17] "Lava flow" "Locust"
## [19] "Mudslide" "Parasitic disease"
## [21] "Pyroclastic flow " "Riverine flood"
## [23] "Rockfall" "Severe winter conditions"
## [25] "Subsidence" "Tropical cyclone"
## [27] "Tsunami" "Viral disease"
print("")
## [1] ""
levels(data$Disaster.Subsubtype)
## [1] "" "Derecho"
## [3] "Hail" "Lightning/Thunderstorms"
## [5] "Mudslide" "Rain"
## [7] "Sand/Dust storm" "Severe storm"
## [9] "Snow/Ice" "Storm/Surge"
## [11] "Sudden subsidence" "Tornado"
## [13] "Winter storm/Blizzard"
print("")
## [1] ""
levels(data$Continent)
## [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
print("")
## [1] ""
levels(data$Region)
## [1] "Australia and New Zealand" "Caribbean"
## [3] "Central America" "Central Asia"
## [5] "Eastern Africa" "Eastern Asia"
## [7] "Eastern Europe" "Melanesia"
## [9] "Micronesia" "Middle Africa"
## [11] "Northern Africa" "Northern America"
## [13] "Northern Europe" "Polynesia"
## [15] "Russian Federation" "South America"
## [17] "South-Eastern Asia" "Southern Africa"
## [19] "Southern Asia" "Southern Europe"
## [21] "Western Africa" "Western Asia"
## [23] "Western Europe"
Si vuole predire la frequenza degli eventi siccitosi dal 2022 in poi e valutare se esiste un trend crescente della time series .
Come si vede dal seguente grafico la siccità (Drought) è al sesto posto come frequenza di disastri dal 1901 al 2021:
df<-data %>%
group_by(Disaster.Type) %>%
summarise(total=n())
df %>%
mutate(Disaster.Type=reorder(Disaster.Type,total)) %>%
ggplot(aes(Disaster.Type,total, fill=Disaster.Type)) +
geom_bar(stat = "identity")+
coord_flip()+
geom_text(aes(label=total), hjust=0, size=2)+
guides(fill="none")
df<-data %>%
filter(Disaster.Type=="Drought") %>%
group_by(Country) %>%
summarise(total=n())
df %>%
filter(total>5) %>%
mutate(Country=reorder(Country,total)) %>%
ggplot(aes(Country,total, fill=Country)) +
geom_bar(stat = "identity")+
coord_flip()+
geom_text(aes(label=total), hjust=0, size=2)+
guides(fill="none")+
ggtitle("Numero totale di eventi siccitosi", subtitle = "per paese dal 1901 al 2021")
Si nota dal seguente grafico un trend crescente degli eventi siccitosi dal 1975 in poi che sembra appiattirsi di recente :
data %>%
filter(Disaster.Type==c("Drought")) %>%
group_by(Disaster.Type, Year) %>%
summarise(total=n(), .groups = "keep") %>%
ggplot(aes(Year,total, colour=Disaster.Type)) +
geom_line()
Confronto il modello di forecasting ets con arima calcolando gli errori previsionali dividendo la serie storica in un train e un test. Si nota che il modello arima dà errori minori sul test e quindi risulta migliore:
dis<-data %>%
filter(Disaster.Type=="Drought") %>%
group_by(Year) %>%
summarise(total=n())
#training and test sets
train <- ts(dis[1:64,2])
test <- ts(dis[65:76,2])
fc_ets <- forecast(train,12)
acc_ets <- accuracy(fc_ets, test[1:12])
acc_ets
## ME RMSE MAE MPE MAPE MASE
## Training set 0.7603952 5.679190 4.447353 -87.55634 117.40467 0.9465650
## Test set -0.2011196 5.536647 4.604854 -13.89526 33.81144 0.9800872
## ACF1
## Training set 0.3002882
## Test set NA
ar <- auto.arima(train)
fc_ets <- forecast(ar,12)
acc_ets <- accuracy(fc_ets, test[1:12])
acc_ets
## ME RMSE MAE MPE MAPE MASE
## Training set 0.62944795 5.728252 4.432243 -44.45134 78.87091 0.9433491
## Test set 0.09220802 5.170176 4.166667 -10.99215 30.12563 0.8868243
## ACF1
## Training set 0.05532845
## Test set NA
Creo una funzione che predica il numero totale di eventi siccitosi nel mondo dal 2022 al 2032 con un intervallo di confidenza all’80% e al 95% utilizzando il modello arima:
forecast_disaster_arima <- function( Disaster) {
dis<-data %>%
filter(Disaster.Type==Disaster) %>%
group_by(Year) %>%
summarise(total=n())
serie <-ts(dis$total)
ar <- auto.arima(serie)
fore<-forecast(ar)
plot(fore)
year<-2022:2031
df_pred <-data.frame(q=99:108)
p<-predict (fore,newdata=df_pred, se=TRUE)
cat("Forecasting",Disaster,"\n")
cbind(as.data.frame(year),p)
}
forecast_disaster_arima("Drought")
## Forecasting Drought
## year Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 77 2022 13.37966 6.283633 20.47568 2.52722579 24.23209
## 78 2023 13.77162 6.003273 21.53997 1.89095807 25.65229
## 79 2024 13.48123 5.683732 21.27873 1.55598407 25.40648
## 80 2025 13.69637 5.502772 21.88997 1.16534427 26.22739
## 81 2026 13.53698 5.262875 21.81109 0.88282620 26.19114
## 82 2027 13.65506 5.102912 22.20722 0.57567562 26.73445
## 83 2028 13.56758 4.899387 22.23578 0.31072066 26.82445
## 84 2029 13.63239 4.743425 22.52136 0.03788933 27.22690
## 85 2030 13.58438 4.559992 22.60877 -0.21723044 27.38599
## 86 2031 13.61995 4.405431 22.83447 -0.47244172 27.71235