Library

Load the data

aedes <- read.csv("0253828-230224095556074.csv", sep="\t")

Remove the data with missing coordinates

aedes <- aedes[complete.cases(aedes$decimalLatitude, aedes$decimalLongitude), ]

Sample

sample_ae <- aedes %>% sample_n(50000, replace = FALSE)

Convert into spatial points and plot

sp_points_proj <- st_as_sf(sample_ae, coords = c("decimalLongitude", "decimalLatitude"), crs = 4326)
sp_points_proj <- st_transform(sp_points_proj, crs = 4326)
plot(sp_points_proj[,1], main = "")

Generate the speudo absence randomly

Remove the attributs to keep only the coordinates

presence_points <- SpatialPointsDataFrame(
  coords = aedes[, c("decimalLongitude", "decimalLatitude")],
  data = aedes,
  proj4string = CRS("+proj=longlat +datum=WGS84")
)

presence <- aedes[,c("decimalLongitude", "decimalLatitude")]
sample_ae <- presence %>% sample_n(50000, replace = FALSE)
colnames(sample_ae)<-c("longitude","latitude")

Generate 50 000 pseudo absences

num_points <- 50000
min_longitude <- -180
max_longitude <- 180
min_latitude <- -90
max_latitude <- 90
random_points <- data.frame(
  longitude = runif(num_points, min_longitude, max_longitude),
  latitude = runif(num_points, min_latitude, max_latitude)
)
random_points$presence <- 0
sample_ae$presence <- 1

Load the covariates data

list_covar <- list.files("./mood_tutorial/NEO_monthly/", pattern = "TIFF", recursive = T, full.names = T)
length(list_covar)
## [1] 7392
rast(list_covar[20])
## class       : SpatRaster 
## dimensions  : 1800, 3600, 1  (nrow, ncol, nlyr)
## resolution  : 0.1, 0.1  (x, y)
## extent      : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326) 
## source      : MCD43C3_M_BSA_2001-09.FLOAT.TIFF 
## name        : MCD43C3_M_BSA_2001-09.FLOAT

Choose the covariates

Based on the biology Average Land Surface Temperature (Day), Rainfall, Topography

Selection of one raster for each covariates

# Temperature
temp = rast(grep("MOD_LSTD_CLIM_M", list_covar, value = T)[1])
setMinMax(temp)
NAflag(temp) = 99999
plot(temp)
title(main = "Temperature")

# Rainfall
rainfall = rast(grep("TRMM_3B43M", list_covar, value = T)[1])
setMinMax(rainfall)
NAflag(rainfall) = 99999
plot(rainfall)
title(main = "Rainfall")

# Topography
topo = rast(grep("SRTM_RAMP2_TOPO", list_covar, value = T)[1])
setMinMax(topo)
NAflag(topo) = 99999
plot(topo)
title(main = "Topology")

Exctract value

mosquito <- rbind(sample_ae, random_points)

a <- mosquito[,c("longitude", "latitude")]

a_temp <- raster::extract(temp, a)
colnames(a_temp) <- c("ID", "temperature")
mosquito <- cbind(mosquito, a_temp)

a_rain <- raster::extract(rainfall, a)
colnames(a_rain) <- c("ID", "rainfall")
mosquito <- cbind(mosquito, a_rain)

a_topo <- raster::extract(topo, a)
colnames(a_topo) <- c("ID", "topography")
mosquito <- cbind(mosquito, a_topo)

mosquito <- mosquito[,c("longitude", "latitude", "presence", "temperature", "rainfall", "topography")]
Remove observation with missing data
NA_mosquito <- na.omit(mosquito)

Logistic regression model

model <- glm(presence ~ temperature + rainfall + topography, data = NA_mosquito, family = binomial)
summary(model)
## 
## Call:
## glm(formula = presence ~ temperature + rainfall + topography, 
##     family = binomial, data = NA_mosquito)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4227   0.2416   0.4138   0.4875   2.9522  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.9385136  0.0373852   78.60   <2e-16 ***
## temperature -0.0449537  0.0013297  -33.81   <2e-16 ***
## rainfall     0.0042904  0.0001768   24.27   <2e-16 ***
## topography  -0.0025280  0.0000367  -68.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 40501  on 47705  degrees of freedom
## Residual deviance: 31257  on 47702  degrees of freedom
## AIC: 31265
## 
## Number of Fisher Scoring iterations: 5
modEvA::RsqGLM(model)

## $CoxSnell
## [1] 0.1761672
## 
## $Nagelkerke
## [1] 0.3079043
## 
## $McFadden
## [1] 0.2282592
## 
## $Tjur
## [1] 0.2559452
## 
## $sqPearson
## [1] 0.2613938

Log loss

predicted_probabilities <- predict(model, type = "response")
log_loss <- function(y_true, y_pred) {
  -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred))
}
true_labels <- mosquito$presence  # Vraies valeurs de classe
loss <- log_loss(true_labels, predicted_probabilities)
## Warning in y_true * log(y_pred): la taille d'un objet plus long n'est pas
## multiple de la taille d'un objet plus court
## Warning in (1 - y_true) * log(1 - y_pred): la taille d'un objet plus long n'est
## pas multiple de la taille d'un objet plus court
print(loss)
## [1] 1.283899