# Taller 1 - Ciencia de datos aplicada al análisis epidemiológico
sigatoka <- read_excel("C:/Users/R O G S T R I X/Desktop/Universidad/AE/sigatoka_rich_data_2023_2024.xlsx")
str(sigatoka)
## tibble [9,360 × 7] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:9360] 2023 2023 2023 2023 2023 ...
## $ Week : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Location_ID: num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
## $ Longitude : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
## $ Incidence : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
## $ Severity : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
summary(sigatoka)
## Year Week Location_ID Latitude Longitude
## Min. :2023 Min. : 1.00 Min. :1 Min. :7.807 Min. :-76.60
## 1st Qu.:2023 1st Qu.:13.75 1st Qu.:1 1st Qu.:7.814 1st Qu.:-76.60
## Median :2024 Median :26.50 Median :2 Median :7.836 Median :-76.56
## Mean :2024 Mean :26.50 Mean :2 Mean :7.841 Mean :-76.57
## 3rd Qu.:2024 3rd Qu.:39.25 3rd Qu.:3 3rd Qu.:7.873 3rd Qu.:-76.55
## Max. :2024 Max. :52.00 Max. :3 Max. :7.880 Max. :-76.54
## Incidence Severity
## Min. : 0.01 Min. : 0.000
## 1st Qu.:25.54 1st Qu.: 2.480
## Median :50.67 Median : 5.000
## Mean :50.53 Mean : 4.998
## 3rd Qu.:75.72 3rd Qu.: 7.530
## Max. :99.98 Max. :10.000
head(sigatoka)
## # A tibble: 6 × 7
## Year Week Location_ID Latitude Longitude Incidence Severity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2023 1 1 7.81 -76.5 68.8 4.31
## 2 2023 1 1 7.81 -76.5 74.1 2.86
## 3 2023 1 1 7.81 -76.5 68.6 5.94
## 4 2023 1 1 7.81 -76.5 98.6 5.26
## 5 2023 1 1 7.81 -76.5 27.8 9.85
## 6 2023 1 1 7.81 -76.5 13.4 3.01
sigatoka <- sigatoka %>%
mutate(across(where(is.character), ~trimws(.))) %>% # eliminar espacios
distinct() # eliminar duplicados
table(is.na(sigatoka))
##
## FALSE
## 65520
#sigatoka <- sigatoka %>% mutate(across(everything(), ~ifelse(is.na(.), mean(., na.rm=TRUE), .)))
sigatoka <- sigatoka %>%
mutate(
Fecha = as.Date(paste(Year, Week, 1, sep = "-"), format = "%Y-%U-%u")
)
str(sigatoka)
## tibble [9,360 × 8] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:9360] 2023 2023 2023 2023 2023 ...
## $ Week : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Location_ID: num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
## $ Longitude : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
## $ Incidence : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
## $ Severity : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
## $ Fecha : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
fetch_nasa_power_daily <- function(lat, lon, start_date, end_date) {
params <- "T2M,PRECTOTCORR,RH2M,ALLSKY_SFC_SW_DWN"
start <- format(as.Date(start_date), "%Y%m%d")
end <- format(as.Date(end_date), "%Y%m%d")
url <- paste0(
"https://power.larc.nasa.gov/api/temporal/daily/point?parameters=",
params, "&community=AG&latitude=", lat, "&longitude=", lon,
"&start=", start, "&end=", end, "&format=JSON"
)
response <- try(GET(url, timeout(30)), silent = TRUE)
if (inherits(response, "try-error") || response$status_code != 200) {
return(NULL)
}
js <- content(response, as = "parsed", simplifyVector = TRUE)
data <- js$properties$parameter
if (is.null(data)) return(NULL)
# Reestructurar en formato largo por fecha
df <- bind_rows(lapply(names(data), function(var) {
tibble(date = names(data[[var]]), variable = var, value = unlist(data[[var]]))
})) %>%
pivot_wider(names_from = variable, values_from = value) %>%
mutate(date = as.Date(date, format = "%Y%m%d"))
return(df)
}
# Enriquecer base con clima
enrich_with_climate <- function(df_in) {
df_in <- df_in %>% mutate(date_key = Fecha)
clima_rows <- list()
coords <- unique(df_in[, c("Latitude", "Longitude")])
for (i in seq_len(nrow(coords))) {
lat <- coords$Latitude[i]
lon <- coords$Longitude[i]
df_coord <- df_in %>% filter(Latitude == !!lat, Longitude == !!lon)
start <- min(df_coord$Fecha)
end <- max(df_coord$Fecha)
clima_data <- fetch_nasa_power_daily(lat, lon, start, end)
if (!is.null(clima_data)) {
clima_data <- clima_data %>%
mutate(Latitude = lat, Longitude = lon)
clima_rows[[i]] <- clima_data
}
}
df_clima <- bind_rows(clima_rows)
df_out <- df_in %>%
left_join(df_clima, by = c("Fecha" = "date", "Latitude", "Longitude"))
return(df_out)
}
# Aplicar función
sigatoka_enriched <- enrich_with_climate(sigatoka)
# Imputación simple de faltantes
v= "ALLSKY_SFC_SW_DWN"
sigatoka_enriched[[v]][is.na(sigatoka_enriched[[v]])] <-
median(sigatoka_enriched[[v]], na.rm = TRUE)
str(sigatoka_enriched)
## tibble [9,360 × 13] (S3: tbl_df/tbl/data.frame)
## $ Year : num [1:9360] 2023 2023 2023 2023 2023 ...
## $ Week : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Location_ID : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
## $ Longitude : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
## $ Incidence : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
## $ Severity : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
## $ Fecha : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
## $ date_key : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
## $ T2M : Named num [1:9360] 23.3 23.3 23.3 23.3 23.3 ...
## ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
## $ PRECTOTCORR : Named num [1:9360] 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 ...
## ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
## $ RH2M : Named num [1:9360] 90 90 90 90 90 ...
## ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
## $ ALLSKY_SFC_SW_DWN: Named num [1:9360] 16.9 16.9 16.9 16.9 16.9 ...
## ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
sig_sf <- sigatoka_enriched %>% filter(!is.na(Latitude) & !is.na(Longitude)) %>% st_as_sf(coords = c("Longitude","Latitude"), crs = 4326, remove = FALSE)
library(viridis)
## Cargando paquete requerido: viridisLite
p_map <- ggplot() +
geom_sf(data = sig_sf, aes(color = Severity, size = Incidence), alpha = 0.8) +
scale_color_viridis(option = "magma", na.value = "grey50") +
theme_minimal() + labs(title = "Severidad / Incidencia por punto")
p_map

library(leaflet)
pal <- colorNumeric("viridis", domain = sig_sf$Severity, na.color = "grey")
leaflet(sig_sf) %>% addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(~Longitude, ~Latitude, radius = ~ifelse(is.na(Incidence), 3, pmin(Incidence/2 + 2, 10)),
color = ~pal(Severity), stroke = FALSE, fillOpacity = 0.8,
popup = ~paste0("Severidad: ", Severity, "; ", " Incidencia: ", Incidence)) %>%
addLegend("bottomright", pal = pal, values = ~Severity, title = "Severidad")
bb <- st_bbox(sig_sf)
grd <- expand.grid(x = seq(bb["xmin"], bb["xmax"], length.out = 150),
y = seq(bb["ymin"], bb["ymax"], length.out = 150))
coordinates(grd) <- ~x+y; proj4string(grd) <- CRS("+init=epsg:4326")
pts <- sig_sf %>% filter(!is.na(T2M)) %>% as("Spatial")
idw_res <- gstat::idw(formula = T2M ~ 1, locations = pts, newdata = grd)
## [inverse distance weighted interpolation]
r <- raster::rasterFromXYZ(as.data.frame(idw_res)[,c("x","y","var1.pred")])
plot(r, main = "IDW - Predicción T2M")
plot(st_geometry(sig_sf), add = TRUE, pch = 20, cex = 0.6)

ggplot(sigatoka, aes(x = Fecha, y = Incidence, color = factor(Year))) +
geom_line() + geom_point() + #facet_wrap(~Year, scales = "free_y") +
labs(title = "Incidencia por semana por sitio", x = "Fecha", y = "Incidencia")

ggplot(sigatoka, aes(x = Fecha, y = Severity, color = factor(Year))) +
geom_line(linewidth = 1) +
geom_point(alpha = 0.7, size = 2) +
# scale_color_brewer(palette = "Dark2") +
theme_minimal(base_size = 13)+
labs(title = "Evolución temporal de la severidad de Sigatoka", x = "Semana (por fecha)",
y = "Severidad (%)", color = "Año")

#Correlaciones
num_vars <- sigatoka_enriched %>% dplyr::select(where(is.numeric))
corr_matrix_s <- cor(num_vars, use = "pairwise.complete.obs", method = "spearman")
corr_matrix_p <- cor(num_vars, use = "pairwise.complete.obs", method = "pearson")
ggcorrplot(corr_matrix_s, title = "Matriz de correlaciones (Spearman)")

ggcorrplot(corr_matrix_p, title = "Matriz de correlaciones (pearson)")

# par(mfrow = c(1, 1))
#Scatter plot
ggplot(sigatoka_enriched, aes(x = T2M, y = Incidence, , color = factor(Year))) + geom_point(alpha=0.6) +
geom_smooth(method = "loess") + labs(title = "Incidencia vs T2M")
## `geom_smooth()` using formula = 'y ~ x'

#+facet_wrap(~Year)
#Clusters
vars_cluster <- c("Incidence","Severity","T2M","PRECTOTCORR", "ALLSKY_SFC_SW_DWN")
vars_cluster <- intersect(vars_cluster, names(sigatoka_enriched))
clust_df <- sigatoka_enriched %>% dplyr::select(any_of(vars_cluster)) %>% drop_na()
scaled <- scale(clust_df)
km <- kmeans(scaled, centers = 3, nstart = 50)
sigatoka$cluster <- NA
sigatoka$cluster[as.numeric(rownames(clust_df))] <- km$cluster
ggplot(sigatoka, aes(x = Fecha, y = Incidence, color = factor(cluster))) + geom_line()

sigatoka_cluster <- sigatoka %>%
group_by(cluster, Mes = format(Fecha, "%Y-%m")) %>%
summarise(Incidencia_media = mean(Incidence, na.rm = TRUE))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
ggplot(sigatoka_cluster, aes(x = Mes, y = Incidencia_media, color = factor(cluster), group = cluster)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
theme_minimal(base_size = 13) +
labs(title = "Evolución mensual de la incidencia por clúster climático",
x = "Mes", y = "Incidencia promedio (%)", color = "Cluster") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# GLM Poisson y diagnostico de sobre-dispersion
m_pois <- glm(Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, family = poisson(link = "log"), data = sigatoka_enriched)
dispersion <- sum(residuals(m_pois, type="pearson")^2)/m_pois$df.residual
print(paste("Dispersion:", round(dispersion,2)))
## [1] "Dispersion: 16.53"
m_nb <- MASS::glm.nb(Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, data = sigatoka_enriched)
print(summary(m_nb))
##
## Call:
## MASS::glm.nb(formula = Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN,
## data = sigatoka_enriched, init.theta = 2.05589821, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.7169402 0.2021314 18.389 <2e-16 ***
## T2M 0.0067393 0.0074494 0.905 0.366
## PRECTOTCORR -0.0001709 0.0007709 -0.222 0.825
## ALLSKY_SFC_SW_DWN 0.0018123 0.0027316 0.663 0.507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(2.0559) family taken to be 1)
##
## Null deviance: 10334 on 9359 degrees of freedom
## Residual deviance: 10332 on 9356 degrees of freedom
## AIC: 90274
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 2.0559
## Std. Err.: 0.0305
##
## 2 x log-likelihood: -90263.7420
print(summary(m_pois))
##
## Call:
## glm(formula = Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN,
## family = poisson(link = "log"), data = sigatoka_enriched)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.7161356 0.0400098 92.881 < 2e-16 ***
## T2M 0.0067659 0.0014737 4.591 4.41e-06 ***
## PRECTOTCORR -0.0001698 0.0001529 -1.111 0.266733
## ALLSKY_SFC_SW_DWN 0.0018183 0.0005405 3.364 0.000768 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 179611 on 9359 degrees of freedom
## Residual deviance: 179575 on 9356 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 5
# Random Forest
df_ml <- sigatoka_enriched %>% dplyr::select(Incidence, T2M, PRECTOTCORR, RH2M, ALLSKY_SFC_SW_DWN) %>% drop_na()
set.seed(123)
idx <- caret::createDataPartition(df_ml$Incidence, p=0.8, list=FALSE)
train <- df_ml[idx,]; test <- df_ml[-idx,]
rf <- randomForest::randomForest(Incidence ~ ., data = train, ntree = 500)
preds <- predict(rf, test)
print(caret::postResample(preds, test$Incidence))
## RMSE Rsquared MAE
## 2.950655e+01 3.020035e-04 2.552996e+01
varImpPlot(rf)

imp <- data.frame(Variable = rownames(rf$importance),
Importancia = rf$importance[,1])
ggplot(imp, aes(x = reorder(Variable, Importancia), y = Importancia, fill = Variable)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_minimal(base_size = 14) +
labs(title = "Importancia de variables climáticas en la incidencia de Sigatoka",
x = "Variable", y = "Importancia (IncNodePurity)")
