# Taller 1 - Ciencia de datos aplicada al análisis epidemiológico

sigatoka <- read_excel("C:/Users/R O G  S T R I X/Desktop/Universidad/AE/sigatoka_rich_data_2023_2024.xlsx")
str(sigatoka)
## tibble [9,360 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Year       : num [1:9360] 2023 2023 2023 2023 2023 ...
##  $ Week       : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Location_ID: num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Latitude   : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
##  $ Longitude  : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
##  $ Incidence  : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
##  $ Severity   : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
summary(sigatoka)
##       Year           Week        Location_ID    Latitude       Longitude     
##  Min.   :2023   Min.   : 1.00   Min.   :1    Min.   :7.807   Min.   :-76.60  
##  1st Qu.:2023   1st Qu.:13.75   1st Qu.:1    1st Qu.:7.814   1st Qu.:-76.60  
##  Median :2024   Median :26.50   Median :2    Median :7.836   Median :-76.56  
##  Mean   :2024   Mean   :26.50   Mean   :2    Mean   :7.841   Mean   :-76.57  
##  3rd Qu.:2024   3rd Qu.:39.25   3rd Qu.:3    3rd Qu.:7.873   3rd Qu.:-76.55  
##  Max.   :2024   Max.   :52.00   Max.   :3    Max.   :7.880   Max.   :-76.54  
##    Incidence        Severity     
##  Min.   : 0.01   Min.   : 0.000  
##  1st Qu.:25.54   1st Qu.: 2.480  
##  Median :50.67   Median : 5.000  
##  Mean   :50.53   Mean   : 4.998  
##  3rd Qu.:75.72   3rd Qu.: 7.530  
##  Max.   :99.98   Max.   :10.000
head(sigatoka)
## # A tibble: 6 × 7
##    Year  Week Location_ID Latitude Longitude Incidence Severity
##   <dbl> <dbl>       <dbl>    <dbl>     <dbl>     <dbl>    <dbl>
## 1  2023     1           1     7.81     -76.5      68.8     4.31
## 2  2023     1           1     7.81     -76.5      74.1     2.86
## 3  2023     1           1     7.81     -76.5      68.6     5.94
## 4  2023     1           1     7.81     -76.5      98.6     5.26
## 5  2023     1           1     7.81     -76.5      27.8     9.85
## 6  2023     1           1     7.81     -76.5      13.4     3.01
sigatoka <- sigatoka %>%
  mutate(across(where(is.character), ~trimws(.))) %>%  # eliminar espacios
  distinct()                                            # eliminar duplicados
table(is.na(sigatoka))
## 
## FALSE 
## 65520
#sigatoka <- sigatoka %>% mutate(across(everything(), ~ifelse(is.na(.), mean(., na.rm=TRUE), .)))

sigatoka <- sigatoka %>%
  mutate(
    Fecha = as.Date(paste(Year, Week, 1, sep = "-"), format = "%Y-%U-%u")
  )
str(sigatoka)
## tibble [9,360 × 8] (S3: tbl_df/tbl/data.frame)
##  $ Year       : num [1:9360] 2023 2023 2023 2023 2023 ...
##  $ Week       : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Location_ID: num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Latitude   : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
##  $ Longitude  : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
##  $ Incidence  : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
##  $ Severity   : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
##  $ Fecha      : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
fetch_nasa_power_daily <- function(lat, lon, start_date, end_date) {
  params <- "T2M,PRECTOTCORR,RH2M,ALLSKY_SFC_SW_DWN"
  start <- format(as.Date(start_date), "%Y%m%d")
  end <- format(as.Date(end_date), "%Y%m%d")
  
  url <- paste0(
    "https://power.larc.nasa.gov/api/temporal/daily/point?parameters=",
    params, "&community=AG&latitude=", lat, "&longitude=", lon,
    "&start=", start, "&end=", end, "&format=JSON"
  )
  
  response <- try(GET(url, timeout(30)), silent = TRUE)
  
  if (inherits(response, "try-error") || response$status_code != 200) {
    return(NULL)
  }
  
  js <- content(response, as = "parsed", simplifyVector = TRUE)
  data <- js$properties$parameter
  
  if (is.null(data)) return(NULL)
  
  # Reestructurar en formato largo por fecha
  df <- bind_rows(lapply(names(data), function(var) {
    tibble(date = names(data[[var]]), variable = var, value = unlist(data[[var]]))
  })) %>%
    pivot_wider(names_from = variable, values_from = value) %>%
    mutate(date = as.Date(date, format = "%Y%m%d"))
  
  return(df)
}

# Enriquecer base con clima
enrich_with_climate <- function(df_in) {
  df_in <- df_in %>% mutate(date_key = Fecha)
  clima_rows <- list()
  
  coords <- unique(df_in[, c("Latitude", "Longitude")])
  
  for (i in seq_len(nrow(coords))) {
    lat <- coords$Latitude[i]
    lon <- coords$Longitude[i]
    
    df_coord <- df_in %>% filter(Latitude == !!lat, Longitude == !!lon)
    start <- min(df_coord$Fecha)
    end <- max(df_coord$Fecha)
    
    clima_data <- fetch_nasa_power_daily(lat, lon, start, end)
    
    if (!is.null(clima_data)) {
      clima_data <- clima_data %>%
        mutate(Latitude = lat, Longitude = lon)
      clima_rows[[i]] <- clima_data
    }
  }
  
  df_clima <- bind_rows(clima_rows)
  df_out <- df_in %>%
    left_join(df_clima, by = c("Fecha" = "date", "Latitude", "Longitude"))
  
  return(df_out)
}

# Aplicar función
sigatoka_enriched <- enrich_with_climate(sigatoka)

# Imputación simple de faltantes
v= "ALLSKY_SFC_SW_DWN"
    sigatoka_enriched[[v]][is.na(sigatoka_enriched[[v]])] <-
      median(sigatoka_enriched[[v]], na.rm = TRUE)

str(sigatoka_enriched)
## tibble [9,360 × 13] (S3: tbl_df/tbl/data.frame)
##  $ Year             : num [1:9360] 2023 2023 2023 2023 2023 ...
##  $ Week             : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Location_ID      : num [1:9360] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Latitude         : num [1:9360] 7.81 7.81 7.81 7.81 7.81 ...
##  $ Longitude        : num [1:9360] -76.5 -76.5 -76.5 -76.5 -76.5 ...
##  $ Incidence        : num [1:9360] 68.8 74.1 68.6 98.6 27.8 ...
##  $ Severity         : num [1:9360] 4.31 2.86 5.94 5.26 9.85 3.01 1.43 7.57 2.88 4.26 ...
##  $ Fecha            : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
##  $ date_key         : Date[1:9360], format: "2023-01-02" "2023-01-02" ...
##  $ T2M              : Named num [1:9360] 23.3 23.3 23.3 23.3 23.3 ...
##   ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
##  $ PRECTOTCORR      : Named num [1:9360] 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 5.58 ...
##   ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
##  $ RH2M             : Named num [1:9360] 90 90 90 90 90 ...
##   ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
##  $ ALLSKY_SFC_SW_DWN: Named num [1:9360] 16.9 16.9 16.9 16.9 16.9 ...
##   ..- attr(*, "names")= chr [1:9360] "" "" "" "" ...
  sig_sf <- sigatoka_enriched %>% filter(!is.na(Latitude) & !is.na(Longitude)) %>% st_as_sf(coords = c("Longitude","Latitude"), crs = 4326, remove = FALSE)


library(viridis)
## Cargando paquete requerido: viridisLite
p_map <- ggplot() +
  geom_sf(data = sig_sf, aes(color = Severity, size = Incidence), alpha = 0.8) +
  scale_color_viridis(option = "magma", na.value = "grey50") +
  theme_minimal() + labs(title = "Severidad / Incidencia por punto")
p_map

library(leaflet)
pal <- colorNumeric("viridis", domain = sig_sf$Severity, na.color = "grey")
leaflet(sig_sf) %>% addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(~Longitude, ~Latitude, radius = ~ifelse(is.na(Incidence), 3, pmin(Incidence/2 + 2, 10)),
                   color = ~pal(Severity), stroke = FALSE, fillOpacity = 0.8,
                   popup = ~paste0("Severidad: ", Severity, ";  ", " Incidencia: ", Incidence)) %>%
  addLegend("bottomright", pal = pal, values = ~Severity, title = "Severidad")
  bb <- st_bbox(sig_sf)
  grd <- expand.grid(x = seq(bb["xmin"], bb["xmax"], length.out = 150),
                     y = seq(bb["ymin"], bb["ymax"], length.out = 150))
  coordinates(grd) <- ~x+y; proj4string(grd) <- CRS("+init=epsg:4326")
  
  
  pts <- sig_sf %>% filter(!is.na(T2M)) %>% as("Spatial")
  idw_res <- gstat::idw(formula = T2M ~ 1, locations = pts, newdata = grd)
## [inverse distance weighted interpolation]
  r <- raster::rasterFromXYZ(as.data.frame(idw_res)[,c("x","y","var1.pred")])
  plot(r, main = "IDW - Predicción T2M")
  plot(st_geometry(sig_sf), add = TRUE, pch = 20, cex = 0.6)

ggplot(sigatoka, aes(x = Fecha, y = Incidence, color = factor(Year))) +
  geom_line() + geom_point() + #facet_wrap(~Year, scales = "free_y") +
  labs(title = "Incidencia por semana por sitio", x = "Fecha", y = "Incidencia")

    ggplot(sigatoka, aes(x = Fecha, y = Severity, color = factor(Year))) +
    geom_line(linewidth = 1) +
    geom_point(alpha = 0.7, size = 2) +
    # scale_color_brewer(palette = "Dark2") +
    theme_minimal(base_size = 13)+
      labs(title = "Evolución temporal de la severidad de Sigatoka", x = "Semana (por fecha)", 
           y = "Severidad (%)", color = "Año")

#Correlaciones
    
num_vars <- sigatoka_enriched %>% dplyr::select(where(is.numeric))
corr_matrix_s <- cor(num_vars, use = "pairwise.complete.obs", method = "spearman")
corr_matrix_p <- cor(num_vars, use = "pairwise.complete.obs", method = "pearson")

ggcorrplot(corr_matrix_s, title = "Matriz de correlaciones (Spearman)")

ggcorrplot(corr_matrix_p, title = "Matriz de correlaciones (pearson)")

# par(mfrow = c(1, 1))

#Scatter plot

  ggplot(sigatoka_enriched, aes(x = T2M, y = Incidence, , color = factor(Year))) + geom_point(alpha=0.6) +
    geom_smooth(method = "loess") + labs(title = "Incidencia vs T2M")
## `geom_smooth()` using formula = 'y ~ x'

  #+facet_wrap(~Year)

#Clusters
  vars_cluster <- c("Incidence","Severity","T2M","PRECTOTCORR", "ALLSKY_SFC_SW_DWN")
  vars_cluster <- intersect(vars_cluster, names(sigatoka_enriched))
  clust_df <- sigatoka_enriched %>% dplyr::select(any_of(vars_cluster)) %>% drop_na()
  
    scaled <- scale(clust_df)
    km <- kmeans(scaled, centers = 3, nstart = 50)
    sigatoka$cluster <- NA
    sigatoka$cluster[as.numeric(rownames(clust_df))] <- km$cluster
    ggplot(sigatoka, aes(x = Fecha, y = Incidence, color = factor(cluster))) + geom_line()

  sigatoka_cluster <- sigatoka %>%
  group_by(cluster, Mes = format(Fecha, "%Y-%m")) %>%
  summarise(Incidencia_media = mean(Incidence, na.rm = TRUE))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
ggplot(sigatoka_cluster, aes(x = Mes, y = Incidencia_media, color = factor(cluster), group = cluster)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  theme_minimal(base_size = 13) +
  labs(title = "Evolución mensual de la incidencia por clúster climático",
       x = "Mes", y = "Incidencia promedio (%)", color = "Cluster") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

  # GLM Poisson y diagnostico de sobre-dispersion
    m_pois <- glm(Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, family = poisson(link = "log"), data = sigatoka_enriched)
    dispersion <- sum(residuals(m_pois, type="pearson")^2)/m_pois$df.residual
    print(paste("Dispersion:", round(dispersion,2)))
## [1] "Dispersion: 16.53"
      m_nb <- MASS::glm.nb(Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, data = sigatoka_enriched)
      print(summary(m_nb))
## 
## Call:
## MASS::glm.nb(formula = Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, 
##     data = sigatoka_enriched, init.theta = 2.05589821, link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        3.7169402  0.2021314  18.389   <2e-16 ***
## T2M                0.0067393  0.0074494   0.905    0.366    
## PRECTOTCORR       -0.0001709  0.0007709  -0.222    0.825    
## ALLSKY_SFC_SW_DWN  0.0018123  0.0027316   0.663    0.507    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.0559) family taken to be 1)
## 
##     Null deviance: 10334  on 9359  degrees of freedom
## Residual deviance: 10332  on 9356  degrees of freedom
## AIC: 90274
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.0559 
##           Std. Err.:  0.0305 
## 
##  2 x log-likelihood:  -90263.7420
      print(summary(m_pois))
## 
## Call:
## glm(formula = Incidence ~ T2M + PRECTOTCORR + ALLSKY_SFC_SW_DWN, 
##     family = poisson(link = "log"), data = sigatoka_enriched)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        3.7161356  0.0400098  92.881  < 2e-16 ***
## T2M                0.0067659  0.0014737   4.591 4.41e-06 ***
## PRECTOTCORR       -0.0001698  0.0001529  -1.111 0.266733    
## ALLSKY_SFC_SW_DWN  0.0018183  0.0005405   3.364 0.000768 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 179611  on 9359  degrees of freedom
## Residual deviance: 179575  on 9356  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 5
  # Random Forest
  
    df_ml <- sigatoka_enriched %>% dplyr::select(Incidence, T2M, PRECTOTCORR, RH2M, ALLSKY_SFC_SW_DWN) %>% drop_na()
    
      set.seed(123)
      idx <- caret::createDataPartition(df_ml$Incidence, p=0.8, list=FALSE)
      train <- df_ml[idx,]; test <- df_ml[-idx,]
      rf <- randomForest::randomForest(Incidence ~ ., data = train, ntree = 500)
      preds <- predict(rf, test)
      print(caret::postResample(preds, test$Incidence))
##         RMSE     Rsquared          MAE 
## 2.950655e+01 3.020035e-04 2.552996e+01
      varImpPlot(rf)

      imp <- data.frame(Variable = rownames(rf$importance),
                  Importancia = rf$importance[,1])

ggplot(imp, aes(x = reorder(Variable, Importancia), y = Importancia, fill = Variable)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal(base_size = 14) +
  labs(title = "Importancia de variables climáticas en la incidencia de Sigatoka",
       x = "Variable", y = "Importancia (IncNodePurity)")