This R code aims to produce mosquito risk maps using remote sensing data.

Part 1: Using collected variables

1. Investigating correlation among raw predictors (17 raw variables)

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Test1"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

# Read the shapefile containing mosquito count sampling locations (you can use data processed above shape_final)

Mosquito_sampling<-shapefile("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Sampling\\Mix_House_Mosquito.shp") # Upload mosquito points
# Convert it to sf object 
Mos_points<-st_as_sf(Mosquito_sampling)

rownames(Mos_points)<-NULL # Reset index 

1.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

1.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

1.3. Correlation test

We see that some variables tend to be strong correlated, particularly EVI, NDVI, temperature data. Hence, correlation test is conducted to see if there is a signigicant correlation or not for those variables.

  • Correlation test for EVI variables
EVI_variable<-my_data[,c(2,3,4)]

EVI_test=correlation::correlation(EVI_variable, include_factors = TRUE, method = "pearson")
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Test")
# write.csv(EVI_test,file="EVI_corelation.csv",row.names = F)

EVI_test
Parameter1 | Parameter2 |    r |       95% CI |     t |  df |      p |  Method | n_Obs
--------------------------------------------------------------------------------------
EVI_max    |   EVI_mean | 0.97 | [0.96, 0.97] | 81.15 | 467 | < .001 | Pearson |   469
EVI_max    |    EVI_min | 0.87 | [0.85, 0.89] | 38.54 | 467 | < .001 | Pearson |   469
EVI_mean   |    EVI_min | 0.96 | [0.95, 0.97] | 73.37 | 467 | < .001 | Pearson |   469
  • Correlation test for NDVI variables
library(dplyr)

NDVI_varaible<-my_data %>% select(NDVI_max:NDVI_min)

NDVI_test=correlation::correlation(NDVI_varaible, include_factors = TRUE, method = "pearson")
NDVI_test
Parameter1 | Parameter2 |    r |       95% CI |     t |  df |      p |  Method | n_Obs
--------------------------------------------------------------------------------------
NDVI_max   |  NDVI_mean | 0.96 | [0.95, 0.96] | 69.89 | 467 | < .001 | Pearson |   469
NDVI_max   |   NDVI_min | 0.79 | [0.75, 0.82] | 27.68 | 467 | < .001 | Pearson |   469
NDVI_mean  |   NDVI_min | 0.93 | [0.92, 0.95] | 56.90 | 467 | < .001 | Pearson |   469
# write.csv(NDVI_test,file="NDVI_corelation.csv",row.names = F)
  • Correlation test for Temp variables
Temp_variable<-my_data %>% select(Temp_lag_mean:Temp_min)
Temp_test=correlation::correlation(Temp_variable, include_factors = TRUE, method = "pearson")
Temp_test
Parameter1    | Parameter2 |    r |       95% CI |     t |  df |      p |  Method | n_Obs
-----------------------------------------------------------------------------------------
Temp_lag_mean |   Temp_max | 0.87 | [0.85, 0.89] | 38.66 | 467 | < .001 | Pearson |   469
Temp_lag_mean |  Temp_mean | 0.80 | [0.76, 0.83] | 28.34 | 467 | < .001 | Pearson |   469
Temp_lag_mean |   Temp_min | 0.28 | [0.20, 0.37] |  6.41 | 467 | < .001 | Pearson |   469
Temp_max      |  Temp_mean | 0.86 | [0.83, 0.88] | 35.90 | 467 | < .001 | Pearson |   469
Temp_max      |   Temp_min | 0.34 | [0.25, 0.41] |  7.73 | 467 | < .001 | Pearson |   469
Temp_mean     |   Temp_min | 0.72 | [0.67, 0.76] | 22.47 | 467 | < .001 | Pearson |   469

As the land cover has five classes, so we need to convert it to factors, not integers.

# Encode land cover as factor and number of mosquito (Count) as integer 
my_data$Landcover<-as.factor(my_data$Landcover)
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
# Check overdisspersion for Count variable
print( mean(my_data$Count)); print(var(my_data$Count)) # Our count data is overdispersion as variance is greater than mean
[1] 34.78038
[1] 30698.91
# If someone wants to export data to your local computer
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito")
# write.csv(my_data,"Myfile.csv")
# Summary of our dataset all variables 
# summary(my_data)

Splitting data into training and testing sets

library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
glm.fit: algorithm did not converge
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + EVI_max + EVI_mean + EVI_min + Landcover + NDVI_max + 
    NDVI_mean + NDVI_min + NDWI + Pop_VN + Rain_lag_mean + Rain_max + 
    Rain_mean + Rain_min + Temp_lag_mean + Temp_max + Temp_mean + 
    Temp_min

Final Model:
Count ~ EVI_max + EVI_mean + EVI_min + NDVI_max + NDVI_mean + 
    NDVI_min + NDWI + Pop_VN + Rain_lag_mean + Rain_max + Rain_mean + 
    Rain_min + Temp_lag_mean + Temp_max + Temp_mean

         Step Df  Deviance Resid. Df Resid. Dev       AIC
1                                361  30396.867 32250.929
2 - Landcover  3  7693.381       364  22703.487 24551.548
3       - DEM  1 21664.025       365   1039.461  2885.522

2. Investigating correlation among raw predictors after filtering strongly correlated raw variables and model testing

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Test1\\Variable_Selection"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

2.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

2.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

2.3. Mosquito Risk prediction using negative binomial

  • As the land cover has five classes, so we need to convert it to factors, not integers.
# Encode land cover as factor and number of mosquito (Count) as integer 
my_data$Landcover<-as.factor(my_data$Landcover)
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
# Check overdisspersion for Count variable
print( mean(my_data$Count)); print(var(my_data$Count)) # Our count data is overdispersion as variance is greater than mean
[1] 34.78038
[1] 30698.91
# If someone wants to export data to your local computer
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito")
# write.csv(my_data,"Myfile.csv")
# Summary of our dataset all variables 
# summary(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.80)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
glm.fit: algorithm did not converge
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Summarize the model
summary(ne_model)

Call:
glm(formula = Count ~ ., family = negative.binomial(theta = 1), 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.4496  -1.7864  -1.0716  -0.3327   8.5400  

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -11.087016   8.537302  -1.299 0.194878    
DEM            -0.011971   0.066005  -0.181 0.856181    
EVI_mean        6.541186   3.169654   2.064 0.039750 *  
Landcover2     -1.786185   1.009456  -1.769 0.077649 .  
Landcover3     -0.743459   1.925351  -0.386 0.699615    
Landcover4      0.387634   0.628261   0.617 0.537620    
NDVI_mean      -0.065095   4.540120  -0.014 0.988568    
NDWI            0.702064   2.010056   0.349 0.727083    
Pop_VN         -0.020802   0.005441  -3.823 0.000155 ***
Rain_lag_mean  -0.100331   0.095621  -1.049 0.294749    
Rain_max        0.002300   0.041928   0.055 0.956286    
Rain_mean       0.204739   0.293565   0.697 0.485980    
Rain_min        1.400129   0.990191   1.414 0.158210    
Temp_lag_mean  -0.302510   0.242506  -1.247 0.213033    
Temp_mean       0.665115   0.258315   2.575 0.010420 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Negative Binomial(1) family taken to be 9.285492)

    Null deviance: 1803.8  on 381  degrees of freedom
Residual deviance: 1071.7  on 367  degrees of freedom
  (4 observations deleted due to missingness)
AIC: 2913.7

Number of Fisher Scoring iterations: 25
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")
The working directory was changed to F:/Research/Research_Cooperation/ILRI_Mosquito_Mapping/Mosquito/Predicted_Maps inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeRaster(Sen_output, filename = "Map_Test1.tif")
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + EVI_mean + Landcover + NDVI_mean + NDWI + Pop_VN + 
    Rain_lag_mean + Rain_max + Rain_mean + Rain_min + Temp_lag_mean + 
    Temp_mean

Final Model:
Count ~ DEM + EVI_mean + NDVI_mean + NDWI + Pop_VN + Rain_lag_mean + 
    Rain_max + Rain_mean + Rain_min + Temp_lag_mean + Temp_mean


  Step Df Deviance Resid. Df Resid. Dev      AIC
1                        367   1071.675 2913.736

2.4. Mosquito Risk prediction using zero-inflated negative binomial

library(pscl)
package 㤼㸱pscl㤼㸲 was built under R version 4.0.3Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")
system is computationally singular: reciprocal condition number = 4.02227e-39FALSE
predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 

Part 2: Using buffered computed variables

3. Investigating correlation of derived variables using buffer distance and model testing (36 derived variables)

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

3.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

write.csv(my_data,file="Buffered_computed.csv",row.names =F)

3.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

3.3. Mosquito Risk prediction using negative binomial

# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.80)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1)) # Set the theta=2 provides better model
glm.fit: algorithm did not converge
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model

There are six variables excluded from the final model, NDVI_200m, NDVI_500m, EVI_1km, EVI_2km, Mean_rain_1km, Urban_200m.

variable_select<-stepAIC(ne_model, trace = F,direction ="backward")
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + EVI_Mean_1km + EVI_Mean_2km + EVI_Mean_500m + NDVI_Mean_1km + 
    NDVI_Mean_200m + NDVI_Mean_2km + NDVI_Mean_500m + NDWI_1km + 
    NDWI_200m + NDWI_2km + NDWI_500m + Pop_1km + Pop_2km + Rain_lag_1km + 
    Rain_lag_2km + Rain_max_1km + Rain_max_2km + Rain_Mean_1km + 
    Rain_Mean_2km + Rain_min_1km + Rain_min_2km + Rice_1km + 
    Rice_200m + Rice_2km + Rice_500m + Temp_lag_1km + Temp_Mean_1km + 
    Temp_Mean_2km + Urban_1km + Urban_200m + Urban_2km + Urban_500m

Final Model:
Count ~ EVI_Mean_500m + NDVI_Mean_1km + NDVI_Mean_2km + NDWI_1km + 
    NDWI_200m + NDWI_2km + NDWI_500m + Pop_1km + Pop_2km + Rain_lag_1km + 
    Rain_lag_2km + Rain_max_1km + Rain_max_2km + Rain_Mean_2km + 
    Rain_min_1km + Rain_min_2km + Rice_1km + Rice_200m + Rice_2km + 
    Rice_500m + Temp_lag_1km + Temp_Mean_1km + Temp_Mean_2km + 
    Urban_1km + Urban_2km + Urban_500m

              Step Df   Deviance Resid. Df Resid. Dev      AIC
1                                      352   840.8793 2735.734
2 - NDVI_Mean_200m  1 18.4776746       353   822.4016 2715.256
3     - Urban_200m  1  0.8779154       354   821.5237 2712.378
4  - Rain_Mean_1km  1  0.4261576       355   821.9498 2710.804
5 - NDVI_Mean_500m  1  0.5709151       356   822.5208 2709.375
6   - EVI_Mean_1km  1  0.8417544       357   823.3625 2708.217
7   - EVI_Mean_2km  1  1.8423857       358   825.2049 2708.059

3.4. Mosquito Risk prediction using zero-inflated negative binomial

library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")
system is computationally singular: reciprocal condition number = 2.51058e-22FALSE
predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 

4.0 Investigating correlation of buffer derived variables after removing variables from first stepwise selection and model testing

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Variable_Selection"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

4.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

4.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

4.3. Mosquito Risk prediction using negative binomial

# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + EVI_Mean_500m + NDVI_Mean_1km + NDVI_Mean_2km + 
    NDWI_1km + NDWI_200m + NDWI_2km + NDWI_500m + Pop_1km + Pop_2km + 
    Rain_lag_1km + Rain_lag_2km + Rain_max_1km + Rain_max_2km + 
    Rain_Mean_2km + Rain_min_1km + Rain_min_2km + Rice_1km + 
    Rice_200m + Rice_2km + Rice_500m + Temp_lag_1km + Temp_Mean_1km + 
    Temp_Mean_2km + Urban_1km + Urban_2km + Urban_500m

Final Model:
Count ~ EVI_Mean_500m + NDVI_Mean_1km + NDVI_Mean_2km + NDWI_1km + 
    NDWI_200m + NDWI_2km + NDWI_500m + Pop_1km + Pop_2km + Rain_lag_1km + 
    Rain_lag_2km + Rain_max_1km + Rain_max_2km + Rain_Mean_2km + 
    Rain_min_1km + Rain_min_2km + Rice_1km + Rice_200m + Rice_2km + 
    Rice_500m + Temp_lag_1km + Temp_Mean_1km + Temp_Mean_2km + 
    Urban_1km + Urban_2km + Urban_500m


  Step Df Deviance Resid. Df Resid. Dev      AIC
1                        358   825.2049 2708.059

5.0 Investigating buffer derived variables after removing strongly correlated variables (section 4) and model testing

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Test_Selection\\Test_Stepwise"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

5.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

5.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

5.3. Mosquito Risk prediction using negative binomial

# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)
glm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + NDVI_Mean_1km + Pop_1km + Rain_lag_1km + Rain_max_1km + 
    Rain_min_1km

Final Model:
Count ~ DEM + NDVI_Mean_1km + Pop_1km + Rain_max_1km + Rain_min_1km


  Step Df Deviance Resid. Df Resid. Dev      AIC
1                        379   1105.799 2946.653
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")
The working directory was changed to F:/Research/Research_Cooperation/ILRI_Mosquito_Mapping/Mosquito/Predicted_Maps inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeRaster(Sen_output, filename = "Risk_Map_stepwise.tif", overwrite=T)

5.4. Mosquito Risk prediction using zero-inflated negative binomial

library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")

predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 

6.0 Investigating buffer derived variables with 1km buffer

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Stepwise_1km"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

6.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

6.2. Looking at correlation among predictors and its histograms.

Note: Rice and NDVI had strong correlation >0.7

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

6.3. Mosquito Risk prediction using negative binomial

# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not converge
variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ DEM + NDVI_Mean_1km + NDWI_1km + Pop_1km + Rain_lag_1km + 
    Rain_max_1km + Rain_min_1km + Rice_1km + Temp_lag_1km

Final Model:
Count ~ DEM + NDVI_Mean_1km + Pop_1km + Rain_lag_1km + Rain_max_1km + 
    Rain_min_1km

        Step Df  Deviance Resid. Df Resid. Dev      AIC
1                               376   1098.774 2945.628
2 - NDWI_1km  1 0.5204301       377   1099.294 2944.149
3 - Rice_1km  1 1.9125103       378   1101.207 2944.061
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")
The working directory was changed to F:/Research/Research_Cooperation/ILRI_Mosquito_Mapping/Mosquito/Predicted_Maps inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeRaster(Sen_output, filename = "Risk_Map_1km.tif",overwrite=T)

6.4. Mosquito Risk prediction using zero-inflated negative binomial

7. Test with selected variables like in Section 6, but all 2km buffered.

library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\2km_buffer"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

7.1. Extract variable values at each mosquito sampling locations

# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

7.2. Looking at correlation among predictors and its histograms.

# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
Parameter1 |     Parameter2 |     r |         95% CI |     t |  df |      p |  Method | n_Obs
---------------------------------------------------------------------------------------------
DEM        |   EVI_Mean_1km | -0.23 | [-0.32, -0.15] | -5.19 | 467 | < .001 | Pearson |   469
DEM        |  EVI_Mean_500m | -0.23 | [-0.32, -0.15] | -5.20 | 467 | < .001 | Pearson |   469
DEM        |  NDVI_Mean_1km | -0.07 | [-0.16,  0.02] | -1.47 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_200m | -0.07 | [-0.16,  0.02] | -1.60 | 467 | > .999 | Pearson |   469
DEM        |  NDVI_Mean_2km | -0.08 | [-0.17,  0.01] | -1.82 | 467 | > .999 | Pearson |   469
DEM        | NDVI_Mean_500m | -0.07 | [-0.15,  0.03] | -1.41 | 467 | > .999 | Pearson |   469
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)

# Making histogram
multi.hist(my_data,dcol="red")

7.3. Mosquito Risk prediction using negative binomial

# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
  • Splitting data into training and testing sets
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
  • Negative binomial model
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
  • Stepwise selection with negative binomial model
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
Count ~ NDVI_Mean_2km + NDWI_2km + Pop_2km + Rain_Mean_2km + 
    Rice_2km + Temp_Mean_2km

Final Model:
Count ~ NDVI_Mean_2km + Pop_2km + Rain_Mean_2km

             Step Df   Deviance Resid. Df Resid. Dev      AIC
1                                     379   1193.511 3034.366
2 - Temp_Mean_2km  1 0.12233845       380   1193.634 3032.488
3      - NDWI_2km  1 0.09926757       381   1193.733 3030.588
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")
The working directory was changed to F:/Research/Research_Cooperation/ILRI_Mosquito_Mapping/Mosquito/Predicted_Maps inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeRaster(Sen_output, filename = "Risk_Map_7.tif",overwrite=T)
---
title: "R Notebook"
output: html_notebook
---

This R code aims to produce mosquito risk maps using remote sensing data.

# Part 1: Using collected variables 

# 1. Investigating correlation among raw predictors (17 raw variables)

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Test1"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

# Read the shapefile containing mosquito count sampling locations (you can use data processed above shape_final)

Mosquito_sampling<-shapefile("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Sampling\\Mix_House_Mosquito.shp") # Upload mosquito points
# Convert it to sf object 
Mos_points<-st_as_sf(Mosquito_sampling)

rownames(Mos_points)<-NULL # Reset index 

```

### 1.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```

### 1.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 1.3. Correlation test

We see that some variables tend to be strong correlated, particularly EVI, NDVI, temperature data. Hence, correlation test is conducted to see if there is a signigicant correlation or not for those variables.

- Correlation test for `EVI` variables

```{r}
EVI_variable<-my_data[,c(2,3,4)]

EVI_test=correlation::correlation(EVI_variable, include_factors = TRUE, method = "pearson")
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Test")
# write.csv(EVI_test,file="EVI_corelation.csv",row.names = F)

EVI_test
```

- Correlation test for `NDVI` variables

```{r}
library(dplyr)

NDVI_varaible<-my_data %>% select(NDVI_max:NDVI_min)

NDVI_test=correlation::correlation(NDVI_varaible, include_factors = TRUE, method = "pearson")
NDVI_test
# write.csv(NDVI_test,file="NDVI_corelation.csv",row.names = F)
```

- Correlation test for `Temp` variables

```{r}
Temp_variable<-my_data %>% select(Temp_lag_mean:Temp_min)
Temp_test=correlation::correlation(Temp_variable, include_factors = TRUE, method = "pearson")
Temp_test
```

 As the land cover has five classes, so we need to convert it to factors, not integers.

```{r}
# Encode land cover as factor and number of mosquito (Count) as integer 
my_data$Landcover<-as.factor(my_data$Landcover)
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
# Check overdisspersion for Count variable
print( mean(my_data$Count)); print(var(my_data$Count)) # Our count data is overdispersion as variance is greater than mean
# If someone wants to export data to your local computer
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito")
# write.csv(my_data,"Myfile.csv")
# Summary of our dataset all variables 
# summary(my_data)
```


 Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```


- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```


# 2. Investigating correlation among raw predictors after filtering strongly correlated raw variables and model testing

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Mosquito_Test1\\Variable_Selection"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)

```

### 2.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```
### 2.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 2.3. Mosquito Risk prediction using negative binomial

- As the land cover has five classes, so we need to convert it to factors, not integers.

```{r}
# Encode land cover as factor and number of mosquito (Count) as integer 
my_data$Landcover<-as.factor(my_data$Landcover)
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
# Check overdisspersion for Count variable
print( mean(my_data$Count)); print(var(my_data$Count)) # Our count data is overdispersion as variance is greater than mean
# If someone wants to export data to your local computer
# setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito")
# write.csv(my_data,"Myfile.csv")
# Summary of our dataset all variables 
# summary(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.80)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

- Summarize the model

```{r}
summary(ne_model)
```

```{r}
#Sen_output<-predict(predictors, ne_model, type="response",progress="window")

#setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")

#writeRaster(Sen_output, filename = "Map_Test1.tif")
```

- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```

### 2.4. Mosquito Risk prediction using zero-inflated negative binomial 

```{r}
library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")

predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

# Part 2: Using buffered computed variables 

# 3. Investigating correlation of derived variables using buffer distance and model testing (36 derived variables)

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)
```

### 3.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)

# write.csv(my_data,file="Buffered_computed.csv",row.names =F)
```

### 3.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 3.3. Mosquito Risk prediction using negative binomial

```{r}
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.80)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1)) # Set the theta=2 provides better model
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```


- Stepwise selection with negative binomial model

There are six variables excluded from the final model, NDVI_200m, NDVI_500m, EVI_1km, EVI_2km, Mean_rain_1km, Urban_200m.

```{r}
variable_select<-stepAIC(ne_model, trace = F,direction ="backward")

variable_select$anova
```


### 3.4. Mosquito Risk prediction using zero-inflated negative binomial 

```{r}
library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")

predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

# 4.0 Investigating correlation of buffer derived variables after removing variables from first stepwise selection and model testing

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Variable_Selection"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)
```

### 4.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```

### 4.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 4.3. Mosquito Risk prediction using negative binomial

```{r}
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=2 provides better model
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```

# 5.0 Investigating buffer derived variables after removing strongly correlated variables (section 4) and model testing 

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Test_Selection\\Test_Stepwise"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)
```

### 5.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```

### 5.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 5.3. Mosquito Risk prediction using negative binomial

```{r}
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```

```{r}
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")

writeRaster(Sen_output, filename = "Risk_Map_stepwise.tif", overwrite=T)
```

### 5.4. Mosquito Risk prediction using zero-inflated negative binomial 

```{r}
library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")

predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

# 6.0 Investigating buffer derived variables with 1km buffer 

```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\Stepwise_1km"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)
```

### 6.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```

### 6.2. Looking at correlation among predictors and its histograms.

Note: Rice and NDVI had strong correlation >0.7

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```

### 6.3. Mosquito Risk prediction using negative binomial

```{r}
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```

```{r}
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")

writeRaster(Sen_output, filename = "Risk_Map_1km.tif",overwrite=T)
```


### 6.4. Mosquito Risk prediction using zero-inflated negative binomial 

```{r}
library(pscl)
zero_model<-zeroinfl(Count~., data=train, dist = "negbin",link = "logit")

predictions<- predict(zero_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

# 7. Test with selected variables like in Section 6, but all 2km buffered.



```{r}
library(raster); library(sf)
# Specify the path that contains the raster data layers
path<-"F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Cover_Ratio\\2km_buffer"
# List all files with tif extension from above folder
predictors<-list.files(path=path,pattern = "tif$",full.names = T)
# Stack all layers together 
predictors<-stack(predictors)
```

### 7.1. Extract variable values at each mosquito sampling locations

```{r}
# Extract values from raster layers using sampling locations.
Value_Extract<-raster::extract(predictors,Mos_points,df=T)

# Select only varibles from column 2. The first column is ID, and it is useless
my_data <- Value_Extract[, c(2:ncol(Value_Extract))]
# Look at first few rows
head(my_data)
```

### 7.2. Looking at correlation among predictors and its histograms.

```{r}
# Combine corelation coeffieints and tests

library(correlation)
library(psych)

head(correlation::correlation(plot_data, include_factors = TRUE, method = "pearson"))
# Plot correlation matrix 
library("PerformanceAnalytics")
chart.Correlation(my_data, histogram=TRUE, pch=19)
# Making histogram
multi.hist(my_data,dcol="red")

```


### 7.3. Mosquito Risk prediction using negative binomial

```{r}
# Add Count column to the dataset
my_data$Count<-Mos_points$Sum_ttl

my_data$Count<-as.integer(my_data$Count) # Convert it to be count data (integer)
head(my_data)
```

- Splitting data into training and testing sets 

```{r}
library(caTools)
# Splitting data into training and testing
set.seed(123) 
sample = sample.split(my_data$Count, SplitRatio = 0.8)
train = subset(my_data, sample == TRUE)
test  = subset(my_data, sample == FALSE)
```

- Negative binomial model

```{r}
library(caret)
ne_model<-glm(Count~., data=train, family =  negative.binomial(theta = 1) ) # Set the theta=1
predictions<- predict(ne_model,newdata=test)

data.frame(R2 = R2(predictions, test$Count), RMSE = RMSE(predictions, test$Count), MAE = MAE(predictions,test$Count)) 
```

- Stepwise selection with negative binomial model

```{r}
variable_select<-stepAIC(ne_model, trace = F)

variable_select$anova
```

```{r}
Sen_output<-predict(predictors, ne_model, type="response",progress="window")

setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps")

writeRaster(Sen_output, filename = "Risk_Map_7.tif",overwrite=T)
```