This R code is dedicated to a training workshop “Mosquito Mapping” at ILRI in Hanoi. Before we start our workshop, you need to download data here.
logical
, numeric
, integer
, character or string
# Create a vector from 1 to 10
myvector<-c(1:10)
myvector # Display the myvector
[1] 1 2 3 4 5 6 7 8 9 10
edu_class<-c("Good","Very good","Average","Weak","Good","Good","Very good","Average","Weak","Good","Good","Very good")
edu_class<-as.factor(edu_class)
edu_class
[1] Good Very good Average Weak Good Good Very good Average Weak Good Good Very good
Levels: Average Good Very good Weak
nlevels(edu_class)# Check the number of factor levels
[1] 4
BMI <- data.frame(
gender = c("Male", "Male","Female"),
height = c(165, 171.5, 167),
weight = c(45,67, 86),
Age = c(34,38,45)
)
BMI
+, -, /, *, ^
num1<-1
num2<-4
num1*num2
[1] 4
print(paste("num1+num2=",num1+num2))
[1] "num1+num2= 5"
num1<-c(1:5)
num2<-c(5:9)
num1+num2
[1] 6 8 10 12 14
>,<,>=,<=,==, !=
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v>t)
[1] FALSE TRUE FALSE FALSE
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v!=t)
[1] TRUE TRUE TRUE FALSE
&, |, !
v <- c(3,1,TRUE,2+3i)
t <- c(4,1,FALSE,2+3i)
print(v==3 & t==4)
[1] TRUE FALSE FALSE FALSE
t <- c(4,TRUE,0,1,FALSE,2+3i)
print(t!=3)
[1] TRUE TRUE TRUE TRUE TRUE TRUE
for (i in 1:10){
print(i)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
for (i in 1:10){
if (i>5){
print(i)
} else{
print("Values<=5")
}
}
[1] "Values<=5"
[1] "Values<=5"
[1] "Values<=5"
[1] "Values<=5"
[1] "Values<=5"
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
mymean<-function(n){
mean_value<-sum(n)/length(n)
return(mean_value)
}
mymean(c(20:80))
[1] 50
# Write a function to calculate mean from start
mymean<-function(n){
mysum<-0
mylength<-0
for (i in n){
mysum=mysum+i
mylength=mylength+1
}
mean_value<-mysum/mylength
return(mean_value)
}
mymean(1:10)
[1] 5.5
df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Mosquito_ILRI/main/Dataset_temp.csv", header=T)
head(df)
library(dplyr)
# Get first five columns from df dataframe
col5<-df %>% dplyr::select(1:5) #Using select function from dplyr package
# Get 4 columns from df
col4<- df[,c(1:4)]
col4<-df[,c(1,2,3,4)] # The same as above
# Select two columns from df
col2<-df[1:10,c("Forest","Count")]
head(col2)
library(tidyr)
df%>% colnames() # See column names
[1] "Count" "Locations" "Forest" "Lag_temp_center" "Lag_temp_power" "NDVI" "Pop" "Rice"
[9] "Water"
df%>% summarize_all(class) %>% pivot_longer(everything(),names_to = "Variables",values_to = "Data Types") # See data types of each column
# Another way
str(df)
'data.frame': 469 obs. of 9 variables:
$ Count : int 32 4 43 0 11 0 0 49 5 17 ...
$ Locations : chr "Dan Phuong" "Dan Phuong" "Dan Phuong" "Dan Phuong" ...
$ Forest : num 0.0859 0.1211 0.1133 0.1484 0.1016 ...
$ Lag_temp_center: num -4.65 -4.09 -4.18 -3.59 -2.82 ...
$ Lag_temp_power : num 21.65 16.75 17.51 12.89 7.95 ...
$ NDVI : num 0.317 0.338 0.33 0.355 0.339 ...
$ Pop : num 8.24 8.83 8.91 11.31 12.16 ...
$ Rice : num 0.355 0.512 0.465 0.684 0.574 ...
$ Water : num 0 0 0 0 0 0 0 0 0 0 ...
# Some other functions from tidyverse.
library(stringr)
df%>% filter(str_detect(`Locations`, "^D")) %>% head()# Get only data that starting with D in Locations column
df %>% select(contains("Co"),contains("Po")) %>% head()# Select all columns that contain character Co and Po
df %>% mutate(Center=Lag_temp_power-mean(Lag_temp_power)) %>% head() # Create a new variable by lag tem - mean of lag temp
NA
# Create a new column with two classes (Rice and Non-rice) from Rice column
df %>% mutate(Rice_class=case_when(Rice>0.5~"Rice", TRUE~"Non-rice")) %>% head()
Briefly, you have now understand about R and somewhat its working principles. There is no way to be an expert to read a single book, so additional reading materials are necessary. I have listed here some resources: R for data science by Hadley Wickham, Basic R program by Tutorialpoint, and Data wrangling, exploration, and analysis with R by University of British Columbia.
If you have not installed the following packages, please do so.
#install.packages(c("RColorBrewer", "sp", "rgeos", "tmaptools", "sf", "downloader", "rgdal","geojsonio","raster","ggplot2"))
library(raster)
# Define the path that stores our data
path<-"F:\\Training_Material\\Second_Training\\Sub_study"
# Read NDVI raster in R using raster function from raster package
NDVI<-raster(paste(path,"\\","NDVI.tif", sep=""))
# Plot the NDVI
plot(NDVI)
library(fs)
#dir_info(path) # Print out all files in the folder
# List all files with an extension .tif
files<-list.files(path=path,full.names =T, pattern="tif$")
# Stack bands together
layer<-stack(files)
# Plot all bands
plot(layer)
# Reproject raster layer
library(dplyr)
NDVI_UTM<- NDVI %>% projectRaster(., crs = "+proj=utm +zone=48 +datum=WGS84 +units=m +no_defs")
NDVI_UTM
class : RasterLayer
dimensions : 418, 444, 185592 (nrow, ncol, ncell)
resolution : 28, 29.8 (x, y)
extent : 564562, 576994, 2329417, 2341873 (xmin, xmax, ymin, ymax)
crs : +proj=utm +zone=48 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0
source : memory
names : NDVI
values : -0.1422169, 0.5922366 (min, max)
# Read mosquito sampling locations
library(sf)
shape<-st_read(paste(path,"\\","Mosquito.shp",sep=""))
Reading layer `Mosquito' from data source `F:\Training_Material\Second_Training\Sub_study\Mosquito.shp' using driver `ESRI Shapefile'
Simple feature collection with 87 features and 5 fields
geometry type: MULTIPOINT
dimension: XY
bbox: xmin: 105.6319 ymin: 21.0775 xmax: 105.7217 ymax: 21.14083
geographic CRS: WGS 84
shape%>% st_geometry()%>% plot(shape)
Error in plot.sfc_MULTIPOINT(., shape) : missing(y) is not TRUE
#library(raster)
VN<-getData(name="GADM",country="Vietnam",level=1) # Levels 1 VN, Level 2 province, Level 3 means communes
VN_sf<-st_as_sf(VN) # Convert sp class to sf class
plot(VN, main="Vietnam Map") # Plot Vietnam Map
plot(VN_sf) # Plot all attribute columns
HN <- VN_sf %>% filter(VARNAME_1=="Ha Noi")
HN %>% st_geometry() %>% plot(col=3, main="Hanoi Map")
VN2<-getData(name="GADM",country="Vietnam",level=2) # Levels 1 VN, Level 2 province, Level 3 means communes
VN2sf<-st_as_sf(VN2) %>% st_transform(st_crs(HN)) # Convert sp class to sf class and convert crs. Only demonstration
# Clip to Hanoi boundary
HN_dist<- VN2sf %>% filter(st_contains(HN,., sparse = F))
although coordinates are longitude/latitude, st_contains assumes that they are planar
HN_dist %>% st_geometry() %>% plot(main="Hanoi District",col=rainbow(30))
# Default crs of data in GAMD is geographic coordinate 4326
st_crs(HN_dist)$proj4string
[1] "+proj=longlat +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +no_defs"
# Convert longlat to UTM
HN_UTM<-HN_dist %>% st_transform(., crs=32648 ) # 32648 is EPSG code for Zone 48 North cover Vietnam
st_crs(HN_UTM)$proj4string
[1] "+proj=utm +zone=48 +datum=WGS84 +units=m +no_defs"
# Get Rainfall data
Rainfall<-getData(name = "worldclim",var="prec",res=2.5,lon=102,lat=22)
# Clipping Vietnam precipitation
crop_VN<-crop(Rainfall,extent(VN)) # Crop the raster to the extent of the shapefile
VN_rain<-mask(crop_VN, VN) # Mask out or clip
plot(VN_rain)
# Extract values from raster stack layers
library(sf)
points<-shapefile("E:\\Python_Tutorials\\HuSuk_Data\\Step1\\Step2\\Mos_Point.shp")
Value<-raster::extract(layer,points,df=T)
Value<-Value[,c(2:ncol(Value))]
library("PerformanceAnalytics")
chart.Correlation(Value, histogram=TRUE, pch=19)
mlayer<-layer[[c(1,2,3,5,6,7,9)]]
Value<-raster::extract(mlayer,points,df=T)
Value<-Value[,c(2:ncol(Value))]
library("PerformanceAnalytics")
chart.Correlation(Value, histogram=TRUE, pch=19)
Final<-cbind(Count=points$Counts,Value)
#
Final$Count<-as.integer(Final$Count)
head(Final)
library(caTools)
library(MASS)
library(caret)
set.seed(123)
sampling = sample.split(Final$Count, SplitRatio = 0.8)
train = subset(Final, sampling == TRUE)
test = subset(Final, sampling == FALSE)
NBR<-glm.nb(Count~August_Rain + Pop + Rice + August_Temp+ Forest+ Water, data=train)
glm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergeglm.fit: algorithm did not convergealternation limit reached
summary(NBR)
Call:
glm.nb(formula = Count ~ August_Rain + Pop + Rice + August_Temp +
Forest + Water, data = train, init.theta = 0.3827343932,
link = log)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9007 -1.0622 -0.7856 -0.1473 3.1336
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -18.12144 10.87364 -1.667 0.0956 .
August_Rain 0.03937 0.01681 2.342 0.0192 *
Pop -0.01286 0.01291 -0.996 0.3191
Rice -1.37881 1.26945 -1.086 0.2774
August_Temp 0.05854 0.17728 0.330 0.7413
Forest 4.44428 4.88246 0.910 0.3627
Water -4.80263 5.78660 -0.830 0.4066
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for Negative Binomial(0.3694) family taken to be 1)
Null deviance: 95.651 on 72 degrees of freedom
Residual deviance: 86.264 on 66 degrees of freedom
AIC: 558.13
Number of Fisher Scoring iterations: 25
Theta: 0.3827
Std. Err.: 0.0597
Warning while fitting theta: alternation limit reached
2 x log-likelihood: -542.1320
#model_NB1<-glm(Count~., data=train_NB1, family = negative.binomial(theta = 0.34))
pre_NB1<- predict(NBR,newdata=test, type = "response") # Predict for new data
accuracy_NB1<-data.frame(R2 = R2(pre_NB1, test$Count), RMSE = RMSE(pre_NB1, test$Count), MAE = MAE(pre_NB1,test$Count))
accuracy_NB1
#Sosanh<-data.frame(cbind(Observed=test_NB1$Count,Predicted=pre_NB1))
#setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps\\Final_Map")
out_temp<-predict(layer, NBR, type="response",progress="window")
Loading required namespace: tcltk
# Making a copy
tmap<-out_temp
# Remove outside box values
tmap[tmap==tmap[1]]<-NA
no non-missing arguments to min; returning Infno non-missing arguments to max; returning -Inf
tmap[tmap>400]<-400
library(sp)
library(fields)
library(RColorBrewer)
#display.brewer.all()
#setwd("F:\\Research\\Research_Cooperation\\ILRI_Mosquito_Mapping\\Mosquito\\Predicted_Maps\\Final_Map")
#writeRaster(tmap, filename = "Risk_map_QGIS.tif", overwrite=T)
North2 <- list("SpatialPolygonsRescale", layout.north.arrow(type=2),
offset = c(105.3, 20.8), scale = 1000, fill="gray", which = 1)
clrs.spec <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
map1<-spplot(tmap, zcol = "layer", col.regions = clrs.spec(180), #tim.colors()
cuts = 180, colorkey = list(space = "right"),
sp.layout=list(North2))
map1