This topic is about create a choropleth map by R. I will use the dataset about Hand-foot-and-mouth (HFM) disease of Thailand R506 surveillance system, not provided, and then create choropleth map to visualize 2022 incidence rate of HFM disease.
library(tidyverse)
library(maptools)
library(RColorBrewer)
library(classInt)
setwd("D:/Data Science/FETP")
hfm <- read.csv("hfm65.csv",skip = 8)
names(hfm)
## [1] "disease" "sex" "agey" "agem" "aged"
## [6] "marietal" "race" "race1" "occupat" "addrcode"
## [11] "metropol" "hospital" "type" "result" "hserv"
## [16] "datesick" "datedefine" "datereach" "datedeath" "organism"
## [21] "complica"
The “addrcode” variable is a value indicate specific area of
Thailand, first two digits of code indicate each province of Thailand.
So we have to extract this first two digits first.
# Extract first two digits of “addrcode”.
hfm<- hfm %>%
mutate( addrcode = abs(as.numeric(addrcode)))%>%
mutate( province = as.numeric(substr(addrcode,1,2)))
unique(hfm$province)
## [1] 10 55 22 57 50 53 30 47 34 85 90 56 58 42 60 84 37 49 16 63 45 74 86 14 76
## [26] 83 36 11 62 41 40 65 73 15 64 93 52 80 19 20 51 96 71 92 13 67 35 24 32 44
## [51] 82 38 21 81 95 31 25 23 91 43 46 12 66 70 77 18 33 48 94 54 27 17 61 39 26
## [76] 5 72 NA
hfm%>%
filter(province==5)%>%
dplyr::select(addrcode)
## addrcode
## 1 5.6e+07
## 2 5.6e+07
## 3 5.6e+07
## 4 5.6e+07
## 5 5.6e+07
## 6 5.6e+07
## 7 5.6e+07
## 8 5.6e+07
## 9 5.6e+07
## 10 5.6e+07
## 11 5.6e+07
## 12 5.6e+07
## 13 5.6e+07
We have NA due to missing data in addrcode column and we have 5 due to the miss recording 56000000 in to 5.6e+007. So we have to recode 5 to 56.
hfm <- hfm%>%
mutate(province = case_when(province == 5~56,
TRUE ~ province))
hfm_count <- hfm%>%
group_by(province)%>%
summarise(case = n())%>%
na.omit()
#na.omit mean we delete records that we do not know where the case come from.
Provicedata <- read_csv("D:/Data Science/FETP/R_spatial/province_for_R506.csv")
head(Provicedata)
## # A tibble: 6 × 14
## ...1 Province HASC ISO FIPS Reg Population Area.km._. Area.mi._.
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Bangkok Metrop… TH.BM 10 TH40 C 8305218 1569 606
## 2 2 Samut Prakan TH.SP 11 TH42 C 1828694 1004 388
## 3 3 Nonthaburi TH.NO 12 TH38 C 1334083 622 240
## 4 4 Pathum Thani TH.PT 13 TH39 C 1327147 1526 589
## 5 5 Phra Nakhon Si… TH.PA 14 TH36 C 870671 2557 987
## 6 6 Ang Thong TH.AT 15 TH35 C 254292 968 374
## # … with 5 more variables: oldFileProvinceName <chr>, province <dbl>,
## # MOPH_Admin_Code <dbl>, ODPC_Code <dbl>, alt_prov_name <chr>
This data set provide a code of each province in Thailand and its basic information i.e. total area, total population.
And calculate incidence per 100,000 population.
hfm_map <-hfm_count%>%
left_join(Provicedata, by = c("province"="province"))%>%
dplyr::select(province, case, Population)%>%
mutate(incidence= case*100000/Population)
hfm_map%>%
group_by(province)%>%
arrange(desc(incidence))
## # A tibble: 76 × 4
## # Groups: province [76]
## province case Population incidence
## <dbl> <int> <dbl> <dbl>
## 1 33 1731 1055980 164.
## 2 91 324 274863 118.
## 3 65 973 912827 107.
## 4 93 405 480976 84.2
## 5 43 379 458772 82.6
## 6 32 834 1122900 74.3
## 7 34 1236 1746790 70.8
## 8 67 595 940076 63.3
## 9 66 312 548242 56.9
## 10 57 647 1172928 55.2
## # … with 66 more rows
Top 3 HFM incidence is province number 33, 91 and 65.
By using ‘readShapeSpatial()’ of ‘maptools’ package.
path <-"D:/Data Science/FETP/R_spatial/province/tha_admbnda_adm1_rtsd_20220121.shp"
countrymap_1 <- readShapeSpatial(path)
countrymap_1@data$ADM1_PCODE
## [1] TH10 TH11 TH12 TH13 TH14 TH15 TH16 TH17 TH18 TH19 TH20 TH21 TH22 TH23 TH24
## [16] TH25 TH26 TH27 TH30 TH31 TH32 TH33 TH34 TH35 TH36 TH37 TH38 TH39 TH40 TH41
## [31] TH42 TH43 TH44 TH45 TH46 TH47 TH48 TH49 TH50 TH51 TH52 TH53 TH54 TH55 TH56
## [46] TH57 TH58 TH60 TH61 TH62 TH63 TH64 TH65 TH66 TH67 TH70 TH71 TH72 TH73 TH74
## [61] TH75 TH76 TH77 TH80 TH81 TH82 TH83 TH84 TH85 TH86 TH90 TH91 TH92 TH93 TH94
## [76] TH95 TH96
## 77 Levels: TH10 TH11 TH12 TH13 TH14 TH15 TH16 TH17 TH18 TH19 TH20 TH21 ... TH96
There is different about province code in shape file that begin with ‘TH’.
#create "ADM1_PCODE" for our dataset before merge.
#add 'TH' before province code.
hfm_map<-hfm_map%>%
mutate(ADM1_PCODE = paste("TH", province, sep = ""))
#merge now
countrymap_1@data <- merge(countrymap_1@data,hfm_map,
by.x = "ADM1_PCODE",
by.y = "ADM1_PCODE",all.x = T)
summary(hfm_map$incidence)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1183 11.3091 24.3860 29.9632 33.6433 163.9236
#So use 0, 10, 20, 30 and more as reference of each interval.
interval <- c(0, 10, 20, 30, Inf )
n_interval <- length(interval)-1 #Define number of interval.
Create color palette for each interval by ‘brewer.pla()’ of ‘RColorBrewer’ package.
Ccolors <- (RColorBrewer::brewer.pal(n_interval, "YlGn"))
#Change colors here by define palette in " ".
By ‘classIntervals()’ from ‘classInt’ package.
incidence_interval_1<-classIntervals(countrymap_1@data$incidence,
n=n_interval,
#n mean number of interval we want.
style="fixed",
fixedBreaks=interval,
#fixedBreaks mean cut point of each interval.
dataPrecision = 3)
Then assign color for each incidence interval with ‘findColours()’.
colcode <- findColours(incidence_interval_1,Ccolors)
#Plot choropleth map
plot(countrymap_1, col=colcode, lwd =1, fin = c(5,6))
#add title
title("Incidence of HFM diseasce in Thiland during Jan to Aug 2022")
#Create legend
par("usr")
## [1] 86.871245 116.109078 5.018956 21.059155
#Use par("usr") to find available x-y coordination of the plot for adjusting legend position.
legend(95,13,
legend=c("> 30","20-30","10-20","0-10"),
rev(names(attr(colcode, "table"))),
fill=rev(attr(colcode, "palette")),
cex=0.7, bty="n", y.intersp = 1)