library(dplyr)
library(data.table)
library(ggplot2)
library(RColorBrewer)
library(rworldmap)
setwd("C:/Data Science Fundation with R/Kraggle/zika-virus-epidemic")
list.files("C:/Data Science Fundation with R/Kraggle/zika-virus-epidemic")
[1] "cdc_zika.csv" "examples"
[3] "Rplot.png" "Rplot01.png"
[5] "Rplot02.png" "Rplot03.png"
[7] "Rplot04.png" "Rplot05.png"
[9] "Rplot06.png" "Rplot07.png"
[11] "Rplot08.png" "Rplot09.png"
[13] "Rplot10.png" "Rplot11.png"
[15] "Rplot12.png" "Rplot13.png"
[17] "Rplot14.png" "rsconnect"
[19] "Zika-ZF Yi.r" "Zika-ZF Yi.Rmd"
[21] "Zika Virus Data Visulization_ R Notebook -ZF Yi.nb.html" "Zika Virus Data Visulization_ R Notebook -ZF Yi.Rmd"
zika <- read.csv('cdc_zika.csv',header=TRUE, fill=TRUE,row.names=NULL)
zika <- data.table(zika)
zika[, c("Country", "Province") := tstrsplit(location, "-", fixed = TRUE)][]
Warning in `[.data.table`(zika, , `:=`(c("Country", "Province"), tstrsplit(location, :
Supplied 2 columns to be assigned a list (length 4) of values (2 unused)
report_date location location_type data_field data_field_code
1: 3/19/2016 Argentina-Buenos_Aires province cumulative_confirmed_local_cases AR0001
2: 3/19/2016 Argentina-Buenos_Aires province cumulative_probable_local_cases AR0002
3: 3/19/2016 Argentina-Buenos_Aires province cumulative_confirmed_imported_cases AR0003
4: 3/19/2016 Argentina-Buenos_Aires province cumulative_probable_imported_cases AR0004
5: 3/19/2016 Argentina-Buenos_Aires province cumulative_cases_under_study AR0005
---
107615: 6/28/2016 United_States_Virgin_Islands territory confirmed_conjunctivitis VI0017
107616: 6/28/2016 United_States_Virgin_Islands territory confirmed_eyepain VI0018
107617: 6/28/2016 United_States_Virgin_Islands territory confirmed_headache VI0019
107618: 6/28/2016 United_States_Virgin_Islands territory confirmed_malaise VI0020
107619: 6/28/2016 United_States_Virgin_Islands territory zika_no_specimen VI0021
time_period time_period_type value unit Country Province
1: NA NA 0 cases Argentina Buenos_Aires
2: NA NA 0 cases Argentina Buenos_Aires
3: NA NA 2 cases Argentina Buenos_Aires
4: NA NA 1 cases Argentina Buenos_Aires
5: NA NA 127 cases Argentina Buenos_Aires
---
107615: NA NA 7 cases United_States_Virgin_Islands NA
107616: NA NA 13 cases United_States_Virgin_Islands NA
107617: NA NA 14 cases United_States_Virgin_Islands NA
107618: NA NA 5 cases United_States_Virgin_Islands NA
107619: NA NA 2 cases United_States_Virgin_Islands NA
zika$report_date <-as.Date(zika$report_date, "%m/%d/%Y")
zika$Year <- as.numeric(format(zika$report_date, format = "%Y"))
zika %>%
filter(!is.na(Year)) %>%
group_by(Country, Year) %>%
summarise(n = n())-> ZikaOc
names(ZikaOc)[3] <- "cases"
ggplot(ZikaOc, aes(x= Country, y = cases)) +
geom_bar(stat="identity") +
coord_flip()+
facet_wrap(~Year)
Globally reported zika cases, about 107, 619 obervations, were mainly occured in 12 countries: Argentina (2,091 obervations), Brazil (4,253), Colombia (86,889), Cominican Republican (5,716), Ecuador (796), El Salvador (1,000), Guatemala (516), Haiti (52), Mexico (2894), Nicaragua (125), Panama(148) and USA (2,962).
data(countryExData,envir=environment(),package="rworldmap")
str(countryExData)
'data.frame': 149 obs. of 80 variables:
$ ISO3V10 : chr "AGO" "ALB" "ARE" "ARG" ...
$ Country : chr "Angola" "Albania" "United Arab Emirates " "Argentina" ...
$ EPI_regions : chr "Sub-Saharan Africa" "Central and Eastern Europ" "Middle East and North Africa" "Latin America and Caribbe" ...
$ GEO_subregion : chr "Southern Africa" "Central Europe" "Arabian Peninsula" "South America" ...
$ Population2005 : num 15941 3130 4496 38747 3016 ...
$ GDP_capita.MRYA : num 2314 4955 22698 13652 5011 ...
$ landlock : int 0 0 0 0 1 0 1 1 1 0 ...
$ landarea : num 1251896 28346 74777 2736296 28273 ...
$ density : num 0.2 34.3 8.7 1.3 30.3 0.3 16.3 14.6 91 71.2 ...
$ EPI : num 39.5 84 64 81.8 77.8 79.8 89.4 72.2 54.7 78.4 ...
$ ENVHEALTH : num 8.9 89.3 89.8 91.1 88 99.3 98.1 76.4 37.6 98.8 ...
$ ECOSYSTEM : num 70.1 78.6 38.2 72.5 67.5 60.4 80.7 67.9 71.7 58 ...
$ ENVHEALTH.1 : num 8.9 89.3 89.8 91.1 88 99.3 98.1 76.4 37.6 98.8 ...
$ AIR_E : num 49.2 99.1 85.1 87.3 99.4 84.9 97 97.7 99.5 50.2 ...
$ WATER_E : num 61.6 96.5 27.1 74.9 28 62.5 79.9 48.5 62.8 52.3 ...
$ BIODIVERSITY : num 58.9 4 36.6 33.6 16 78.1 71.6 29 62.5 10 ...
$ PRODUCTIVE_NATURAL_RESOURCES: num 81.3 79.4 74.1 71.5 82.1 91.8 88.2 85.7 48 76.1 ...
$ CLIMATE : num 74.6 93.4 26.6 82.3 87.2 42.5 79.9 77.1 81.5 69.5 ...
$ DALY_SC : num 0 99.5 98.9 98 98.2 99.6 99.8 93 26.1 99.6 ...
$ WATER_H : num 19.8 91.3 98.8 91.3 83.3 100 100 53.6 44.7 100 ...
$ AIR_H : num 16 66.8 62.4 76.9 72.5 97.9 92.8 66.2 53.5 96 ...
$ AIR_E.1 : num 49.2 99.1 85.1 87.3 99.4 84.9 97 97.7 99.5 50.2 ...
$ WATER_E.1 : num 61.6 96.5 27.1 74.9 28 62.5 79.9 48.5 62.8 52.3 ...
$ BIODIVERSITY.1 : num 58.9 4 36.6 33.6 16 78.1 71.6 29 62.5 10 ...
$ FOREST : num 95.4 100 100 75.9 70.1 100 100 100 0 100 ...
$ FISH : num 87.3 62.5 50 58.8 NA 96.7 NA NA NA 47.4 ...
$ AGRICULTURE : num 61.3 75.6 72.3 79.9 94.2 78.7 76.4 71.4 95.9 80.8 ...
$ CLIMATE.1 : num 74.6 93.4 26.6 82.3 87.2 42.5 79.9 77.1 81.5 69.5 ...
$ ACSAT_pt : num 19.3 89.5 97.7 89.5 80.1 100 100 46.2 25.1 100 ...
$ WATSUP_pt : num 20.2 93.2 100 93.2 86.4 100 100 61 64.3 100 ...
$ DALY_pt : num 0 99.5 98.9 98 98.2 99.6 99.8 93 26.1 99.6 ...
$ INDOOR_pt : num 0 47.4 94.7 94.7 72.2 94.7 94.7 48.4 0 94.7 ...
$ PM10_pt : num 40 70.1 11.2 51.3 59 100 87.8 67 84.1 95.4 ...
$ OZONE_H_pt : num 0 99.1 100 92.4 100 100 99.2 100 99.4 99.7 ...
$ SO2_pt : num 98.4 98.5 70.2 98.8 98.8 69.9 94.4 95.4 99.3 0.6 ...
$ OZONE_E_pt : num 0 99.8 100 75.7 100 100 99.6 100 99.6 99.8 ...
$ WATQI_pt : num 29.4 93 0 76.4 31.7 75.3 59.8 31.7 25.6 59.6 ...
$ WATSTR_pt : num 98.3 90.3 100 100 100 73.4 100 100 63 100 ...
$ WATQI_GEMS.station.data : num NA 93 NA 76.4 NA 75.3 59.8 NA NA 59.6 ...
$ FORGRO_pt : num 95.4 100 100 75.9 70.1 100 100 100 0 100 ...
$ CRI_pt : num 99.7 5.5 100 39.8 37.7 86.1 80.1 46.2 84.1 9.6 ...
$ EFFCON_pt : num 95.7 1.6 2.3 33.9 10.4 79 63 11.9 40.9 11.5 ...
$ AZE_pt : num 0 NA NA 40 0 69.4 NA NA NA NA ...
$ MPAEEZ_pt : num 14 6 1 2 100 78 100 100 100 0 ...
$ EEZTD_pt : num 74.5 25.1 0 17.5 NA 93.5 NA NA NA 0 ...
$ MTI_pt : num 100 100 100 100 NA 100 NA NA NA 94.9 ...
$ IRRSTR_pt : num 97.5 100 51.8 74.6 97 50.7 100 82.9 100 100 ...
$ AGINT_pt : num 100 90.2 100 78.4 94.5 79.6 63.2 91.1 92 87.1 ...
$ AGSUB_pt : num 100 100 100 100 100 99.9 22.8 100 100 22.8 ...
$ BURNED_pt : num 0 78.9 96.1 55.7 79.5 63.3 96 78.4 87.7 98.6 ...
$ PEST_pt : num 9.1 9.1 13.6 90.9 100 100 100 4.5 100 95.5 ...
$ GHGCAP_pt : num 65.8 98.8 38.6 87.1 98 45.4 81.6 88.7 94 77.7 ...
$ CO2IND_pt : num 95 85 32.1 92.7 78.3 76.2 82.3 97.1 100 59.7 ...
$ CO2KWH_pt : num 63 96.3 9 67 85.1 5.9 75.7 45.6 50.5 71.1 ...
$ ACSAT : num 31 91 98 91 83 100 100 54 36 100 ...
$ WATSUP : num 53 96 100 96 92 100 100 77 79 100 ...
$ DALY : num 109 0.3 0.6 1.1 1 0.2 0.1 3.9 41 0.2 ...
$ INDOOR : num 95 50 5 5 26.4 5 5 49 95 5 ...
$ PM10 : num 91.4 55.5 125.6 77.9 68.7 ...
$ OZONE_H : num 4948.8 15.8 0 140.4 0 ...
$ SO2 : num 0.7 0.6 12.6 0.5 0.5 12.7 2.4 1.9 0.3 41.9 ...
$ OZONE_E : num 1.36e+09 6.81e+05 2.63e+01 9.96e+07 0.00 ...
$ WATQI : num 57.5 95.8 39.9 85.8 58.9 85.2 75.9 58.9 55.3 75.7 ...
$ WATQI_GEMS.station.data.1 : num NA 95.8 NA 85.8 NA 85.2 75.9 NA NA 75.7 ...
$ WATSTR : num 5.5 0 41.6 24.1 68.6 45.7 0 31.4 0 49.8 ...
$ FORGRO : num 1 1 1 0.9 0.9 1 1.1 1 0.6 1.1 ...
$ CRI : num 0.5 0 0.5 0.2 0.2 0.4 0.4 0.2 0.4 0 ...
$ EFFCON : num 9.6 0.2 0.2 3.4 1 7.9 6.3 1.2 4.1 1.2 ...
$ AZE : num 0 NA NA 40 0 69.4 NA NA NA NA ...
$ MPAEEZ : num 1.4 0.6 0.1 0.2 10 7.8 10 10 10 0 ...
$ EEZTD : num 0.255 0.749 1 0.825 NA ...
$ MTI : num 0.0016 0 0.0034 0.0044 NA 0.0014 NA NA NA -0.001 ...
$ IRRSTR : num 2.2 0 41 21.6 2.5 41.9 0 14.6 0 0 ...
$ AGINT : num 0 6.2 0 13.7 3.5 12.9 23.3 5.6 5.1 8.2 ...
$ AGSUB : num 0 0 0 0 0 0 36 0 0 36 ...
$ BURNED : num 15.3 2.9 0.5 6 2.8 5 0.5 2.9 1.7 0.2 ...
$ PEST : num 2 2 3 20 22 22 22 1 22 21 ...
$ GHGCAP : num 20 2.9 34.1 8.9 3.3 30.5 11.8 8.1 5.3 13.8 ...
$ CO2IND : num 1.2 1.9 5.5 1.4 2.3 2.5 2.1 1.1 0.8 3.6 ...
$ CO2KWH : num 343 34 844 306 138 873 225 505 459 268 ...
Test <- merge(countryExData, ZikaOc, by = "Country")
sPDF <- joinCountryData2Map(Test, joinCode = "ISO3", nameJoinColumn = "ISO3V10")
11 codes from your data successfully matched countries in the map
0 codes from your data failed to match with a country code in the map
234 codes from the map weren't represented in your data
mapDevice() #create world map shaped window
mapCountryData(sPDF, nameColumnToPlot='cases', addLegend = TRUE, mapTitle = "Global Zika virus epidemic cases from 2015 to 2016")
#mapBubbles(sPDF,nameZSize="cases",nameZColour="Country",colourPalette="PuBuGn",oceanCol="lightblue",landCol="wheat")
Zika virus epidemic cases mainly happened in North and sourth American in 2015 and 2016.
USA <- zika[grep("United_States", zika$location),]
Mexico <- zika[grep("Mexico", zika$location),]
Panama <- zika[grep("Panama", zika$location),]
Nicaragua <- zika[grep("Nicaragua", zika$location),]
Haiti <- zika[grep("Haiti", zika$location),]
Guatemala <- zika[grep("Guatemala", zika$location),]
El_salvador <- zika[grep("El_Salvador", zika$location),]
Ecuador <- zika[grep("Ecuador", zika$location),]
Dominican_republic <- zika[grep("Dominican_Republic", zika$location),]
Colombia <- zika[grep("Colombia", zika$location),]
Argentina <- zika[grep("Argentina", zika$location),]
Brazil <- zika[grep("Brazil", zika$location),]
USA %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_USA
g1 <- ggplot(g_USA, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'USA: Reported Zika cases',
x = 'cases types')
Mexico %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Mexico
g2 <-ggplot(g_Mexico, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Mexico: Reported Zika cases',
x = 'cases types')
Panama %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Panama
g3 <-ggplot(g_Panama, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Panama: Reported Zika cases',
x = 'cases types')
Nicaragua %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Nicaragua
g4 <-ggplot(g_Nicaragua, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Nicaragua: Reported Zika cases',
x = 'cases types')
Haiti %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Haiti
g5 <-ggplot(g_Haiti, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Haiti: Reported Zika cases',
x = 'cases types')
Guatemala %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Guatemala
g6 <-ggplot(g_Guatemala, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Guatemala: Reported Zika cases',
x = 'cases types')
El_salvador %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_El_salvador
g7 <-ggplot(g_El_salvador, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'El_salvador: Reported Zika cases',
x = 'cases types')
Ecuador %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Ecuador
g8 <-ggplot(g_Ecuador, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Ecuador: Reported Zika cases',
x = 'cases types')
Dominican_republic %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Dominican_republic
g9 <-ggplot(g_Dominican_republic, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Dominican_republic: Reported Zika cases',
x = 'cases types')
Colombia %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Colombia
g10 <-ggplot(g_Colombia, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Colombia: Reported Zika cases',
x = 'cases types')
Argentina %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Argentina
g11 <-ggplot(g_Argentina, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Argentina: Reported Zika cases',
x = 'cases types')
Brazil %>%
group_by(data_field, Year) %>%
summarise(Cases = n()) -> g_Brazil
g12 <-ggplot(g_Brazil, aes(x = data_field, y = Cases)) +
geom_bar(stat = 'identity',colour = 'white') +
facet_wrap(~ Year) +
scale_fill_hue() +
coord_flip() +
labs(y = 'Brazil: Reported Zika cases',
x = 'cases types')
g1 # Frome the cases in USA, most zika cases was reported from travel and local. Local cases mainly were reported from Puerto Rico, New York, Florida, and Virgin Island.
g2
g3
g4
g5
g6
g7
g8
g9
g10
g11
g12