#install.packages("tidyverse")
#install.packages("ggplot2")
library(tidyverse)
library(readxl)
library(ggplot2)
Load Data
df <- read.delim("20230727_dengue_patient_metadata_combined.tsv", sep = "\t", header = TRUE)
Examine Data
colnames(df)
## [1] "Virus.name" "Serotype"
## [3] "Genotype" "Accession.ID"
## [5] "Collection.date" "Location"
## [7] "Host" "Additional.location.information"
## [9] "Sampling.strategy" "Gender"
## [11] "Patient.age" "Patient.status"
## [13] "Last.vaccinated" "Last.vaccination.date"
## [15] "Passage" "Specimen"
## [17] "Additional.host.information" "AA.Substitutions"
table(df$Serotype)
##
## DENV1 DENV2 DENV3 DENV4
## 16426 14307 7932 3962
unique(df$Genotype)
## [1] "V" "II" "IV-AsianII"
## [4] "I" "II-Cosmopolitan" NA
## [7] "III" "IV" "V-AsianI"
## [10] "III-Asian-American" "I-American" "unassigned"
## [13] "Others"
length(unique(df$Virus.name))
## [1] 42621
#location
length(unique(df$Location))
## [1] 1190
sorted_location_counts <- sort(table(df$Location), decreasing = TRUE)
head(sorted_location_counts)
##
## Asia / Singapore Asia / Thailand Asia / India
## 4048 3926 2757
## Asia / Vietnam Asia / China / Guangdong Asia / Taiwan
## 2701 2548 1882
#grab top 5 locations
top_5_locations <- names(sorted_location_counts)[1:5]
#subset df to only have the top 5 locations
top5df <- df[df$Location %in% top_5_locations, ]
Plot Data
gghisto <- list(
theme(axis.text.x = element_text(face="bold",size=8, color = "Navyblue"),
axis.text.y = element_text(face="bold",
size=14),
axis.title=element_text(size=17),
plot.title = element_text(size=17,face="bold")))
ggplot(df, aes(x = Genotype, fill = Serotype)) +
geom_bar(position = "stack", color = "black") +
labs(title = "Stacked Bar Plot of Serotype and Genotype",
x = "Serotype", y = "Count") +
theme(panel.background = element_rect(fill = "lightblue")) + gghisto
ggplot(df, aes(x = Serotype, fill = Genotype)) +
geom_bar(position = "dodge") +
labs(title = "Grouped Bar Plot of Serotype and Genotype",
x = "Serotype", y = "Count")
Plot by locations
ggplot(top5df, aes(x = Location, fill = Serotype)) +
geom_bar(position = "dodge", color = "black") +
labs(title = "Grouped Bar Plot of Serotype and Genotype",
x = "Serotype", y = "Count")+
theme(panel.background = element_rect(fill = "lightblue")) + gghisto