H2L2C Summer 2023 Yijun Dengue

1. Print statement used to output the words “Hello World”

#install.packages("tidyverse")
#install.packages("ggplot2")
library(tidyverse)
library(readxl)
library(ggplot2)

Load Data

df <- read.delim("20230727_dengue_patient_metadata_combined.tsv", sep = "\t", header = TRUE)

Examine Data

colnames(df)

##  [1] "Virus.name"                      "Serotype"                       
##  [3] "Genotype"                        "Accession.ID"                   
##  [5] "Collection.date"                 "Location"                       
##  [7] "Host"                            "Additional.location.information"
##  [9] "Sampling.strategy"               "Gender"                         
## [11] "Patient.age"                     "Patient.status"                 
## [13] "Last.vaccinated"                 "Last.vaccination.date"          
## [15] "Passage"                         "Specimen"                       
## [17] "Additional.host.information"     "AA.Substitutions"

table(df$Serotype)

## 
## DENV1 DENV2 DENV3 DENV4 
## 16426 14307  7932  3962

unique(df$Genotype)

##  [1] "V"                  "II"                 "IV-AsianII"        
##  [4] "I"                  "II-Cosmopolitan"    NA                  
##  [7] "III"                "IV"                 "V-AsianI"          
## [10] "III-Asian-American" "I-American"         "unassigned"        
## [13] "Others"

length(unique(df$Virus.name))

## [1] 42621

#location
length(unique(df$Location))

## [1] 1190

sorted_location_counts <- sort(table(df$Location), decreasing = TRUE)
head(sorted_location_counts)

## 
##         Asia / Singapore          Asia / Thailand             Asia / India 
##                     4048                     3926                     2757 
##           Asia / Vietnam Asia / China / Guangdong            Asia / Taiwan 
##                     2701                     2548                     1882

#grab top 5 locations
top_5_locations <- names(sorted_location_counts)[1:5]

#subset df to only have the top 5 locations
top5df <- df[df$Location %in% top_5_locations, ]

Plot Data

gghisto <- list(
  theme(axis.text.x = element_text(face="bold",size=8, color = "Navyblue"),
          axis.text.y = element_text(face="bold", 
          size=14),
          axis.title=element_text(size=17),
          plot.title = element_text(size=17,face="bold")))

ggplot(df, aes(x = Genotype, fill = Serotype)) +
  geom_bar(position = "stack", color = "black") +
  labs(title = "Stacked Bar Plot of Serotype and Genotype",
       x = "Serotype", y = "Count") +
  theme(panel.background = element_rect(fill = "lightblue")) + gghisto

ggplot(df, aes(x = Serotype, fill = Genotype)) +
  geom_bar(position = "dodge") +
  labs(title = "Grouped Bar Plot of Serotype and Genotype",
       x = "Serotype", y = "Count")

Plot by locations

ggplot(top5df, aes(x = Location, fill = Serotype)) +
  geom_bar(position = "dodge", color = "black") +
  labs(title = "Grouped Bar Plot of Serotype and Genotype",
       x = "Serotype", y = "Count")+
  theme(panel.background = element_rect(fill = "lightblue")) + gghisto