#Working directory
#setwd("~/UConn Courses/Big data science for biologists/Final project/Project")
library(readxl)
library(readr)
#Loading 1st dataset: Biofilm-associated multidrug-resistant and methicillin-resistant staphylococcus aureus infections in children
mrsa_nepal <- read_excel("Project/MRSA_Nepal/Data.xlsx",
col_types = c("text", "text", "text",
"text", "text", "text", "numeric",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
"text", "text", "text", "text", "numeric",
"numeric", "numeric", "text"))
View(mrsa_nepal)
#Loading 2nd dataset: Biofilm-associated multidrug-resistant and methicillin-resistant staphylococcus aureus infections in children
mrsa_india <- read_csv("Project/MRSA_India/MRSA_DATA__CSV_file.csv",
col_types = cols(`AGEGROUP<1Y=1,1-3=2,4-6=3,>6=4` = col_double(),
`GENDER MALE=1,FEMALE=2` = col_double(),
`TYPEMRSA CA=1,HA=2` = col_double()))
View(mrsa_india)
#change names of all columns to more convenient names
colnames(mrsa_nepal) <- gsub(" ", "_", colnames(mrsa_nepal))
#See if each Isolate number is unique
duplicates <- duplicated(mrsa_nepal$Isolate_number)
any(duplicates)
## [1] FALSE
#Change isolate id into number
mrsa_nepal$Isolate_number <- gsub("\\D", "", mrsa_nepal$Isolate_number)
mrsa_nepal$Isolate_number <- as.numeric(mrsa_nepal$Isolate_number)
#Sort by isolated number
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mrsa_nepal <- mrsa_nepal %>%
arrange(mrsa_nepal$Isolate_number)
library(ggplot2)
ggplot(data = mrsa_nepal, aes(x = Gender, y = Total_Resistant, fill = Gender)) +
geom_violin(trim=FALSE, fill='lightgrey') +
labs(x = "Gender", y = "Total Resistant", title = "Violin Plot of Total Resistant by Gender") +
stat_summary(fun.y=mean, geom="point", shape=23, size=3) +
geom_dotplot(binaxis='y', stackdir='center', dotsize=1) +
theme_minimal()
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
mrsa_nepal$Children_category <- factor(mrsa_nepal$Children_category, levels = c("Neonate", "Infant", "Toddler", "Pre-school", "School"))
ggplot(data = mrsa_nepal, aes(x = Children_category, y = Total_Resistant, fill = Children_category)) +
geom_violin(trim=FALSE) +
labs(x = "Children Category", y = "Total Resistant", title = "Violin Plot of Total Resistant by Children Category") +
stat_summary(fun.y=mean, geom="point", shape=23, size=3, aes(color = Children_category)) +
geom_dotplot(binaxis='y', stackdir='center', dotsize=1, aes(fill = Children_category)) +
theme_minimal()
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Violin Plot of Total Resistant by Gender
#Same with gender
#Creation of a violin graph with "Children_category" on the x-axis and "Gender" represented by the color
ggplot(data = mrsa_nepal, aes(x = Children_category, y = Total_Resistant, fill = Children_category)) +
geom_violin(trim=FALSE) +
labs(x = "Children Category", y = "Total Resistant", title = "Violin Plot of Total Resistant by Children Category and Gender") +
stat_summary(fun.y=mean, geom="point", shape=23, size=3) +
geom_dotplot(binaxis='y', stackdir='center', dotsize=1) +
theme_minimal() +
facet_wrap(~ Gender)
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
#V2 improved Creation of a violin graph with "Children_category" on the x-axis and "Gender" represented by the color
ggplot(data = mrsa_nepal, aes(x = Children_category, y = Total_Resistant, fill = Children_category)) +
geom_violin(trim=FALSE) +
geom_point(stat = "summary", fun = "mean", shape = 23, size = 3, position = position_dodge(width = 0.75)) +
geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 1, position = position_dodge(width = 0.75)) +
labs(x = "Children Category", y = "Total Resistant", title = "V2 Violin Plot of Total Resistant by Children Category and Gender") +
facet_wrap(~ Gender, scales = "free_y") +
scale_fill_manual(values = c("Neonate" = "orange", "Infant" = "red", "Toddler" = "blue", "Pre-school" = "green", "School" = "purple")) +
theme_minimal()
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.