# Install and load required packages
required_packages <- c("readxl", "dplyr", "ggplot2", "gridExtra")
install_if_missing <- function(p) {
if (!requireNamespace(p, quietly = TRUE)) {
install.packages(p)
}
}
invisible(sapply(required_packages, install_if_missing))
# Load the required packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
# Set knitr options
knitr::opts_chunk$set(echo = TRUE)
# Set your file path to the location where Sampledata2.xlsx is stored
file_path <- "~/Documents/LABSCloud/Sampledata2.xlsx"
# Check if the file exists and load the data
if (file.exists(file_path)) {
raw <- read_excel(file_path)
# Inspect the structure of the dataset
#str(raw)
} else {
stop("File not found. Please check the file path.")
}
df <- subset(raw, Year > 2012)
df$RangeGroup <- dplyr::case_when(
df$CrimeRate < 250 ~ "CrimeRate < 250",
df$CrimeRate >= 250 & df$CrimeRate <= 500 ~ "250 <= CrimeRate <= 500",
df$CrimeRate > 500 ~ "CrimeRate > 500"
)
# summary(df)
# Plot a histogram of the variable “CrimeRate” and fill with the variable “Year”
no_yrs <- length(unique(df[['Year']]))
col1 <- colorspace::diverge_hcl(n = no_yrs)
ggplot(df, aes(x = CrimeRate, fill = as.factor(Year))) +
geom_histogram(position = "dodge", binwidth = 100,color="black",alpha=0.7) +
scale_fill_manual(values = col1) +
labs(title = "Crime Rate Histogram Plots",
x = "Crime Rate per 100, 1000 people",
y = "Count",
fill = "Year")
# Scatter Plot
# Group by Year and summarize the CrimeRate
df_new <- raw %>%
group_by(Year) %>%
summarize(
mean_rate = mean(CrimeRate, na.rm = TRUE),
max_rate = max(CrimeRate, na.rm = TRUE),
min_rate = min(CrimeRate, na.rm = TRUE)
)
# Display the summarized data
# df_new
# Plot the scatter plot with line and point
ggplot(df_new, aes(x = Year, y = mean_rate)) +
geom_point(color = "blue", size = 1.0) +
geom_line(color = "blue", size = 0.5) +
labs(title = "Mean Crime Rate by Year",
x = "Year",
y = "Mean Crime Rate per 100, 1000 people in USA") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
boxplot_plot <- ggplot(raw, aes(x = factor(Year), y = CrimeRate, fill = factor(Year))) +
geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 2) +
labs(title = "Boxplot of CrimeRate by Year",
x = "Year",
y = "CrimeRate",
fill = "Year") +
theme_minimal()
boxplot_plot
# Arrange the three plots in one page
p1 <- ggplot(df, aes(x = CrimeRate, fill = as.factor(Year))) +
geom_histogram(position = "dodge", binwidth = 100,color="black",alpha=0.7) +
scale_fill_manual(values = col1) +
labs(title = "Crime Rate Histogram Plots",
x = "Crime Rate per 100, 1000 people",
y = "Count",
fill = "Year")
p2 <- ggplot(df_new, aes(x = Year, y = mean_rate)) +
geom_point(color = "blue", size = 1.0) +
geom_line(color = "blue", size = 0.5) +
labs(title = "Mean Crime Rate by Year",
x = "Year",
y = "Mean Crime Rate per 100, 1000 people in USA") +
theme_minimal()
p3 <- boxplot_plot <- ggplot(raw, aes(x = factor(Year), y = CrimeRate, fill = factor(Year))) +
geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 2) +
labs(title = "Boxplot of CrimeRate by Year",
x = "Year",
y = "CrimeRate",
fill = "Year") +
theme_minimal()
grid.arrange(p1,p2,p3, ncol=1)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.