#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("GGally")
# Infection Data Visualizations in R
# Create the data frame
infections <- c(245, 215, 2076, 5023, 189, 195, 123, 116, 3298, 430, 502, 126, 112, 67, 52, 39, 54, 2356, 6781, 120, 2389, 279, 257, 290, 234, 5689, 261, 672, 205)
ufo2010 <- c(2, 6, 2, 59, 0, 1, 1, 0, 115, 0, 0, 0, 0, 0, 0, 0, 6, 4, 2, 7, 2, 9, 2, 29, 10, 169, 1, 40, 16)
pop <- c(25101, 61912, 33341, 409061, 7481, 18675, 25581, 22286, 459598, 3915, 67197, 34365, 3911, 32122, 31459, 2311, 28350, 101482, 19005, 20679, 36745, 162812, 15927, 251417, 153920, 1554720, 16148, 305455, 37276)
df <- data.frame(infections, ufo2010, pop)
# Load necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# --- 1. Bar Graph: Comparing Infections and UFO Sightings ---
ggplot(df, aes(x = 1:nrow(df))) +
geom_bar(aes(y = infections, fill = "Infections"), stat = "identity", position = "dodge") +
geom_bar(aes(y = ufo2010, fill = "UFO Sightings (2010)"), stat = "identity", position = "dodge", alpha = 0.7) +
scale_fill_manual("Variables", values = c("Infections" = "deepskyblue", "UFO Sightings (2010)" = "palevioletred")) +
labs(x = "Data Point Index", y = "Count", title = "Comparison of Infections and UFO Sightings") +
theme_minimal() +
theme(legend.position = "top")
# Observation: This bar graph compares the number of infections and UFO sightings for each
# data point. The scale of infections is significantly higher than UFO sightings in most cases.
# There are few instances where UFO sightings are non-zero, but their counts are low relative
# to the infection numbers.
#The bar chart demonstrates that infection counts are consistently much larger than UFO sightings. This highlights a strong scale imbalance and emphasizes that UFO sightings are recurring in low numbers from this data index.
# --- 2. Line Chart: Trends in Infections and Population ---
ggplot(df, aes(x = 1:nrow(df))) +
geom_line(aes(y = infections, color = "Infections"), linewidth = 1) +
geom_line(aes(y = pop, color = "Population"), linewidth = 1, linetype = "dashed") +
scale_color_manual("Variables", values = c("Infections" = "light blue", "Population" = "purple")) +
labs(x = "Data Point Index", y = "Count", title = "Trends in Infections and Population") +
theme_minimal() +
theme(legend.position = "top")
#The line chart shows that population values dominate due to their magnitude. Although infections #fluctuate, population trends overshadow them visually.
# Observation: This line chart shows the trends of infections and population across the data points.
# The population values are on a much larger scale than infection counts, making it difficult to
# observe detailed changes in infections on the same plot. However, we can see the overall
# fluctuations of both variables.
# --- 3. Scatter Plot: Relationship between Population and Infections ---
ggplot(df, aes(x = pop, y = infections)) +
geom_point(color = "blue", alpha = 0.6) +
labs(x = "Population", y = "Number of Infections", title = "Relationship between Population and Number of Infections") +
theme_minimal()
# Observation: This scatter plot explores the relationship between population size and the number
# of infections. There doesn't appear to be a strong linear correlation. While some high-population
# areas have high infection counts, this is not consistently the case.
#This plot show the correlation between population and number of infections are not relevant, other factors #may contribute to infection spread because higher population doesn’t consistently resuls higher infections.
# --- 4. Box Plot: Distribution of Infections ---
ggplot(df, aes(y = infections)) +
geom_boxplot(fill = "rosybrown") +
labs(y = "Number of Infections", title = "Distribution of Number of Infections") +
theme_minimal()
# Observation: This box plot summarizes the distribution of the 'infections' variable. It shows the
# median, quartiles, and potential outliers. The plot indicates that the majority of infection
# counts are relatively low, with some higher values identified as outliers.
#The boxplot summarizes the distribution of infection counts, highlighting the median, interquartile range, #and several high-value outliers. This indicates that while most observations are relatively low, a small #number of cases experience extremely high infection levels
# --- 5. Histogram: Frequency Distribution of UFO Sightings ---
ggplot(df, aes(x = ufo2010)) +
geom_histogram(binwidth = 5, fill = "burlywood", color = "tomato", alpha = 0.7) +
labs(x = "Number of UFO Sightings (2010)", y = "Frequency", title = "Frequency Distribution of UFO Sightings (2010)") +
theme_minimal()
#The histogram reveals a heavily right-skewed distribution with most observations near zero. UFO #sightings are infrequent, with only a few extreme outliers
# Observation: This histogram shows the frequency distribution of UFO sightings in 2010. The
# distribution is heavily skewed towards zero, indicating that most data points have very few or
# no reported UFO sightings.
# --- 6. Scatter Plot: Relationship between Population and UFO Sightings ---
ggplot(df, aes(x = pop, y = ufo2010)) +
geom_point(color = "dark green", alpha = 0.6) +
labs(x = "Population", y = "Number of UFO Sightings (2010)", title = "Relationship between Population and UFO Sightings (2010)") +
theme_minimal()
# Observation: This scatter plot examines the relationship between population size and the number
# of UFO sightings. There doesn't seem to be a clear linear relationship between these two variables.
#This scatter plot shows no relation between population and UFOSightings, meaning it’s not depend of population
# --- 7. Scatter Plot: Infections vs. UFOs with Population Size ---
ggplot(df, aes(x = ufo2010, y = infections, size = pop)) +
geom_point(alpha = 0.6, color = "dark blue") +
scale_size_continuous(name = "Population Size") +
labs(x = "Number of UFO Sightings (2010)", y = "Number of Infections", title = "Infections vs. UFO Sightings, Size by Population") +
theme_minimal()
# Observation: This scatter plot shows the relationship between infections and UFO sightings, with
# the size of each point representing the population size. It helps to visualize if areas with higher
# infections or UFO sightings also tend to have larger populations. No strong pattern is immediately
# apparent.
#The scatter plot summarizes the distribution of infection counts, highlighting the median, interquartile #range,and several high-value outliers. This indicates that while most observations are relatively low, a small #number of cases experience extremely high infection levels
# --- 8. Pair Plot: Overview of Relationships ---
library(GGally)
ggpairs(df) +
ggtitle("Pair Plot of Infections, UFO Sightings, and Population") +
theme_minimal()
# Observation: The pair plot provides a matrix of scatter plots for each pair of variables and
# density plots for the distribution of each individual variable. This gives a quick overview of
# potential linear relationships and the shape of the distributions. The distributions of
# infections and UFO sightings appear skewed, and the scatter plots reiterate the lack of strong
# linear correlations observed in the individual plots.
#The pair plot provides a comprehensive overview of all variable relationships. Distributions confirm #skewness, and scatter plots reinforce the absence of strong linear correlations across variables