# load libraries
library(ggplot2)
library(readr)
library(dplyr)

# reading the space-separated txt file
infections <- read_table("/Users/gretacapelletti/Downloads/infections1.txt")

── Column specification ─────────────────────────────────────────────────────────────────────
cols(
  infections = col_double(),
  ufo2010 = col_double(),
  pop = col_double()
)
# quick look at the data
head(infections)
NA
ggplot(infections, aes(x = infections)) +
  geom_histogram(binwidth = 500, fill = "lightblue", color = "black") +
  labs(title = "Histogram of Infections", x = "Infections", y = "Count")

# most of the values are under 1000, so the majority of cases are low
# a few bars on the right show really high infection numbers — definitely outliers
# overall, the distribution is skewed to the right (lots of low values, some big ones)
# add index to use as x-axis
infections$index <- 1:nrow(infections)

ggplot(infections, aes(x = index, y = infections)) +
  geom_line(color = "red") +
  labs(title = "Infection Trend", x = "Index", y = "Infections")

# infection numbers go up and down a lot
# there are a few big spikes, maybe outliers or hot spots
# nothing super steady — it jumps randomly
ggplot(infections, aes(x = ufo2010, y = infections)) +
  geom_point(color = "blue") +
  labs(title = "Infections vs UFO Sightings", x = "UFO Sightings", y = "Infections")


# no clear pattern, but fun to see if there's any connection
# there's no clear relationship between UFO sightings and infections
# most of the points are low for both, with a few random outliers
# doesn’t really show a trend, but fun to explore
ggplot(infections, aes(x = pop, y = infections)) +
  geom_point(color = "purple") +
  scale_x_log10() +
  scale_y_log10() +
  labs(title = "Infections vs Population (log scale)", x = "Population (log)", y = "Infections (log)")


# log scale makes it easier to see patterns in messy data
# The original scatter plot was hard to interpret because one or two really large population values stretched the x-axis.
# Most of the other data points were clustered in one tiny area, making it difficult to spot any trends.
# To fix this, I applied a log scale to both the x-axis (population) and y-axis (infections).
# This transformation doesn’t change the data — it just adjusts how it’s displayed.
# The log scale makes it easier to visualize the relationship between infections and population,
# especially when the data has extreme values or is spread out across a wide range.
# After applying the log scale, we can better see that infections tend to increase as population increases.
ggplot(infections, aes(x = pop, y = infections)) +
  geom_point(color = "purple") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_x_log10() +
  scale_y_log10() +
  labs(title = "Infections vs Population (Log Scale + Trend Line)", x = "Population (log)", y = "Infections (log)")

# This version shows the relationship between infections and population using log scales.
# Log scale helps adjust for the wide range in values — especially the large outliers.
# The regression line makes it clear that as population increases, infection numbers also tend to rise.
# This visualization is more balanced and easier to interpret than the original scatter plot.
LS0tCnRpdGxlOiAiYXNzaWdubWVudCA0LWluZmVjdGlvbnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KYGBge3J9CiMgbG9hZCBsaWJyYXJpZXMKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KGRwbHlyKQoKIyByZWFkaW5nIHRoZSBzcGFjZS1zZXBhcmF0ZWQgdHh0IGZpbGUKaW5mZWN0aW9ucyA8LSByZWFkX3RhYmxlKCIvVXNlcnMvZ3JldGFjYXBlbGxldHRpL0Rvd25sb2Fkcy9pbmZlY3Rpb25zMS50eHQiKQoKCiMgcXVpY2sgbG9vayBhdCB0aGUgZGF0YQpoZWFkKGluZmVjdGlvbnMpCgpgYGAKYGBge3J9CmdncGxvdChpbmZlY3Rpb25zLCBhZXMoeCA9IGluZmVjdGlvbnMpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSA1MDAsIGZpbGwgPSAibGlnaHRibHVlIiwgY29sb3IgPSAiYmxhY2siKSArCiAgbGFicyh0aXRsZSA9ICJIaXN0b2dyYW0gb2YgSW5mZWN0aW9ucyIsIHggPSAiSW5mZWN0aW9ucyIsIHkgPSAiQ291bnQiKQpgYGAKYGBge3J9CiMgbW9zdCBvZiB0aGUgdmFsdWVzIGFyZSB1bmRlciAxMDAwLCBzbyB0aGUgbWFqb3JpdHkgb2YgY2FzZXMgYXJlIGxvdwojIGEgZmV3IGJhcnMgb24gdGhlIHJpZ2h0IHNob3cgcmVhbGx5IGhpZ2ggaW5mZWN0aW9uIG51bWJlcnMKIyBvdmVyYWxsLCB0aGUgZGlzdHJpYnV0aW9uIGlzIHNrZXdlZCB0byB0aGUgcmlnaHQgKGxvdHMgb2YgbG93IHZhbHVlcywgc29tZSBiaWcgb25lcykKYGBgCgpgYGB7cn0KIyBhZGQgaW5kZXggdG8gdXNlIGFzIHgtYXhpcwppbmZlY3Rpb25zJGluZGV4IDwtIDE6bnJvdyhpbmZlY3Rpb25zKQoKZ2dwbG90KGluZmVjdGlvbnMsIGFlcyh4ID0gaW5kZXgsIHkgPSBpbmZlY3Rpb25zKSkgKwogIGdlb21fbGluZShjb2xvciA9ICJyZWQiKSArCiAgbGFicyh0aXRsZSA9ICJJbmZlY3Rpb24gVHJlbmQiLCB4ID0gIkluZGV4IiwgeSA9ICJJbmZlY3Rpb25zIikKYGBgCmBgYHtyfQojIGluZmVjdGlvbiBudW1iZXJzIGdvIHVwIGFuZCBkb3duIGEgbG90CiMgdGhlcmUgYXJlIGEgZmV3IGJpZyBzcGlrZXMKIyBub3RoaW5nIHN1cGVyIHN0ZWFkeSDigJQgaXQganVtcHMgcmFuZG9tbHkKYGBgCmBgYHtyfQpnZ3Bsb3QoaW5mZWN0aW9ucywgYWVzKHggPSB1Zm8yMDEwLCB5ID0gaW5mZWN0aW9ucykpICsKICBnZW9tX3BvaW50KGNvbG9yID0gImJsdWUiKSArCiAgbGFicyh0aXRsZSA9ICJJbmZlY3Rpb25zIHZzIFVGTyBTaWdodGluZ3MiLCB4ID0gIlVGTyBTaWdodGluZ3MiLCB5ID0gIkluZmVjdGlvbnMiKQpgYGAKYGBge3J9CiMgdGhlcmUncyBubyBjbGVhciByZWxhdGlvbnNoaXAgYmV0d2VlbiBVRk8gc2lnaHRpbmdzIGFuZCBpbmZlY3Rpb25zCiMgbW9zdCBvZiB0aGUgcG9pbnRzIGFyZSBsb3cgZm9yIGJvdGgsIHdpdGggYSBmZXcgcmFuZG9tIG91dGxpZXJzCiMgZG9lc27igJl0IHJlYWxseSBzaG93IGEgdHJlbmQsIGJ1dCBmdW4gdG8gZXhwbG9yZQpgYGAKCmBgYHtyfQpnZ3Bsb3QoaW5mZWN0aW9ucywgYWVzKHggPSBwb3AsIHkgPSBpbmZlY3Rpb25zKSkgKwogIGdlb21fcG9pbnQoY29sb3IgPSAicHVycGxlIikgKwogIHNjYWxlX3hfbG9nMTAoKSArCiAgc2NhbGVfeV9sb2cxMCgpICsKICBsYWJzKHRpdGxlID0gIkluZmVjdGlvbnMgdnMgUG9wdWxhdGlvbiAobG9nIHNjYWxlKSIsIHggPSAiUG9wdWxhdGlvbiAobG9nKSIsIHkgPSAiSW5mZWN0aW9ucyAobG9nKSIpCgojIGxvZyBzY2FsZSBtYWtlcyBpdCBlYXNpZXIgdG8gc2VlIHBhdHRlcm5zIGluIG1lc3N5IGRhdGEKYGBgCmBgYHtyfQojIFRoZSBvcmlnaW5hbCBzY2F0dGVyIHBsb3QgdGhhdCBJIGRpZCB3YXMgaGFyZCB0byBpbnRlcnByZXQgYmVjYXVzZSBvbmUgb3IgdHdvIHJlYWxseSBsYXJnZSBwb3B1bGF0aW9uIHZhbHVlcyBzdHJldGNoZWQgdGhlIHgtYXhpcy4KIyBNb3N0IG9mIHRoZSBvdGhlciBkYXRhIHBvaW50cyB3ZXJlIGNsdXN0ZXJlZCBpbiBvbmUgdGlueSBhcmVhLCBtYWtpbmcgaXQgZGlmZmljdWx0IHRvIHNwb3QgYW55IHRyZW5kcy4KIyBUbyBmaXggdGhpcywgSSBhcHBsaWVkIGEgbG9nIHNjYWxlIHRvIGJvdGggdGhlIHgtYXhpcyAocG9wdWxhdGlvbikgYW5kIHktYXhpcyAoaW5mZWN0aW9ucykuCiMgVGhpcyB0cmFuc2Zvcm1hdGlvbiBkb2VzbuKAmXQgY2hhbmdlIHRoZSBkYXRhIOKAlCBpdCBqdXN0IGFkanVzdHMgaG93IGl04oCZcyBkaXNwbGF5ZWQuCiMgVGhlIGxvZyBzY2FsZSBtYWtlcyBpdCBlYXNpZXIgdG8gdmlzdWFsaXplIHRoZSByZWxhdGlvbnNoaXAgYmV0d2VlbiBpbmZlY3Rpb25zIGFuZCBwb3B1bGF0aW9uLAojIGVzcGVjaWFsbHkgd2hlbiB0aGUgZGF0YSBoYXMgZXh0cmVtZSB2YWx1ZXMgb3IgaXMgc3ByZWFkIG91dCBhY3Jvc3MgYSB3aWRlIHJhbmdlLgojIEFmdGVyIGFwcGx5aW5nIHRoZSBsb2cgc2NhbGUsIHdlIGNhbiBiZXR0ZXIgc2VlIHRoYXQgaW5mZWN0aW9ucyB0ZW5kIHRvIGluY3JlYXNlIGFzIHBvcHVsYXRpb24gaW5jcmVhc2VzLgpgYGAKCmBgYHtyfQpnZ3Bsb3QoaW5mZWN0aW9ucywgYWVzKHggPSBwb3AsIHkgPSBpbmZlY3Rpb25zKSkgKwogIGdlb21fcG9pbnQoY29sb3IgPSAicHVycGxlIikgKwogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIHNlID0gRkFMU0UsIGNvbG9yID0gImJsYWNrIikgKwogIHNjYWxlX3hfbG9nMTAoKSArCiAgc2NhbGVfeV9sb2cxMCgpICsKICBsYWJzKHRpdGxlID0gIkluZmVjdGlvbnMgdnMgUG9wdWxhdGlvbiAoTG9nIFNjYWxlICsgVHJlbmQgTGluZSkiLCB4ID0gIlBvcHVsYXRpb24gKGxvZykiLCB5ID0gIkluZmVjdGlvbnMgKGxvZykiKQoKYGBgCmBgYHtyfQojIFRoaXMgdmVyc2lvbiBzaG93cyB0aGUgcmVsYXRpb25zaGlwIGJldHdlZW4gaW5mZWN0aW9ucyBhbmQgcG9wdWxhdGlvbiB1c2luZyBsb2cgc2NhbGVzLgojIExvZyBzY2FsZSBoZWxwcyBhZGp1c3QgZm9yIHRoZSB3aWRlIHJhbmdlIGluIHZhbHVlcyDigJQgZXNwZWNpYWxseSB0aGUgbGFyZ2Ugb3V0bGllcnMuCiMgVGhlIHJlZ3Jlc3Npb24gbGluZSBtYWtlcyBpdCBjbGVhciB0aGF0IGFzIHBvcHVsYXRpb24gaW5jcmVhc2VzLCBpbmZlY3Rpb24gbnVtYmVycyBhbHNvIHRlbmQgdG8gcmlzZS4KIyBUaGlzIHZpc3VhbGl6YXRpb24gaXMgbW9yZSBiYWxhbmNlZCBhbmQgZWFzaWVyIHRvIGludGVycHJldCB0aGFuIHRoZSBvcmlnaW5hbCBzY2F0dGVyIHBsb3QuCmBgYAoKCg==