# Chapter 5 Tutorial: Exploring Numerical Data
# <March 22>
# <MacGarrigle>
# Step 1: Install and Load Packages -----
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(usdata)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
data("loan50")
data("county")

# Step 2: Scatterplots (Section 5.1) -----
# Figure 5.1: Relationship between total income and loan amount
ggplot(loan50, aes(x = total_income, y = loan_amount)) +
  geom_point(color = "#5f91ac", size = 2) +
  labs(x = "Total income", y = "Loan amount", title = "Loan Amount vs. Income")

# Step 3: Dot Plots (Section 5.2) -----
# Figure 5.3: Dot plot of interest rates with the mean marked as a triangle
ggplot(loan50, aes(x = interest_rate)) +
  geom_dotplot(fill = "#5f91ac", color = "#5f91ac", binwidth = 1) +
  geom_point(aes(x = mean(interest_rate), y = 0), 
             shape = 17, color = "red", size = 5) +
  labs(x = "Interest rate", title = "Interest Rate Distribution")
## Warning in geom_point(aes(x = mean(interest_rate), y = 0), shape = 17, color = "red", : All aesthetics have length 1, but the data has 50 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# Step 4: Histograms (Section 5.3) -----
# Figure 5.4: Histogram showing right skew
ggplot(loan50, aes(x = interest_rate)) +
  geom_histogram(binwidth = 2.5, fill = "#5f91ac", color = "white") +
  labs(x = "Interest rate", y = "Count", title = "Binned Interest Rates")

# Figure 5.5: Density plot (a smoothed histogram)
ggplot(loan50, aes(x = interest_rate)) +
  geom_density(fill = "#5f91ac", alpha = 0.5)

# Step 5: Box Plots (Section 5.5) -----
# Figure 5.9: Comparison of Dot plot and Box plot
ggplot(loan50, aes(x = interest_rate)) +
  geom_boxplot(fill = "white", color = "#5f91ac") +
  labs(x = "Interest rate", title = "Box Plot of Interest Rates")

# Step 6: Robust Statistics (Section 5.6) -----
# Calculate mean vs. median to see the difference
loan50 %>%
  summarize(mean_ir = mean(interest_rate),
            median_ir = median(interest_rate),
            sd_ir = sd(interest_rate),
            iqr_ir = IQR(interest_rate))
## # A tibble: 1 × 4
##   mean_ir median_ir sd_ir iqr_ir
##     <dbl>     <dbl> <dbl>  <dbl>
## 1    11.6      9.93  5.05   5.76
# Step 7: Transformations (Section 5.7) -----
# Figure 5.11: Raw vs. Log-transformed population
ggplot(county, aes(x = pop2017)) + 
  geom_histogram(fill = "#5f91ac") + labs(title = "Raw Population (Extreme Skew)")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(county, aes(x = log10(pop2017))) + 
  geom_histogram(fill = "#5f91ac") + labs(title = "Log10 Transformed Population")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

install.packages("packagename", repos = "https://cran.rstudio.com/")
## Warning: package 'packagename' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages