# Chapter 5 Tutorial: Exploring Numerical Data
# <March 22>
# <MacGarrigle>
# Step 1: Install and Load Packages -----
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(usdata)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
data("loan50")
data("county")
# Step 2: Scatterplots (Section 5.1) -----
# Figure 5.1: Relationship between total income and loan amount
ggplot(loan50, aes(x = total_income, y = loan_amount)) +
geom_point(color = "#5f91ac", size = 2) +
labs(x = "Total income", y = "Loan amount", title = "Loan Amount vs. Income")

# Step 3: Dot Plots (Section 5.2) -----
# Figure 5.3: Dot plot of interest rates with the mean marked as a triangle
ggplot(loan50, aes(x = interest_rate)) +
geom_dotplot(fill = "#5f91ac", color = "#5f91ac", binwidth = 1) +
geom_point(aes(x = mean(interest_rate), y = 0),
shape = 17, color = "red", size = 5) +
labs(x = "Interest rate", title = "Interest Rate Distribution")
## Warning in geom_point(aes(x = mean(interest_rate), y = 0), shape = 17, color = "red", : All aesthetics have length 1, but the data has 50 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.

# Step 4: Histograms (Section 5.3) -----
# Figure 5.4: Histogram showing right skew
ggplot(loan50, aes(x = interest_rate)) +
geom_histogram(binwidth = 2.5, fill = "#5f91ac", color = "white") +
labs(x = "Interest rate", y = "Count", title = "Binned Interest Rates")

# Figure 5.5: Density plot (a smoothed histogram)
ggplot(loan50, aes(x = interest_rate)) +
geom_density(fill = "#5f91ac", alpha = 0.5)

# Step 5: Box Plots (Section 5.5) -----
# Figure 5.9: Comparison of Dot plot and Box plot
ggplot(loan50, aes(x = interest_rate)) +
geom_boxplot(fill = "white", color = "#5f91ac") +
labs(x = "Interest rate", title = "Box Plot of Interest Rates")

# Step 6: Robust Statistics (Section 5.6) -----
# Calculate mean vs. median to see the difference
loan50 %>%
summarize(mean_ir = mean(interest_rate),
median_ir = median(interest_rate),
sd_ir = sd(interest_rate),
iqr_ir = IQR(interest_rate))
## # A tibble: 1 × 4
## mean_ir median_ir sd_ir iqr_ir
## <dbl> <dbl> <dbl> <dbl>
## 1 11.6 9.93 5.05 5.76
# Step 7: Transformations (Section 5.7) -----
# Figure 5.11: Raw vs. Log-transformed population
ggplot(county, aes(x = pop2017)) +
geom_histogram(fill = "#5f91ac") + labs(title = "Raw Population (Extreme Skew)")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(county, aes(x = log10(pop2017))) +
geom_histogram(fill = "#5f91ac") + labs(title = "Log10 Transformed Population")
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

install.packages("packagename", repos = "https://cran.rstudio.com/")
## Warning: package 'packagename' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages