This analysis investigates the relationship between visitor volumes and professional photography opportunities in US National Parks. By analyzing 41,900 photo metadata records and 3,650 daily visitation observations across 10 major parks, we identify “visual saturation” points where high crowds correlate with diminished photographic uniqueness.
Key Findings:
# Data manipulation and analysis
library(tidyverse) # Core data science package
library(lubridate) # Date/time handling
library(corrplot) # Correlation matrices
# Spatial analysis
library(sf) # Spatial data handling
library(leaflet) # Interactive maps
library(leaflet.extras) # Additional map features
# Visualization
library(ggplot2) # Core plotting
library(viridis) # Color palettes
library(plotly) # Interactive plots
library(patchwork) # Combine plots
library(scales) # Scale functions for plots
library(ggridges) # Ridge plots
# Statistical analysis
library(broom) # Tidy model outputs
library(GGally) # Pair plots
# Tables
library(knitr) # Table formatting
library(kableExtra) # Enhanced tables
# Set theme for all plots
theme_set(theme_minimal(base_size = 12))
cat("✓ All libraries loaded successfully\n")
## ✓ All libraries loaded successfully
# Load photography metadata
photos <- read_csv("photography_metadata.csv",
col_types = cols(
photo_id = col_character(),
park_name = col_character(),
latitude = col_double(),
longitude = col_double(),
capture_datetime = col_datetime(),
year = col_integer(),
month = col_integer(),
day = col_integer(),
hour = col_integer(),
camera_model = col_character(),
lens = col_character(),
focal_length_mm = col_integer(),
aperture_f = col_double(),
iso = col_integer(),
shutter_speed = col_character(),
is_professional = col_logical(),
hotspot_id = col_integer()
))
# Load park visitation data
visitation <- read_csv("park_visitation.csv",
col_types = cols(
park_name = col_character(),
date = col_date(),
year = col_integer(),
month = col_integer(),
day = col_integer(),
day_of_week = col_character(),
is_weekend = col_logical(),
is_holiday = col_logical(),
daily_visitors = col_integer(),
park_latitude = col_double(),
park_longitude = col_double()
))
# Display data summaries
cat("Photography Metadata:\n")
## Photography Metadata:
print(glimpse(photos))
## Rows: 41,900
## Columns: 17
## $ photo_id <chr> "PHO_000001", "PHO_000002", "PHO_000003", "PHO_000004…
## $ park_name <chr> "Yosemite", "Yosemite", "Yosemite", "Yosemite", "Yose…
## $ latitude <dbl> 37.79507, 37.82158, 37.81276, 37.75589, 37.73541, 37.…
## $ longitude <dbl> -119.6147, -119.5731, -119.5906, -119.6556, -119.6579…
## $ capture_datetime <dttm> 2023-08-01 18:15:00, 2023-01-03 06:32:00, 2023-05-19…
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,…
## $ month <int> 8, 1, 5, 5, 9, 2, 4, 12, 5, 4, 7, 6, 3, 4, 5, 8, 7, 1…
## $ day <int> 1, 3, 19, 25, 18, 8, 12, 21, 19, 5, 3, 20, 14, 25, 8,…
## $ hour <int> 18, 6, 6, 7, 5, 18, 6, 17, 19, 6, 18, 23, 5, 5, 19, 5…
## $ camera_model <chr> "Sony A1", "Sony A1", "Sony A1", "Sony A7R V", "Nikon…
## $ lens <chr> "16-35mm", "400mm", "24mm", "14mm", "100-400mm", "100…
## $ focal_length_mm <int> 33, 400, 24, 14, 285, 332, 111, 19, 24, 24, 67, 33, 2…
## $ aperture_f <dbl> 4.6, 12.4, 4.6, 5.9, 10.9, 11.6, 9.5, 4.9, 7.4, 4.7, …
## $ iso <int> 3200, 3200, 800, 400, 1600, 1600, 1600, 100, 3200, 80…
## $ shutter_speed <chr> "1/132", "1/1600", "1/48", "1/14", "1/570", "1/664", …
## $ is_professional <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FA…
## $ hotspot_id <int> 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 1, 1, 1, 1, 0, 0,…
## # A tibble: 41,900 × 17
## photo_id park_name latitude longitude capture_datetime year month day
## <chr> <chr> <dbl> <dbl> <dttm> <int> <int> <int>
## 1 PHO_000001 Yosemite 37.8 -120. 2023-08-01 18:15:00 2023 8 1
## 2 PHO_000002 Yosemite 37.8 -120. 2023-01-03 06:32:00 2023 1 3
## 3 PHO_000003 Yosemite 37.8 -120. 2023-05-19 06:55:00 2023 5 19
## 4 PHO_000004 Yosemite 37.8 -120. 2023-05-25 07:05:00 2023 5 25
## 5 PHO_000005 Yosemite 37.7 -120. 2023-09-18 05:59:00 2023 9 18
## 6 PHO_000006 Yosemite 37.8 -120. 2023-02-08 18:05:00 2023 2 8
## 7 PHO_000007 Yosemite 37.8 -120. 2023-04-12 06:17:00 2023 4 12
## 8 PHO_000008 Yosemite 37.7 -120. 2023-12-21 17:14:00 2023 12 21
## 9 PHO_000009 Yosemite 37.9 -120. 2023-05-19 19:20:00 2023 5 19
## 10 PHO_000010 Yosemite 37.9 -120. 2023-04-05 06:35:00 2023 4 5
## # ℹ 41,890 more rows
## # ℹ 9 more variables: hour <int>, camera_model <chr>, lens <chr>,
## # focal_length_mm <int>, aperture_f <dbl>, iso <int>, shutter_speed <chr>,
## # is_professional <lgl>, hotspot_id <int>
cat("\n\nPark Visitation Data:\n")
##
##
## Park Visitation Data:
print(glimpse(visitation))
## Rows: 3,650
## Columns: 11
## $ park_name <chr> "Yosemite", "Yosemite", "Yosemite", "Yosemite", "Yosemi…
## $ date <date> 2023-01-01, 2023-01-02, 2023-01-03, 2023-01-04, 2023-0…
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ day_of_week <chr> "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday",…
## $ is_weekend <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ is_holiday <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
## $ daily_visitors <int> 7027, 3855, 3091, 2094, 4041, 3310, 5479, 5764, 4381, 3…
## $ park_latitude <dbl> 37.8651, 37.8651, 37.8651, 37.8651, 37.8651, 37.8651, 3…
## $ park_longitude <dbl> -119.5383, -119.5383, -119.5383, -119.5383, -119.5383, …
## # A tibble: 3,650 × 11
## park_name date year month day day_of_week is_weekend is_holiday
## <chr> <date> <int> <int> <int> <chr> <lgl> <lgl>
## 1 Yosemite 2023-01-01 2023 1 1 Sunday TRUE TRUE
## 2 Yosemite 2023-01-02 2023 1 2 Monday FALSE FALSE
## 3 Yosemite 2023-01-03 2023 1 3 Tuesday FALSE FALSE
## 4 Yosemite 2023-01-04 2023 1 4 Wednesday FALSE FALSE
## 5 Yosemite 2023-01-05 2023 1 5 Thursday FALSE FALSE
## 6 Yosemite 2023-01-06 2023 1 6 Friday FALSE FALSE
## 7 Yosemite 2023-01-07 2023 1 7 Saturday TRUE FALSE
## 8 Yosemite 2023-01-08 2023 1 8 Sunday TRUE FALSE
## 9 Yosemite 2023-01-09 2023 1 9 Monday FALSE FALSE
## 10 Yosemite 2023-01-10 2023 1 10 Tuesday FALSE FALSE
## # ℹ 3,640 more rows
## # ℹ 3 more variables: daily_visitors <int>, park_latitude <dbl>,
## # park_longitude <dbl>
cat("\n\n✓ Data loaded successfully\n")
##
##
## ✓ Data loaded successfully
cat(sprintf(" - Photos: %s records\n", format(nrow(photos), big.mark = ",")))
## - Photos: 41,900 records
cat(sprintf(" - Visitation: %s records\n", format(nrow(visitation), big.mark = ",")))
## - Visitation: 3,650 records
cat(sprintf(" - Parks: %d\n", n_distinct(photos$park_name)))
## - Parks: 10
# Check for missing values
cat("Missing Values:\n")
## Missing Values:
cat("\nPhotography Metadata:\n")
##
## Photography Metadata:
print(colSums(is.na(photos)))
## photo_id park_name latitude longitude
## 0 0 0 0
## capture_datetime year month day
## 0 0 0 0
## hour camera_model lens focal_length_mm
## 0 0 0 0
## aperture_f iso shutter_speed is_professional
## 0 0 0 0
## hotspot_id
## 0
cat("\nVisitation Data:\n")
##
## Visitation Data:
print(colSums(is.na(visitation)))
## park_name date year month day
## 0 0 0 0 0
## day_of_week is_weekend is_holiday daily_visitors park_latitude
## 0 0 0 0 0
## park_longitude
## 0
# Check data ranges
cat("\n\nData Ranges:\n")
##
##
## Data Ranges:
cat(sprintf("Date range: %s to %s\n",
min(photos$capture_datetime),
max(photos$capture_datetime)))
## Date range: 2023-01-01 01:22:00 to 2023-12-28 21:31:00
cat(sprintf("Coordinates: Lat [%.2f, %.2f], Lon [%.2f, %.2f]\n",
min(photos$latitude), max(photos$latitude),
min(photos$longitude), max(photos$longitude)))
## Coordinates: Lat [35.46, 48.86], Lon [-123.75, -68.17]
cat(sprintf("Focal lengths: %d to %d mm\n",
min(photos$focal_length_mm),
max(photos$focal_length_mm)))
## Focal lengths: 14 to 400 mm
cat(sprintf("ISO range: %d to %d\n",
min(photos$iso),
max(photos$iso)))
## ISO range: 100 to 12800
cat("\n✓ Data quality checks complete\n")
##
## ✓ Data quality checks complete
# Aggregate monthly data
monthly_photos <- photos %>%
group_by(park_name, month) %>%
summarise(
photo_count = n(),
avg_focal_length = mean(focal_length_mm),
professional_pct = mean(is_professional) * 100,
.groups = "drop"
)
monthly_visitors <- visitation %>%
group_by(park_name, month) %>%
summarise(
total_visitors = sum(daily_visitors),
avg_daily_visitors = mean(daily_visitors),
.groups = "drop"
)
# Merge datasets
monthly_combined <- monthly_photos %>%
left_join(monthly_visitors, by = c("park_name", "month"))
# Create dual-axis plot for overall trends
overall_monthly <- monthly_combined %>%
group_by(month) %>%
summarise(
total_photos = sum(photo_count),
total_visitors = sum(total_visitors) / 1000, # Scale to thousands
.groups = "drop"
)
# Plot
p1 <- ggplot(overall_monthly, aes(x = month)) +
geom_col(aes(y = total_photos, fill = "Photo Volume"),
alpha = 0.7, position = "identity") +
geom_line(aes(y = total_visitors * 20, color = "Visitor Traffic (1000s)"),
size = 1.5) +
geom_point(aes(y = total_visitors * 20, color = "Visitor Traffic (1000s)"),
size = 3) +
scale_y_continuous(
name = "Photo Count",
sec.axis = sec_axis(~./20, name = "Visitors (Thousands)")
) +
scale_x_continuous(breaks = 1:12,
labels = month.abb) +
scale_fill_manual(values = c("Photo Volume" = "#2E86AB")) +
scale_color_manual(values = c("Visitor Traffic (1000s)" = "#A23B72")) +
labs(
title = "Monthly Photography Volume vs. Visitor Traffic",
subtitle = "All parks combined, 2023",
x = "Month",
fill = NULL,
color = NULL
) +
theme(legend.position = "top")
print(p1)
# Summary table
monthly_summary <- overall_monthly %>%
mutate(
saturation_index = total_photos / total_visitors,
season = case_when(
month %in% c(12, 1, 2) ~ "Winter",
month %in% c(3, 4, 5) ~ "Spring",
month %in% c(6, 7, 8) ~ "Summer",
month %in% c(9, 10, 11) ~ "Fall"
)
)
cat("\nMonthly Statistics:\n")
##
## Monthly Statistics:
monthly_summary %>%
select(Month = month,
Photos = total_photos,
`Visitors (1000s)` = total_visitors,
`Saturation Index` = saturation_index,
Season = season) %>%
kable(digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| Month | Photos | Visitors (1000s) | Saturation Index | Season |
|---|---|---|---|---|
| 1 | 1345 | 1134.12 | 1.19 | Winter |
| 2 | 1699 | 1255.25 | 1.35 | Winter |
| 3 | 2368 | 1936.92 | 1.22 | Spring |
| 4 | 3123 | 2424.98 | 1.29 | Spring |
| 5 | 4190 | 3322.24 | 1.26 | Spring |
| 6 | 5195 | 3967.29 | 1.31 | Summer |
| 7 | 6239 | 5085.11 | 1.23 | Summer |
| 8 | 5497 | 4401.26 | 1.25 | Summer |
| 9 | 4719 | 3787.95 | 1.25 | Fall |
| 10 | 3735 | 3093.27 | 1.21 | Fall |
| 11 | 2063 | 1639.56 | 1.26 | Fall |
| 12 | 1727 | 1464.88 | 1.18 | Winter |
# Calculate optimal shooting windows
seasonal_analysis <- monthly_combined %>%
group_by(month) %>%
summarise(
total_photos = sum(photo_count),
total_visitors = sum(total_visitors),
professional_rate = weighted.mean(professional_pct, photo_count),
.groups = "drop"
) %>%
mutate(
photos_per_1k_visitors = (total_photos / total_visitors) * 1000,
crowd_score = scale(total_visitors)[,1],
quality_score = scale(professional_rate)[,1],
opportunity_score = quality_score - crowd_score,
recommendation = case_when(
opportunity_score > 0.5 ~ "Excellent",
opportunity_score > 0 ~ "Good",
opportunity_score > -0.5 ~ "Moderate",
TRUE ~ "Avoid if possible"
)
)
# Visualize opportunity scores
p3 <- ggplot(seasonal_analysis, aes(x = factor(month), y = opportunity_score)) +
geom_col(aes(fill = recommendation), alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed") +
scale_x_discrete(labels = month.abb) +
scale_fill_manual(
values = c(
"Excellent" = "#06D6A0",
"Good" = "#118AB2",
"Moderate" = "#FFD166",
"Avoid if possible" = "#EF476F"
)
) +
labs(
title = "Optimal Shooting Window Analysis",
subtitle = "Opportunity Score = Photography Quality - Crowd Level",
x = "Month",
y = "Opportunity Score",
fill = "Recommendation"
) +
theme(legend.position = "top")
print(p3)
# Top recommendations table
cat("\n\nBest Months for Professional Photography:\n")
##
##
## Best Months for Professional Photography:
seasonal_analysis %>%
arrange(desc(opportunity_score)) %>%
mutate(
Month = month.name[month],
`Opportunity Score` = round(opportunity_score, 2),
`Photos/1K Visitors` = round(photos_per_1k_visitors, 1),
Recommendation = recommendation
) %>%
select(Month, `Opportunity Score`, `Photos/1K Visitors`, Recommendation) %>%
head(6) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Month | Opportunity Score | Photos/1K Visitors | Recommendation |
|---|---|---|---|
| January | 2.04 | 1.2 | Excellent |
| December | 1.80 | 1.2 | Excellent |
| February | 1.44 | 1.4 | Excellent |
| October | 1.19 | 1.2 | Excellent |
| November | 0.11 | 1.3 | Good |
| July | -0.26 | 1.2 | Moderate |