#Import your POI data (Mini 1 outputs of BU and Albs)
#City selection (Binghamton & Albany; two POI types: park, museum)
path_bing <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_binghamton.rds"
path_alb  <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_albany.rds"

stopifnot(file.exists(path_bing))
pois_bing <- readRDS(path_bing)
pois_alb  <- if (file.exists(path_alb)) readRDS(path_alb) else NULL

normalize_cols <- function(df){
  df %>%
    select(
      id,
      displayName.text,
      formattedAddress,
      types,
      location.latitude,
      location.longitude,
      rating,
      userRatingCount,
      priceLevel = any_of("priceLevel")
    ) %>%
    rename(
      places.id                   = id,
      places.displayName.text     = displayName.text,
      places.formattedAddress     = formattedAddress,
      places.types                = types,
      places.location.latitude    = location.latitude,
      places.location.longitude   = location.longitude,
      places.rating               = rating,
      places.userRatingCount      = userRatingCount
    )
}
poi_raw <- bind_rows(
  normalize_cols(pois_bing),
  if (!is.null(pois_alb)) normalize_cols(pois_alb)
)

# Preview (avoid huge text columns) 
poi_raw %>%
  head(5) %>%
  kable() %>%
  kable_styling(full_width = FALSE)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
places.id places.displayName.text places.formattedAddress places.types places.location.latitude places.location.longitude places.rating places.userRatingCount
ChIJJwAb9RDv2okR2XbyH_7Riic Recreation Park Beethoven St & Seminary Ave, Binghamton, NY 13905, USA park , tourist_attraction , swimming_pool , sports_activity_location, point_of_interest , establishment 42.09972 -75.93330 4.6 1652
ChIJAQAA9OLu2okRKGBanm19Ikc West End Park 95 Margaret St, Binghamton, NY 13905, USA park , point_of_interest, establishment 42.10192 -75.94838 4.5 78
ChIJ__nmo1_v2okRYwUnzIL7Zn8 Recreation Park Ice Skating Rink 60-100 Beethoven St, Binghamton, NY 13905, USA park , point_of_interest, establishment 42.09885 -75.93467 5.0 1
ChIJVVWVPBDv2okR-wWtbyO0gB4 Wallenburg Park Mendelssohn St, Binghamton, NY 13905, USA park , point_of_interest, establishment 42.10122 -75.93299 NA NA
ChIJAVRE8OLu2okRNl7DP786-C4 West End Park 95 Margaret St, Binghamton, NY 13905, USA park , premise , point_of_interest, establishment , street_address 42.10167 -75.94814 NA NA
#Tidy your data
#Remove duplicated rows
n_before <- nrow(poi_raw)
poi_unique <- poi_raw %>% distinct(places.id, .keep_all = TRUE)
n_after  <- nrow(poi_unique)
message("Rows before de-dup: ", n_before, " | after: ", n_after)
## Rows before de-dup: 408 | after: 131
#Flatten / unnest list-columns and collapse `places.types`
# Collapse list-column 'places.types' into a single comma-separated string
poi_flat <- poi_unique %>%
  mutate(places.types = map_chr(places.types, ~ str_c(.x, collapse = ",")))

# Drop any other list-columns to retain 1-row-per-POI (rare with current schema)
is_list_col <- vapply(poi_flat, is.list, logical(1))
poi_flat <- poi_flat[, !is_list_col | names(is_list_col) %in% c("places.types")]

# Quick check
head(poi_flat$places.types, 3)
## [1] "park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment"
## [2] "park,point_of_interest,establishment"                                                          
## [3] "park,point_of_interest,establishment"
### Handle missing values
important_cols <- c("places.rating", "places.userRatingCount")
if ("places.priceLevel" %in% names(poi_flat)) important_cols <- c(important_cols, "places.priceLevel")

n_before <- nrow(poi_flat)
poi_dropna <- poi_flat %>% drop_na(any_of(important_cols))
n_after  <- nrow(poi_dropna)

message("Rows before NA-drop (important cols): ", n_before, " | after: ", n_after)
## Rows before NA-drop (important cols): 131 | after: 74
#Filter by location (city boundary)
ny_places <- tigris::places("NY", progress_bar = FALSE) %>%
  filter(NAME %in% c("Binghamton", "Albany")) %>%
  st_transform(4326)
## Retrieving data for the year 2022
# Convert POIs to sf points (keep original lon/lat columns)
poi_sf <- poi_dropna %>%
  st_as_sf(coords = c("places.location.longitude", "places.location.latitude"),
           crs = 4326, remove = FALSE)

n_before <- nrow(poi_sf)
poi_sf_in <- poi_sf[ny_places, ]   
n_after  <- nrow(poi_sf_in)

message("Rows before city filter: ", n_before, " | after: ", n_after)
## Rows before city filter: 74 | after: 41
# Final cleaned dataset without geometry for tabular work
poi_clean <- poi_sf_in %>% st_drop_geometry()
#Show cleaned POI data
poi_clean %>%
  head(10) %>%
  kable() %>%
  kable_styling(full_width = FALSE)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
places.id places.displayName.text places.formattedAddress places.types places.location.latitude places.location.longitude places.rating places.userRatingCount
ChIJJwAb9RDv2okR2XbyH_7Riic Recreation Park Beethoven St & Seminary Ave, Binghamton, NY 13905, USA park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment 42.09972 -75.93330 4.6 1652
ChIJAQAA9OLu2okRKGBanm19Ikc West End Park 95 Margaret St, Binghamton, NY 13905, USA park,point_of_interest,establishment 42.10192 -75.94838 4.5 78
ChIJ__nmo1_v2okRYwUnzIL7Zn8 Recreation Park Ice Skating Rink 60-100 Beethoven St, Binghamton, NY 13905, USA park,point_of_interest,establishment 42.09885 -75.93467 5.0 1
ChIJm7PolWvv2okRnbwE4r14MxU Roberson Museum 30 Front St, Binghamton, NY 13905, USA museum,planetarium,wedding_venue,tourist_attraction,event_venue,point_of_interest,establishment 42.09390 -75.91858 4.7 170
ChIJJ3brEmnv2okRDe1z0ZtJ9Rw Confluence Park N Shore Dr, Binghamton, NY 13901, USA park,tourist_attraction,point_of_interest,establishment 42.09286 -75.91639 4.5 353
ChIJgWE_TUfv2okRr3Hk9WGkK0E MacArthur Park Vestal Ave, Binghamton, NY 13903, USA park,tourist_attraction,point_of_interest,establishment 42.08618 -75.92157 4.5 217
ChIJ87Gt79Tv2okRjTF9rXGIxDs Greater Binghamton Greenway - Trailhead at McArthur Park Vestal, Denton Rd, Binghamton, NY 13903, USA hiking_area,park,sports_activity_location,point_of_interest,establishment 42.08538 -75.92558 5.0 3
ChIJAZwM2GTv2okRKRS0wKzs4zk Southside to Binghamton University Trailhead 3 Zane Rd, Binghamton, NY 13903, USA hiking_area,park,sports_activity_location,point_of_interest,establishment 42.08905 -75.91679 5.0 1
ChIJNQtPQgDv2okR9PaGYEB3Kwo Duck Duck Goose Island 6 Vermont Ave, Binghamton, NY 13905, USA park,point_of_interest,establishment 42.08836 -75.92774 5.0 2
ChIJ3zUYHXjv2okRCAWqwN_Iqnw Confluence Point Southside Park 8 Conklin Ave, Binghamton, NY 13903, USA hiking_area,park,sports_activity_location,point_of_interest,establishment 42.09141 -75.91532 5.0 1
# Explore and report findings
#1) Park vs Museum differences
poi_types_long <- poi_clean %>%
  mutate(types_vec = str_split(places.types, ",")) %>%
  select(places.id, places.displayName.text, places.rating, places.userRatingCount, types_vec) %>%
  unnest(types_vec)

pm_summary <- poi_types_long %>%
  filter(types_vec %in% c("park","museum")) %>%
  group_by(types_vec) %>%
  summarise(
    n_places    = n_distinct(places.id),
    avg_rating  = mean(places.rating, na.rm = TRUE),
    avg_reviews = mean(places.userRatingCount, na.rm = TRUE),
    .groups = "drop"
  )
pm_summary
## # A tibble: 2 × 4
##   types_vec n_places avg_rating avg_reviews
##   <chr>        <int>      <dbl>       <dbl>
## 1 museum           8       4.78       113. 
## 2 park            33       4.46        97.6
#Quick bar (minimal)
ggplot(pm_summary, aes(x = types_vec, y = n_places, fill = types_vec)) +
  geom_col() +
  labs(x = NULL, y = "Count", title = "Parks vs Museums") +
  theme(legend.position = "none")

#2) Average rating & relation to number of ratings
avg_rating <- poi_clean %>% summarise(avg_rating = mean(places.rating, na.rm = TRUE))
avg_rating
##   avg_rating
## 1   4.519512
ggplot(poi_clean, aes(x = places.userRatingCount, y = places.rating)) +
  geom_point(alpha = 0.7) +
  scale_x_log10() +
  labs(x = "User rating count (log)", y = "Rating", title = "Ratings vs. reviews")

#3) Price level vs rating (if available)
#Had trouble with this one but tried it as an extra, since lab only requires 4 correct attempts (this is my 5th overall).
if ("priceLevel" %in% names(poi_clean) || "places.priceLevel" %in% names(poi_clean)) {

  price_col <- dplyr::case_when(
    "priceLevel"        %in% names(poi_clean) ~ "priceLevel",
    "places.priceLevel" %in% names(poi_clean) ~ "places.priceLevel"
  )

  price_rating <- poi_clean %>%
    dplyr::filter(!is.na(.data[[price_col]])) %>%
    dplyr::group_by(.data[[price_col]]) %>%
    dplyr::summarise(
      n        = dplyr::n(),
      avg_rate = mean(rating, na.rm = TRUE)
    ) %>%
    dplyr::rename(priceLevel = 1) %>%
    dplyr::arrange(priceLevel)

  print(price_rating)

  ggplot2::ggplot(price_rating,
                  ggplot2::aes(x = factor(priceLevel), y = avg_rate)) +
    ggplot2::geom_col(fill = "purple") +
    ggplot2::labs(x = "Price level", y = "Average rating",
                  title = "Average rating by price level")
} else {
  cat("Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.\n")
}
## Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.
#4) Simple clustering signal by city (relative concentration)
city_counts <- st_as_sf(ny_places) %>%
  select(NAME) %>%
  st_join(poi_sf_in) %>%
  st_drop_geometry() %>%
  count(NAME, name = "poi_n")
city_counts
##         NAME poi_n
## 1     Albany     1
## 2 Binghamton    41
# One POI to visit: highest rating, break ties by most reviews
top_pick <- poi_clean %>%
  arrange(desc(places.rating), desc(places.userRatingCount)) %>%
  slice(1) %>%
  select(places.displayName.text, places.formattedAddress, places.rating, places.userRatingCount)
top_pick
##   places.displayName.text   places.formattedAddress places.rating
## 1        The Story Garden Binghamton, NY 13903, USA             5
##   places.userRatingCount
## 1                     51
#5. ONE POI TO VISIT
# Choose: highest rating, break ties by most reviews
poi_pick <- poi_clean %>%
  dplyr::filter(!is.na(places.rating)) %>%
  dplyr::arrange(dplyr::desc(places.rating), dplyr::desc(places.userRatingCount)) %>%
  dplyr::slice(1) %>%
  dplyr::select(places.displayName.text, places.formattedAddress,
                places.rating, places.userRatingCount)

print(poi_pick)
##   places.displayName.text   places.formattedAddress places.rating
## 1        The Story Garden Binghamton, NY 13903, USA             5
##   places.userRatingCount
## 1                     51
print("This was a fun lab!")
## [1] "This was a fun lab!"