MA_Zhaoxin_Mini2

#Import your POI data (Mini 1 outputs of BU and Albs)
#City selection (Binghamton & Albany; two POI types: park, museum)
path_bing <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_binghamton.rds"
path_alb  <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_albany.rds"

stopifnot(file.exists(path_bing))
pois_bing <- readRDS(path_bing)
pois_alb  <- if (file.exists(path_alb)) readRDS(path_alb) else NULL

normalize_cols <- function(df){
  df %>%
    select(
      id,
      displayName.text,
      formattedAddress,
      types,
      location.latitude,
      location.longitude,
      rating,
      userRatingCount,
      priceLevel = any_of("priceLevel")
    ) %>%
    rename(
      places.id                   = id,
      places.displayName.text     = displayName.text,
      places.formattedAddress     = formattedAddress,
      places.types                = types,
      places.location.latitude    = location.latitude,
      places.location.longitude   = location.longitude,
      places.rating               = rating,
      places.userRatingCount      = userRatingCount
    )
}
poi_raw <- bind_rows(
  normalize_cols(pois_bing),
  if (!is.null(pois_alb)) normalize_cols(pois_alb)
)

# Preview (avoid huge text columns) 
poi_raw %>%
  head(5) %>%
  kable() %>%
  kable_styling(full_width = FALSE)

## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")

places.id	places.displayName.text	places.formattedAddress	places.types	places.location.latitude	places.location.longitude	places.rating	places.userRatingCount
ChIJJwAb9RDv2okR2XbyH_7Riic	Recreation Park	Beethoven St & Seminary Ave, Binghamton, NY 13905, USA	park , tourist_attraction , swimming_pool , sports_activity_location, point_of_interest , establishment	42.09972	-75.93330	4.6	1652
ChIJAQAA9OLu2okRKGBanm19Ikc	West End Park	95 Margaret St, Binghamton, NY 13905, USA	park , point_of_interest, establishment	42.10192	-75.94838	4.5	78
ChIJ__nmo1_v2okRYwUnzIL7Zn8	Recreation Park Ice Skating Rink	60-100 Beethoven St, Binghamton, NY 13905, USA	park , point_of_interest, establishment	42.09885	-75.93467	5.0	1
ChIJVVWVPBDv2okR-wWtbyO0gB4	Wallenburg Park	Mendelssohn St, Binghamton, NY 13905, USA	park , point_of_interest, establishment	42.10122	-75.93299	NA	NA
ChIJAVRE8OLu2okRNl7DP786-C4	West End Park	95 Margaret St, Binghamton, NY 13905, USA	park , premise , point_of_interest, establishment , street_address	42.10167	-75.94814	NA	NA

#Tidy your data
#Remove duplicated rows
n_before <- nrow(poi_raw)
poi_unique <- poi_raw %>% distinct(places.id, .keep_all = TRUE)
n_after  <- nrow(poi_unique)
message("Rows before de-dup: ", n_before, " | after: ", n_after)

## Rows before de-dup: 408 | after: 131

#Flatten / unnest list-columns and collapse `places.types`
# Collapse list-column 'places.types' into a single comma-separated string
poi_flat <- poi_unique %>%
  mutate(places.types = map_chr(places.types, ~ str_c(.x, collapse = ",")))

# Drop any other list-columns to retain 1-row-per-POI (rare with current schema)
is_list_col <- vapply(poi_flat, is.list, logical(1))
poi_flat <- poi_flat[, !is_list_col | names(is_list_col) %in% c("places.types")]

# Quick check
head(poi_flat$places.types, 3)

## [1] "park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment"
## [2] "park,point_of_interest,establishment"                                                          
## [3] "park,point_of_interest,establishment"

### Handle missing values
important_cols <- c("places.rating", "places.userRatingCount")
if ("places.priceLevel" %in% names(poi_flat)) important_cols <- c(important_cols, "places.priceLevel")

n_before <- nrow(poi_flat)
poi_dropna <- poi_flat %>% drop_na(any_of(important_cols))
n_after  <- nrow(poi_dropna)

message("Rows before NA-drop (important cols): ", n_before, " | after: ", n_after)

## Rows before NA-drop (important cols): 131 | after: 74

#Filter by location (city boundary)
ny_places <- tigris::places("NY", progress_bar = FALSE) %>%
  filter(NAME %in% c("Binghamton", "Albany")) %>%
  st_transform(4326)

## Retrieving data for the year 2022

# Convert POIs to sf points (keep original lon/lat columns)
poi_sf <- poi_dropna %>%
  st_as_sf(coords = c("places.location.longitude", "places.location.latitude"),
           crs = 4326, remove = FALSE)

n_before <- nrow(poi_sf)
poi_sf_in <- poi_sf[ny_places, ]   
n_after  <- nrow(poi_sf_in)

message("Rows before city filter: ", n_before, " | after: ", n_after)

## Rows before city filter: 74 | after: 41

# Final cleaned dataset without geometry for tabular work
poi_clean <- poi_sf_in %>% st_drop_geometry()

#Show cleaned POI data
poi_clean %>%
  head(10) %>%
  kable() %>%
  kable_styling(full_width = FALSE)

## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")

places.id	places.displayName.text	places.formattedAddress	places.types	places.location.latitude	places.location.longitude	places.rating	places.userRatingCount
ChIJJwAb9RDv2okR2XbyH_7Riic	Recreation Park	Beethoven St & Seminary Ave, Binghamton, NY 13905, USA	park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment	42.09972	-75.93330	4.6	1652
ChIJAQAA9OLu2okRKGBanm19Ikc	West End Park	95 Margaret St, Binghamton, NY 13905, USA	park,point_of_interest,establishment	42.10192	-75.94838	4.5	78
ChIJ__nmo1_v2okRYwUnzIL7Zn8	Recreation Park Ice Skating Rink	60-100 Beethoven St, Binghamton, NY 13905, USA	park,point_of_interest,establishment	42.09885	-75.93467	5.0	1
ChIJm7PolWvv2okRnbwE4r14MxU	Roberson Museum	30 Front St, Binghamton, NY 13905, USA	museum,planetarium,wedding_venue,tourist_attraction,event_venue,point_of_interest,establishment	42.09390	-75.91858	4.7	170
ChIJJ3brEmnv2okRDe1z0ZtJ9Rw	Confluence Park	N Shore Dr, Binghamton, NY 13901, USA	park,tourist_attraction,point_of_interest,establishment	42.09286	-75.91639	4.5	353
ChIJgWE_TUfv2okRr3Hk9WGkK0E	MacArthur Park	Vestal Ave, Binghamton, NY 13903, USA	park,tourist_attraction,point_of_interest,establishment	42.08618	-75.92157	4.5	217
ChIJ87Gt79Tv2okRjTF9rXGIxDs	Greater Binghamton Greenway - Trailhead at McArthur Park	Vestal, Denton Rd, Binghamton, NY 13903, USA	hiking_area,park,sports_activity_location,point_of_interest,establishment	42.08538	-75.92558	5.0	3
ChIJAZwM2GTv2okRKRS0wKzs4zk	Southside to Binghamton University Trailhead	3 Zane Rd, Binghamton, NY 13903, USA	hiking_area,park,sports_activity_location,point_of_interest,establishment	42.08905	-75.91679	5.0	1
ChIJNQtPQgDv2okR9PaGYEB3Kwo	Duck Duck Goose Island	6 Vermont Ave, Binghamton, NY 13905, USA	park,point_of_interest,establishment	42.08836	-75.92774	5.0	2
ChIJ3zUYHXjv2okRCAWqwN_Iqnw	Confluence Point Southside Park	8 Conklin Ave, Binghamton, NY 13903, USA	hiking_area,park,sports_activity_location,point_of_interest,establishment	42.09141	-75.91532	5.0	1

# Explore and report findings
#1) Park vs Museum differences
poi_types_long <- poi_clean %>%
  mutate(types_vec = str_split(places.types, ",")) %>%
  select(places.id, places.displayName.text, places.rating, places.userRatingCount, types_vec) %>%
  unnest(types_vec)

pm_summary <- poi_types_long %>%
  filter(types_vec %in% c("park","museum")) %>%
  group_by(types_vec) %>%
  summarise(
    n_places    = n_distinct(places.id),
    avg_rating  = mean(places.rating, na.rm = TRUE),
    avg_reviews = mean(places.userRatingCount, na.rm = TRUE),
    .groups = "drop"
  )
pm_summary

## # A tibble: 2 × 4
##   types_vec n_places avg_rating avg_reviews
##   <chr>        <int>      <dbl>       <dbl>
## 1 museum           8       4.78       113. 
## 2 park            33       4.46        97.6

#Quick bar (minimal)
ggplot(pm_summary, aes(x = types_vec, y = n_places, fill = types_vec)) +
  geom_col() +
  labs(x = NULL, y = "Count", title = "Parks vs Museums") +
  theme(legend.position = "none")

#2) Average rating & relation to number of ratings
avg_rating <- poi_clean %>% summarise(avg_rating = mean(places.rating, na.rm = TRUE))
avg_rating

##   avg_rating
## 1   4.519512

ggplot(poi_clean, aes(x = places.userRatingCount, y = places.rating)) +
  geom_point(alpha = 0.7) +
  scale_x_log10() +
  labs(x = "User rating count (log)", y = "Rating", title = "Ratings vs. reviews")

#3) Price level vs rating (if available)
#Had trouble with this one but tried it as an extra, since lab only requires 4 correct attempts (this is my 5th overall).
if ("priceLevel" %in% names(poi_clean) || "places.priceLevel" %in% names(poi_clean)) {

  price_col <- dplyr::case_when(
    "priceLevel"        %in% names(poi_clean) ~ "priceLevel",
    "places.priceLevel" %in% names(poi_clean) ~ "places.priceLevel"
  )

  price_rating <- poi_clean %>%
    dplyr::filter(!is.na(.data[[price_col]])) %>%
    dplyr::group_by(.data[[price_col]]) %>%
    dplyr::summarise(
      n        = dplyr::n(),
      avg_rate = mean(rating, na.rm = TRUE)
    ) %>%
    dplyr::rename(priceLevel = 1) %>%
    dplyr::arrange(priceLevel)

  print(price_rating)

  ggplot2::ggplot(price_rating,
                  ggplot2::aes(x = factor(priceLevel), y = avg_rate)) +
    ggplot2::geom_col(fill = "purple") +
    ggplot2::labs(x = "Price level", y = "Average rating",
                  title = "Average rating by price level")
} else {
  cat("Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.\n")
}

## Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.

#4) Simple clustering signal by city (relative concentration)
city_counts <- st_as_sf(ny_places) %>%
  select(NAME) %>%
  st_join(poi_sf_in) %>%
  st_drop_geometry() %>%
  count(NAME, name = "poi_n")
city_counts

##         NAME poi_n
## 1     Albany     1
## 2 Binghamton    41

# One POI to visit: highest rating, break ties by most reviews
top_pick <- poi_clean %>%
  arrange(desc(places.rating), desc(places.userRatingCount)) %>%
  slice(1) %>%
  select(places.displayName.text, places.formattedAddress, places.rating, places.userRatingCount)
top_pick

##   places.displayName.text   places.formattedAddress places.rating
## 1        The Story Garden Binghamton, NY 13903, USA             5
##   places.userRatingCount
## 1                     51

#5. ONE POI TO VISIT
# Choose: highest rating, break ties by most reviews
poi_pick <- poi_clean %>%
  dplyr::filter(!is.na(places.rating)) %>%
  dplyr::arrange(dplyr::desc(places.rating), dplyr::desc(places.userRatingCount)) %>%
  dplyr::slice(1) %>%
  dplyr::select(places.displayName.text, places.formattedAddress,
                places.rating, places.userRatingCount)

print(poi_pick)

##   places.displayName.text   places.formattedAddress places.rating
## 1        The Story Garden Binghamton, NY 13903, USA             5
##   places.userRatingCount
## 1                     51

print("This was a fun lab!")

## [1] "This was a fun lab!"

MA_Zhaoxin_Mini2_Attempt2

2025-10-02