#Import your POI data (Mini 1 outputs of BU and Albs)
#City selection (Binghamton & Albany; two POI types: park, museum)
path_bing <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_binghamton.rds"
path_alb <- "C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_albany.rds"
stopifnot(file.exists(path_bing))
pois_bing <- readRDS(path_bing)
pois_alb <- if (file.exists(path_alb)) readRDS(path_alb) else NULL
normalize_cols <- function(df){
df %>%
select(
id,
displayName.text,
formattedAddress,
types,
location.latitude,
location.longitude,
rating,
userRatingCount,
priceLevel = any_of("priceLevel")
) %>%
rename(
places.id = id,
places.displayName.text = displayName.text,
places.formattedAddress = formattedAddress,
places.types = types,
places.location.latitude = location.latitude,
places.location.longitude = location.longitude,
places.rating = rating,
places.userRatingCount = userRatingCount
)
}
poi_raw <- bind_rows(
normalize_cols(pois_bing),
if (!is.null(pois_alb)) normalize_cols(pois_alb)
)
# Preview (avoid huge text columns)
poi_raw %>%
head(5) %>%
kable() %>%
kable_styling(full_width = FALSE)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
places.id
|
places.displayName.text
|
places.formattedAddress
|
places.types
|
places.location.latitude
|
places.location.longitude
|
places.rating
|
places.userRatingCount
|
ChIJJwAb9RDv2okR2XbyH_7Riic
|
Recreation Park
|
Beethoven St & Seminary Ave, Binghamton, NY 13905, USA
|
park , tourist_attraction , swimming_pool , sports_activity_location,
point_of_interest , establishment
|
42.09972
|
-75.93330
|
4.6
|
1652
|
ChIJAQAA9OLu2okRKGBanm19Ikc
|
West End Park
|
95 Margaret St, Binghamton, NY 13905, USA
|
park , point_of_interest, establishment
|
42.10192
|
-75.94838
|
4.5
|
78
|
ChIJ__nmo1_v2okRYwUnzIL7Zn8
|
Recreation Park Ice Skating Rink
|
60-100 Beethoven St, Binghamton, NY 13905, USA
|
park , point_of_interest, establishment
|
42.09885
|
-75.93467
|
5.0
|
1
|
ChIJVVWVPBDv2okR-wWtbyO0gB4
|
Wallenburg Park
|
Mendelssohn St, Binghamton, NY 13905, USA
|
park , point_of_interest, establishment
|
42.10122
|
-75.93299
|
NA
|
NA
|
ChIJAVRE8OLu2okRNl7DP786-C4
|
West End Park
|
95 Margaret St, Binghamton, NY 13905, USA
|
park , premise , point_of_interest, establishment , street_address
|
42.10167
|
-75.94814
|
NA
|
NA
|
#Tidy your data
#Remove duplicated rows
n_before <- nrow(poi_raw)
poi_unique <- poi_raw %>% distinct(places.id, .keep_all = TRUE)
n_after <- nrow(poi_unique)
message("Rows before de-dup: ", n_before, " | after: ", n_after)
## Rows before de-dup: 408 | after: 131
#Flatten / unnest list-columns and collapse `places.types`
# Collapse list-column 'places.types' into a single comma-separated string
poi_flat <- poi_unique %>%
mutate(places.types = map_chr(places.types, ~ str_c(.x, collapse = ",")))
# Drop any other list-columns to retain 1-row-per-POI (rare with current schema)
is_list_col <- vapply(poi_flat, is.list, logical(1))
poi_flat <- poi_flat[, !is_list_col | names(is_list_col) %in% c("places.types")]
# Quick check
head(poi_flat$places.types, 3)
## [1] "park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment"
## [2] "park,point_of_interest,establishment"
## [3] "park,point_of_interest,establishment"
### Handle missing values
important_cols <- c("places.rating", "places.userRatingCount")
if ("places.priceLevel" %in% names(poi_flat)) important_cols <- c(important_cols, "places.priceLevel")
n_before <- nrow(poi_flat)
poi_dropna <- poi_flat %>% drop_na(any_of(important_cols))
n_after <- nrow(poi_dropna)
message("Rows before NA-drop (important cols): ", n_before, " | after: ", n_after)
## Rows before NA-drop (important cols): 131 | after: 74
#Filter by location (city boundary)
ny_places <- tigris::places("NY", progress_bar = FALSE) %>%
filter(NAME %in% c("Binghamton", "Albany")) %>%
st_transform(4326)
## Retrieving data for the year 2022
# Convert POIs to sf points (keep original lon/lat columns)
poi_sf <- poi_dropna %>%
st_as_sf(coords = c("places.location.longitude", "places.location.latitude"),
crs = 4326, remove = FALSE)
n_before <- nrow(poi_sf)
poi_sf_in <- poi_sf[ny_places, ]
n_after <- nrow(poi_sf_in)
message("Rows before city filter: ", n_before, " | after: ", n_after)
## Rows before city filter: 74 | after: 41
# Final cleaned dataset without geometry for tabular work
poi_clean <- poi_sf_in %>% st_drop_geometry()
#Show cleaned POI data
poi_clean %>%
head(10) %>%
kable() %>%
kable_styling(full_width = FALSE)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
places.id
|
places.displayName.text
|
places.formattedAddress
|
places.types
|
places.location.latitude
|
places.location.longitude
|
places.rating
|
places.userRatingCount
|
ChIJJwAb9RDv2okR2XbyH_7Riic
|
Recreation Park
|
Beethoven St & Seminary Ave, Binghamton, NY 13905, USA
|
park,tourist_attraction,swimming_pool,sports_activity_location,point_of_interest,establishment
|
42.09972
|
-75.93330
|
4.6
|
1652
|
ChIJAQAA9OLu2okRKGBanm19Ikc
|
West End Park
|
95 Margaret St, Binghamton, NY 13905, USA
|
park,point_of_interest,establishment
|
42.10192
|
-75.94838
|
4.5
|
78
|
ChIJ__nmo1_v2okRYwUnzIL7Zn8
|
Recreation Park Ice Skating Rink
|
60-100 Beethoven St, Binghamton, NY 13905, USA
|
park,point_of_interest,establishment
|
42.09885
|
-75.93467
|
5.0
|
1
|
ChIJm7PolWvv2okRnbwE4r14MxU
|
Roberson Museum
|
30 Front St, Binghamton, NY 13905, USA
|
museum,planetarium,wedding_venue,tourist_attraction,event_venue,point_of_interest,establishment
|
42.09390
|
-75.91858
|
4.7
|
170
|
ChIJJ3brEmnv2okRDe1z0ZtJ9Rw
|
Confluence Park
|
N Shore Dr, Binghamton, NY 13901, USA
|
park,tourist_attraction,point_of_interest,establishment
|
42.09286
|
-75.91639
|
4.5
|
353
|
ChIJgWE_TUfv2okRr3Hk9WGkK0E
|
MacArthur Park
|
Vestal Ave, Binghamton, NY 13903, USA
|
park,tourist_attraction,point_of_interest,establishment
|
42.08618
|
-75.92157
|
4.5
|
217
|
ChIJ87Gt79Tv2okRjTF9rXGIxDs
|
Greater Binghamton Greenway - Trailhead at McArthur Park
|
Vestal, Denton Rd, Binghamton, NY 13903, USA
|
hiking_area,park,sports_activity_location,point_of_interest,establishment
|
42.08538
|
-75.92558
|
5.0
|
3
|
ChIJAZwM2GTv2okRKRS0wKzs4zk
|
Southside to Binghamton University Trailhead
|
3 Zane Rd, Binghamton, NY 13903, USA
|
hiking_area,park,sports_activity_location,point_of_interest,establishment
|
42.08905
|
-75.91679
|
5.0
|
1
|
ChIJNQtPQgDv2okR9PaGYEB3Kwo
|
Duck Duck Goose Island
|
6 Vermont Ave, Binghamton, NY 13905, USA
|
park,point_of_interest,establishment
|
42.08836
|
-75.92774
|
5.0
|
2
|
ChIJ3zUYHXjv2okRCAWqwN_Iqnw
|
Confluence Point Southside Park
|
8 Conklin Ave, Binghamton, NY 13903, USA
|
hiking_area,park,sports_activity_location,point_of_interest,establishment
|
42.09141
|
-75.91532
|
5.0
|
1
|
# Explore and report findings
#1) Park vs Museum differences
poi_types_long <- poi_clean %>%
mutate(types_vec = str_split(places.types, ",")) %>%
select(places.id, places.displayName.text, places.rating, places.userRatingCount, types_vec) %>%
unnest(types_vec)
pm_summary <- poi_types_long %>%
filter(types_vec %in% c("park","museum")) %>%
group_by(types_vec) %>%
summarise(
n_places = n_distinct(places.id),
avg_rating = mean(places.rating, na.rm = TRUE),
avg_reviews = mean(places.userRatingCount, na.rm = TRUE),
.groups = "drop"
)
pm_summary
## # A tibble: 2 × 4
## types_vec n_places avg_rating avg_reviews
## <chr> <int> <dbl> <dbl>
## 1 museum 8 4.78 113.
## 2 park 33 4.46 97.6
#Quick bar (minimal)
ggplot(pm_summary, aes(x = types_vec, y = n_places, fill = types_vec)) +
geom_col() +
labs(x = NULL, y = "Count", title = "Parks vs Museums") +
theme(legend.position = "none")

#2) Average rating & relation to number of ratings
avg_rating <- poi_clean %>% summarise(avg_rating = mean(places.rating, na.rm = TRUE))
avg_rating
## avg_rating
## 1 4.519512
ggplot(poi_clean, aes(x = places.userRatingCount, y = places.rating)) +
geom_point(alpha = 0.7) +
scale_x_log10() +
labs(x = "User rating count (log)", y = "Rating", title = "Ratings vs. reviews")

#3) Price level vs rating (if available)
#Had trouble with this one but tried it as an extra, since lab only requires 4 correct attempts (this is my 5th overall).
if ("priceLevel" %in% names(poi_clean) || "places.priceLevel" %in% names(poi_clean)) {
price_col <- dplyr::case_when(
"priceLevel" %in% names(poi_clean) ~ "priceLevel",
"places.priceLevel" %in% names(poi_clean) ~ "places.priceLevel"
)
price_rating <- poi_clean %>%
dplyr::filter(!is.na(.data[[price_col]])) %>%
dplyr::group_by(.data[[price_col]]) %>%
dplyr::summarise(
n = dplyr::n(),
avg_rate = mean(rating, na.rm = TRUE)
) %>%
dplyr::rename(priceLevel = 1) %>%
dplyr::arrange(priceLevel)
print(price_rating)
ggplot2::ggplot(price_rating,
ggplot2::aes(x = factor(priceLevel), y = avg_rate)) +
ggplot2::geom_col(fill = "purple") +
ggplot2::labs(x = "Price level", y = "Average rating",
title = "Average rating by price level")
} else {
cat("Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.\n")
}
## Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.
#4) Simple clustering signal by city (relative concentration)
city_counts <- st_as_sf(ny_places) %>%
select(NAME) %>%
st_join(poi_sf_in) %>%
st_drop_geometry() %>%
count(NAME, name = "poi_n")
city_counts
## NAME poi_n
## 1 Albany 1
## 2 Binghamton 41
# One POI to visit: highest rating, break ties by most reviews
top_pick <- poi_clean %>%
arrange(desc(places.rating), desc(places.userRatingCount)) %>%
slice(1) %>%
select(places.displayName.text, places.formattedAddress, places.rating, places.userRatingCount)
top_pick
## places.displayName.text places.formattedAddress places.rating
## 1 The Story Garden Binghamton, NY 13903, USA 5
## places.userRatingCount
## 1 51
#5. ONE POI TO VISIT
# Choose: highest rating, break ties by most reviews
poi_pick <- poi_clean %>%
dplyr::filter(!is.na(places.rating)) %>%
dplyr::arrange(dplyr::desc(places.rating), dplyr::desc(places.userRatingCount)) %>%
dplyr::slice(1) %>%
dplyr::select(places.displayName.text, places.formattedAddress,
places.rating, places.userRatingCount)
print(poi_pick)
## places.displayName.text places.formattedAddress places.rating
## 1 The Story Garden Binghamton, NY 13903, USA 5
## places.userRatingCount
## 1 51
print("This was a fun lab!")
## [1] "This was a fun lab!"