tidycensus::census_api_key(Sys.getenv(“CENSUS_API_KEY”)) google_api_key <- Sys.getenv(“GOOGLE_API_KEY”) stopifnot(nzchar(google_api_key))
#load RDS from mini assignment one using code from lab 2 for the google pois pois <- readRDS(“C:/Users/jenny/OneDrive - Georgia Institute of Technology/Desktop/CP8883/Mini 1/google_poi_binghamton.rds”) #name of columns in mini 1 to refer back to names(pois)
#purpose = keep unique places, select columns, st_as_sf) poi_clean
<- pois %>% dplyr::distinct(id, .keep_all = TRUE) %>%
dplyr::select( id, displayName.text, formattedAddress, types,
location.latitude, location.longitude, rating, userRatingCount )
poi_points <- sf::st_as_sf(
#POINTS layer in WGS84 poi_clean, coords = c(“location.longitude”,
“location.latitude”), crs = 4326 )
#looked up exact table numbers on censusbureau website
#from lab, to cache shapefiles options(tigris_use_cache = TRUE)
tab_2015 <- tidycensus::get_acs( geography = “tract”, variables =
c(median_rent_2br = “B25031_004E”), year = 2015, survey = “acs5”, state
= state_abb, county = counties, geometry = FALSE, output = “wide”
)
tab_2023 <- tidycensus::get_acs( geography = “tract”, variables = c(median_rent_2br = “B25031_004E”), year = 2023, survey = “acs5”, state = state_abb, county = counties, geometry = FALSE, output = “wide” )
#tract geometries with tigris tract_geom <- tigris::tracts(state = state_abb, county = counties, year = 2023)
#join tables to shapes (same mutate/select pattern from EDA exercise) tracts <- tract_geom %>% dplyr::left_join( tab_2015 %>% dplyr::select(GEOID, median_rent_2br_2015 = median_rent_2br), by = “GEOID” ) %>% dplyr::left_join( tab_2023 %>% dplyr::select(GEOID, median_rent_2br_2023 = median_rent_2br), by = “GEOID” ) %>% dplyr::mutate(median_rent_2br_change = median_rent_2br_2023 - median_rent_2br_2015)
cat(“Rows (raw):”, nrow(pois), “”)
#remove dupes poi_clean <- pois %>% dplyr::distinct(id,
.keep_all = TRUE) %>%
dplyr::select( id, displayName.text, formattedAddress, types,
location.latitude, location.longitude, rating, userRatingCount ) %>%
dplyr::filter(!is.na(location.latitude),
!is.na(location.longitude))
#drop missing coords (EDA cleaning) cat(“Rows (dedup + non-missing
coords):”, nrow(poi_clean), “”)
poi_points <- sf::st_as_sf( poi_clean, coords = c(“location.longitude”, “location.latitude”), crs = 4326 )
#turns each type into own row for organization poi_types <- tidyr::unnest_longer(poi_clean, types)
top_types <- poi_types %>% dplyr::count(types, sort = TRUE) %>% dplyr::slice_head(n = 10)
ggplot2::ggplot(top_types, ggplot2::aes(x = reorder(types, n), y = n)) + ggplot2::geom_col() + ggplot2::coord_flip() + ggplot2::labs(x = “Type”, y = “Count”, title = “Top POI types”)
#make pretty colors ggplot2::ggplot(top_types, ggplot2::aes(x = reorder(types, n), y = n)) + ggplot2::geom_col(fill = “blue”) + ggplot2::coord_flip() + ggplot2::labs(x = “Type”, y = “Count”, title = “Top POI types”)
ggplot2::ggplot(poi_clean, ggplot2::aes(rating)) +
ggplot2::geom_histogram(binwidth = 0.2, boundary = 0, fill = “green”,
color = “blue”) +
ggplot2::labs(x = “Rating”, y = “POI count”, title = “Distribution of
POI ratings”)
#red color and increase size of points ggplot2::ggplot(poi_clean,
ggplot2::aes(userRatingCount, rating)) + ggplot2::geom_point(alpha =
0.8, color = “red”) +
ggplot2::scale_x_log10() + ggplot2::labs(x = “User rating count (log
scale)”, y = “Rating”, title = “Ratings vs. number of reviews”)
#learned myself from my previous GIS experience
tmap::tm_shape(tracts) + tmap::tm_fill(“median_rent_2br_change”, palette
= “-RdYlBu”) +
tmap::tm_borders()
#Color POI dots by rating with a palette used in class lab examples tmap::tm_shape(poi_points) + tmap::tm_dots(col = “rating”, size = “userRatingCount”, palette = “magma”)
#RESULTS from mini 2– what the lab asks for!!!!!!!!!!!!!!
#1. COMPARE 2 POI’S (PARKS VS MUSEUMS) #print charts and summarize # Seen in EDA: unnest list-column, then group_by/summarise poi_types_long <- tidyr::unnest_longer(poi_clean, types)
type_summary <- poi_types_long %>% dplyr::filter(types %in% c(“park”,“museum”)) %>% dplyr::group_by(types) %>% dplyr::summarise( n_places = dplyr::n_distinct(id), avg_rating = mean(rating, na.rm = TRUE), avg_reviews = mean(userRatingCount, na.rm = TRUE) ) %>% dplyr::arrange(desc(n_places))
print(type_summary)
ggplot2::ggplot(type_summary, ggplot2::aes(x = types, y = n_places, fill = types)) + ggplot2::geom_col() + ggplot2::labs(x = “Type”, y = “Count”, title = “Parks vs. Museums”) + ggplot2::scale_fill_manual(values = c(park = “blue”, museum = “green”)) + ggplot2::theme(legend.position = “none”)
#2. AVERAGE RATING avg_rating <- poi_clean %>% dplyr::summarise(avg_rating = mean(rating, na.rm = TRUE))
print(avg_rating)
ggplot2::ggplot(poi_clean, ggplot2::aes(userRatingCount, rating)) + ggplot2::geom_point(alpha = 0.8, color = “purple”) + ggplot2::scale_x_log10() + ggplot2::labs( x = “User rating count (log scale)”, y = “Rating”, title = “Ratings vs. number of reviews” )
corr_rr <- with(poi_clean, cor(userRatingCount, rating, use = “complete.obs”)) cat(“Correlation between rating and # of reviews (log not applied):”, round(corr_rr, 3), “”, sep = ““)
#3. ASSOCIATION BETWEEN PRICE LEVEL AND RATING SCORE #unsure about this one if (“priceLevel” %in% names(poi_clean) || “places.priceLevel” %in% names(poi_clean)) {
price_col <- dplyr::case_when( “priceLevel” %in% names(poi_clean) ~ “priceLevel”, “places.priceLevel” %in% names(poi_clean) ~ “places.priceLevel” )
price_rating <- poi_clean %>% dplyr::filter(!is.na(.data[[price_col]])) %>% dplyr::group_by(.data[[price_col]]) %>% dplyr::summarise( n = dplyr::n(), avg_rate = mean(rating, na.rm = TRUE) ) %>% dplyr::rename(priceLevel = 1) %>% dplyr::arrange(priceLevel)
print(price_rating)
ggplot2::ggplot(price_rating, ggplot2::aes(x = factor(priceLevel), y = avg_rate)) + ggplot2::geom_col(fill = “purple”) + ggplot2::labs(x = “Price level”, y = “Average rating”, title = “Average rating by price level”) } else { cat(“Price level is not available in this POI dataset, so no price–rating analysis was produced in this mini assignment.”) }
#4. DO POIS CLUSTER OR EVENLY SPREAD? # Seen in EDA exercise tracts_joined <- sf::st_join(sf::st_transform(tracts, 3857), sf::st_transform(poi_points, 3857))
poi_per_tract <- tracts_joined %>% sf::st_drop_geometry() %>% dplyr::count(GEOID, name = “poi_n”)
tracts_counts <- tracts %>% dplyr::left_join(poi_per_tract, by = “GEOID”) %>% dplyr::mutate(poi_n = tidyr::replace_na(poi_n, 0L))
tmap::tmap_mode(“view”) tmap::tm_shape(tracts_counts) + tmap::tm_fill(“poi_n”, palette = “YlOrRd”) + tmap::tm_borders() + tmap::tm_layout(title = “POIs per tract”)
#5. ONE POI TO VISIT # Choose: highest rating, break ties by most reviews poi_pick <- poi_clean %>% dplyr::filter(!is.na(rating)) %>% dplyr::arrange(dplyr::desc(rating), dplyr::desc(userRatingCount)) %>% dplyr::slice(1) %>% dplyr::select(displayName.text, formattedAddress, rating, userRatingCount)
print(poi_pick)
print(“This was a fun lab!”)