PLACE_ID_PATH <- "../data/place_ids/"
place_id_files <- list.files(PLACE_ID_PATH,
full.names = T)
place_id_meta <- map_df(place_id_files, read_feather) %>%
rename(city = location) %>%
distinct(city, lat, lon)
REVIEW_PATH <- "../data/tidy_reviews.csv"
tidy_review_df <- read_csv(REVIEW_PATH) %>%
select(city, name, review_num, review_tidy) %>%
left_join(place_id_meta) %>%
filter(!is.na(city),
name == "SUBWAY®Restaurants") %>%
group_by(city) %>%
distinct(review_tidy, .keep_all = T) %>%
ungroup() %>%
mutate(review_length = nchar(review_tidy))
The dataset includes 681 Google reviews from 156 cities in Iowa (city list taken from Wikipedia). The reviews were collected in the summer of 2017 using the Google Maps APIs (through googleway R package). We identified all place_ids (business) within 5000 m (~3mi) of each town, and then collected 200 pages of reviews (20/page) for each place_id. This is subway reviews only.
Here’s a sample of the reviews:
tidy_review_df %>%
slice(1:20) %>%
pull(review_tidy)
## [1] "food is good and service is usually quick"
## [2] "i love subways food however the staff seemed indifferent"
## [3] "friendly people"
## [4] "never had a sandwich messed up good with handling multiple orders clean and friendly staff"
## [5] "very good sandwiches friendly staff"
## [6] "subway is a great place to eat but unfortunately the employees seem like your the enemy and are very snappy and ill never eat there ever again the music is so d loud you cant hear your self think we ask about party platters and she was very rude to us and we did not get our information and well be eating other places"
## [7] "good place for a sandwich"
## [8] "older lady was very rude and was basically screaming at me and my husband because she couldnt hear us"
## [9] "always good food and service from a trained staff that can get you your food quickly and correctly poor location and access is not always the easiest off a busy road tight parking and awkward exit"
## [10] "always good food and service from a trained staff that can get you your food quickly and correctly poor location and access is not always the easiest off a busy road tight parking and awkward exit"
## [11] "good place for a quick sandwich to go lots of choices"
## [12] "good place for a quick sandwich to go lots of choices"
## [13] "really not sure what the deal with this location is lately the last three times we have been there they have run out of something pizza crust and meat balls twice now i understand things like this happen but this many times is a bit ridiculous"
## [14] "really not sure what the deal with this location is lately the last three times we have been there they have run out of something pizza crust and meat balls twice now i understand things like this happen but this many times is a bit ridiculous"
## [15] "new owner and new manager they are making positive changes and has gotten alot better im pretty happy"
## [16] "they good with service its just they dont give much meat"
## [17] "two employees were arguing over who turned off the oven then argued over something about my sandwichbread was dried outnot going back to this location"
## [18] "service was poor staff was more interested in talking to each other than taking my order"
## [19] "super nice people awesome food they have better bread than most"
## [20] "great positive service excellent store"
n_reviews_by_city <- tidy_review_df %>%
count(city, lat, lon) %>%
mutate(log_n_reviews = log(n))
ggplot(n_reviews_by_city, aes(x = n)) +
geom_histogram(binwidth = 1)
n_reviews_by_city %>%
arrange(-n) %>%
slice(1:10) %>%
kable()
| city | lat | lon | n | log_n_reviews |
|---|---|---|---|---|
| Ankeny | 41.73179 | -93.60013 | 10 | 2.302585 |
| Burlington | 40.80765 | -91.11289 | 10 | 2.302585 |
| Cedar Falls | 42.53490 | -92.44532 | 10 | 2.302585 |
| Clinton | 41.84447 | -90.18874 | 10 | 2.302585 |
| Lambs Grove | 41.70087 | -93.07891 | 10 | 2.302585 |
| Newton | 41.70332 | -93.05735 | 10 | 2.302585 |
| Ottumwa | 41.01603 | -92.40830 | 10 | 2.302585 |
| Sergeant Bluff | 42.40298 | -96.35884 | 10 | 2.302585 |
| Altoona | 41.64355 | -93.47508 | 9 | 2.197225 |
| Spencer | 43.14507 | -95.14432 | 8 | 2.079442 |
iowa_bounding_box = c(-96.6397171020508,
40.3755989074707, # southwest coordinates
-90.1400604248047,
43.5011367797852) # northeast coordinates
#iowa_map <- get_stamenmap(iowa_bounding_box,
# zoom = 5,
# maptype = "toner-lite")
#ggmap(iowa_map) +
# geom_point(aes(x = lon, y = lat, size = log(n)),
# data = n_reviews_by_city, alpha = .1, color = "red") +
# theme_bw()
DESMOINES_LAT_LON <- c(-93.6091064,41.6005448)
iowa_map2 <- get_googlemap(center = DESMOINES_LAT_LON,
zoom = 7,
maptype = "roadmap")
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = log(n)),
data = n_reviews_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
tidy_review_df %>%
ggplot(aes(x = review_length)) +
xlab("Review length (characters)") +
geom_histogram(binwidth = 10)
review_length_by_city <- tidy_review_df %>%
group_by(city, lat, lon) %>%
summarize(log_mean_review_length = log(mean(review_length))) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = log_mean_review_length),
data = review_length_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
Using NSB measure (bits). Note that only a third have entropy measures that are calculated (why?).
ENTROPY_PATH <- "../data/review_nsb_entropy.csv"
review_entropy <- read_csv(ENTROPY_PATH,
col_names = c("review_num",
"entropy_bits")) %>%
right_join(tidy_review_df) %>%
filter(!is.na(entropy_bits))
review_entropy %>%
ggplot(aes(x = entropy_bits)) +
xlab("Word Entropy (bits)") +
geom_histogram()
ENTROPY_PATH_CORPUS <- "../data/review_nsb_entropy_subway_city_corpus.csv"
review_entropy_corpus <- read_csv(ENTROPY_PATH_CORPUS,
col_names = c("city",
"entropy_bits_corpus")) %>%
filter(!is.na(entropy_bits_corpus))
entropy_by_city <- review_entropy %>%
group_by(city, lat, lon) %>%
summarize(mean_entropy_bits = mean(entropy_bits)) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = mean_entropy_bits),
data = entropy_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
Using the Affin dataset.
SENTIMENT_PATH <- "../data/review_sentiment.csv"
review_sentiment <- read_csv(SENTIMENT_PATH)
review_sentiment %>%
ggplot(aes(x = sentiment_score_afinn)) +
xlab("Review Sentiment") +
geom_histogram()
sentiment_by_city <- review_sentiment %>%
right_join(tidy_review_df %>% select(review_num, city, lat, lon)) %>%
group_by(city, lat, lon) %>%
summarize(mean_sentiment_score_afinn = mean(sentiment_score_afinn, na.rm = T)) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = mean_sentiment_score_afinn),
data = sentiment_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
At the city level.
all_dfs <- list(select(n_reviews_by_city, city, log_n_reviews),
select(census_data, -lat, -lon, -log_prop_pop_hisp),
select(exit_distance, city, log_min_distance_to_exit_meters),
select(sentiment_by_city, city, mean_sentiment_score_afinn),
select(review_length_by_city, city, log_mean_review_length),
select(entropy_by_city, city, mean_entropy_bits),
select(review_entropy_corpus, city, entropy_bits_corpus))
by_city_df <- reduce(all_dfs, left_join)
make_corr_plot(by_city_df[,-1])
lm(mean_entropy_bits ~ log_mean_review_length + log_n_reviews + log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
summary()
##
## Call:
## lm(formula = mean_entropy_bits ~ log_mean_review_length + log_n_reviews +
## log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters,
## data = by_city_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21488 -0.56451 -0.01633 0.48505 2.40316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.192e+00 2.510e+00 -1.272 0.2058
## log_mean_review_length 6.646e-01 1.114e-01 5.968 2.31e-08 ***
## log_n_reviews -1.967e-02 2.072e-01 -0.095 0.9245
## log_pop 4.760e-02 7.157e-02 0.665 0.5072
## prop_pop_white 2.448e+00 2.340e+00 1.046 0.2975
## median_income -1.157e-05 6.448e-06 -1.795 0.0751 .
## log_min_distance_to_exit_meters 7.863e-02 6.439e-02 1.221 0.2243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8768 on 125 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.2489, Adjusted R-squared: 0.2128
## F-statistic: 6.904 on 6 and 125 DF, p-value: 2.375e-06
lm(entropy_bits_corpus ~ log_mean_review_length + log_n_reviews + log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
summary()
##
## Call:
## lm(formula = entropy_bits_corpus ~ log_mean_review_length + log_n_reviews +
## log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters,
## data = by_city_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36425 -0.11032 0.02644 0.07503 0.42846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.108e+00 1.157e+00 -0.957 0.351
## log_mean_review_length 1.191e+00 4.453e-02 26.741 < 2e-16 ***
## log_n_reviews 1.491e+00 1.020e-01 14.616 8.67e-12 ***
## log_pop -1.848e-02 4.333e-02 -0.426 0.675
## prop_pop_white -4.192e-01 9.811e-01 -0.427 0.674
## median_income -3.850e-07 3.111e-06 -0.124 0.903
## log_min_distance_to_exit_meters -2.016e-02 3.236e-02 -0.623 0.541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1959 on 19 degrees of freedom
## (130 observations deleted due to missingness)
## Multiple R-squared: 0.9915, Adjusted R-squared: 0.9888
## F-statistic: 369.7 on 6 and 19 DF, p-value: < 2.2e-16
lm(mean_sentiment_score_afinn ~ log_mean_review_length + log_n_reviews +log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
summary()
##
## Call:
## lm(formula = mean_sentiment_score_afinn ~ log_mean_review_length +
## log_n_reviews + log_pop + prop_pop_white + median_income +
## log_min_distance_to_exit_meters, data = by_city_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1936 -0.3533 0.0788 0.5725 2.2861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.419e-02 2.060e+00 -0.046 0.96358
## log_mean_review_length -5.639e-01 9.576e-02 -5.889 2.54e-08 ***
## log_n_reviews 4.552e-01 1.596e-01 2.852 0.00497 **
## log_pop 2.111e-02 6.637e-02 0.318 0.75086
## prop_pop_white 2.769e+00 1.764e+00 1.570 0.11862
## median_income 5.334e-06 5.929e-06 0.900 0.36975
## log_min_distance_to_exit_meters 5.440e-02 5.789e-02 0.940 0.34891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8751 on 147 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2676, Adjusted R-squared: 0.2377
## F-statistic: 8.952 on 6 and 147 DF, p-value: 2.48e-08
Social Variables
Population demographics
Using census data (
acmpackage).Population demographics - geographic
Exit distance
Exits identified using OpenStreetMap API (
osmdatapackage; “motorway_junction”). I then calculated minimum distance from each city to an exit as the crow flies (“Haversine distance”). Driving time would be preferable here.Exit distance - geographic distribution