PLACE_ID_PATH <- "../data/place_ids/"
place_id_files <- list.files(PLACE_ID_PATH, 
                             full.names = T)
place_id_meta <- map_df(place_id_files, read_feather) %>%
  rename(city = location) %>%
  distinct(city, lat, lon)

REVIEW_PATH <- "../data/tidy_reviews.csv"
tidy_review_df <- read_csv(REVIEW_PATH) %>%
  select(city, name, review_num, review_tidy) %>%
  left_join(place_id_meta) %>%
  filter(!is.na(city),
         name == "SUBWAY®Restaurants") %>%
  group_by(city) %>%
  distinct(review_tidy, .keep_all = T) %>%
  ungroup() %>%
  mutate(review_length = nchar(review_tidy))

Reviews

The dataset includes 681 Google reviews from 156 cities in Iowa (city list taken from Wikipedia). The reviews were collected in the summer of 2017 using the Google Maps APIs (through googleway R package). We identified all place_ids (business) within 5000 m (~3mi) of each town, and then collected 200 pages of reviews (20/page) for each place_id. This is subway reviews only.

Here’s a sample of the reviews:

tidy_review_df %>%
  slice(1:20) %>%
  pull(review_tidy)
##  [1] "food is good and service is usually quick"                                                                                                                                                                                                                                                                                     
##  [2] "i love subways food however the staff seemed indifferent"                                                                                                                                                                                                                                                                      
##  [3] "friendly people"                                                                                                                                                                                                                                                                                                               
##  [4] "never had a sandwich messed up good with handling multiple orders clean and friendly staff"                                                                                                                                                                                                                                    
##  [5] "very good sandwiches friendly staff"                                                                                                                                                                                                                                                                                           
##  [6] "subway is a great place to eat but unfortunately the employees seem like your the enemy and are very snappy and ill never eat there ever again the music is so d loud you cant hear your self think we ask about party platters and she was very rude to us and we did not get our information and well be eating other places"
##  [7] "good place for a sandwich"                                                                                                                                                                                                                                                                                                     
##  [8] "older lady was very rude and was basically screaming at me and my husband because she couldnt hear us"                                                                                                                                                                                                                         
##  [9] "always good food and service from a trained staff that can get you your food quickly and correctly poor location and access is not always the easiest off a busy road tight parking and awkward exit"                                                                                                                          
## [10] "always good food and service from a trained staff that can get you your food quickly and correctly poor location and access is not always the easiest off a busy road tight parking and awkward exit"                                                                                                                          
## [11] "good place for a quick sandwich to go lots of choices"                                                                                                                                                                                                                                                                         
## [12] "good place for a quick sandwich to go lots of choices"                                                                                                                                                                                                                                                                         
## [13] "really not sure what the deal with this location is lately the last three times we have been there they have run out of something pizza crust and meat balls twice now i understand things like this happen but this many times is a bit ridiculous"                                                                           
## [14] "really not sure what the deal with this location is lately the last three times we have been there they have run out of something pizza crust and meat balls twice now i understand things like this happen but this many times is a bit ridiculous"                                                                           
## [15] "new owner and new manager they are making positive changes and has gotten alot better im pretty happy"                                                                                                                                                                                                                         
## [16] "they good with service its just they dont give much meat"                                                                                                                                                                                                                                                                      
## [17] "two employees were arguing over who turned off the oven then argued over something about my sandwichbread was dried outnot going back to this location"                                                                                                                                                                        
## [18] "service was poor staff was more interested in talking to each other than taking my order"                                                                                                                                                                                                                                      
## [19] "super nice people awesome food they have better bread than most"                                                                                                                                                                                                                                                               
## [20] "great positive service excellent store"

Number of reviews

By city

n_reviews_by_city <- tidy_review_df %>%
  count(city, lat, lon) %>%
  mutate(log_n_reviews = log(n))

ggplot(n_reviews_by_city, aes(x = n)) +
  geom_histogram(binwidth = 1) 

n_reviews_by_city %>%
  arrange(-n) %>%
  slice(1:10) %>%
  kable()
city lat lon n log_n_reviews
Ankeny 41.73179 -93.60013 10 2.302585
Burlington 40.80765 -91.11289 10 2.302585
Cedar Falls 42.53490 -92.44532 10 2.302585
Clinton 41.84447 -90.18874 10 2.302585
Lambs Grove 41.70087 -93.07891 10 2.302585
Newton 41.70332 -93.05735 10 2.302585
Ottumwa 41.01603 -92.40830 10 2.302585
Sergeant Bluff 42.40298 -96.35884 10 2.302585
Altoona 41.64355 -93.47508 9 2.197225
Spencer 43.14507 -95.14432 8 2.079442

By city - geographical distribution

iowa_bounding_box = c(-96.6397171020508,
                      40.3755989074707, # southwest coordinates
                      -90.1400604248047, 
                      43.5011367797852) # northeast coordinates

#iowa_map <- get_stamenmap(iowa_bounding_box, 
#                     zoom = 5, 
#                     maptype = "toner-lite")

#ggmap(iowa_map) +
#  geom_point(aes(x = lon, y = lat, size = log(n)), 
 #            data = n_reviews_by_city, alpha = .1, color = "red") +
 # theme_bw()


DESMOINES_LAT_LON <- c(-93.6091064,41.6005448)
iowa_map2 <- get_googlemap(center = DESMOINES_LAT_LON, 
                     zoom = 7, 
                     maptype = "roadmap")

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log(n)), 
             data = n_reviews_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Review language measures

Length

tidy_review_df %>%
  ggplot(aes(x = review_length)) +
  xlab("Review length (characters)") +
  geom_histogram(binwidth = 10) 

Length - geographic distribution by city

review_length_by_city <- tidy_review_df %>%
  group_by(city, lat, lon) %>%
  summarize(log_mean_review_length = log(mean(review_length))) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_mean_review_length), 
             data = review_length_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Entropy

Using NSB measure (bits). Note that only a third have entropy measures that are calculated (why?).

  • Should zeros for words not in reviews be included?
  • Should we treat each city as a corpus?
ENTROPY_PATH <- "../data/review_nsb_entropy.csv"


review_entropy <- read_csv(ENTROPY_PATH,
                           col_names = c("review_num",
                                         "entropy_bits"))  %>%
  right_join(tidy_review_df) %>%
  filter(!is.na(entropy_bits))

review_entropy %>%
  ggplot(aes(x = entropy_bits)) +
  xlab("Word Entropy (bits)") +
  geom_histogram()

ENTROPY_PATH_CORPUS <- "../data/review_nsb_entropy_subway_city_corpus.csv"

review_entropy_corpus <- read_csv(ENTROPY_PATH_CORPUS,
                           col_names = c("city",
                                         "entropy_bits_corpus"))  %>%
  filter(!is.na(entropy_bits_corpus))

Entropy - geographic distribution by city

entropy_by_city <- review_entropy %>%
  group_by(city, lat, lon) %>%
  summarize(mean_entropy_bits = mean(entropy_bits)) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = mean_entropy_bits), 
             data = entropy_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Sentiment

Using the Affin dataset.

SENTIMENT_PATH <- "../data/review_sentiment.csv"

review_sentiment <- read_csv(SENTIMENT_PATH)  

review_sentiment %>%
  ggplot(aes(x = sentiment_score_afinn)) +
  xlab("Review Sentiment") +
  geom_histogram()

Sentiment - geographic distribution by city

sentiment_by_city <- review_sentiment %>%
  right_join(tidy_review_df %>% select(review_num, city, lat, lon)) %>%
  group_by(city, lat, lon) %>%
  summarize(mean_sentiment_score_afinn = mean(sentiment_score_afinn, na.rm = T)) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = mean_sentiment_score_afinn), 
             data = sentiment_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Social Variables

Population demographics

Using census data (acm package).

CENSUS_DATA_PATH <- "../data/census_data.csv"
census_data <- read_csv(CENSUS_DATA_PATH) %>%
  mutate( log_prop_pop_hisp = log(pophisp/pop),
          prop_pop_white = popwhite/pop,
          log_pop = log(pop)) %>%
  select(-pophisp, -pop, -popwhite) %>%
  rename(median_income = medianincome) %>%
  left_join(place_id_meta)

census_data %>%
  select(-lat, -lon) %>%
  gather(measure, value, -city) %>%
  ggplot(aes(x = value)) +
  facet_wrap(~measure, scales = "free") +
  geom_histogram() 

Population demographics - geographic

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = median_income), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_pop), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_prop_pop_hisp), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Exit distance

Exits identified using OpenStreetMap API (osmdata package; “motorway_junction”). I then calculated minimum distance from each city to an exit as the crow flies (“Haversine distance”). Driving time would be preferable here.

DISTANCE_TO_EXIT <- "../data/distance_to_exit.csv"
exit_distance <- read_csv(DISTANCE_TO_EXIT)  %>%
  mutate(log_min_distance_to_exit_meters = log(min_distance_to_exit_meters)) %>%
  left_join(place_id_meta)

exit_distance %>%
  ggplot(aes(x = min_distance_to_exit_meters)) +
  xlab("Exit distance (meters)") +
  geom_histogram() 

exit_distance %>%
  ggplot(aes(x = log_min_distance_to_exit_meters)) +
  xlab("Log exit distance (meters)") +
  geom_histogram() 

Exit distance - geographic distribution

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_min_distance_to_exit_meters), size = 4, 
             data = exit_distance, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Pairwise correlations

At the city level.

all_dfs <- list(select(n_reviews_by_city, city, log_n_reviews),
                select(census_data, -lat, -lon, -log_prop_pop_hisp),
                select(exit_distance, city, log_min_distance_to_exit_meters),
                select(sentiment_by_city, city, mean_sentiment_score_afinn),
                select(review_length_by_city, city, log_mean_review_length),
                select(entropy_by_city, city, mean_entropy_bits),
                select(review_entropy_corpus, city, entropy_bits_corpus))

by_city_df <- reduce(all_dfs, left_join)

make_corr_plot(by_city_df[,-1])

Models

lm(mean_entropy_bits ~ log_mean_review_length + log_n_reviews + log_pop  + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_entropy_bits ~ log_mean_review_length + log_n_reviews + 
##     log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, 
##     data = by_city_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21488 -0.56451 -0.01633  0.48505  2.40316 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -3.192e+00  2.510e+00  -1.272   0.2058    
## log_mean_review_length           6.646e-01  1.114e-01   5.968 2.31e-08 ***
## log_n_reviews                   -1.967e-02  2.072e-01  -0.095   0.9245    
## log_pop                          4.760e-02  7.157e-02   0.665   0.5072    
## prop_pop_white                   2.448e+00  2.340e+00   1.046   0.2975    
## median_income                   -1.157e-05  6.448e-06  -1.795   0.0751 .  
## log_min_distance_to_exit_meters  7.863e-02  6.439e-02   1.221   0.2243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8768 on 125 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.2489, Adjusted R-squared:  0.2128 
## F-statistic: 6.904 on 6 and 125 DF,  p-value: 2.375e-06
lm(entropy_bits_corpus ~ log_mean_review_length + log_n_reviews + log_pop  + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
  summary()
## 
## Call:
## lm(formula = entropy_bits_corpus ~ log_mean_review_length + log_n_reviews + 
##     log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, 
##     data = by_city_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36425 -0.11032  0.02644  0.07503  0.42846 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -1.108e+00  1.157e+00  -0.957    0.351    
## log_mean_review_length           1.191e+00  4.453e-02  26.741  < 2e-16 ***
## log_n_reviews                    1.491e+00  1.020e-01  14.616 8.67e-12 ***
## log_pop                         -1.848e-02  4.333e-02  -0.426    0.675    
## prop_pop_white                  -4.192e-01  9.811e-01  -0.427    0.674    
## median_income                   -3.850e-07  3.111e-06  -0.124    0.903    
## log_min_distance_to_exit_meters -2.016e-02  3.236e-02  -0.623    0.541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1959 on 19 degrees of freedom
##   (130 observations deleted due to missingness)
## Multiple R-squared:  0.9915, Adjusted R-squared:  0.9888 
## F-statistic: 369.7 on 6 and 19 DF,  p-value: < 2.2e-16
lm(mean_sentiment_score_afinn ~ log_mean_review_length + log_n_reviews +log_pop  + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_sentiment_score_afinn ~ log_mean_review_length + 
##     log_n_reviews + log_pop + prop_pop_white + median_income + 
##     log_min_distance_to_exit_meters, data = by_city_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1936 -0.3533  0.0788  0.5725  2.2861 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -9.419e-02  2.060e+00  -0.046  0.96358    
## log_mean_review_length          -5.639e-01  9.576e-02  -5.889 2.54e-08 ***
## log_n_reviews                    4.552e-01  1.596e-01   2.852  0.00497 ** 
## log_pop                          2.111e-02  6.637e-02   0.318  0.75086    
## prop_pop_white                   2.769e+00  1.764e+00   1.570  0.11862    
## median_income                    5.334e-06  5.929e-06   0.900  0.36975    
## log_min_distance_to_exit_meters  5.440e-02  5.789e-02   0.940  0.34891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8751 on 147 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2676, Adjusted R-squared:  0.2377 
## F-statistic: 8.952 on 6 and 147 DF,  p-value: 2.48e-08