PLACE_ID_PATH <- "../data/place_ids/"
place_id_files <- list.files(PLACE_ID_PATH, 
                             full.names = T)
place_id_meta <- map_df(place_id_files, read_feather) %>%
  rename(city = location) %>%
  distinct(city, lat, lon)


REVIEW_PATH <- "../data/tidy_reviews.csv"
tidy_review_df <- read_csv(REVIEW_PATH) %>%
  select(city, name, review_num, review_tidy) %>%
  group_by(city, name) %>%
  distinct(review_tidy, .keep_all = T) %>%
  ungroup() %>%
  left_join(place_id_meta) %>%
  filter(!is.na(city)) %>%
  mutate(review_length = nchar(review_tidy))

Reviews

The dataset includes 24440 Google reviews from 890 cities in Iowa (city list taken from Wikipedia). The reviews were collected in the summer of 2017 using the Google Maps APIs (through googleway R package). We identified all place_ids (business) within 5000 m (~3mi) of each town, and then collected 200 pages of reviews (20/page) for each place_id.

Here’s a sample of the reviews:

tidy_review_df %>%
  slice(1:20) %>%
  pull(review_tidy)
##  [1] "barry nice review it is a nice par 3 golf course love it"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
##  [2] "have not played it yet but when i get off the road it will be all the time see ya in a few months bear"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [3] "penny is very accommodating and has a beautiful selection of plants"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
##  [4] "they know what there doing for a decent price"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
##  [5] "bad experience bought a truck and the transmission went out a week later they acted like i was trash and my business wasnt important to them i would recommend to buy a vehicle from a auto salvage lot before spending any money on anything on there lot it would be more reliable"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
##  [6] "great place"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [7] "great local campground although i still think of it as the boy scout camp"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
##  [8] "beautiful setting great camping and fishing"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [9] "great family camping"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [10] "a nice recreation park with lots of trails fishing and primitive modern campsites"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [11] "the absolute worst pizza ranch in existence we came in at 630pm on a friday night and the buffet was nearly completely empty the little food that remained consisted of really old gummy instant mashed potatoes 2 pieces of cactus bread and 3 pieces of possibly weekold chicken the salad bar was equally repulsive and i was sincerely concerned that the food might not even be safe the bathroom was tiny and didnt have a changing table the highchair they gave is was broken and very filthy the restaurant was filled with hungry angry people but the staff appeared to be completely incompetent in the hour that we stayed they only brought out about 7 pizzas and some chicken this was not nearly enough to feed everyone we finally just gave up waiting for more food to appear and left ill never go back again"
## [12] "no ice cream here there is at other pizza ranch locations great pizza and chicken though friendly staff menu and napkin container werent moved so the inside of the table was left dirty after last time it was wiped all available highchairs needed wiped off and there were no working straps to secure my daughter not a deal breakers though will definitely be back at some point dont live in ackley"                                                                                                                                                                                                                                                                                                                                                                                                                       
## [13] "that kid with the mole who makes pizza needs to shave his beard real ugly and tasted a few hairs in my pizza but was still pretty good"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [14] "if you want a lot of pizza this is the place if you want gourmet or high end pizza this isnt it hearty meals fried chicekn salad bar all at a good price not for the health minded but great comfort food"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [15] "hardly no food 7 tables full of people and barely seen the waiter"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [16] "love the dr s here"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
## [17] "i work here its amazing ppl right here"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [18] "great people great product great prices"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [19] "great meat sticks"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [20] "owner is very friendly and knowledgeable"

Number of reviews

By establishment name

(e.g., “Subway”)

total_reviews <- count(tidy_review_df,  name) %>%
  arrange(-n)

total_reviews %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 5) 

total_reviews %>%
  slice(1:10) %>%
  kable()
name n
SUBWAY®Restaurants 681
Casey’s General Store 454
Dollar General 392
Walmart Supercenter 175
Hy-Vee 169
Walgreens 149
McDonald’s 135
US Post Office 120
Pizza Ranch 118
Pizza Hut 110

By city

n_reviews_by_city <- tidy_review_df %>%
  count(city, lat, lon) %>%
  mutate(log_n_reviews = log(n)) %>%
  ungroup()

ggplot(n_reviews_by_city, aes(x = n)) +
  geom_histogram(binwidth = 10) 

n_reviews_by_city %>%
  arrange(-n) %>%
  slice(1:10) %>%
  kable()
city lat lon n log_n_reviews
Ames 42.03078 -93.63191 237 5.468060
Marion 42.03328 -91.59690 230 5.438079
Urbandale 41.62666 -93.71217 229 5.433722
Coralville 41.68228 -91.59606 223 5.407172
Ankeny 41.73179 -93.60013 213 5.361292
Bettendorf 41.56085 -90.48344 213 5.361292
Dubuque 42.50056 -90.66457 211 5.351858
Carter Lake 41.29055 -95.91807 200 5.298317
Altoona 41.64355 -93.47508 192 5.257495
Clive 41.60609 -93.77236 181 5.198497

By city - geographical distribution

iowa_bounding_box = c(-96.6397171020508,
                      40.3755989074707, # southwest coordinates
                      -90.1400604248047, 
                      43.5011367797852) # northeast coordinates

#iowa_map <- get_stamenmap(iowa_bounding_box, 
#                     zoom = 5, 
#                     maptype = "toner-lite")

#ggmap(iowa_map) +
#  geom_point(aes(x = lon, y = lat, size = log(n)), 
 #            data = n_reviews_by_city, alpha = .1, color = "red") +
 # theme_bw()


DESMOINES_LAT_LON <- c(-93.6091064,41.6005448)
iowa_map2 <- get_googlemap(center = DESMOINES_LAT_LON, 
                     zoom = 7, 
                     maptype = "roadmap")

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log(n)), 
             data = n_reviews_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Review language measures

Length

tidy_review_df %>%
  ggplot(aes(x = review_length)) +
  xlab("Review length (characters)") +
  geom_histogram(binwidth = 10) 

Length - geographic distribution by city

review_length_by_city <- tidy_review_df %>%
  group_by(city, lat, lon) %>%
  summarize(log_mean_review_length = log(mean(review_length))) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_mean_review_length), 
             data = review_length_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Entropy

Using NSB measure (bits). Note that only a third have entropy measures that are calculated (why?).

  • Should zeros for words not in reviews be included?
  • Should we treat each city as a corpus?
ENTROPY_PATH <- "../data/review_nsb_entropy.csv"

review_entropy <- read_csv(ENTROPY_PATH,
                           col_names = c("review_num",
                                         "entropy_bits"))  %>%
  right_join(tidy_review_df) %>%
  filter(!is.na(entropy_bits))

review_entropy %>%
  ggplot(aes(x = entropy_bits)) +
  xlab("Word Entropy (bits)") +
  geom_histogram()

Entropy - geographic distribution by city

entropy_by_city <- review_entropy %>%
  group_by(city, lat, lon) %>%
  summarize(mean_entropy_bits = mean(entropy_bits)) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = mean_entropy_bits), 
             data = entropy_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Sentiment

Using the Affin dataset.

SENTIMENT_PATH <- "../data/review_sentiment.csv"

review_sentiment <- read_csv(SENTIMENT_PATH)   %>%
  right_join(tidy_review_df)

review_sentiment %>%
  ggplot(aes(x = sentiment_score_afinn)) +
  xlab("Review Sentiment") +
  geom_histogram()

Sentiment - geographic distribution by city

sentiment_by_city <- review_sentiment %>%
  right_join(tidy_review_df %>% select(review_num, city, lat, lon)) %>%
  group_by(city, lat, lon) %>%
  summarize(mean_sentiment_score_afinn = mean(sentiment_score_afinn, na.rm = T)) %>%
  ungroup()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = mean_sentiment_score_afinn), 
             data = sentiment_by_city, alpha = .5, size = 4) +
    viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Social Variables

Population demographics

Using census data (acm package).

CENSUS_DATA_PATH <- "../data/census_data.csv"
census_data <- read_csv(CENSUS_DATA_PATH) %>%
  mutate( log_prop_pop_hisp = log(pophisp/pop),
          prop_pop_white = popwhite/pop,
          log_pop = log(pop)) %>%
  select(-pophisp, -pop, -popwhite) %>%
  rename(median_income = medianincome) %>%
  left_join(place_id_meta)

census_data %>%
  select(-lat, -lon) %>%
  gather(measure, value, -city) %>%
  ggplot(aes(x = value)) +
  facet_wrap(~measure, scales = "free") +
  geom_histogram() 

Population demographics - geographic

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = median_income), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_pop), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_prop_pop_hisp), size = 4, 
             data = census_data, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Exit distance

Exits identified using OpenStreetMap API (osmdata package; “motorway_junction”). I then calculated minimum distance from each city to an exit as the crow flies (“Haversine distance”). Driving time would be preferable here.

DISTANCE_TO_EXIT <- "../data/distance_to_exit.csv"
exit_distance <- read_csv(DISTANCE_TO_EXIT)  %>%
  mutate(log_min_distance_to_exit_meters = log(min_distance_to_exit_meters)) %>%
  left_join(place_id_meta)

exit_distance %>%
  ggplot(aes(x = min_distance_to_exit_meters)) +
  xlab("Exit distance (meters)") +
  geom_histogram() 

exit_distance %>%
  ggplot(aes(x = log_min_distance_to_exit_meters)) +
  xlab("Log exit distance (meters)") +
  geom_histogram() 

Exit distance - geographic distribution

ggmap(iowa_map2, extent = 
        "device") +
  geom_point(aes(x = lon, y = lat, color = log_min_distance_to_exit_meters), size = 4, 
             data = exit_distance, alpha = .5) +
  viridis::scale_color_viridis(direction = -1) +
  theme_bw()

Pairwise correlations

At the city level.

all_dfs <- list(select(n_reviews_by_city, city, log_n_reviews),
                select(census_data, -lat, -lon, -log_prop_pop_hisp),
                select(exit_distance, city, log_min_distance_to_exit_meters),
                select(sentiment_by_city, city, mean_sentiment_score_afinn),
                select(review_length_by_city, city, log_mean_review_length),
                select(entropy_by_city, city, mean_entropy_bits))

by_city_df <- reduce(all_dfs, left_join)

make_corr_plot(by_city_df[,-1])

Models

lm(mean_entropy_bits ~ log_mean_review_length + log_n_reviews + log_pop  + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_entropy_bits ~ log_mean_review_length + log_n_reviews + 
##     log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, 
##     data = by_city_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9769 -0.3714 -0.0049  0.3258  3.5512 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      9.448e-02  6.543e-01   0.144 0.885217    
## log_mean_review_length           8.033e-01  4.140e-02  19.404  < 2e-16 ***
## log_n_reviews                   -1.215e-01  3.200e-02  -3.795 0.000159 ***
## log_pop                          8.060e-02  2.331e-02   3.458 0.000573 ***
## prop_pop_white                  -1.275e+00  5.692e-01  -2.240 0.025348 *  
## median_income                    3.018e-06  1.936e-06   1.559 0.119309    
## log_min_distance_to_exit_meters -1.587e-02  2.192e-02  -0.724 0.469279    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6523 on 777 degrees of freedom
##   (106 observations deleted due to missingness)
## Multiple R-squared:  0.4306, Adjusted R-squared:  0.4262 
## F-statistic: 97.93 on 6 and 777 DF,  p-value: < 2.2e-16
lm(mean_sentiment_score_afinn ~ log_mean_review_length + log_n_reviews +log_pop  + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
  summary()
## 
## Call:
## lm(formula = mean_sentiment_score_afinn ~ log_mean_review_length + 
##     log_n_reviews + log_pop + prop_pop_white + median_income + 
##     log_min_distance_to_exit_meters, data = by_city_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3070 -0.2138  0.0660  0.3570  1.8547 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.243e+00  6.728e-01   4.820 1.71e-06 ***
## log_mean_review_length          -4.322e-01  3.883e-02 -11.131  < 2e-16 ***
## log_n_reviews                    1.365e-01  2.903e-02   4.702 3.02e-06 ***
## log_pop                         -8.269e-02  2.347e-02  -3.523  0.00045 ***
## prop_pop_white                   6.202e-01  5.856e-01   1.059  0.28988    
## median_income                    1.666e-06  1.974e-06   0.844  0.39899    
## log_min_distance_to_exit_meters  3.588e-02  2.248e-02   1.596  0.11078    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6813 on 833 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.165,  Adjusted R-squared:  0.1589 
## F-statistic: 27.43 on 6 and 833 DF,  p-value: < 2.2e-16