PLACE_ID_PATH <- "../data/place_ids/"
place_id_files <- list.files(PLACE_ID_PATH,
full.names = T)
place_id_meta <- map_df(place_id_files, read_feather) %>%
rename(city = location) %>%
distinct(city, lat, lon)
REVIEW_PATH <- "../data/tidy_reviews.csv"
tidy_review_df <- read_csv(REVIEW_PATH) %>%
select(city, name, review_num, review_tidy) %>%
group_by(city, name) %>%
distinct(review_tidy, .keep_all = T) %>%
ungroup() %>%
left_join(place_id_meta) %>%
filter(!is.na(city)) %>%
mutate(review_length = nchar(review_tidy))
The dataset includes 24440 Google reviews from 890 cities in Iowa (city list taken from Wikipedia). The reviews were collected in the summer of 2017 using the Google Maps APIs (through googleway R package). We identified all place_ids (business) within 5000 m (~3mi) of each town, and then collected 200 pages of reviews (20/page) for each place_id.
Here’s a sample of the reviews:
tidy_review_df %>%
slice(1:20) %>%
pull(review_tidy)
## [1] "barry nice review it is a nice par 3 golf course love it"
## [2] "have not played it yet but when i get off the road it will be all the time see ya in a few months bear"
## [3] "penny is very accommodating and has a beautiful selection of plants"
## [4] "they know what there doing for a decent price"
## [5] "bad experience bought a truck and the transmission went out a week later they acted like i was trash and my business wasnt important to them i would recommend to buy a vehicle from a auto salvage lot before spending any money on anything on there lot it would be more reliable"
## [6] "great place"
## [7] "great local campground although i still think of it as the boy scout camp"
## [8] "beautiful setting great camping and fishing"
## [9] "great family camping"
## [10] "a nice recreation park with lots of trails fishing and primitive modern campsites"
## [11] "the absolute worst pizza ranch in existence we came in at 630pm on a friday night and the buffet was nearly completely empty the little food that remained consisted of really old gummy instant mashed potatoes 2 pieces of cactus bread and 3 pieces of possibly weekold chicken the salad bar was equally repulsive and i was sincerely concerned that the food might not even be safe the bathroom was tiny and didnt have a changing table the highchair they gave is was broken and very filthy the restaurant was filled with hungry angry people but the staff appeared to be completely incompetent in the hour that we stayed they only brought out about 7 pizzas and some chicken this was not nearly enough to feed everyone we finally just gave up waiting for more food to appear and left ill never go back again"
## [12] "no ice cream here there is at other pizza ranch locations great pizza and chicken though friendly staff menu and napkin container werent moved so the inside of the table was left dirty after last time it was wiped all available highchairs needed wiped off and there were no working straps to secure my daughter not a deal breakers though will definitely be back at some point dont live in ackley"
## [13] "that kid with the mole who makes pizza needs to shave his beard real ugly and tasted a few hairs in my pizza but was still pretty good"
## [14] "if you want a lot of pizza this is the place if you want gourmet or high end pizza this isnt it hearty meals fried chicekn salad bar all at a good price not for the health minded but great comfort food"
## [15] "hardly no food 7 tables full of people and barely seen the waiter"
## [16] "love the dr s here"
## [17] "i work here its amazing ppl right here"
## [18] "great people great product great prices"
## [19] "great meat sticks"
## [20] "owner is very friendly and knowledgeable"
(e.g., “Subway”)
total_reviews <- count(tidy_review_df, name) %>%
arrange(-n)
total_reviews %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 5)
total_reviews %>%
slice(1:10) %>%
kable()
| name | n |
|---|---|
| SUBWAY®Restaurants | 681 |
| Casey’s General Store | 454 |
| Dollar General | 392 |
| Walmart Supercenter | 175 |
| Hy-Vee | 169 |
| Walgreens | 149 |
| McDonald’s | 135 |
| US Post Office | 120 |
| Pizza Ranch | 118 |
| Pizza Hut | 110 |
n_reviews_by_city <- tidy_review_df %>%
count(city, lat, lon) %>%
mutate(log_n_reviews = log(n)) %>%
ungroup()
ggplot(n_reviews_by_city, aes(x = n)) +
geom_histogram(binwidth = 10)
n_reviews_by_city %>%
arrange(-n) %>%
slice(1:10) %>%
kable()
| city | lat | lon | n | log_n_reviews |
|---|---|---|---|---|
| Ames | 42.03078 | -93.63191 | 237 | 5.468060 |
| Marion | 42.03328 | -91.59690 | 230 | 5.438079 |
| Urbandale | 41.62666 | -93.71217 | 229 | 5.433722 |
| Coralville | 41.68228 | -91.59606 | 223 | 5.407172 |
| Ankeny | 41.73179 | -93.60013 | 213 | 5.361292 |
| Bettendorf | 41.56085 | -90.48344 | 213 | 5.361292 |
| Dubuque | 42.50056 | -90.66457 | 211 | 5.351858 |
| Carter Lake | 41.29055 | -95.91807 | 200 | 5.298317 |
| Altoona | 41.64355 | -93.47508 | 192 | 5.257495 |
| Clive | 41.60609 | -93.77236 | 181 | 5.198497 |
iowa_bounding_box = c(-96.6397171020508,
40.3755989074707, # southwest coordinates
-90.1400604248047,
43.5011367797852) # northeast coordinates
#iowa_map <- get_stamenmap(iowa_bounding_box,
# zoom = 5,
# maptype = "toner-lite")
#ggmap(iowa_map) +
# geom_point(aes(x = lon, y = lat, size = log(n)),
# data = n_reviews_by_city, alpha = .1, color = "red") +
# theme_bw()
DESMOINES_LAT_LON <- c(-93.6091064,41.6005448)
iowa_map2 <- get_googlemap(center = DESMOINES_LAT_LON,
zoom = 7,
maptype = "roadmap")
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = log(n)),
data = n_reviews_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
tidy_review_df %>%
ggplot(aes(x = review_length)) +
xlab("Review length (characters)") +
geom_histogram(binwidth = 10)
review_length_by_city <- tidy_review_df %>%
group_by(city, lat, lon) %>%
summarize(log_mean_review_length = log(mean(review_length))) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = log_mean_review_length),
data = review_length_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
Using NSB measure (bits). Note that only a third have entropy measures that are calculated (why?).
ENTROPY_PATH <- "../data/review_nsb_entropy.csv"
review_entropy <- read_csv(ENTROPY_PATH,
col_names = c("review_num",
"entropy_bits")) %>%
right_join(tidy_review_df) %>%
filter(!is.na(entropy_bits))
review_entropy %>%
ggplot(aes(x = entropy_bits)) +
xlab("Word Entropy (bits)") +
geom_histogram()
entropy_by_city <- review_entropy %>%
group_by(city, lat, lon) %>%
summarize(mean_entropy_bits = mean(entropy_bits)) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = mean_entropy_bits),
data = entropy_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
Using the Affin dataset.
SENTIMENT_PATH <- "../data/review_sentiment.csv"
review_sentiment <- read_csv(SENTIMENT_PATH) %>%
right_join(tidy_review_df)
review_sentiment %>%
ggplot(aes(x = sentiment_score_afinn)) +
xlab("Review Sentiment") +
geom_histogram()
sentiment_by_city <- review_sentiment %>%
right_join(tidy_review_df %>% select(review_num, city, lat, lon)) %>%
group_by(city, lat, lon) %>%
summarize(mean_sentiment_score_afinn = mean(sentiment_score_afinn, na.rm = T)) %>%
ungroup()
ggmap(iowa_map2, extent =
"device") +
geom_point(aes(x = lon, y = lat, color = mean_sentiment_score_afinn),
data = sentiment_by_city, alpha = .5, size = 4) +
viridis::scale_color_viridis(direction = -1) +
theme_bw()
At the city level.
all_dfs <- list(select(n_reviews_by_city, city, log_n_reviews),
select(census_data, -lat, -lon, -log_prop_pop_hisp),
select(exit_distance, city, log_min_distance_to_exit_meters),
select(sentiment_by_city, city, mean_sentiment_score_afinn),
select(review_length_by_city, city, log_mean_review_length),
select(entropy_by_city, city, mean_entropy_bits))
by_city_df <- reduce(all_dfs, left_join)
make_corr_plot(by_city_df[,-1])
lm(mean_entropy_bits ~ log_mean_review_length + log_n_reviews + log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
summary()
##
## Call:
## lm(formula = mean_entropy_bits ~ log_mean_review_length + log_n_reviews +
## log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters,
## data = by_city_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9769 -0.3714 -0.0049 0.3258 3.5512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.448e-02 6.543e-01 0.144 0.885217
## log_mean_review_length 8.033e-01 4.140e-02 19.404 < 2e-16 ***
## log_n_reviews -1.215e-01 3.200e-02 -3.795 0.000159 ***
## log_pop 8.060e-02 2.331e-02 3.458 0.000573 ***
## prop_pop_white -1.275e+00 5.692e-01 -2.240 0.025348 *
## median_income 3.018e-06 1.936e-06 1.559 0.119309
## log_min_distance_to_exit_meters -1.587e-02 2.192e-02 -0.724 0.469279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6523 on 777 degrees of freedom
## (106 observations deleted due to missingness)
## Multiple R-squared: 0.4306, Adjusted R-squared: 0.4262
## F-statistic: 97.93 on 6 and 777 DF, p-value: < 2.2e-16
lm(mean_sentiment_score_afinn ~ log_mean_review_length + log_n_reviews +log_pop + prop_pop_white + median_income + log_min_distance_to_exit_meters, data = by_city_df) %>%
summary()
##
## Call:
## lm(formula = mean_sentiment_score_afinn ~ log_mean_review_length +
## log_n_reviews + log_pop + prop_pop_white + median_income +
## log_min_distance_to_exit_meters, data = by_city_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3070 -0.2138 0.0660 0.3570 1.8547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.243e+00 6.728e-01 4.820 1.71e-06 ***
## log_mean_review_length -4.322e-01 3.883e-02 -11.131 < 2e-16 ***
## log_n_reviews 1.365e-01 2.903e-02 4.702 3.02e-06 ***
## log_pop -8.269e-02 2.347e-02 -3.523 0.00045 ***
## prop_pop_white 6.202e-01 5.856e-01 1.059 0.28988
## median_income 1.666e-06 1.974e-06 0.844 0.39899
## log_min_distance_to_exit_meters 3.588e-02 2.248e-02 1.596 0.11078
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6813 on 833 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.165, Adjusted R-squared: 0.1589
## F-statistic: 27.43 on 6 and 833 DF, p-value: < 2.2e-16
Social Variables
Population demographics
Using census data (
acmpackage).Population demographics - geographic
Exit distance
Exits identified using OpenStreetMap API (
osmdatapackage; “motorway_junction”). I then calculated minimum distance from each city to an exit as the crow flies (“Haversine distance”). Driving time would be preferable here.Exit distance - geographic distribution