Basic Descriptive Statistics
Summary Statistics & Frequency Distributions
summary(df_business$stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 3.500 3.592 4.500 5.000
ggplot(df_business, aes(x = stars)) +
geom_bar(fill = "goldenrod") +
theme_minimal() +
labs(title = "Distribution of Average Star Ratings per Business", x = "Stars", y = "Count of Average Rating")

summary(df_business$review_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 8.00 15.00 41.63 36.00 2126.00
summary(df_review$stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 3.694 5.000 5.000
ggplot(df_review, aes(x = stars)) +
geom_bar(fill = "gold") +
theme_minimal() +
labs(title = "Distribution of All Star Ratings", x = "Stars", y = "Count of Reviews")

summary(df_review$useful)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.271 2.000 236.000
summary(df_review$funny)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3123 0.0000 72.0000
summary(df_review$cool)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4148 0.0000 130.0000
summary(df_tip$compliment_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01575 0.00000 3.00000
# Price Range
freq_price <- df_business |>
group_by(attributes$RestaurantsPriceRange2) |>
summarize(count = n()) |>
slice(1:4)
ggplot(freq_price, aes(x = `attributes$RestaurantsPriceRange2`, y = count)) +
geom_col(fill = "forestgreen") +
labs(title = "Distribution of Price Range", x = "Price Range",
y = "Count") +
theme(plot.title = element_text(hjust = 0.5, size = 17),
axis.text.x =
element_text(angle = 0, vjust = 0.55, size = 10),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 15))

Bivariate Analysis - Star Reviews & Review Count
ggplot(df_business, aes(x = stars, y = review_count)) +
geom_jitter(alpha = 0.3, color = "red") +
labs(title = "Star Ratings vs. Review Count", x = "Star Rating", y = "Review Count")

Subpopulation Analysis - Business Categories
# Group businesses by category groups
df_business <- df_business %>%
mutate(category_group = case_when(
grepl("Grocery", categories, ignore.case = TRUE) ~ "Grocery",
grepl("Restaurants|Food|Cafe|Bakery|Bar", categories, ignore.case = TRUE) ~ "Food & Drink",
grepl("Health|Dentist|Doctor|Chiropractor|Medical", categories, ignore.case = TRUE) ~ "Health & Medical",
grepl("Beauty|Salon|Spa|Hair|Nail", categories, ignore.case = TRUE) ~ "Beauty",
grepl("Auto|Car|Vehicle|Gas|Automotive|Oil Change", categories, ignore.case = TRUE) ~ "Automotive",
grepl("Hotel|Motel|Resort|Lodging", categories, ignore.case = TRUE) ~ "Hospitality",
grepl("Pet|Animal", categories, ignore.case = TRUE) ~ "Pets",
grepl("Education|School|Tutor|University", categories, ignore.case = TRUE) ~ "Education",
grepl("Retail|Shop|Store|Shopping|Clothing", categories, ignore.case = TRUE) ~ "Retail",
grepl("Real Estate|Property|Mortgage", categories, ignore.case = TRUE) ~ "Real Estate",
grepl("Services", categories, ignore.case = TRUE) ~ "Services",
grepl("Active|Fitness", categories, ignore.case = TRUE) ~ "Active Life",
TRUE ~ "Other"
))
df_business %>%
count(category_group, sort = TRUE)
## category_group n
## 1 Food & Drink 3411
## 2 Retail 1212
## 3 Services 1157
## 4 Automotive 1002
## 5 Health & Medical 988
## 6 Beauty 697
## 7 Real Estate 293
## 8 Pets 278
## 9 Active Life 242
## 10 Hospitality 218
## 11 Grocery 198
## 12 Education 112
## 13 Other 104
# Statistics by Category Group
category_summary <- df_business %>%
group_by(category_group) %>%
summarise(
n_businesses = n(),
avg_stars = mean(stars, na.rm = TRUE),
median_stars = median(stars, na.rm = TRUE),
sd_stars = sd(stars, na.rm = TRUE),
avg_reviews = mean(review_count, na.rm = TRUE),
median_reviews = median(review_count, na.rm = TRUE),
sd_reviews = sd(review_count, na.rm = TRUE)
) %>%
arrange(desc(n_businesses))
category_summary
## # A tibble: 13 × 8
## category_group n_businesses avg_stars median_stars sd_stars avg_reviews
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Food & Drink 3411 3.52 3.5 0.837 80.7
## 2 Retail 1212 3.67 4 0.941 18.3
## 3 Services 1157 3.63 4 1.09 15.2
## 4 Automotive 1002 3.71 4 1.02 21.8
## 5 Health & Medical 988 3.60 4 1.09 17.4
## 6 Beauty 697 3.80 4 0.878 32.3
## 7 Real Estate 293 2.63 2.5 1.12 12.9
## 8 Pets 278 3.95 4 0.731 22.5
## 9 Active Life 242 4.06 4 0.778 26.4
## 10 Hospitality 218 3.25 3 1.26 31.1
## 11 Grocery 198 3.38 3.5 0.846 41.3
## 12 Education 112 3.63 4 1.21 16.6
## 13 Other 104 3.73 4 1.05 26.1
## # ℹ 2 more variables: median_reviews <dbl>, sd_reviews <dbl>
Date Ranges of Reviews
df_review <- df_review %>%
mutate(date = as.Date(date))
range(df_review$date)
## [1] "2005-03-02" "2022-01-19"
Business Data Analysis
Business Distribution by Geography
leaflet(df_business) %>%
addTiles() %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = 3,
popup = ~paste0("<b>", name, "</b><br>Rating: ", stars),
color = ~ifelse(stars >= 4, "green", "red"),
fillOpacity = 0.6
)
Star Reviews & Review Count (repeated graphic)
ggplot(df_business, aes(x = stars, y = review_count)) +
geom_jitter(alpha = 0.3, color = "red") +
labs(title = "Star Ratings vs. Review Count", x = "Star Rating", y = "Review Count")

# Businesses with extremely high review counts appear to be outliers/unusual
outliers <- df_business %>%
filter((stars = 5 & review_count > 500) | (stars <= 2 & review_count > 1000))
dim(outliers)
## [1] 65 15
Rating Distributions Across Categories
ggplot(df_business, aes(x = category_group, y = stars)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Star Ratings Across Categories")

Average Rating by Geography
# Step 1: Bin and aggregate
heatmap_data <- df_business %>%
mutate(
lat_bin = round(latitude, 1),
lon_bin = round(longitude, 1)
) %>%
group_by(lat_bin, lon_bin) %>%
summarise(avg_rating = mean(stars, na.rm = TRUE), .groups = "drop")
# Step 2: Define color palette
pal <- colorNumeric(palette = "viridis", domain = heatmap_data$avg_rating)
# Step 3: Leaflet map with rectangles for each grid cell
leaflet(heatmap_data) %>%
addTiles() %>%
addRectangles(
lng1 = ~lon_bin - 0.05, lat1 = ~lat_bin - 0.05, # bottom-left corner
lng2 = ~lon_bin + 0.05, lat2 = ~lat_bin + 0.05, # top-right corner
fillColor = ~pal(avg_rating),
fillOpacity = 0.6,
color = "#444",
weight = 0.5,
popup = ~paste0("Avg Rating: ", round(avg_rating, 2),
"<br>Bin: (", lat_bin, ", ", lon_bin, ")")
) %>%
addLegend(pal = pal, values = ~avg_rating, title = "Avg Rating")
Outlier Detection and Analysis
Dataframe Enhancement
# Stats grouped by business name
business_name_summary <- df_business %>%
group_by(name) %>%
summarise(
num_cities = n_distinct(city),
num_states = n_distinct(state),
total_locations = n(), # number of rows with that name
total_stars = sum(stars, na.rm = TRUE),
avg_stars = mean(stars, na.rm = TRUE)
)
tips_summary <- df_tip %>%
inner_join(df_business %>% select(business_id, name), by = "business_id") %>%
group_by(name) %>%
summarise(
num_tips = n(),
total_compliments = sum(compliment_count, na.rm = TRUE)
)
review_summary <- df_review %>%
inner_join(df_business %>% select(business_id, name), by = "business_id") %>%
group_by(name) %>%
summarise(
total_reviews = n()
)
business_stats_by_name <- business_name_summary %>%
left_join(tips_summary, by = "name") %>%
left_join(review_summary, by = "name")
df_business <- df_business %>%
left_join(business_stats_by_name, by = "name")
# Number of attributes per business ID
df_business <- df_business %>%
mutate(
num_attributes = select(., starts_with("attributes")) %>%
apply(1, function(x) sum(!is.na(x)))
)
# Number of categories per business ID
df_business <- df_business %>%
mutate(
num_categories = ifelse(
is.na(categories),
0,
sapply(strsplit(categories, ","), function(x) length(trimws(x)))
)
)
Statistics
summary(select(df_business, 16:25))
## num_cities num_states total_locations total_stars avg_stars
## Min. :1.000 Min. :1 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1 1st Qu.: 1.000 1st Qu.: 3.50 1st Qu.:3.000
## Median :1.000 Median :1 Median : 1.000 Median : 4.50 Median :3.500
## Mean :1.261 Mean :1 Mean : 3.606 Mean : 11.24 Mean :3.592
## 3rd Qu.:1.000 3rd Qu.:1 3rd Qu.: 2.000 3rd Qu.: 6.00 3rd Qu.:4.500
## Max. :7.000 Max. :1 Max. :63.000 Max. :204.50 Max. :5.000
##
## num_tips total_compliments total_reviews num_attributes
## Min. : 1.00 Min. : 0.0000 Min. : 5.0 Min. : 0.000
## 1st Qu.: 2.00 1st Qu.: 0.0000 1st Qu.: 10.0 1st Qu.: 2.000
## Median : 5.00 Median : 0.0000 Median : 24.0 Median : 4.000
## Mean : 28.91 Mean : 0.3708 Mean : 114.7 Mean : 6.933
## 3rd Qu.: 25.00 3rd Qu.: 0.0000 3rd Qu.: 98.0 3rd Qu.:10.000
## Max. :658.00 Max. :10.0000 Max. :2627.0 Max. :33.000
## NA's :2693 NA's :2693
## num_categories
## Min. : 0.000
## 1st Qu.: 3.000
## Median : 4.000
## Mean : 4.355
## 3rd Qu.: 6.000
## Max. :25.000
##
Variable Analysis - Review Ratios
df_review <- df_review %>%
mutate(
total_votes = useful + funny + cool,
useful_ratio = ifelse(total_votes > 0, useful / total_votes, NA),
funny_ratio = ifelse(total_votes > 0, funny / total_votes, NA),
cool_ratio = ifelse(total_votes > 0, cool / total_votes, NA)
)
df_review %>%
summarise(
avg_total_votes = mean(total_votes, na.rm = TRUE),
avg_useful_ratio = mean(useful_ratio, na.rm = TRUE),
avg_funny_ratio = mean(funny_ratio, na.rm = TRUE),
avg_cool_ratio = mean(cool_ratio, na.rm = TRUE)
)
## avg_total_votes avg_useful_ratio avg_funny_ratio avg_cool_ratio
## 1 1.998339 0.6906884 0.1259866 0.183325
ggplot(df_review, aes(x = useful_ratio)) +
geom_histogram(bins = 30, fill = "steelblue") +
labs(title = "Distribution of Useful Vote Ratios", x = "Useful Vote Ratio", y = "Count")
## Warning: Removed 192047 rows containing non-finite outside the scale range
## (`stat_bin()`).

Variable Analysis - Extreme Lat/Long Coordinates
# Geographic coordinate statistics
coord_stats <- df_business %>%
summarise(
lat_mean = round(mean(latitude, na.rm = TRUE), 4),
lat_median = round(median(latitude, na.rm = TRUE), 4),
lat_min = min(latitude, na.rm = TRUE),
lat_max = max(latitude, na.rm = TRUE),
lat_range = round(max(latitude, na.rm = TRUE) - min(latitude, na.rm = TRUE), 4),
lon_mean = round(mean(longitude, na.rm = TRUE), 4),
lon_median = round(median(longitude, na.rm = TRUE), 4),
lon_min = min(longitude, na.rm = TRUE),
lon_max = max(longitude, na.rm = TRUE),
lon_range = round(max(longitude, na.rm = TRUE) - min(longitude, na.rm = TRUE), 4)
)
cat("Coordinate Statistics:\n")
## Coordinate Statistics:
print(coord_stats)
## lat_mean lat_median lat_min lat_max lat_range lon_mean lon_median
## 1 32.2485 32.2412 31.90027 32.54964 0.6494 -110.9341 -110.9444
## lon_min lon_max lon_range
## 1 -111.2849 -110.6336 0.6513
# Identify extreme coordinates
extreme_coords <- df_business %>%
mutate(
lat_extreme = latitude < quantile(latitude, 0.01, na.rm = TRUE) |
latitude > quantile(latitude, 0.99, na.rm = TRUE),
lon_extreme = longitude < quantile(longitude, 0.01, na.rm = TRUE) |
longitude > quantile(longitude, 0.99, na.rm = TRUE),
coord_extreme = lat_extreme | lon_extreme
)
extreme_coord_summary <- extreme_coords %>%
summarise(
total_businesses = n(),
extreme_lat = sum(lat_extreme, na.rm = TRUE),
extreme_lon = sum(lon_extreme, na.rm = TRUE),
extreme_either = sum(coord_extreme, na.rm = TRUE),
pct_extreme = round(sum(coord_extreme, na.rm = TRUE) / n() * 100, 2)
)
cat("Extreme Coordinates Summary (1st and 99th percentiles):\n")
## Extreme Coordinates Summary (1st and 99th percentiles):
print(extreme_coord_summary)
## total_businesses extreme_lat extreme_lon extreme_either pct_extreme
## 1 9912 200 199 399 4.03
# extreme coordinates
extreme_coordinators <- extreme_coords %>%
filter(coord_extreme == TRUE) %>%
select(name, city, state, latitude, longitude, stars, review_count) %>%
arrange(desc(abs(latitude - coord_stats$lat_mean) + abs(longitude - coord_stats$lon_mean))) %>%
head(10)
cat("\nBusinesses with Extreme Coordinates:\n")
##
## Businesses with Extreme Coordinates:
print(extreme_coordinators)
## name city state latitude longitude stars
## 1 Southern Arizona Plumbing Tucson AZ 32.01492 -110.6646 4.5
## 2 Portraits By Denise and Jim Tucson AZ 32.01492 -110.6646 5.0
## 3 Molecular Munchies Tucson AZ 32.01492 -110.6646 5.0
## 4 The Cookie Lady Co Tucson AZ 32.01492 -110.6646 1.0
## 5 Fantastic Five Entertainment Tucson AZ 32.01492 -110.6646 4.5
## 6 Substance Diner Tucson AZ 32.01492 -110.6646 4.5
## 7 J B Painting Tucson AZ 32.01492 -110.6646 4.0
## 8 Rockin' Z Doggie Ranch Vail AZ 31.97326 -110.7156 3.5
## 9 Vail Flowers Vail AZ 31.97606 -110.7174 5.0
## 10 Colossal Cave Mountain Park Vail AZ 32.06207 -110.6336 4.5
## review_count
## 1 16
## 2 5
## 3 21
## 4 9
## 5 5
## 6 17
## 7 7
## 8 5
## 9 21
## 10 258
# Visualization: Geographic Distribution
map_df <- extreme_coords %>%
filter(!is.na(latitude) & !is.na(longitude))
# Interactive leaflet map
leaflet(data = map_df) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~ifelse(coord_extreme, "red", "blue"),
radius = 3,
stroke = FALSE,
fillOpacity = 0.6,
popup = ~paste0("<b>", name, "</b><br>City: ", city, "<br>Stars: ", stars, "<br>Reviews: ", review_count)
) %>%
addLegend(position = "bottomright",
colors = c("red", "blue"),
labels = c("Extreme Coordinates", "Normal Coordinates"),
title = "Legend")
Multivariate Outlier Detection
# Define the new dataset and features
X <- df_business[, c("review_count", "stars", "total_stars")]
# Remove rows with any NA values for outlier detection
X_clean <- na.omit(X)
clean_indices <- complete.cases(df_business[, c("review_count", "stars", "total_stars")])
# Robust function
robust_cov <- function(x, contamination = 0.1) {
# Use classical estimates as starting point
center <- apply(x, 2, median, na.rm = TRUE) # Use median as robust center
# Calculate distances from center
dists <- apply(x, 1, function(row) sqrt(sum((row - center)^2)))
# Remove top contamination% of points for robust estimation
threshold <- quantile(dists, 1 - contamination, na.rm = TRUE)
good_points <- dists <= threshold
# Recalculate center and covariance on clean subset
robust_center <- apply(x[good_points, ], 2, mean, na.rm = TRUE)
robust_covariance <- cov(x[good_points, ], use = "complete.obs")
return(list(center = robust_center, cov = robust_covariance))
}
# Robust covariance estimation
robust_result <- robust_cov(X_clean, contamination = 0.1)
# Mahalanobis distance
mahalanobis_dist <- mahalanobis(X_clean, robust_result$center, robust_result$cov)
# Chi-square cutoff (90% confidence level for 2 dimensions)
cutoff <- qchisq(0.90, df = ncol(X_clean))
# Mark outliers: -1 for outlier, 1 for inlier
outliers_elliptic <- ifelse(mahalanobis_dist > cutoff, -1, 1)
if (requireNamespace("dbscan", quietly = TRUE)) {
lof_scores <- dbscan::lof(X_clean, minPts = 20)
lof_threshold <- quantile(lof_scores, 0.9)
outliers_knn <- ifelse(lof_scores > lof_threshold, -1, 1)
} else {
cat("dbscan package not available\n")
}
df_business$Outlier_Elliptic <- NA
df_business$Outlier_KNN <- NA
df_business$Outlier_Elliptic[clean_indices] <- outliers_elliptic
df_business$Outlier_KNN[clean_indices] <- outliers_knn
df_business$Final_Outlier <- df_business$Outlier_Elliptic + df_business$Outlier_KNN
df_business$Final_Outlier <- as.factor(df_business$Final_Outlier)
outlier_businesses <- df_business %>%
filter(Final_Outlier == -2) %>%
select(name, review_count, stars, total_stars, Final_Outlier)
head(outlier_businesses, 25)
## name review_count stars
## 1 Octopus Car Wash 90 2.0
## 2 Creative Beginnings Pre-School & Kindergarten 23 1.0
## 3 Freddy's Frozen Custard & Steakburgers 112 3.5
## 4 La Placita Village Parking Garage 6 1.0
## 5 Prep & Pastry 2126 4.5
## 6 Frost Gelato 217 4.5
## 7 Hooters 109 3.5
## 8 Barro's Pizza 116 3.5
## 9 La Parrilla Suiza 125 3.0
## 10 Old Pueblo Traders 11 1.0
## 11 Bing's Boba Tea 182 4.0
## 12 McDonald's 83 1.5
## 13 eegee's 57 3.0
## 14 Southern Arizona Veterinary Specialty & Emergency Center 93 3.0
## 15 Brawley's Restaurant 90 3.5
## 16 Street- Taco and Beer Co. 805 4.5
## 17 The Screamery HandCrafted Ice Cream 173 4.5
## 18 Blake's Lotaburger 69 2.0
## 19 eegee's 25 3.0
## 20 Starbucks 67 3.5
## 21 Frost Gelato 243 4.5
## 22 Golden Corral Buffet & Grill 84 2.5
## 23 Octopus Car Wash 102 2.0
## 24 Republic Services of Tucson, AZ 96 1.5
## 25 Maloney's 86 2.0
## total_stars Final_Outlier
## 1 9.5 -2
## 2 1.0 -2
## 3 13.5 -2
## 4 1.0 -2
## 5 4.5 -2
## 6 13.5 -2
## 7 8.5 -2
## 8 7.5 -2
## 9 9.0 -2
## 10 1.0 -2
## 11 8.0 -2
## 12 78.0 -2
## 13 86.5 -2
## 14 7.0 -2
## 15 7.0 -2
## 16 8.5 -2
## 17 9.0 -2
## 18 7.5 -2
## 19 86.5 -2
## 20 204.5 -2
## 21 13.5 -2
## 22 7.5 -2
## 23 9.5 -2
## 24 1.5 -2
## 25 2.0 -2
fig <- plot_ly(
data = df_business,
x = ~review_count,
y = ~stars,
z = ~total_stars,
color = ~Final_Outlier, # must be a factor or categorical
colors = c("blue", "orange", "red"),
type = "scatter3d",
mode = "markers",
marker = list(size = 4, opacity = 0.7)
) %>%
layout(
title = "3D Scatter of Reviews vs. Stars vs. Total Stars",
scene = list(
xaxis = list(title = "Review Count"),
yaxis = list(title = "Star Rating"),
zaxis = list(title = "Total Stars")
)
)
fig