Module 2 EDA

Data Quality Assessment

Check for missing values

colSums(is.na(df_business))

##                business_id                       name 
##                          0                          0 
##                    address                       city 
##                          0                          0 
##                      state                postal_code 
##                          0                          0 
##                   latitude                  longitude 
##                          0                          0 
##                      stars               review_count 
##                          0                          0 
##                    is_open          ByAppointmentOnly 
##                          0                       6552 
## BusinessAcceptsCreditCards                BikeParking 
##                       1763                       5601 
##     RestaurantsPriceRange2                  CoatCheck 
##                       4909                       9619 
##         RestaurantsTakeOut        RestaurantsDelivery 
##                       6787                       6892 
##                     Caters                       WiFi 
##                       7752                       6694 
##            BusinessParking       WheelchairAccessible 
##                       4598                       8186 
##                  HappyHour             OutdoorSeating 
##                       9052                       7374 
##                      HasTV    RestaurantsReservations 
##                       7625                       7603 
##                DogsAllowed                    Alcohol 
##                       8887                       7691 
##                GoodForKids          RestaurantsAttire 
##                       6966                       7852 
##                   Ambience    RestaurantsTableService 
##                       7658                       8957 
##   RestaurantsGoodForGroups                  DriveThru 
##                       7629                       9350 
##                 NoiseLevel                GoodForMeal 
##                       7921                       8386 
##     BusinessAcceptsBitcoin                    Smoking 
##                       8793                       9692 
##                      Music             GoodForDancing 
##                       9562                       9691 
##           AcceptsInsurance                 BestNights 
##                       9409                       9638 
##                       BYOB                    Corkage 
##                       9683                       9756 
##                BYOBCorkage          HairSpecializesIn 
##                       9846                       9848 
##                Open24Hours  RestaurantsCounterService 
##                       9911                       9911 
##                AgesAllowed        DietaryRestrictions 
##                       9895                       9911 
##                 categories                     Monday 
##                          6                       2114 
##                    Tuesday                  Wednesday 
##                       1729                       1605 
##                   Thursday                     Friday 
##                       1531                       1577 
##                   Saturday                     Sunday 
##                       2949                       4980

colSums(is.na(df_review))

##   review_id     user_id business_id       stars      useful       funny 
##           0           0           0           0           0           0 
##        cool        date 
##           0           0

colSums(is.na(df_tip))

##          user_id      business_id             date compliment_count 
##                0                0                0                0

Completeness percentages

key_fields <- c("name","stars", "review_count", "is_open", "categories")

df_business %>%
  summarise(across(all_of(key_fields), ~ mean(!is.na(.)) * 100))

##   name stars review_count is_open categories
## 1  100   100          100     100   99.93947

Identify Duplicates

duplicated_rows <- df_business[duplicated(df_business), ]
dim(duplicated_rows)

## [1]  0 14

Basic Descriptive Statistics

Summary Statistics & Frequency Distributions

summary(df_business$stars)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.500   3.592   4.500   5.000

ggplot(df_business, aes(x = stars)) +
  geom_bar(fill = "goldenrod") +
  theme_minimal() +
  labs(title = "Distribution of Average Star Ratings per Business", x = "Stars", y = "Count of Average Rating")

summary(df_business$review_count)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00    8.00   15.00   41.63   36.00 2126.00

summary(df_review$stars)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.694   5.000   5.000

ggplot(df_review, aes(x = stars)) +
  geom_bar(fill = "gold") +
  theme_minimal() +
  labs(title = "Distribution of All Star Ratings", x = "Stars", y = "Count of Reviews")

summary(df_review$useful)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.271   2.000 236.000

summary(df_review$funny)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3123  0.0000 72.0000

summary(df_review$cool)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.4148   0.0000 130.0000

summary(df_tip$compliment_count)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01575 0.00000 3.00000

# Price Range
freq_price <- df_business |> 
  group_by(attributes$RestaurantsPriceRange2) |> 
  summarize(count = n()) |> 
  slice(1:4)
 
ggplot(freq_price, aes(x = `attributes$RestaurantsPriceRange2`, y = count)) +
  geom_col(fill = "forestgreen") +
  labs(title = "Distribution of Price Range", x = "Price Range", 
       y = "Count") +
  theme(plot.title = element_text(hjust = 0.5, size = 17), 
        axis.text.x = 
          element_text(angle = 0, vjust = 0.55, size = 10), 
        axis.text.y = element_text(size = 12), 
        axis.title = element_text(size = 15))

Bivariate Analysis - Star Reviews & Review Count

ggplot(df_business, aes(x = stars, y = review_count)) +
  geom_jitter(alpha = 0.3, color = "red") +
  labs(title = "Star Ratings vs. Review Count", x = "Star Rating", y = "Review Count")

Subpopulation Analysis - Business Categories

# Group businesses by category groups
df_business <- df_business %>%
  mutate(category_group = case_when(
    grepl("Grocery", categories, ignore.case = TRUE) ~ "Grocery",
    grepl("Restaurants|Food|Cafe|Bakery|Bar", categories, ignore.case = TRUE) ~ "Food & Drink",
    grepl("Health|Dentist|Doctor|Chiropractor|Medical", categories, ignore.case = TRUE) ~ "Health & Medical",
    grepl("Beauty|Salon|Spa|Hair|Nail", categories, ignore.case = TRUE) ~ "Beauty",
    grepl("Auto|Car|Vehicle|Gas|Automotive|Oil Change", categories, ignore.case = TRUE) ~ "Automotive",
    grepl("Hotel|Motel|Resort|Lodging", categories, ignore.case = TRUE) ~ "Hospitality",
    grepl("Pet|Animal", categories, ignore.case = TRUE) ~ "Pets",
    grepl("Education|School|Tutor|University", categories, ignore.case = TRUE) ~ "Education",
    grepl("Retail|Shop|Store|Shopping|Clothing", categories, ignore.case = TRUE) ~ "Retail",
    grepl("Real Estate|Property|Mortgage", categories, ignore.case = TRUE) ~ "Real Estate",
    grepl("Services", categories, ignore.case = TRUE) ~ "Services",
    grepl("Active|Fitness", categories, ignore.case = TRUE) ~ "Active Life",
    TRUE ~ "Other"
  ))

df_business %>%
  count(category_group, sort = TRUE)

##      category_group    n
## 1      Food & Drink 3411
## 2            Retail 1212
## 3          Services 1157
## 4        Automotive 1002
## 5  Health & Medical  988
## 6            Beauty  697
## 7       Real Estate  293
## 8              Pets  278
## 9       Active Life  242
## 10      Hospitality  218
## 11          Grocery  198
## 12        Education  112
## 13            Other  104

# Statistics by Category Group
category_summary <- df_business %>%
  group_by(category_group) %>%
  summarise(
    n_businesses = n(),
    avg_stars = mean(stars, na.rm = TRUE),
    median_stars = median(stars, na.rm = TRUE),
    sd_stars = sd(stars, na.rm = TRUE),
    avg_reviews = mean(review_count, na.rm = TRUE),
    median_reviews = median(review_count, na.rm = TRUE),
    sd_reviews = sd(review_count, na.rm = TRUE)
  ) %>%
  arrange(desc(n_businesses))

category_summary

## # A tibble: 13 × 8
##    category_group   n_businesses avg_stars median_stars sd_stars avg_reviews
##    <chr>                   <int>     <dbl>        <dbl>    <dbl>       <dbl>
##  1 Food & Drink             3411      3.52          3.5    0.837        80.7
##  2 Retail                   1212      3.67          4      0.941        18.3
##  3 Services                 1157      3.63          4      1.09         15.2
##  4 Automotive               1002      3.71          4      1.02         21.8
##  5 Health & Medical          988      3.60          4      1.09         17.4
##  6 Beauty                    697      3.80          4      0.878        32.3
##  7 Real Estate               293      2.63          2.5    1.12         12.9
##  8 Pets                      278      3.95          4      0.731        22.5
##  9 Active Life               242      4.06          4      0.778        26.4
## 10 Hospitality               218      3.25          3      1.26         31.1
## 11 Grocery                   198      3.38          3.5    0.846        41.3
## 12 Education                 112      3.63          4      1.21         16.6
## 13 Other                     104      3.73          4      1.05         26.1
## # ℹ 2 more variables: median_reviews <dbl>, sd_reviews <dbl>

Date Ranges of Reviews

df_review <- df_review %>%
  mutate(date = as.Date(date))

range(df_review$date)

## [1] "2005-03-02" "2022-01-19"

Business Data Analysis

Business Distribution by Geography

leaflet(df_business) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude,
    lat = ~latitude,
    radius = 3,
    popup = ~paste0("<b>", name, "</b><br>Rating: ", stars),
    color = ~ifelse(stars >= 4, "green", "red"),
    fillOpacity = 0.6
  )

Star Reviews & Review Count (repeated graphic)

ggplot(df_business, aes(x = stars, y = review_count)) +
  geom_jitter(alpha = 0.3, color = "red") +
  labs(title = "Star Ratings vs. Review Count", x = "Star Rating", y = "Review Count")

# Businesses with extremely high review counts appear to be outliers/unusual
outliers <- df_business %>%
  filter((stars = 5 & review_count > 500) | (stars <= 2 & review_count > 1000))

dim(outliers)

## [1] 65 15

Rating Distributions Across Categories

ggplot(df_business, aes(x = category_group, y = stars)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Star Ratings Across Categories")

Average Rating by Geography

# Step 1: Bin and aggregate
heatmap_data <- df_business %>%
  mutate(
    lat_bin = round(latitude, 1),
    lon_bin = round(longitude, 1)
  ) %>%
  group_by(lat_bin, lon_bin) %>%
  summarise(avg_rating = mean(stars, na.rm = TRUE), .groups = "drop")

# Step 2: Define color palette
pal <- colorNumeric(palette = "viridis", domain = heatmap_data$avg_rating)

# Step 3: Leaflet map with rectangles for each grid cell
leaflet(heatmap_data) %>%
  addTiles() %>%
  addRectangles(
    lng1 = ~lon_bin - 0.05, lat1 = ~lat_bin - 0.05,   # bottom-left corner
    lng2 = ~lon_bin + 0.05, lat2 = ~lat_bin + 0.05,   # top-right corner
    fillColor = ~pal(avg_rating),
    fillOpacity = 0.6,
    color = "#444",
    weight = 0.5,
    popup = ~paste0("Avg Rating: ", round(avg_rating, 2),
                    "<br>Bin: (", lat_bin, ", ", lon_bin, ")")
  ) %>%
  addLegend(pal = pal, values = ~avg_rating, title = "Avg Rating")

Outlier Detection and Analysis

Dataframe Enhancement

# Stats grouped by business name
business_name_summary <- df_business %>%
  group_by(name) %>%
  summarise(
    num_cities = n_distinct(city),
    num_states = n_distinct(state),
    total_locations = n(),  # number of rows with that name
    total_stars = sum(stars, na.rm = TRUE),
    avg_stars = mean(stars, na.rm = TRUE)
  )

tips_summary <- df_tip %>%
  inner_join(df_business %>% select(business_id, name), by = "business_id") %>%
  group_by(name) %>%
  summarise(
    num_tips = n(),
    total_compliments = sum(compliment_count, na.rm = TRUE)
  )

review_summary <- df_review %>%
  inner_join(df_business %>% select(business_id, name), by = "business_id") %>%
  group_by(name) %>%
  summarise(
    total_reviews = n()
  )

business_stats_by_name <- business_name_summary %>%
  left_join(tips_summary, by = "name") %>%
  left_join(review_summary, by = "name")

df_business <- df_business %>%
  left_join(business_stats_by_name, by = "name")

# Number of attributes per business ID
df_business <- df_business %>%
  mutate(
    num_attributes = select(., starts_with("attributes")) %>%
      apply(1, function(x) sum(!is.na(x)))
  )

# Number of categories per business ID
df_business <- df_business %>%
  mutate(
    num_categories = ifelse(
      is.na(categories),
      0,
      sapply(strsplit(categories, ","), function(x) length(trimws(x)))
    )
  )

Statistics

summary(select(df_business, 16:25))

##    num_cities      num_states total_locations   total_stars       avg_stars    
##  Min.   :1.000   Min.   :1    Min.   : 1.000   Min.   :  1.00   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1    1st Qu.: 1.000   1st Qu.:  3.50   1st Qu.:3.000  
##  Median :1.000   Median :1    Median : 1.000   Median :  4.50   Median :3.500  
##  Mean   :1.261   Mean   :1    Mean   : 3.606   Mean   : 11.24   Mean   :3.592  
##  3rd Qu.:1.000   3rd Qu.:1    3rd Qu.: 2.000   3rd Qu.:  6.00   3rd Qu.:4.500  
##  Max.   :7.000   Max.   :1    Max.   :63.000   Max.   :204.50   Max.   :5.000  
##                                                                                
##     num_tips      total_compliments total_reviews    num_attributes  
##  Min.   :  1.00   Min.   : 0.0000   Min.   :   5.0   Min.   : 0.000  
##  1st Qu.:  2.00   1st Qu.: 0.0000   1st Qu.:  10.0   1st Qu.: 2.000  
##  Median :  5.00   Median : 0.0000   Median :  24.0   Median : 4.000  
##  Mean   : 28.91   Mean   : 0.3708   Mean   : 114.7   Mean   : 6.933  
##  3rd Qu.: 25.00   3rd Qu.: 0.0000   3rd Qu.:  98.0   3rd Qu.:10.000  
##  Max.   :658.00   Max.   :10.0000   Max.   :2627.0   Max.   :33.000  
##  NA's   :2693     NA's   :2693                                       
##  num_categories  
##  Min.   : 0.000  
##  1st Qu.: 3.000  
##  Median : 4.000  
##  Mean   : 4.355  
##  3rd Qu.: 6.000  
##  Max.   :25.000  
##

Variable Analysis - Review Ratios

df_review <- df_review %>%
mutate(
  total_votes = useful + funny + cool,
  useful_ratio = ifelse(total_votes > 0, useful / total_votes, NA),
  funny_ratio  = ifelse(total_votes > 0, funny / total_votes, NA),
  cool_ratio   = ifelse(total_votes > 0, cool / total_votes, NA)
)

df_review %>%
  summarise(
    avg_total_votes = mean(total_votes, na.rm = TRUE),
    avg_useful_ratio = mean(useful_ratio, na.rm = TRUE),
    avg_funny_ratio  = mean(funny_ratio, na.rm = TRUE),
    avg_cool_ratio   = mean(cool_ratio, na.rm = TRUE)
  )

##   avg_total_votes avg_useful_ratio avg_funny_ratio avg_cool_ratio
## 1        1.998339        0.6906884       0.1259866       0.183325

ggplot(df_review, aes(x = useful_ratio)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  labs(title = "Distribution of Useful Vote Ratios", x = "Useful Vote Ratio", y = "Count")

## Warning: Removed 192047 rows containing non-finite outside the scale range
## (`stat_bin()`).

Variable Analysis - Extreme Lat/Long Coordinates

# Geographic coordinate statistics
coord_stats <- df_business %>%
summarise(
lat_mean = round(mean(latitude, na.rm = TRUE), 4),
lat_median = round(median(latitude, na.rm = TRUE), 4),
lat_min = min(latitude, na.rm = TRUE),
lat_max = max(latitude, na.rm = TRUE),
lat_range = round(max(latitude, na.rm = TRUE) - min(latitude, na.rm = TRUE), 4),
lon_mean = round(mean(longitude, na.rm = TRUE), 4),
lon_median = round(median(longitude, na.rm = TRUE), 4),
lon_min = min(longitude, na.rm = TRUE),
lon_max = max(longitude, na.rm = TRUE),
lon_range = round(max(longitude, na.rm = TRUE) - min(longitude, na.rm = TRUE), 4)
)

cat("Coordinate Statistics:\n")

## Coordinate Statistics:

print(coord_stats)

##   lat_mean lat_median  lat_min  lat_max lat_range  lon_mean lon_median
## 1  32.2485    32.2412 31.90027 32.54964    0.6494 -110.9341  -110.9444
##     lon_min   lon_max lon_range
## 1 -111.2849 -110.6336    0.6513

# Identify extreme coordinates
extreme_coords <- df_business %>%
mutate(
lat_extreme = latitude < quantile(latitude, 0.01, na.rm = TRUE) |
latitude > quantile(latitude, 0.99, na.rm = TRUE),
lon_extreme = longitude < quantile(longitude, 0.01, na.rm = TRUE) |
longitude > quantile(longitude, 0.99, na.rm = TRUE),
coord_extreme = lat_extreme | lon_extreme
)


extreme_coord_summary <- extreme_coords %>%
summarise(
total_businesses = n(),
extreme_lat = sum(lat_extreme, na.rm = TRUE),
extreme_lon = sum(lon_extreme, na.rm = TRUE),
extreme_either = sum(coord_extreme, na.rm = TRUE),
pct_extreme = round(sum(coord_extreme, na.rm = TRUE) / n() * 100, 2)
)


cat("Extreme Coordinates Summary (1st and 99th percentiles):\n")

## Extreme Coordinates Summary (1st and 99th percentiles):

print(extreme_coord_summary)

##   total_businesses extreme_lat extreme_lon extreme_either pct_extreme
## 1             9912         200         199            399        4.03

# extreme coordinates
extreme_coordinators <- extreme_coords %>%
filter(coord_extreme == TRUE) %>%
select(name, city, state, latitude, longitude, stars, review_count) %>%
arrange(desc(abs(latitude - coord_stats$lat_mean) + abs(longitude - coord_stats$lon_mean))) %>%
head(10)

cat("\nBusinesses with Extreme Coordinates:\n")

## 
## Businesses with Extreme Coordinates:

print(extreme_coordinators)

##                            name   city state latitude longitude stars
## 1     Southern Arizona Plumbing Tucson    AZ 32.01492 -110.6646   4.5
## 2   Portraits By Denise and Jim Tucson    AZ 32.01492 -110.6646   5.0
## 3            Molecular Munchies Tucson    AZ 32.01492 -110.6646   5.0
## 4            The Cookie Lady Co Tucson    AZ 32.01492 -110.6646   1.0
## 5  Fantastic Five Entertainment Tucson    AZ 32.01492 -110.6646   4.5
## 6               Substance Diner Tucson    AZ 32.01492 -110.6646   4.5
## 7                  J B Painting Tucson    AZ 32.01492 -110.6646   4.0
## 8        Rockin' Z Doggie Ranch   Vail    AZ 31.97326 -110.7156   3.5
## 9                  Vail Flowers   Vail    AZ 31.97606 -110.7174   5.0
## 10  Colossal Cave Mountain Park   Vail    AZ 32.06207 -110.6336   4.5
##    review_count
## 1            16
## 2             5
## 3            21
## 4             9
## 5             5
## 6            17
## 7             7
## 8             5
## 9            21
## 10          258

# Visualization: Geographic Distribution

map_df <- extreme_coords %>%
filter(!is.na(latitude) & !is.na(longitude))

# Interactive leaflet map

leaflet(data = map_df) %>%
addProviderTiles("CartoDB.Positron") %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
color = ~ifelse(coord_extreme, "red", "blue"),
radius = 3,
stroke = FALSE,
fillOpacity = 0.6,
popup = ~paste0("<b>", name, "</b><br>City: ", city, "<br>Stars: ", stars, "<br>Reviews: ", review_count)
) %>%

addLegend(position = "bottomright",
colors = c("red", "blue"),
labels = c("Extreme Coordinates", "Normal Coordinates"),
title = "Legend")

Multivariate Outlier Detection

# Define the new dataset and features
X <- df_business[, c("review_count", "stars", "total_stars")]

# Remove rows with any NA values for outlier detection
X_clean <- na.omit(X)
clean_indices <- complete.cases(df_business[, c("review_count", "stars", "total_stars")])

# Robust function
robust_cov <- function(x, contamination = 0.1) {
  # Use classical estimates as starting point
  center <- apply(x, 2, median, na.rm = TRUE)  # Use median as robust center
  
  # Calculate distances from center
  dists <- apply(x, 1, function(row) sqrt(sum((row - center)^2)))
  
  # Remove top contamination% of points for robust estimation
  threshold <- quantile(dists, 1 - contamination, na.rm = TRUE)
  good_points <- dists <= threshold
  
  # Recalculate center and covariance on clean subset
  robust_center <- apply(x[good_points, ], 2, mean, na.rm = TRUE)
  robust_covariance <- cov(x[good_points, ], use = "complete.obs")
  
  return(list(center = robust_center, cov = robust_covariance))
}

# Robust covariance estimation
robust_result <- robust_cov(X_clean, contamination = 0.1)

# Mahalanobis distance
mahalanobis_dist <- mahalanobis(X_clean, robust_result$center, robust_result$cov)

# Chi-square cutoff (90% confidence level for 2 dimensions)
cutoff <- qchisq(0.90, df = ncol(X_clean))

# Mark outliers: -1 for outlier, 1 for inlier
outliers_elliptic <- ifelse(mahalanobis_dist > cutoff, -1, 1)

if (requireNamespace("dbscan", quietly = TRUE)) {
  lof_scores <- dbscan::lof(X_clean, minPts = 20)
  lof_threshold <- quantile(lof_scores, 0.9)
  outliers_knn <- ifelse(lof_scores > lof_threshold, -1, 1)
} else {
  cat("dbscan package not available\n")
}

df_business$Outlier_Elliptic <- NA
df_business$Outlier_KNN <- NA

df_business$Outlier_Elliptic[clean_indices] <- outliers_elliptic
df_business$Outlier_KNN[clean_indices] <- outliers_knn

df_business$Final_Outlier <- df_business$Outlier_Elliptic + df_business$Outlier_KNN
df_business$Final_Outlier <- as.factor(df_business$Final_Outlier)

outlier_businesses <- df_business %>%
  filter(Final_Outlier == -2) %>%
  select(name, review_count, stars, total_stars, Final_Outlier)

head(outlier_businesses, 25)

##                                                        name review_count stars
## 1                                          Octopus Car Wash           90   2.0
## 2             Creative Beginnings Pre-School & Kindergarten           23   1.0
## 3                    Freddy's Frozen Custard & Steakburgers          112   3.5
## 4                         La Placita Village Parking Garage            6   1.0
## 5                                             Prep & Pastry         2126   4.5
## 6                                              Frost Gelato          217   4.5
## 7                                                   Hooters          109   3.5
## 8                                             Barro's Pizza          116   3.5
## 9                                         La Parrilla Suiza          125   3.0
## 10                                       Old Pueblo Traders           11   1.0
## 11                                          Bing's Boba Tea          182   4.0
## 12                                               McDonald's           83   1.5
## 13                                                  eegee's           57   3.0
## 14 Southern Arizona Veterinary Specialty & Emergency Center           93   3.0
## 15                                     Brawley's Restaurant           90   3.5
## 16                                Street- Taco and Beer Co.          805   4.5
## 17                      The Screamery HandCrafted Ice Cream          173   4.5
## 18                                       Blake's Lotaburger           69   2.0
## 19                                                  eegee's           25   3.0
## 20                                                Starbucks           67   3.5
## 21                                             Frost Gelato          243   4.5
## 22                             Golden Corral Buffet & Grill           84   2.5
## 23                                         Octopus Car Wash          102   2.0
## 24                          Republic Services of Tucson, AZ           96   1.5
## 25                                                Maloney's           86   2.0
##    total_stars Final_Outlier
## 1          9.5            -2
## 2          1.0            -2
## 3         13.5            -2
## 4          1.0            -2
## 5          4.5            -2
## 6         13.5            -2
## 7          8.5            -2
## 8          7.5            -2
## 9          9.0            -2
## 10         1.0            -2
## 11         8.0            -2
## 12        78.0            -2
## 13        86.5            -2
## 14         7.0            -2
## 15         7.0            -2
## 16         8.5            -2
## 17         9.0            -2
## 18         7.5            -2
## 19        86.5            -2
## 20       204.5            -2
## 21        13.5            -2
## 22         7.5            -2
## 23         9.5            -2
## 24         1.5            -2
## 25         2.0            -2

fig <- plot_ly(
  data = df_business,
  x = ~review_count,
  y = ~stars,
  z = ~total_stars,       
  color = ~Final_Outlier,     # must be a factor or categorical
  colors = c("blue", "orange", "red"), 
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 4, opacity = 0.7)
) %>%
  layout(
    title = "3D Scatter of Reviews vs. Stars vs. Total Stars",
    scene = list(
      xaxis = list(title = "Review Count"),
      yaxis = list(title = "Star Rating"),
      zaxis = list(title = "Total Stars")
    )
  )

fig

Module 2 EDA

Group 4: Lillian Lema, Kalisi Logan, Alex Morrow, Albert Darko

2025-06-05

Data Quality Assessment

Check for missing values

Completeness percentages

Identify Duplicates

Basic Descriptive Statistics

Summary Statistics & Frequency Distributions

Bivariate Analysis - Star Reviews & Review Count

Subpopulation Analysis - Business Categories

Date Ranges of Reviews

Business Data Analysis

Business Distribution by Geography

Star Reviews & Review Count (repeated graphic)

Rating Distributions Across Categories

Average Rating by Geography

Outlier Detection and Analysis

Dataframe Enhancement

Statistics

Variable Analysis - Review Ratios

Variable Analysis - Extreme Lat/Long Coordinates

Multivariate Outlier Detection