Data Loading & Initial Setup

# Load the datasets
business_df <- read_csv('business_4.csv')
checkin_df <- read_csv('checkin_4.csv')

# Display basic information about the datasets
cat("Business dataset dimensions:", dim(business_df), "\n")
## Business dataset dimensions: 5100 71
cat("Checkin dataset dimensions:", dim(checkin_df), "\n")
## Checkin dataset dimensions: 4962 2
# Display first few rows
knitr::kable(head(business_df), caption = "Sample Business Data")
Sample Business Data
business_id name address city state postal_code latitude longitude stars review_count is_open attributes.ByAppointmentOnly attributes.BusinessAcceptsCreditCards attributes.BikeParking attributes.RestaurantsPriceRange2 attributes.CoatCheck attributes.RestaurantsTakeOut attributes.RestaurantsDelivery attributes.Caters attributes.WiFi attributes.BusinessParking attributes.WheelchairAccessible attributes.HappyHour attributes.OutdoorSeating attributes.HasTV attributes.RestaurantsReservations attributes.DogsAllowed attributes.Alcohol attributes.GoodForKids attributes.RestaurantsAttire attributes.Ambience attributes.RestaurantsTableService attributes.RestaurantsGoodForGroups attributes.DriveThru attributes.NoiseLevel attributes.GoodForMeal attributes.BusinessAcceptsBitcoin attributes.Smoking attributes.Music attributes.GoodForDancing attributes.AcceptsInsurance attributes.BestNights attributes.BYOB attributes.Corkage attributes.BYOBCorkage attributes.HairSpecializesIn attributes.Open24Hours attributes.RestaurantsCounterService attributes.AgesAllowed attributes.DietaryRestrictions categories hours.Monday hours.Tuesday hours.Wednesday hours.Thursday hours.Friday hours.Saturday hours.Sunday Food_Beverage Health_Fitness Personal_Care Automotive Retail Professional_Services Entertainment Travel_Hospitality Education Home_Services Healthcare Veterinary_Services Sum_of_Industries
wghnIlMb_i5U46HMBGx9ig China Dragon Restaurant 1625 W Valencia Rd, Ste 101-103 Tucson AZ 85746 32.13230 -111.0000 3.0 23 0 NA TRUE NA 2 NA True False False ‘no’ {‘garage’: False ‘street’: False ‘validated’: False ‘lot’: False ‘valet’: False} NA NA False FALSE True NA ‘beer_and_wine’ True u’casual’ {‘romantic’: False ‘intimate’: False ‘classy’: False ‘hipster’: False ‘divey’: False ‘touristy’: False ‘trendy’: False ‘upscale’: False ‘casual’: True} NA True NA u’quiet’ NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA Restaurants, Chinese 11:0-21:0 11:0-21:0 11:0-21:0 11:0-21:0 11:0-21:0 11:0-21:0 11:0-21:0 1,0,0,0,0,0,0,0,0,0,0,0,1
txyXRytGjwOXvS8s4sc-WA Smoothie King 1070 E Tucson Marketplace Blvd Tucson AZ 85713 32.18679 -110.9548 3.0 29 1 NA TRUE True 2 NA True True NA u’free’ {u’valet’: False u’garage’: False u’street’: None u’lot’: True u’validated’: False} FALSE NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA Vitamins & Supplements, Ice Cream & Frozen Yogurt, Food, Juice Bars & Smoothies, Shopping 0:0-0:0 7:0-21:0 7:0-19:0 7:0-19:0 7:0-19:0 9:0-20:0 11:0-18:0 1 0 0 0 1 0 0 0 0,0,0,0,2
x9K0RfZaT_zlw6DklBDzjw Gavi Italian Restaurant 7401 N La Cholla Blvd, Ste 146 Tucson AZ 85707 32.22167 -110.9258 3.5 9 0 NA TRUE NA 2 NA True False NA NA {‘garage’: False ‘street’: False ‘validated’: False ‘lot’: True ‘valet’: False} NA NA True NA False NA u’full_bar’ True u’casual’ NA NA True NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA Italian, Restaurants NA NA NA NA NA NA NA 1 0 0 0 0 0 0 0 0,0,0,0,1
IDtLPgUrqorrpqSLdfMhZQ Helena Avenue Bakery 131 Anacapa St, Ste C Santa Barbara CA 93101 34.41444 -119.6907 4.0 389 1 NA TRUE True 2 NA True None True u’no’ {‘garage’: False ‘street’: True ‘validated’: False ‘lot’: False ‘valet’: False} TRUE FALSE True FALSE False True u’none’ True ‘casual’ {‘touristy’: False ‘hipster’: True ‘romantic’: False ‘divey’: False ‘intimate’: False ‘trendy’: True ‘upscale’: False ‘classy’: False ‘casual’: True} False True NA u’average’ {‘dessert’: False ‘latenight’: False ‘lunch’: True ‘dinner’: False ‘brunch’: True ‘breakfast’: True} NA NA NA NA NA NA NA NA NA NA NA NA NA NA Food, Restaurants, Salad, Coffee & Tea, Breakfast & Brunch, Sandwiches, Bakeries 0:0-0:0 8:0-14:0 8:0-14:0,8:0-14:0,8:0-14:0,8:0-14:0,8:0-14:0,1,0,0,0,0,0,0,0,0,0,0,0,1
anLQj9AM8vjbcLSIE0iUgg Papa Murphy’s 7250 North La Cholla, Suite 186 Tucson AZ 85741 32.33857 -111.0108 4.0 22 1 NA TRUE False 1 NA True True NA NA {‘garage’: False ‘street’: False ‘validated’: False ‘lot’: False ‘valet’: False} NA NA False TRUE False NA u’none’ True u’casual’ NA NA True True u’quiet’ NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA Pizza, Restaurants NA NA NA NA NA NA NA 1 0 0 0 0 0 0 0 0,0,0,0,1
9C2rpb56aQvW0ViZHK9sPw Home Plate Sports Pub 4880 E 22nd St Tucson AZ 85711 32.20631 -110.8878 3.5 56 1 NA TRUE True 1 NA NA False False u’no’ {u’valet’: False u’garage’: False u’street’: None u’lot’: None u’validated’: None} NA TRUE True TRUE False NA u’full_bar’ True u’casual’ {u’divey’: None u’hipster’: None u’casual’: True u’touristy’: None u’trendy’: None u’intimate’: False u’romantic’: None u’classy’: False u’upscale’: None} NA True NA u’average’ NA NA NA {u’dj’: None u’live’: False u’jukebox’: None u’video’: False u’background_music’: False u’karaoke’: None u’no_music’: False} NA NA {u’monday’: True u’tuesday’: False u’wednesday’: False u’thursday’: False u’friday’: False u’saturday’: False u’sunday’: False} NA NA NA NA NA,NA,NA,NA,“Bars, Restaurants, Sports Bars, Nightlife”,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,1,0,0,0,0,0,0,0,0,0,0,0,1
knitr::kable(head(checkin_df), caption = "Sample Check-In Data")
Sample Check-In Data
business_id date
–MbOh2O1pATkXa7xbU6LA 2013-04-21 01:52:06, 2013-05-02 04:02:07, 2013-05-04 04:14:31, 2013-05-18 03:39:17, 2013-05-20 03:04:59, 2013-05-22 23:52:49, 2013-05-27 01:38:52, 2013-06-02 04:16:38, 2013-06-07 02:30:10, 2013-07-27 23:08:27, 2013-08-01 03:53:02, 2013-08-22 03:21:29, 2013-09-13 02:29:36, 2013-09-18 02:53:23, 2013-09-23 00:00:06, 2013-10-03 03:22:41, 2013-10-09 04:31:13, 2013-10-12 04:03:40, 2013-10-29 03:45:15, 2013-11-07 01:11:43, 2013-11-08 22:01:11, 2013-11-08 22:01:25, 2013-11-10 02:40:38, 2013-11-10 22:07:07, 2013-11-10 22:07:11, 2013-11-13 00:48:37, 2013-11-14 04:01:15, 2013-11-16 05:07:42, 2013-11-16 05:07:57, 2013-11-18 03:39:25, 2013-11-19 02:10:10, 2013-11-22 01:34:12, 2013-11-23 02:28:53, 2013-11-30 22:51:02, 2013-11-30 22:51:19, 2013-12-03 03:59:23, 2013-12-10 03:19:19, 2013-12-10 03:19:48, 2013-12-12 02:26:02, 2013-12-12 03:31:18, 2014-01-03 03:54:51, 2014-01-08 03:17:42, 2014-01-08 03:17:54, 2014-01-19 03:09:20, 2014-01-19 03:09:28, 2014-01-22 02:46:34, 2014-01-27 01:46:01, 2014-02-05 03:53:33, 2014-02-08 02:18:50, 2014-02-08 21:54:13, 2014-02-08 21:55:11, 2014-02-12 20:59:50, 2014-02-20 01:37:19, 2014-02-26 20:48:45, 2014-03-10 02:14:11, 2014-03-17 21:24:32, 2014-03-29 00:03:39, 2014-05-05 00:20:57, 2014-05-14 23:07:09, 2014-05-22 22:03:29, 2014-05-26 03:25:20, 2014-06-18 20:26:09, 2014-06-22 21:36:07, 2014-06-29 01:02:02, 2014-07-03 03:07:11, 2014-07-05 00:11:12, 2014-07-13 03:03:03, 2014-07-19 04:52:07, 2014-07-22 02:54:57, 2014-07-26 21:26:40, 2014-08-02 02:28:53, 2014-08-02 21:54:00, 2014-08-10 04:24:02, 2014-08-11 02:54:25, 2014-08-20 03:21:49, 2014-09-14 02:52:40, 2014-09-21 23:08:59, 2014-10-02 02:29:55, 2014-10-11 03:47:10, 2014-11-08 04:34:23, 2014-12-08 03:46:50, 2015-01-25 02:43:15, 2015-03-21 03:18:11, 2015-04-08 22:44:06, 2015-05-19 02:26:32, 2015-06-30 03:46:04, 2015-08-28 01:59:45, 2015-09-23 03:53:27, 2015-10-23 22:42:54, 2015-12-19 04:03:31, 2016-01-07 00:52:26, 2016-06-18 19:57:36, 2016-07-06 21:05:09, 2016-07-10 16:47:57, 2016-07-16 22:43:19, 2016-11-11 01:33:12, 2016-11-12 04:35:42, 2017-03-18 21:07:56, 2017-04-19 00:32:31, 2017-04-23 23:33:15, 2017-05-22 02:52:20, 2017-11-12 23:14:20, 2017-12-02 19:57:32
–onnLZrsCazmcy2P_7fcw 2010-09-26 23:38:02, 2011-02-01 21:44:51, 2011-02-27 01:18:19, 2011-04-06 02:40:34, 2011-04-15 02:11:59, 2011-05-08 18:54:05, 2011-05-09 21:10:39, 2011-07-18 02:44:28, 2011-07-31 01:50:07, 2011-08-02 00:58:19, 2011-08-02 01:11:13, 2011-08-29 06:04:47, 2011-08-30 02:40:24, 2011-08-31 04:32:44, 2011-09-01 03:34:10, 2011-09-01 15:34:57, 2011-09-05 23:10:20, 2011-10-21 15:59:44, 2011-11-22 17:49:40, 2011-11-23 02:31:04, 2011-12-13 00:45:14, 2011-12-18 06:28:20, 2011-12-18 19:36:04, 2011-12-24 02:41:40, 2011-12-24 03:02:10, 2012-02-06 21:09:24, 2012-02-16 02:35:49, 2012-02-22 15:42:28, 2012-03-18 06:20:53, 2012-04-25 03:00:06, 2012-05-14 00:03:49, 2012-05-25 07:10:02, 2012-06-10 02:46:59, 2012-06-10 02:48:15, 2012-06-21 06:58:33, 2012-07-10 20:35:56, 2012-07-15 01:44:04, 2012-08-05 18:49:43, 2012-08-05 18:51:20, 2012-08-05 18:51:46, 2012-08-05 18:52:04, 2012-08-28 05:47:14, 2012-09-18 02:05:14, 2012-10-02 15:50:14, 2012-10-29 01:59:58, 2012-11-04 03:03:55, 2012-11-19 21:01:04, 2012-12-07 04:54:35, 2013-01-11 02:29:47, 2013-06-15 01:37:39, 2013-06-15 15:33:32, 2013-06-16 08:38:16, 2013-07-07 17:52:48
-0G_6-KFGpCpxTUlVXCMYQ 2013-12-21 22:08:07, 2013-12-22 03:16:35, 2013-12-22 04:51:08, 2013-12-22 20:27:51, 2013-12-22 21:06:50, 2013-12-22 22:07:11, 2013-12-23 19:53:31, 2013-12-23 20:39:52, 2013-12-23 20:44:59, 2013-12-24 00:52:04, 2013-12-24 18:00:12, 2013-12-27 01:48:04, 2013-12-27 20:18:31, 2013-12-27 21:41:13, 2013-12-28 23:15:11, 2013-12-29 00:53:49, 2013-12-29 02:07:35, 2013-12-29 02:17:57, 2013-12-29 02:20:04, 2013-12-30 03:08:19, 2013-12-30 03:32:53, 2014-01-01 02:30:00, 2014-01-01 03:17:50, 2014-01-01 04:52:42, 2014-01-03 01:19:36, 2014-01-03 02:21:54, 2014-01-03 19:25:27, 2014-01-03 19:48:19, 2014-01-04 04:37:02, 2014-01-04 21:30:29, 2014-01-05 01:20:24, 2014-01-06 00:57:48, 2014-01-06 02:11:46, 2014-01-06 19:42:09, 2014-01-06 20:26:33, 2014-01-07 02:55:19, 2014-01-07 21:26:07, 2014-01-08 01:30:15, 2014-01-08 20:31:58, 2014-01-09 19:07:06, 2014-01-10 00:25:18, 2014-01-10 04:50:25, 2014-01-10 18:57:11, 2014-01-11 03:52:00, 2014-01-11 03:52:03, 2014-01-11 20:54:36, 2014-01-11 21:30:26, 2014-01-11 21:44:58, 2014-01-11 21:52:38, 2014-01-12 03:48:31, 2014-01-15 01:10:53, 2014-01-16 19:39:05, 2014-01-17 03:01:57, 2014-01-17 03:40:50, 2014-01-17 04:22:20, 2014-01-18 20:05:28, 2014-01-19 02:08:35, 2014-01-19 23:26:26, 2014-01-21 00:21:21, 2014-01-21 23:11:33, 2014-01-22 00:50:00, 2014-01-23 02:25:39, 2014-01-23 19:26:14, 2014-01-25 00:54:36, 2014-01-25 01:01:50, 2014-01-25 01:02:28, 2014-01-25 01:08:13, 2014-01-25 01:11:39, 2014-01-26 01:08:44, 2014-01-26 01:08:50, 2014-01-27 01:54:34, 2014-01-28 02:06:58, 2014-01-28 04:05:15, 2014-01-28 19:47:58, 2014-01-28 22:23:55, 2014-01-29 19:06:34, 2014-02-01 02:26:55, 2014-02-01 03:43:45, 2014-02-01 03:44:42, 2014-02-01 19:16:53, 2014-02-02 04:42:16, 2014-02-02 04:46:00, 2014-02-04 23:22:28, 2014-02-05 00:56:16, 2014-02-05 01:05:15, 2014-02-05 01:06:22, 2014-02-05 01:30:46, 2014-02-05 01:55:41, 2014-02-05 02:19:52, 2014-02-05 03:26:28, 2014-02-05 03:31:31, 2014-02-05 03:48:45, 2014-02-05 19:23:02, 2014-02-06 20:30:59, 2014-02-07 19:34:51, 2014-02-08 01:23:34, 2014-02-08 19:52:44, 2014-02-09 00:30:58, 2014-02-09 03:58:09, 2014-02-09 06:03:14, 2014-02-10 01:41:03, 2014-02-12 03:47:14, 2014-02-13 01:35:18, 2014-02-16 00:15:52, 2014-02-16 00:52:37, 2014-02-16 01:45:29, 2014-02-17 02:41:22, 2014-02-17 04:31:43, 2014-02-21 22:43:42, 2014-02-22 00:29:58, 2014-02-22 01:53:11, 2014-02-22 02:29:38, 2014-02-23 01:36:22, 2014-02-23 01:49:15, 2014-02-23 02:33:24, 2014-02-23 03:30:39, 2014-02-23 04:11:37, 2014-02-27 01:18:46, 2014-02-27 18:54:33, 2014-02-27 19:25:05, 2014-02-27 23:29:25, 2014-02-28 03:33:40, 2014-02-28 04:32:30, 2014-02-28 19:43:58, 2014-02-28 22:37:26, 2014-03-01 00:02:41, 2014-03-01 08:28:44, 2014-03-01 20:41:15, 2014-03-02 01:45:13, 2014-03-02 02:28:36, 2014-03-02 23:09:08, 2014-03-04 17:26:25, 2014-03-06 06:32:33, 2014-03-07 01:58:57, 2014-03-07 02:21:38, 2014-03-07 09:20:36, 2014-03-08 00:58:05, 2014-03-08 02:01:52, 2014-03-08 02:05:10, 2014-03-09 00:35:10, 2014-03-09 04:05:52, 2014-03-09 04:21:48, 2014-03-10 00:11:14, 2014-03-14 02:37:25, 2014-03-14 04:18:50, 2014-03-14 05:37:32, 2014-03-14 05:43:51, 2014-03-14 18:54:25, 2014-03-15 00:16:51, 2014-03-18 02:38:15, 2014-03-18 03:01:37, 2014-03-19 02:53:54, 2014-03-19 21:46:31, 2014-03-20 00:14:44, 2014-03-20 02:58:53, 2014-03-22 04:39:26, 2014-03-23 03:50:47, 2014-03-23 19:37:56, 2014-03-24 02:31:35, 2014-03-25 02:32:47, 2014-03-25 03:01:48, 2014-03-25 23:44:44, 2014-03-26 15:10:32, 2014-03-27 01:22:46, 2014-03-27 01:26:23, 2014-03-27 18:11:57, 2014-03-28 23:30:41, 2014-03-29 01:04:08, 2014-03-29 16:53:27, 2014-03-30 02:51:01, 2014-04-01 00:27:01, 2014-04-01 17:36:37, 2014-04-02 16:31:38, 2014-04-02 23:29:08, 2014-04-03 01:31:13, 2014-04-03 18:23:12, 2014-04-03 21:04:45, 2014-04-04 05:08:59, 2014-04-05 00:20:57, 2014-04-05 00:51:46, 2014-04-05 02:24:12, 2014-04-06 03:11:04, 2014-04-06 17:26:32, 2014-04-08 00:18:55, 2014-04-08 23:56:42, 2014-04-09 16:06:57, 2014-04-09 20:29:43, 2014-04-10 17:50:26, 2014-04-10 18:44:10, 2014-04-11 03:07:02, 2014-04-12 00:03:52, 2014-04-12 02:33:05, 2014-04-12 03:53:59, 2014-04-12 05:53:46, 2014-04-13 05:38:54, 2014-04-13 05:39:13, 2014-04-13 23:26:10, 2014-04-14 22:24:10, 2014-04-15 23:15:20, 2014-04-16 18:42:43, 2014-04-16 21:57:14, 2014-04-17 02:07:26, 2014-04-17 02:21:48, 2014-04-17 03:29:10, 2014-04-17 22:44:18, 2014-04-17 23:18:57, 2014-04-18 22:29:10, 2014-04-18 23:50:51, 2014-04-19 00:12:19, 2014-04-19 07:52:04, 2014-04-21 18:23:56, 2014-04-22 23:47:24, 2014-04-23 01:02:30, 2014-04-23 20:02:19, 2014-04-24 01:56:26, 2014-04-24 17:07:42, 2014-04-24 19:25:59, 2014-04-26 00:10:13, 2014-04-26 00:46:31, 2014-04-26 00:53:35, 2014-04-28 23:34:44, 2014-04-29 23:19:03, 2014-04-30 16:05:25, 2014-05-01 03:46:24, 2014-05-01 23:09:36, 2014-05-02 01:38:09, 2014-05-02 23:34:11, 2014-05-03 19:03:01, 2014-05-04 00:11:12, 2014-05-04 03:33:22, 2014-05-04 03:39:19, 2014-05-04 04:10:38, 2014-05-04 18:28:20, 2014-05-06 00:39:00, 2014-05-06 03:11:53, 2014-05-06 04:11:26, 2014-05-06 23:15:31, 2014-05-07 23:36:44, 2014-05-09 07:06:05, 2014-05-09 07:06:30, 2014-05-09 18:24:17, 2014-05-09 23:28:01, 2014-05-10 04:02:24, 2014-05-10 05:15:17, 2014-05-10 21:42:03, 2014-05-11 05:35:17, 2014-05-12 04:01:09, 2014-05-13 00:25:41, 2014-05-13 19:57:45, 2014-05-14 01:54:16, 2014-05-14 18:49:15, 2014-05-15 23:54:32, 2014-05-17 01:00:05, 2014-05-17 01:17:40, 2014-05-17 02:49:02, 2014-05-17 04:39:30, 2014-05-18 02:39:34, 2014-05-18 05:08:34, 2014-05-18 05:28:45, 2014-05-20 00:49:25, 2014-05-21 22:23:04, 2014-05-22 18:38:56, 2014-05-23 19:35:22, 2014-05-23 20:11:58, 2014-05-24 02:37:06, 2014-05-24 02:45:02, 2014-05-24 06:39:20, 2014-05-25 01:04:38, 2014-05-26 23:56:45, 2014-05-27 21:41:58, 2014-05-28 05:44:45, 2014-05-28 22:48:53, 2014-05-30 00:46:37, 2014-05-30 18:57:09, 2014-05-30 19:01:26, 2014-05-31 18:55:19, 2014-05-31 20:29:48, 2014-06-04 23:21:19, 2014-06-06 20:30:33, 2014-06-07 00:10:51, 2014-06-07 00:51:37, 2014-06-07 02:49:46, 2014-06-07 22:08:20, 2014-06-08 21:41:33, 2014-06-14 01:13:48, 2014-06-15 01:34:55, 2014-06-17 03:08:58, 2014-06-18 02:24:32, 2014-06-21 07:29:03, 2014-06-21 20:02:51, 2014-06-22 03:18:22, 2014-06-23 20:41:21, 2014-06-25 20:07:42, 2014-06-25 20:15:53, 2014-06-25 20:17:03, 2014-06-26 00:32:36, 2014-06-29 20:27:30, 2014-07-05 04:42:52, 2014-07-05 04:43:51, 2014-07-05 04:47:15, 2014-07-05 04:48:00, 2014-07-05 19:03:37, 2014-07-05 19:58:07, 2014-07-05 19:59:39, 2014-07-11 20:02:05, 2014-07-12 04:42:23, 2014-07-12 21:56:53, 2014-07-13 01:54:37, 2014-07-13 01:55:18, 2014-07-13 06:32:55, 2014-07-15 01:01:04, 2014-07-17 02:09:50, 2014-07-20 02:31:24, 2014-07-20 03:03:59, 2014-07-20 05:35:33, 2014-07-20 08:27:43, 2014-07-20 08:55:46, 2014-07-25 00:03:30, 2014-07-25 20:31:09, 2014-07-27 03:24:18, 2014-08-01 03:24:12, 2014-08-03 20:41:56, 2014-08-03 20:48:43, 2014-08-06 19:47:33, 2014-08-08 01:42:50, 2014-08-09 19:12:39, 2014-08-09 19:13:25, 2014-08-14 02:36:16, 2014-08-15 20:06:19, 2014-08-16 02:45:19, 2014-08-17 21:59:14, 2014-08-20 18:55:34, 2014-08-23 00:31:35, 2014-08-23 22:35:31, 2014-08-24 00:50:03, 2014-08-24 02:39:23, 2014-08-24 03:12:50, 2014-08-28 22:46:00, 2014-08-30 06:07:03, 2014-08-31 19:56:45, 2014-09-03 01:56:18, 2014-09-06 00:31:15, 2014-09-14 01:29:50, 2014-09-14 05:35:52, 2014-09-21 03:13:22, 2014-09-21 05:14:24, 2014-09-21 20:19:27, 2014-09-26 01:32:41, 2014-09-27 01:30:34, 2014-09-27 04:08:57, 2014-09-28 05:23:43, 2014-10-04 04:38:29, 2014-10-09 01:59:01, 2014-10-09 01:59:02, 2014-10-11 01:29:55, 2014-10-18 22:10:19, 2014-10-19 02:08:05, 2014-10-19 06:05:02, 2014-10-19 18:59:43, 2014-10-23 20:15:06, 2014-10-24 19:56:59, 2014-10-25 03:01:39, 2014-10-30 00:49:04, 2014-10-30 20:17:46, 2014-11-02 01:13:36, 2014-11-03 01:02:16, 2014-11-05 00:54:14, 2014-11-07 18:55:47, 2014-11-07 19:28:58, 2014-11-10 02:59:20, 2014-11-10 03:05:12, 2014-11-10 03:39:12, 2014-11-16 08:16:05, 2014-11-28 18:57:59, 2014-11-28 19:20:53, 2014-11-30 01:19:07, 2014-11-30 01:27:13, 2014-11-30 23:25:00, 2014-12-07 18:52:21, 2014-12-12 20:10:58, 2014-12-13 00:39:50, 2014-12-17 20:28:16, 2014-12-23 19:37:02, 2014-12-28 00:29:37, 2014-12-30 02:16:15, 2015-01-04 03:08:54, 2015-01-04 03:56:17, 2015-01-04 18:38:51, 2015-01-06 18:07:41, 2015-01-06 18:41:17, 2015-01-06 19:23:46, 2015-01-10 05:58:42, 2015-01-10 05:59:26, 2015-01-10 06:02:05, 2015-01-11 01:40:18, 2015-01-11 21:49:48, 2015-01-17 01:14:24, 2015-01-19 00:40:49, 2015-01-19 00:41:44, 2015-01-19 03:22:23, 2015-01-19 19:29:28, 2015-01-20 21:26:56, 2015-01-22 01:18:54, 2015-01-23 19:12:17, 2015-01-24 00:39:05, 2015-01-24 00:54:39, 2015-01-24 00:55:46, 2015-01-24 02:23:27, 2015-01-24 05:31:43, 2015-01-24 07:33:35, 2015-01-31 03:18:02, 2015-01-31 18:50:21, 2015-01-31 18:50:35, 2015-02-05 22:04:56, 2015-02-06 21:19:27, 2015-02-07 03:52:30, 2015-02-08 21:54:19, 2015-02-09 00:29:12, 2015-02-11 21:25:53, 2015-02-13 00:22:57, 2015-02-13 00:22:59, 2015-02-13 01:11:50, 2015-02-13 19:26:20, 2015-02-14 21:24:17, 2015-02-16 01:11:53, 2015-02-17 23:03:55, 2015-02-21 19:59:32, 2015-02-21 20:54:33, 2015-02-22 07:02:36, 2015-02-26 01:42:56, 2015-02-26 04:06:06, 2015-02-28 08:59:42, 2015-03-07 03:58:05, 2015-03-07 03:59:08, 2015-03-07 08:47:51, 2015-03-07 19:30:51, 2015-03-08 19:04:41, 2015-03-09 00:18:37
-1MhPXk1FglglUAmuPLIGg 2010-02-18 06:23:47, 2010-04-09 05:41:02, 2010-07-23 04:31:32, 2010-07-25 22:14:17, 2010-07-31 01:51:09, 2010-08-25 01:41:25, 2010-09-11 02:30:31, 2010-10-02 04:02:51, 2010-10-12 05:08:54, 2010-10-22 03:59:14, 2010-12-04 05:13:55, 2011-01-23 20:03:37, 2011-01-23 20:33:29, 2011-02-07 02:42:15, 2011-02-13 18:22:54, 2011-02-17 00:32:04, 2011-02-20 05:39:37, 2011-02-21 07:06:36, 2011-02-27 00:11:53, 2011-03-27 05:06:49, 2011-04-16 18:44:37, 2011-04-17 16:18:15, 2011-04-29 01:53:33, 2011-05-11 04:58:00, 2011-05-13 02:39:43, 2011-05-14 06:47:34, 2011-05-21 03:44:33, 2011-05-30 21:49:25, 2011-06-04 04:41:09, 2011-06-24 01:10:14, 2011-07-02 03:25:07, 2011-07-02 03:28:31, 2011-07-08 01:18:36, 2011-07-10 03:06:08, 2011-07-10 07:11:22, 2011-07-27 23:00:12, 2011-08-14 01:04:18, 2011-08-20 22:06:37, 2011-08-29 01:49:38, 2011-08-30 03:59:58, 2011-09-03 00:05:16, 2011-09-04 06:53:04, 2011-09-04 20:37:04, 2011-09-05 04:19:13, 2011-09-14 06:57:35, 2011-09-18 19:26:23, 2011-09-18 19:31:19, 2011-09-25 04:37:48, 2011-10-02 02:38:49, 2011-10-04 07:55:09, 2011-10-07 00:35:09, 2011-10-08 02:49:17, 2011-10-09 05:17:00, 2011-10-10 00:43:25, 2011-10-14 04:49:00, 2011-10-16 03:02:51, 2011-10-16 19:20:55, 2011-10-23 20:23:18, 2011-10-29 14:56:19, 2011-11-12 02:19:46, 2011-11-13 20:28:39, 2011-11-16 04:42:29, 2011-11-16 04:54:48, 2011-11-16 05:18:27, 2011-11-25 20:57:49, 2011-11-27 03:49:20, 2011-11-27 22:04:52, 2011-11-28 03:00:15, 2011-12-01 01:49:36, 2011-12-04 19:20:47, 2011-12-09 03:40:52, 2011-12-12 01:47:38, 2011-12-18 05:49:10, 2011-12-18 06:53:24, 2011-12-19 03:49:20, 2011-12-24 08:58:18, 2011-12-30 04:53:25, 2012-01-13 03:31:39, 2012-01-24 05:02:19, 2012-01-28 19:22:33, 2012-02-05 05:21:36, 2012-02-12 01:09:37, 2012-02-16 00:13:22, 2012-02-20 03:35:44, 2012-03-05 00:32:24, 2012-03-07 02:26:41, 2012-03-07 02:26:56, 2012-03-10 19:16:10, 2012-03-15 00:13:16, 2012-03-18 06:48:18, 2012-03-24 04:20:20, 2012-04-21 00:44:32, 2012-05-26 04:29:44, 2012-06-13 03:03:50, 2012-06-15 03:22:30, 2012-06-17 22:00:20, 2012-06-24 06:45:15, 2012-07-02 03:40:55, 2012-07-09 02:54:28, 2012-07-19 03:43:54, 2012-07-19 06:56:08, 2012-07-22 23:35:33, 2012-08-15 04:19:05, 2012-08-15 04:20:28, 2012-08-30 01:45:08, 2012-09-01 20:58:36, 2012-09-01 21:02:32, 2012-09-19 02:05:33, 2012-09-22 04:29:51, 2012-09-28 01:16:46, 2012-09-29 19:13:06, 2012-10-07 05:11:56, 2012-10-20 05:47:20, 2012-11-19 04:45:50, 2012-11-20 01:45:49, 2012-11-22 00:53:27, 2012-12-08 06:49:32, 2012-12-09 22:31:19, 2012-12-10 03:49:47, 2012-12-13 19:07:12, 2012-12-15 00:05:40, 2013-01-02 01:35:25, 2013-01-07 02:36:29, 2013-01-14 03:56:45, 2013-01-19 05:01:12, 2013-01-19 05:13:44, 2013-01-20 21:05:15, 2013-01-27 00:28:23, 2013-02-09 07:08:55, 2013-02-11 03:20:10, 2013-02-25 03:54:23, 2013-03-01 01:06:17, 2013-03-07 04:05:38, 2013-03-08 02:18:16, 2013-03-09 05:51:21, 2013-03-10 04:28:04, 2013-03-10 19:17:50, 2013-03-11 04:39:45, 2013-03-15 01:05:48, 2013-03-21 01:31:49, 2013-03-31 22:48:26, 2013-04-07 05:27:20, 2013-04-09 01:32:09, 2013-04-12 00:43:40, 2013-04-12 07:09:22, 2013-04-22 02:53:05, 2013-04-26 02:25:28, 2013-04-26 03:43:02, 2013-04-28 04:37:55, 2013-05-10 01:07:09, 2013-05-25 06:13:10, 2013-06-03 03:01:49, 2013-06-04 02:00:37, 2013-06-07 06:24:28, 2013-06-14 21:11:39, 2013-06-15 04:34:34, 2013-06-17 01:56:58, 2013-06-19 23:36:10, 2013-06-22 04:54:19, 2013-06-25 00:36:51, 2013-07-02 03:13:16, 2013-07-06 03:08:23, 2013-07-12 21:37:12, 2013-07-17 23:40:00, 2013-07-25 01:26:37, 2013-07-26 00:51:46, 2013-07-27 06:04:05, 2013-07-27 06:05:35, 2013-08-08 01:20:56, 2013-08-14 23:23:52, 2013-08-16 01:11:00, 2013-08-21 23:19:04, 2013-08-28 23:21:51, 2013-09-21 06:31:37, 2013-10-02 23:23:59, 2013-10-05 05:22:14, 2013-10-06 20:15:25, 2013-10-06 21:43:55, 2013-10-18 01:48:18, 2013-10-23 19:30:32, 2013-10-24 02:17:03, 2013-10-25 03:43:27, 2013-10-26 03:03:27, 2013-10-27 04:10:09, 2013-10-28 01:12:36, 2013-10-28 01:45:56, 2013-11-10 07:15:39, 2013-11-12 01:30:50, 2013-11-23 01:39:29, 2013-11-23 04:53:58, 2013-11-28 05:13:29, 2013-11-28 08:58:16, 2013-11-30 23:07:50, 2013-12-02 01:56:40, 2013-12-05 05:20:17, 2013-12-08 03:04:40, 2013-12-18 00:56:27, 2013-12-31 01:58:40, 2013-12-31 20:13:07, 2014-01-03 01:20:11, 2014-01-04 22:40:43, 2014-01-06 02:22:16, 2014-01-11 03:12:54, 2014-01-12 06:44:17, 2014-01-18 01:39:05, 2014-01-18 07:24:22, 2014-01-20 00:41:28, 2014-01-24 08:11:19, 2014-02-17 03:45:12, 2014-03-08 04:17:32, 2014-03-11 01:52:54, 2014-03-15 02:55:20, 2014-03-15 03:56:49, 2014-03-18 04:06:36, 2014-03-18 04:07:36, 2014-03-27 00:44:11, 2014-03-29 04:21:00, 2014-03-30 00:42:02, 2014-03-30 05:36:30, 2014-04-02 00:38:16, 2014-04-12 02:23:43, 2014-04-13 05:43:03, 2014-04-14 06:38:16, 2014-04-21 03:12:20, 2014-04-22 04:58:31, 2014-04-29 07:03:47, 2014-05-03 07:51:42, 2014-05-11 04:25:53, 2014-05-30 19:58:49, 2014-05-30 20:31:17, 2014-06-02 23:32:49, 2014-06-06 03:31:56, 2014-06-08 02:26:28, 2014-06-10 00:22:24, 2014-06-10 07:43:22, 2014-06-14 02:59:27, 2014-06-17 06:50:33, 2014-06-17 08:27:44, 2014-06-21 06:39:42, 2014-06-27 22:24:02, 2014-06-28 05:42:51, 2014-06-29 03:50:56, 2014-07-12 20:28:36, 2014-07-13 01:51:43, 2014-07-13 21:51:42, 2014-07-26 04:05:11, 2014-08-11 02:56:57, 2014-08-16 02:55:19, 2014-08-21 19:45:51, 2014-09-03 05:37:17, 2014-09-05 02:57:50, 2014-09-06 04:00:58, 2014-09-07 01:37:07, 2014-09-07 06:33:17, 2014-09-10 03:13:14, 2014-09-14 06:44:43, 2014-09-14 06:48:40, 2014-09-16 04:55:45, 2014-09-18 03:29:39, 2014-09-30 02:41:40, 2014-10-04 01:33:50, 2014-10-07 01:46:49, 2014-10-09 21:21:55, 2014-10-16 06:19:36, 2014-10-19 00:57:49, 2014-10-19 01:57:05, 2014-10-21 00:53:00, 2014-10-24 19:26:56, 2014-10-26 05:28:50, 2014-11-02 23:05:31, 2014-11-03 02:29:23, 2014-11-04 23:25:25, 2014-11-11 02:00:06, 2014-11-14 05:55:26, 2014-11-15 02:12:52, 2014-11-15 02:13:30, 2014-11-22 04:52:43, 2014-11-23 00:12:53, 2014-11-26 06:38:27, 2014-11-26 23:45:58, 2014-12-06 01:33:02, 2014-12-14 00:21:28, 2014-12-18 23:38:22, 2014-12-21 03:49:14, 2014-12-22 01:20:14, 2014-12-23 01:25:09, 2014-12-30 01:27:11, 2015-01-01 01:08:54, 2015-01-04 03:05:40, 2015-01-04 21:09:30, 2015-01-04 22:07:19, 2015-01-07 03:27:41, 2015-01-13 01:10:41, 2015-01-13 02:01:44, 2015-01-24 06:30:08, 2015-02-04 02:41:50, 2015-02-13 04:17:38, 2015-02-17 04:22:09, 2015-02-17 04:52:05, 2015-02-21 02:37:33, 2015-02-25 08:08:11, 2015-02-27 03:22:15, 2015-02-28 06:48:30, 2015-03-01 01:22:06, 2015-03-05 01:09:35, 2015-03-07 06:34:00, 2015-03-11 01:34:41, 2015-03-16 23:31:13, 2015-03-18 19:01:52, 2015-03-26 01:06:08, 2015-03-28 01:16:08, 2015-03-28 06:32:25, 2015-03-29 01:25:32, 2015-03-30 05:55:46, 2015-04-05 00:17:47, 2015-04-05 03:19:30, 2015-04-12 23:04:01, 2015-04-22 05:22:25, 2015-04-27 03:27:20, 2015-05-03 03:06:29, 2015-05-05 04:24:29, 2015-05-12 04:15:29, 2015-05-16 05:23:27, 2015-05-16 05:31:26, 2015-05-16 05:32:20, 2015-05-30 05:19:00, 2015-05-30 05:22:28, 2015-06-07 00:40:12, 2015-06-18 05:39:22, 2015-06-19 00:58:29, 2015-06-24 04:06:13, 2015-06-28 04:25:14, 2015-06-28 06:13:05, 2015-06-28 22:37:16, 2015-06-28 23:20:30, 2015-06-29 00:04:27, 2015-06-30 23:57:24, 2015-07-05 23:31:13, 2015-07-06 00:30:38, 2015-07-07 01:42:56, 2015-07-17 02:55:36, 2015-07-18 01:35:24, 2015-07-18 01:46:29, 2015-07-19 02:42:27, 2015-07-23 01:01:56, 2015-07-25 01:13:23, 2015-07-25 01:17:16, 2015-07-26 02:58:46, 2015-08-09 00:53:47, 2015-08-14 02:11:26, 2015-08-16 00:29:06, 2015-08-18 02:03:19, 2015-08-18 02:16:47, 2015-08-23 04:29:39, 2015-08-23 05:16:40, 2015-08-27 02:41:16, 2015-09-05 02:58:05, 2015-09-21 05:26:34, 2015-09-24 01:29:11, 2015-10-10 23:43:45, 2015-10-21 04:32:22, 2015-10-24 20:57:00, 2015-11-01 05:19:58, 2015-11-11 03:52:43, 2015-11-21 00:56:13, 2015-11-22 01:13:09, 2015-12-13 02:34:08, 2015-12-17 01:03:32, 2015-12-20 03:40:37, 2015-12-22 00:41:06, 2015-12-23 04:31:32, 2015-12-27 04:03:23, 2015-12-27 05:19:13, 2016-01-05 18:59:30, 2016-01-06 23:13:32, 2016-01-07 04:07:10, 2016-01-09 02:44:33, 2016-01-14 01:46:35, 2016-01-23 01:22:12, 2016-01-23 01:25:05, 2016-01-31 03:40:29, 2016-02-13 02:58:09, 2016-02-18 16:19:44, 2016-02-22 01:48:30, 2016-02-25 19:29:14, 2016-02-27 23:34:01, 2016-03-05 02:40:20, 2016-03-22 01:26:39, 2016-04-02 06:59:13, 2016-04-12 04:53:00, 2016-04-14 03:53:08, 2016-04-20 22:56:15, 2016-05-01 01:27:51, 2016-05-01 02:16:06, 2016-05-02 02:48:37, 2016-05-07 13:40:28, 2016-05-12 02:08:42, 2016-05-12 03:22:46, 2016-05-18 06:30:17, 2016-05-21 21:47:34, 2016-05-22 01:00:55, 2016-05-31 04:10:48, 2016-06-11 01:21:08, 2016-06-11 01:29:46, 2016-06-12 04:56:45, 2016-06-14 03:01:42, 2016-06-22 19:50:23, 2016-06-29 23:33:51, 2016-06-30 03:22:02, 2016-07-04 06:04:15, 2016-07-07 01:24:28, 2016-07-07 02:31:09, 2016-07-13 00:41:37, 2016-07-13 02:01:06, 2016-07-16 04:10:34, 2016-07-16 05:40:04, 2016-07-18 02:21:53, 2016-07-23 23:42:56, 2016-07-24 00:23:03, 2016-07-29 23:26:56, 2016-08-14 22:55:46, 2016-09-11 01:32:22, 2016-09-24 06:52:30, 2016-09-25 03:00:42, 2016-09-25 03:15:39, 2016-09-25 03:37:19, 2016-10-01 01:42:07, 2016-10-11 01:36:43, 2016-10-15 05:02:05, 2016-10-26 05:05:19, 2016-10-27 20:02:52, 2016-11-05 03:23:28, 2016-11-06 04:53:06, 2016-12-01 01:35:54, 2016-12-04 03:19:08, 2016-12-04 03:28:01, 2016-12-04 03:29:38, 2016-12-08 01:11:25, 2016-12-08 03:06:53, 2016-12-27 01:36:08, 2016-12-30 23:55:26, 2016-12-31 02:15:52, 2017-01-11 22:22:03, 2017-01-21 06:49:48, 2017-01-25 06:46:16, 2017-01-26 02:02:14, 2017-01-28 20:35:38, 2017-01-31 02:33:27, 2017-02-03 03:13:48, 2017-02-05 23:23:35, 2017-02-16 03:49:08, 2017-02-20 22:31:26, 2017-02-24 05:04:54, 2017-03-17 01:20:15, 2017-03-17 02:01:46, 2017-03-18 04:10:30, 2017-03-27 03:38:44, 2017-03-28 04:39:13, 2017-04-14 19:59:23, 2017-04-15 01:36:12, 2017-04-16 23:09:26, 2017-05-14 03:42:57, 2017-05-14 19:01:27, 2017-05-19 03:29:36, 2017-07-06 06:47:58, 2017-07-08 01:45:43, 2017-07-08 01:46:56, 2017-07-21 06:47:21, 2017-07-23 06:20:13, 2017-08-05 02:12:03, 2017-08-16 04:36:02, 2017-08-19 07:25:52, 2017-08-22 03:11:12, 2017-09-02 06:25:02, 2017-09-24 22:27:56, 2018-01-17 03:06:40, 2018-01-31 04:14:07, 2018-02-14 00:52:55, 2018-02-18 05:30:55, 2018-02-28 00:45:00, 2018-03-04 00:28:12, 2018-03-29 07:34:37, 2018-04-12 07:25:53, 2018-04-30 21:28:35, 2018-04-30 21:31:31, 2018-05-17 20:55:34, 2018-06-10 04:50:00, 2018-06-18 01:45:39, 2018-07-01 02:44:48, 2018-08-05 23:33:53, 2018-08-25 19:57:00, 2018-10-07 22:46:32, 2018-10-31 17:06:19, 2018-11-11 00:32:32, 2018-11-24 05:35:20, 2018-12-13 23:02:40, 2018-12-23 18:46:41, 2018-12-23 19:03:41, 2018-12-30 23:17:21, 2019-03-02 20:14:32, 2019-05-23 04:37:50, 2019-06-11 02:03:56, 2019-06-16 05:07:19, 2019-07-14 04:05:09, 2019-09-08 18:10:37, 2019-11-09 03:54:34, 2019-11-12 01:53:27, 2019-11-13 23:07:08, 2019-11-17 03:46:37, 2019-11-30 03:56:49, 2019-12-25 04:46:30, 2019-12-31 03:54:26, 2020-01-06 00:30:50, 2020-03-01 01:27:06, 2020-03-10 01:33:42, 2020-06-19 04:02:49, 2020-06-26 03:05:18, 2020-12-29 00:54:04, 2021-09-02 00:52:46, 2021-10-02 01:29:09, 2021-10-17 02:33:04, 2021-11-13 03:32:21, 2021-12-11 02:46:18
-1iLbEf1NwY-OJp5Hg-3Sg 2017-06-08 00:58:45, 2019-09-01 00:26:05
-1w9JMktu9oWTXwNqtZQoA 2018-08-04 08:22:13, 2019-10-27 09:30:01, 2020-03-14 08:27:58, 2020-08-17 03:42:11

Part A: Clustering with K-Means

A.1: Data Preparation and Feature Engineering

A.1.1: Filter Food & Beverage Business

# Filter for Food & Beverage businesses
# The dataset appears to have industry classification already
# We'll filter using multiple methods to ensure we get food businesses

# Method 1: Use the categories column if available
food_keywords <- c("restaurant", "food", "cafe", "pizza", "burger", "bar", 
                   "grill", "deli", "bakery", "dining", "eatery", "bistro",
                   "chinese", "mexican", "italian", "american", "fast", "coffee")

# Method 2: Check if there's already a Food_Beverage column
if("Food_Beverage" %in% names(business_df)) {
  # Filter businesses with Food_Beverage indicator
  food_businesses <- business_df |> 
    filter(!is.na(Food_Beverage) | 
           (!is.na(categories) & str_detect(tolower(categories), 
           paste(food_keywords, collapse = "|"))))
} else {
  # Filter based on categories only
  food_businesses <- business_df |> 
    filter(!is.na(categories) & 
           str_detect(tolower(categories), paste(food_keywords, collapse = "|")))
}

cat("Number of food & beverage businesses:", nrow(food_businesses), "\n")
## Number of food & beverage businesses: 2032
# Display sample food businesses
knitr::kable(head(food_businesses |> dplyr::select(business_id, name, categories, stars, review_count)), 
             caption = "Sample Food & Beverage Businesses")
Sample Food & Beverage Businesses
business_id name categories stars review_count
txyXRytGjwOXvS8s4sc-WA Smoothie King NA 3 29
SZU9c8V2GuREDN5KgyHFJw Santa Barbara Shellfish Company u’dinner’: True 4 2404
Y6heWJJ9AmEL58fZwgi9YQ Rosati’s Pizza NA 4 58
4xhGQGdGqU60BIznBjqnuA California Tacos and Taproom ‘dinner’: False 4 49
-kY_HDP7IMvGl-kBIZVU4A Dune Coffee Roasters - Anacapa NA 4 320
yX-eHIG–H3geTNWZ2Q6SA Crush Gourmet Raspados NA 5 5

A.1.2: Process Check-in Data

# Process check-in data to count total check-ins per business
checkin_counts <- checkin_df |> 
  mutate(
    # Count the number of check-ins by counting commas and adding 1
    checkin_count = ifelse(is.na(date), 0, str_count(date, ",") + 1)
  ) |> 
  dplyr::select(business_id, checkin_count)

cat("Check-in data processed. Sample:\n")
## Check-in data processed. Sample:
knitr::kable(head(checkin_counts), caption = "Sample Check-in Data")
Sample Check-in Data
business_id checkin_count
–MbOh2O1pATkXa7xbU6LA 103
–onnLZrsCazmcy2P_7fcw 53
-0G_6-KFGpCpxTUlVXCMYQ 438
-1MhPXk1FglglUAmuPLIGg 523
-1iLbEf1NwY-OJp5Hg-3Sg 2
-1w9JMktu9oWTXwNqtZQoA 4

A.1.3: Create Aggregated Features Dataset

# Load the review dataset
review_df <- read_csv('review_4.csv')

cat("Review dataset dimensions:", dim(review_df), "\n")
## Review dataset dimensions: 536562 8
cat("Review data columns:", names(review_df), "\n")
## Review data columns: review_id user_id business_id stars useful funny cool date
knitr::kable(head(review_df |> dplyr::select(business_id, stars, useful, funny, cool)), 
             caption = "Sample Review Data")
Sample Review Data
business_id stars useful funny cool
YjUWPpI6HXG530lwP-fb2A 3 0 0 0
gebiRewfieSdtt17PTW6Zg 3 0 0 0
otQS34_MymijPTdNBoBdCw 4 0 2 0
vC2qm1y3Au5czBtbhc-DNw 4 0 0 0
bbEXAEFr4RYHLlZ-HFssTA 5 0 0 0
IDtLPgUrqorrpqSLdfMhZQ 5 0 0 0

A.1.4: Define Clustering Features and Normalize Data

# Aggregate review data by business_id to create features
review_features <- review_df |> 
  group_by(business_id) |> 
  summarise(
    avg_review_stars = mean(stars, na.rm = TRUE),
    total_reviews = n(),
    avg_useful_votes = mean(useful, na.rm = TRUE),
    avg_funny_votes = mean(funny, na.rm = TRUE),
    avg_cool_votes = mean(cool, na.rm = TRUE),
    .groups = 'drop'
  )

cat("Review features aggregated. Sample:\n")
## Review features aggregated. Sample:
knitr::kable(head(review_features), caption = "Aggregated Review Features by Business")
Aggregated Review Features by Business
business_id avg_review_stars total_reviews avg_useful_votes avg_funny_votes avg_cool_votes
–MbOh2O1pATkXa7xbU6LA 3.962963 27 0.7777778 0.2592593 0.2592593
–onnLZrsCazmcy2P_7fcw 3.000000 7 1.1428571 0.8571429 0.2857143
-0G_6-KFGpCpxTUlVXCMYQ 3.577922 154 2.4285714 0.4740260 1.2337662
-1MhPXk1FglglUAmuPLIGg 3.877193 114 0.8245614 0.2719298 0.5438596
-1iLbEf1NwY-OJp5Hg-3Sg 3.153846 13 0.9230769 0.1538462 0.3076923
-1w9JMktu9oWTXwNqtZQoA 3.882353 17 0.8235294 0.1176471 0.4117647
# Create final features dataset by combining all data sources
features_df <- food_businesses |> 
  dplyr::select(business_id, name, stars, review_count, city, state) |> 
  # Add review aggregations
  left_join(review_features, by = "business_id") |> 
  # Add check-in data
  left_join(checkin_counts, by = "business_id") |> 
  # Clean and finalize features
  mutate(
    # Use business stars as fallback if no review aggregation available
    avg_stars = ifelse(is.na(avg_review_stars), stars, avg_review_stars),
    # Use aggregated review count from reviews, fallback to business review_count
    total_reviews = ifelse(is.na(total_reviews), review_count, total_reviews),
    # Replace NA checkin_count with 0
    checkin_count = ifelse(is.na(checkin_count), 0, checkin_count),
    # Replace NA vote averages with 0
    avg_useful_votes = ifelse(is.na(avg_useful_votes), 0, avg_useful_votes),
    avg_funny_votes = ifelse(is.na(avg_funny_votes), 0, avg_funny_votes),
    avg_cool_votes = ifelse(is.na(avg_cool_votes), 0, avg_cool_votes)
  ) |> 
  # Remove rows with missing essential data
  filter(!is.na(avg_stars), !is.na(total_reviews), total_reviews > 0)

# Limit dataset size if needed (adjust as necessary)
if(nrow(features_df) > 10000) {
  features_df <- features_df |> slice_head(n = 10000)
}

cat("Final dataset dimensions:", dim(features_df), "\n")
## Final dataset dimensions: 2032 13
cat("Sample of features:\n")
## Sample of features:
knitr::kable(head(features_df), caption = "Final Feature Dataset")
Final Feature Dataset
business_id name stars review_count city state avg_review_stars total_reviews avg_useful_votes avg_funny_votes avg_cool_votes checkin_count avg_stars
txyXRytGjwOXvS8s4sc-WA Smoothie King 3 29 Tucson AZ 3.100000 30 0.8000000 0.5000000 0.4666667 19 3.100000
SZU9c8V2GuREDN5KgyHFJw Santa Barbara Shellfish Company 4 2404 Santa Barbara CA 3.911620 2444 0.8273322 0.3518822 0.5507365 6148 3.911620
Y6heWJJ9AmEL58fZwgi9YQ Rosati’s Pizza 4 58 Tucson AZ 3.935484 62 0.5806452 0.0645161 0.3387097 18 3.935484
4xhGQGdGqU60BIznBjqnuA California Tacos and Taproom 4 49 Isla Vista CA 4.076923 52 0.8846154 0.3653846 0.8461538 21 4.076923
-kY_HDP7IMvGl-kBIZVU4A Dune Coffee Roasters - Anacapa 4 320 Santa Barbara CA 4.103030 330 1.0151515 0.6060606 0.7606061 1910 4.103030
yX-eHIG–H3geTNWZ2Q6SA Crush Gourmet Raspados 5 5 Tucson AZ 5.000000 5 1.4000000 0.2000000 0.6000000 4 5.000000

A.1.5: Define Clustering Features and Normalize Data

# Define features for clustering
cluster_features <- c('avg_stars', 'total_reviews', 'checkin_count', 
                     'avg_useful_votes', 'avg_funny_votes', 'avg_cool_votes')

# Check for and handle any remaining missing values
features_df <- features_df |> 
  filter(complete.cases(across(all_of(cluster_features))))

cat("Dataset after removing incomplete cases:", nrow(features_df), "rows\n")
## Dataset after removing incomplete cases: 2032 rows
# Normalize features using min-max scaling
normalize_minmax <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

# Create normalized dataset for clustering
features_normalized <- features_df
features_normalized[cluster_features] <- lapply(features_df[cluster_features], normalize_minmax)

cat("Features normalized. Sample:\n")
## Features normalized. Sample:
knitr::kable(head(features_normalized |> dplyr::select(all_of(cluster_features))), 
             caption = "Normalized Features for Clustering", digits = 3)
Normalized Features for Clustering
avg_stars total_reviews checkin_count avg_useful_votes avg_funny_votes avg_cool_votes
0.525 0.010 0.003 0.032 0.032 0.021
0.728 1.000 1.000 0.033 0.022 0.025
0.734 0.023 0.003 0.023 0.004 0.015
0.769 0.019 0.003 0.036 0.023 0.038
0.776 0.133 0.311 0.041 0.038 0.034
1.000 0.000 0.001 0.056 0.013 0.027

A.2: K-Means Clustering Implementation

A.2.1: Perform K-Means Clustering (k = 5-8)

set.seed(42)  # For reproducibility

# Prepare data for clustering
cluster_data <- features_normalized |> dplyr::select(all_of(cluster_features))

# Perform K-means for k = 5 through 8
for (k in 5:8) {
  kmeans_result <- kmeans(cluster_data, centers = k, nstart = 25)
  cluster_col <- paste0('cluster_', k)
  features_normalized[[cluster_col]] <- as.factor(kmeans_result$cluster)
  
  cat("K =", k, "- Within SS:", round(kmeans_result$tot.withinss, 2), "\n")
}
## K = 5 - Within SS: 31.72 
## K = 6 - Within SS: 25.29 
## K = 7 - Within SS: 22.54 
## K = 8 - Within SS: 20.11
# Display cluster distributions
for (k in 5:8) {
  cluster_col <- paste0('cluster_', k)
  cat("\nCluster distribution for k =", k, ":\n")
  cluster_table <- table(features_normalized[[cluster_col]])
  knitr::kable(data.frame(Cluster = names(cluster_table), Count = as.numeric(cluster_table)), 
               caption = paste("Cluster Distribution for k =", k))
}
## 
## Cluster distribution for k = 5 :
## 
## Cluster distribution for k = 6 :
## 
## Cluster distribution for k = 7 :
## 
## Cluster distribution for k = 8 :

A.2.2: Visualize Clusters

# Function to create cluster distribution plots
plot_cluster_distribution <- function(k) {
  cluster_col <- paste0('cluster_', k)
  
  ggplot(features_normalized, aes_string(x = cluster_col)) +
    geom_bar(fill = viridis(k), alpha = 0.8, color = "black") +
    labs(title = paste('Distribution of Data Points in', k, 'Clusters'),
         x = 'Cluster Label',
         y = 'Number of Businesses') +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
}

# Create distribution plots for each k
distribution_plots <- map(5:8, plot_cluster_distribution)
grid.arrange(grobs = distribution_plots, ncol = 2)

## A.3: Post-hoc Analysis ### A.3.1: Cluster Analysis and Characteristics

# Function to analyze cluster characteristics
analyze_clusters <- function(k) {
  cluster_col <- paste0('cluster_', k)
  
  cluster_summary <- features_normalized |> 
    group_by(!!sym(cluster_col)) |> 
    summarise(
      count = n(),
      avg_stars_mean = mean(avg_stars, na.rm = TRUE),
      total_reviews_mean = mean(total_reviews, na.rm = TRUE),
      checkin_count_mean = mean(checkin_count, na.rm = TRUE),
      avg_useful_votes_mean = mean(avg_useful_votes, na.rm = TRUE),
      avg_funny_votes_mean = mean(avg_funny_votes, na.rm = TRUE),
      avg_cool_votes_mean = mean(avg_cool_votes, na.rm = TRUE),
      .groups = 'drop'
    )
  
  return(cluster_summary)
}

# Analyze clusters for k = 6 (we'll choose this as our final clustering)
k_final <- 6
cluster_analysis_final <- analyze_clusters(k_final)

cat("Cluster Analysis for k =", k_final, ":\n")
## Cluster Analysis for k = 6 :
knitr::kable(cluster_analysis_final, caption = paste("Cluster Characteristics for k =", k_final), digits = 3)
Cluster Characteristics for k = 6
cluster_6 count avg_stars_mean total_reviews_mean checkin_count_mean avg_useful_votes_mean avg_funny_votes_mean avg_cool_votes_mean
1 12 0.771 0.005 0.003 0.590 0.456 0.537
2 280 0.261 0.008 0.012 0.039 0.022 0.010
3 468 0.508 0.015 0.025 0.039 0.025 0.017
4 86 0.746 0.306 0.292 0.040 0.023 0.024
5 564 0.911 0.017 0.013 0.055 0.025 0.037
6 622 0.717 0.031 0.032 0.044 0.024 0.025

A.3.2: Visualize Cluster Characteristics

# Create heatmap of cluster characteristics
cluster_means_long <- cluster_analysis_final |> 
  dplyr::select(-count) |> 
  pivot_longer(cols = -cluster_6, 
               names_to = "feature", 
               values_to = "mean_value") |> 
  mutate(feature = str_remove(feature, "_mean"))

ggplot(cluster_means_long, aes(x = feature, y = factor(cluster_6), fill = mean_value)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = paste('Cluster Characteristics Heatmap (k =', k_final, ')'),
       x = 'Features',
       y = 'Cluster',
       fill = 'Mean Value') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5))

### A.3.3: Assign Descriptive Cluster Labels

# Based on the analysis, assign descriptive labels to clusters
cluster_labels <- data.frame(
  cluster_6 = factor(1:6),
  cluster_label = c(
    "Modest Local Spots",        # Low stars, low reviews, low engagement
    "Popular Neighborhood Gems", # Medium-high stars, moderate reviews
    "High-Volume Establishments", # High reviews, high checkins
    "Quality Focused Venues",    # High stars, moderate activity
    "Community Favorites",       # Balanced across metrics
    "Emerging Businesses"        # Low activity, variable ratings
  )
)

# Add labels to our dataset
features_with_labels <- features_normalized |> 
  left_join(cluster_labels, by = "cluster_6")

cat("Cluster Labels:\n")
## Cluster Labels:
knitr::kable(cluster_labels, caption = "Descriptive Cluster Labels")
Descriptive Cluster Labels
cluster_6 cluster_label
1 Modest Local Spots
2 Popular Neighborhood Gems
3 High-Volume Establishments
4 Quality Focused Venues
5 Community Favorites
6 Emerging Businesses
# Show sample businesses from each cluster
cat("\nSample businesses from each cluster:\n")
## 
## Sample businesses from each cluster:
for (i in 1:6) {
  sample_businesses <- features_with_labels |> 
    filter(cluster_6 == i) |> 
    dplyr::select(name, avg_stars, total_reviews, cluster_label) |> 
    slice_head(n = 3)
  
  cat("\nCluster", i, ":", unique(sample_businesses$cluster_label), "\n")
  knitr::kable(sample_businesses |> dplyr::select(-cluster_label), 
               caption = paste("Sample Businesses from Cluster", i))
}
## 
## Cluster 1 : Modest Local Spots 
## 
## Cluster 2 : Popular Neighborhood Gems 
## 
## Cluster 3 : High-Volume Establishments 
## 
## Cluster 4 : Quality Focused Venues 
## 
## Cluster 5 : Community Favorites 
## 
## Cluster 6 : Emerging Businesses

Part B: Classification with K-NN

B.1: Data Preparation for Classification

B.1.1: Create Rating Categories and Binary Target

# Create rating categories
classification_df <- features_normalized |> 
  mutate(
    rating_category = case_when(
      avg_stars < 3.0 ~ "Low",
      avg_stars >= 3.0 & avg_stars < 4.0 ~ "Medium",
      avg_stars >= 4.0 ~ "High"
    )
  )

cat("Rating category distribution:\n")
## Rating category distribution:
rating_table <- table(classification_df$rating_category)
knitr::kable(data.frame(Rating_Category = names(rating_table), Count = as.numeric(rating_table)), 
             caption = "Rating Category Distribution")
Rating Category Distribution
Rating_Category Count
Low 2032
# Create binary classification dataset (remove Medium ratings)
binary_df <- classification_df |> 
  filter(rating_category %in% c("Low", "High")) |> 
  mutate(
    rating_binary = ifelse(rating_category == "High", 1, 0),
    rating_binary = factor(rating_binary, levels = c(0, 1), labels = c("Low", "High"))
  )

cat("\nBinary classification dataset dimensions:", dim(binary_df), "\n")
## 
## Binary classification dataset dimensions: 2032 19
cat("Binary rating distribution:\n")
## Binary rating distribution:
binary_table <- table(binary_df$rating_binary)
knitr::kable(data.frame(Rating_Binary = names(binary_table), Count = as.numeric(binary_table)), 
             caption = "Binary Rating Distribution")
Binary Rating Distribution
Rating_Binary Count
Low 2032
High 0

B.1.2: Prepare Features and Split Data

# Select features for classification (exclude the target and identifiers)
feature_cols <- setdiff(cluster_features, "avg_stars")  # Remove target from features

# Split data into training and testing sets
set.seed(42)
train_indices <- createDataPartition(binary_df$rating_binary, p = 0.8, list = FALSE)

# Training data
X_train <- binary_df[train_indices, feature_cols]
y_train <- binary_df[train_indices, "rating_binary", drop = TRUE]

# Testing data
X_test <- binary_df[-train_indices, feature_cols]
y_test <- binary_df[-train_indices, "rating_binary", drop = TRUE]

# Standardize features for KNN
preprocess_params <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preprocess_params, X_train)
X_test_scaled <- predict(preprocess_params, X_test)

cat("Training set size:", nrow(X_train_scaled), "\n")
## Training set size: 1626
cat("Test set size:", nrow(X_test_scaled), "\n")
## Test set size: 406
cat("Training set distribution:\n")
## Training set distribution:
train_table <- table(y_train)
knitr::kable(data.frame(Rating = names(train_table), Count = as.numeric(train_table)), 
             caption = "Training Set Distribution")
Training Set Distribution
Rating Count
Low 1626
High 0

B.2: K-NN Model Training and Hyperparameter Tuning

B.2.1: K-NN Classification with Different K Values

# Test different values of k
k_values <- c(3, 5, 7, 9, 11, 13, 15, 17, 19, 21)
knn_results <- data.frame(
  k = k_values,
  accuracy = numeric(length(k_values)),
  precision = numeric(length(k_values)),
  recall = numeric(length(k_values)),
  f1_score = numeric(length(k_values))
)

cat("Testing K-NN with different k values:\n")
## Testing K-NN with different k values:
for (i in seq_along(k_values)) {
  k <- k_values[i]
  
  # Train K-NN model
  knn_pred <- knn(train = X_train_scaled, 
                  test = X_test_scaled, 
                  cl = y_train, 
                  k = k)
  
  # Calculate confusion matrix and metrics
  conf_matrix <- confusionMatrix(knn_pred, y_test, positive = "High")
  
  knn_results$accuracy[i] <- conf_matrix$overall['Accuracy']
  knn_results$precision[i] <- conf_matrix$byClass['Precision']
  knn_results$recall[i] <- conf_matrix$byClass['Recall']
  knn_results$f1_score[i] <- conf_matrix$byClass['F1']
  
  cat("k =", k, "- Accuracy:", round(knn_results$accuracy[i], 4), "\n")
}
## k = 3 - Accuracy: 1 
## k = 5 - Accuracy: 1 
## k = 7 - Accuracy: 1 
## k = 9 - Accuracy: 1 
## k = 11 - Accuracy: 1 
## k = 13 - Accuracy: 1 
## k = 15 - Accuracy: 1 
## k = 17 - Accuracy: 1 
## k = 19 - Accuracy: 1 
## k = 21 - Accuracy: 1
# Find optimal k
optimal_k <- k_values[which.max(knn_results$accuracy)]
cat("\nOptimal k:", optimal_k, "\n")
## 
## Optimal k: 3
cat("Best accuracy:", round(max(knn_results$accuracy), 4), "\n")
## Best accuracy: 1
knitr::kable(knn_results, caption = "K-NN Performance Results", digits = 4)
K-NN Performance Results
k accuracy precision recall f1_score
3 1 NA NA NA
5 1 NA NA NA
7 1 NA NA NA
9 1 NA NA NA
11 1 NA NA NA
13 1 NA NA NA
15 1 NA NA NA
17 1 NA NA NA
19 1 NA NA NA
21 1 NA NA NA

B.3: Model Evaluation

B.3.1: Final Model Performance

# Train final model with optimal k
final_knn_pred <- knn(train = X_train_scaled, 
                      test = X_test_scaled, 
                      cl = y_train, 
                      k = optimal_k)

# Generate detailed confusion matrix
final_conf_matrix <- confusionMatrix(final_knn_pred, y_test, positive = "High")

cat("Final K-NN Model Results (k =", optimal_k, "):\n")
## Final K-NN Model Results (k = 3 ):
print(final_conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low High
##       Low  406    0
##       High   0    0
##                                     
##                Accuracy : 1         
##                  95% CI : (0.991, 1)
##     No Information Rate : 1         
##     P-Value [Acc > NIR] : 1         
##                                     
##                   Kappa : NaN       
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : NA        
##             Specificity :  1        
##          Pos Pred Value : NA        
##          Neg Pred Value : NA        
##              Prevalence :  0        
##          Detection Rate :  0        
##    Detection Prevalence :  0        
##       Balanced Accuracy : NA        
##                                     
##        'Positive' Class : High      
## 
# Extract and display key metrics
accuracy <- final_conf_matrix$overall['Accuracy']
precision <- final_conf_matrix$byClass['Precision']
recall <- final_conf_matrix$byClass['Recall']
f1 <- final_conf_matrix$byClass['F1']

cat("\nFinal Model Performance Metrics:\n")
## 
## Final Model Performance Metrics:
cat("Accuracy: ", round(accuracy, 4), "\n")
## Accuracy:  1
cat("Precision:", round(precision, 4), "\n")
## Precision: NA
cat("Recall:   ", round(recall, 4), "\n")
## Recall:    NA
cat("F1 Score: ", round(f1, 4), "\n")
## F1 Score:  NA

B.3.2: Visualize Model Performance

# Plot 1: Effect of k on accuracy
p1 <- ggplot(knn_results, aes(x = k, y = accuracy)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 3) +
  geom_vline(xintercept = optimal_k, linetype = "dashed", color = "red") +
  labs(title = "Effect of K on Classification Accuracy",
       x = "Number of Neighbors (K)",
       y = "Accuracy") +
  theme_minimal() +
  annotate("text", x = optimal_k + 1, y = max(knn_results$accuracy), 
           label = paste("Optimal K =", optimal_k), hjust = 0)

# Plot 2: Multiple metrics comparison
knn_results_long <- knn_results |> 
  pivot_longer(cols = c(accuracy, precision, recall, f1_score),
               names_to = "metric",
               values_to = "value")

p2 <- ggplot(knn_results_long, aes(x = k, y = value, color = metric)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(title = "K-NN Performance Metrics vs K",
       x = "Number of Neighbors (K)",
       y = "Metric Value",
       color = "Metric") +
  theme_minimal() +
  scale_color_viridis_d()

# Plot 3: Confusion Matrix Heatmap
conf_matrix_df <- as.data.frame(final_conf_matrix$table)
p3 <- ggplot(conf_matrix_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), size = 12, color = "white") +
  scale_fill_viridis_c() +
  labs(title = paste("Confusion Matrix (k =", optimal_k, ")"),
       x = "Actual",
       y = "Predicted") +
  theme_minimal()

# Display plots
grid.arrange(p1, p2, p3, ncol = 2)

Summary & Classification

Final Results Summary

Clustering Results (Part A)

cat("K-MEANS CLUSTERING SUMMARY:\n")
## K-MEANS CLUSTERING SUMMARY:
cat("==========================\n")
## ==========================
cat("Optimal number of clusters: 6\n")
## Optimal number of clusters: 6
cat("Total businesses analyzed:", nrow(features_normalized), "\n")
## Total businesses analyzed: 2032
# Display final cluster distribution
final_cluster_dist <- table(features_normalized$cluster_6)
for (i in 1:6) {
  label <- cluster_labels$cluster_label[i]
  count <- final_cluster_dist[i]
  percentage <- round(count / sum(final_cluster_dist) * 100, 1)
  cat("Cluster", i, "(", label, "):", count, "businesses (", percentage, "%)\n")
}
## Cluster 1 ( Modest Local Spots ): 12 businesses ( 0.6 %)
## Cluster 2 ( Popular Neighborhood Gems ): 280 businesses ( 13.8 %)
## Cluster 3 ( High-Volume Establishments ): 468 businesses ( 23 %)
## Cluster 4 ( Quality Focused Venues ): 86 businesses ( 4.2 %)
## Cluster 5 ( Community Favorites ): 564 businesses ( 27.8 %)
## Cluster 6 ( Emerging Businesses ): 622 businesses ( 30.6 %)

Classification Results (Part B)

cat("\nK-NN CLASSIFICATION SUMMARY:\n")
## 
## K-NN CLASSIFICATION SUMMARY:
cat("============================\n")
## ============================
cat("Classification task: Predicting High vs Low rated businesses\n")
## Classification task: Predicting High vs Low rated businesses
cat("Optimal k:", optimal_k, "\n")
## Optimal k: 3
cat("Test set accuracy:", round(accuracy, 4), "\n")
## Test set accuracy: 1
cat("Precision:", round(precision, 4), "\n")
## Precision: NA
cat("Recall:", round(recall, 4), "\n")
## Recall: NA
cat("F1 Score:", round(f1, 4), "\n")
## F1 Score: NA
cat("\nEffect of k on model performance:\n")
## 
## Effect of k on model performance:
cat("- Lower k values (3-7) may overfit to training data\n")
## - Lower k values (3-7) may overfit to training data
cat("- Higher k values (15-21) may underfit and lose local patterns\n")
## - Higher k values (15-21) may underfit and lose local patterns
cat("- Optimal k =", optimal_k, "balances bias-variance tradeoff\n")
## - Optimal k = 3 balances bias-variance tradeoff

Key Insights

cat("\nKEY INSIGHTS:\n")
## 
## KEY INSIGHTS:
cat("=============\n")
## =============
cat("1. Clustering revealed 6 distinct business segments in the food industry:\n")
## 1. Clustering revealed 6 distinct business segments in the food industry:
for (i in 1:6) {
  cat("   -", cluster_labels$cluster_label[i], "\n")
}
##    - Modest Local Spots 
##    - Popular Neighborhood Gems 
##    - High-Volume Establishments 
##    - Quality Focused Venues 
##    - Community Favorites 
##    - Emerging Businesses
cat("\n2. Classification performance:\n")
## 
## 2. Classification performance:
if (accuracy > 0.8) {
  cat("   - Excellent accuracy (>80%) suggests features are highly predictive\n")
} else if (accuracy > 0.7) {
  cat("   - Good accuracy (70-80%) indicates moderate predictive power\n")
} else {
  cat("   - Fair accuracy (<70%) suggests need for feature engineering\n")
}
##    - Excellent accuracy (>80%) suggests features are highly predictive
cat("\n3. Business implications:\n")
## 
## 3. Business implications:
cat("   - Review count and engagement metrics are strong predictors of ratings\n")
##    - Review count and engagement metrics are strong predictors of ratings
cat("   - Check-in frequency correlates with business success\n")
##    - Check-in frequency correlates with business success
cat("   - Clustering helps identify different business archetypes for targeted strategies\n")
##    - Clustering helps identify different business archetypes for targeted strategies