# Load the datasets
business_df <- read_csv('business_4.csv')
checkin_df <- read_csv('checkin_4.csv')
# Display basic information about the datasets
cat("Business dataset dimensions:", dim(business_df), "\n")
## Business dataset dimensions: 5100 71
cat("Checkin dataset dimensions:", dim(checkin_df), "\n")
## Checkin dataset dimensions: 4962 2
# Display first few rows
knitr::kable(head(business_df), caption = "Sample Business Data")
| business_id | name | address | city | state | postal_code | latitude | longitude | stars | review_count | is_open | attributes.ByAppointmentOnly | attributes.BusinessAcceptsCreditCards | attributes.BikeParking | attributes.RestaurantsPriceRange2 | attributes.CoatCheck | attributes.RestaurantsTakeOut | attributes.RestaurantsDelivery | attributes.Caters | attributes.WiFi | attributes.BusinessParking | attributes.WheelchairAccessible | attributes.HappyHour | attributes.OutdoorSeating | attributes.HasTV | attributes.RestaurantsReservations | attributes.DogsAllowed | attributes.Alcohol | attributes.GoodForKids | attributes.RestaurantsAttire | attributes.Ambience | attributes.RestaurantsTableService | attributes.RestaurantsGoodForGroups | attributes.DriveThru | attributes.NoiseLevel | attributes.GoodForMeal | attributes.BusinessAcceptsBitcoin | attributes.Smoking | attributes.Music | attributes.GoodForDancing | attributes.AcceptsInsurance | attributes.BestNights | attributes.BYOB | attributes.Corkage | attributes.BYOBCorkage | attributes.HairSpecializesIn | attributes.Open24Hours | attributes.RestaurantsCounterService | attributes.AgesAllowed | attributes.DietaryRestrictions | categories | hours.Monday | hours.Tuesday | hours.Wednesday | hours.Thursday | hours.Friday | hours.Saturday | hours.Sunday | Food_Beverage | Health_Fitness | Personal_Care | Automotive | Retail | Professional_Services | Entertainment | Travel_Hospitality | Education | Home_Services | Healthcare | Veterinary_Services | Sum_of_Industries |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wghnIlMb_i5U46HMBGx9ig | China Dragon Restaurant | 1625 W Valencia Rd, Ste 101-103 | Tucson | AZ | 85746 | 32.13230 | -111.0000 | 3.0 | 23 | 0 | NA | TRUE | NA | 2 | NA | True | False | False | ‘no’ | {‘garage’: False | ‘street’: False | ‘validated’: False | ‘lot’: False | ‘valet’: False} | NA | NA | False | FALSE | True | NA | ‘beer_and_wine’ | True | u’casual’ | {‘romantic’: False | ‘intimate’: False | ‘classy’: False | ‘hipster’: False | ‘divey’: False | ‘touristy’: False | ‘trendy’: False | ‘upscale’: False | ‘casual’: True} | NA | True | NA | u’quiet’ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Restaurants, Chinese | 11:0-21:0 | 11:0-21:0 | 11:0-21:0 | 11:0-21:0 | 11:0-21:0 | 11:0-21:0 | 11:0-21:0 | 1,0,0,0,0,0,0,0,0,0,0,0,1 |
| txyXRytGjwOXvS8s4sc-WA | Smoothie King | 1070 E Tucson Marketplace Blvd | Tucson | AZ | 85713 | 32.18679 | -110.9548 | 3.0 | 29 | 1 | NA | TRUE | True | 2 | NA | True | True | NA | u’free’ | {u’valet’: False | u’garage’: False | u’street’: None | u’lot’: True | u’validated’: False} | FALSE | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Vitamins & Supplements, Ice Cream & Frozen Yogurt, Food, Juice Bars & Smoothies, Shopping | 0:0-0:0 | 7:0-21:0 | 7:0-19:0 | 7:0-19:0 | 7:0-19:0 | 9:0-20:0 | 11:0-18:0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0,0,0,0,2 |
| x9K0RfZaT_zlw6DklBDzjw | Gavi Italian Restaurant | 7401 N La Cholla Blvd, Ste 146 | Tucson | AZ | 85707 | 32.22167 | -110.9258 | 3.5 | 9 | 0 | NA | TRUE | NA | 2 | NA | True | False | NA | NA | {‘garage’: False | ‘street’: False | ‘validated’: False | ‘lot’: True | ‘valet’: False} | NA | NA | True | NA | False | NA | u’full_bar’ | True | u’casual’ | NA | NA | True | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Italian, Restaurants | NA | NA | NA | NA | NA | NA | NA | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,0,0,0,1 |
| IDtLPgUrqorrpqSLdfMhZQ | Helena Avenue Bakery | 131 Anacapa St, Ste C | Santa Barbara | CA | 93101 | 34.41444 | -119.6907 | 4.0 | 389 | 1 | NA | TRUE | True | 2 | NA | True | None | True | u’no’ | {‘garage’: False | ‘street’: True | ‘validated’: False | ‘lot’: False | ‘valet’: False} | TRUE | FALSE | True | FALSE | False | True | u’none’ | True | ‘casual’ | {‘touristy’: False | ‘hipster’: True | ‘romantic’: False | ‘divey’: False | ‘intimate’: False | ‘trendy’: True | ‘upscale’: False | ‘classy’: False | ‘casual’: True} | False | True | NA | u’average’ | {‘dessert’: False | ‘latenight’: False | ‘lunch’: True | ‘dinner’: False | ‘brunch’: True | ‘breakfast’: True} | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Food, Restaurants, Salad, Coffee & Tea, Breakfast & Brunch, Sandwiches, Bakeries | 0:0-0:0 | 8:0-14:0 | 8:0-14:0,8:0-14:0,8:0-14:0,8:0-14:0,8:0-14:0,1,0,0,0,0,0,0,0,0,0,0,0,1 |
| anLQj9AM8vjbcLSIE0iUgg | Papa Murphy’s | 7250 North La Cholla, Suite 186 | Tucson | AZ | 85741 | 32.33857 | -111.0108 | 4.0 | 22 | 1 | NA | TRUE | False | 1 | NA | True | True | NA | NA | {‘garage’: False | ‘street’: False | ‘validated’: False | ‘lot’: False | ‘valet’: False} | NA | NA | False | TRUE | False | NA | u’none’ | True | u’casual’ | NA | NA | True | True | u’quiet’ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Pizza, Restaurants | NA | NA | NA | NA | NA | NA | NA | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,0,0,0,1 |
| 9C2rpb56aQvW0ViZHK9sPw | Home Plate Sports Pub | 4880 E 22nd St | Tucson | AZ | 85711 | 32.20631 | -110.8878 | 3.5 | 56 | 1 | NA | TRUE | True | 1 | NA | NA | False | False | u’no’ | {u’valet’: False | u’garage’: False | u’street’: None | u’lot’: None | u’validated’: None} | NA | TRUE | True | TRUE | False | NA | u’full_bar’ | True | u’casual’ | {u’divey’: None | u’hipster’: None | u’casual’: True | u’touristy’: None | u’trendy’: None | u’intimate’: False | u’romantic’: None | u’classy’: False | u’upscale’: None} | NA | True | NA | u’average’ | NA | NA | NA | {u’dj’: None | u’live’: False | u’jukebox’: None | u’video’: False | u’background_music’: False | u’karaoke’: None | u’no_music’: False} | NA | NA | {u’monday’: True | u’tuesday’: False | u’wednesday’: False | u’thursday’: False | u’friday’: False | u’saturday’: False | u’sunday’: False} | NA | NA | NA | NA | NA,NA,NA,NA,“Bars, Restaurants, Sports Bars, Nightlife”,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,10:0-2:0,1,0,0,0,0,0,0,0,0,0,0,0,1 |
knitr::kable(head(checkin_df), caption = "Sample Check-In Data")
| business_id | date |
|---|---|
| –MbOh2O1pATkXa7xbU6LA | 2013-04-21 01:52:06, 2013-05-02 04:02:07, 2013-05-04 04:14:31, 2013-05-18 03:39:17, 2013-05-20 03:04:59, 2013-05-22 23:52:49, 2013-05-27 01:38:52, 2013-06-02 04:16:38, 2013-06-07 02:30:10, 2013-07-27 23:08:27, 2013-08-01 03:53:02, 2013-08-22 03:21:29, 2013-09-13 02:29:36, 2013-09-18 02:53:23, 2013-09-23 00:00:06, 2013-10-03 03:22:41, 2013-10-09 04:31:13, 2013-10-12 04:03:40, 2013-10-29 03:45:15, 2013-11-07 01:11:43, 2013-11-08 22:01:11, 2013-11-08 22:01:25, 2013-11-10 02:40:38, 2013-11-10 22:07:07, 2013-11-10 22:07:11, 2013-11-13 00:48:37, 2013-11-14 04:01:15, 2013-11-16 05:07:42, 2013-11-16 05:07:57, 2013-11-18 03:39:25, 2013-11-19 02:10:10, 2013-11-22 01:34:12, 2013-11-23 02:28:53, 2013-11-30 22:51:02, 2013-11-30 22:51:19, 2013-12-03 03:59:23, 2013-12-10 03:19:19, 2013-12-10 03:19:48, 2013-12-12 02:26:02, 2013-12-12 03:31:18, 2014-01-03 03:54:51, 2014-01-08 03:17:42, 2014-01-08 03:17:54, 2014-01-19 03:09:20, 2014-01-19 03:09:28, 2014-01-22 02:46:34, 2014-01-27 01:46:01, 2014-02-05 03:53:33, 2014-02-08 02:18:50, 2014-02-08 21:54:13, 2014-02-08 21:55:11, 2014-02-12 20:59:50, 2014-02-20 01:37:19, 2014-02-26 20:48:45, 2014-03-10 02:14:11, 2014-03-17 21:24:32, 2014-03-29 00:03:39, 2014-05-05 00:20:57, 2014-05-14 23:07:09, 2014-05-22 22:03:29, 2014-05-26 03:25:20, 2014-06-18 20:26:09, 2014-06-22 21:36:07, 2014-06-29 01:02:02, 2014-07-03 03:07:11, 2014-07-05 00:11:12, 2014-07-13 03:03:03, 2014-07-19 04:52:07, 2014-07-22 02:54:57, 2014-07-26 21:26:40, 2014-08-02 02:28:53, 2014-08-02 21:54:00, 2014-08-10 04:24:02, 2014-08-11 02:54:25, 2014-08-20 03:21:49, 2014-09-14 02:52:40, 2014-09-21 23:08:59, 2014-10-02 02:29:55, 2014-10-11 03:47:10, 2014-11-08 04:34:23, 2014-12-08 03:46:50, 2015-01-25 02:43:15, 2015-03-21 03:18:11, 2015-04-08 22:44:06, 2015-05-19 02:26:32, 2015-06-30 03:46:04, 2015-08-28 01:59:45, 2015-09-23 03:53:27, 2015-10-23 22:42:54, 2015-12-19 04:03:31, 2016-01-07 00:52:26, 2016-06-18 19:57:36, 2016-07-06 21:05:09, 2016-07-10 16:47:57, 2016-07-16 22:43:19, 2016-11-11 01:33:12, 2016-11-12 04:35:42, 2017-03-18 21:07:56, 2017-04-19 00:32:31, 2017-04-23 23:33:15, 2017-05-22 02:52:20, 2017-11-12 23:14:20, 2017-12-02 19:57:32 |
| –onnLZrsCazmcy2P_7fcw | 2010-09-26 23:38:02, 2011-02-01 21:44:51, 2011-02-27 01:18:19, 2011-04-06 02:40:34, 2011-04-15 02:11:59, 2011-05-08 18:54:05, 2011-05-09 21:10:39, 2011-07-18 02:44:28, 2011-07-31 01:50:07, 2011-08-02 00:58:19, 2011-08-02 01:11:13, 2011-08-29 06:04:47, 2011-08-30 02:40:24, 2011-08-31 04:32:44, 2011-09-01 03:34:10, 2011-09-01 15:34:57, 2011-09-05 23:10:20, 2011-10-21 15:59:44, 2011-11-22 17:49:40, 2011-11-23 02:31:04, 2011-12-13 00:45:14, 2011-12-18 06:28:20, 2011-12-18 19:36:04, 2011-12-24 02:41:40, 2011-12-24 03:02:10, 2012-02-06 21:09:24, 2012-02-16 02:35:49, 2012-02-22 15:42:28, 2012-03-18 06:20:53, 2012-04-25 03:00:06, 2012-05-14 00:03:49, 2012-05-25 07:10:02, 2012-06-10 02:46:59, 2012-06-10 02:48:15, 2012-06-21 06:58:33, 2012-07-10 20:35:56, 2012-07-15 01:44:04, 2012-08-05 18:49:43, 2012-08-05 18:51:20, 2012-08-05 18:51:46, 2012-08-05 18:52:04, 2012-08-28 05:47:14, 2012-09-18 02:05:14, 2012-10-02 15:50:14, 2012-10-29 01:59:58, 2012-11-04 03:03:55, 2012-11-19 21:01:04, 2012-12-07 04:54:35, 2013-01-11 02:29:47, 2013-06-15 01:37:39, 2013-06-15 15:33:32, 2013-06-16 08:38:16, 2013-07-07 17:52:48 |
| -0G_6-KFGpCpxTUlVXCMYQ | 2013-12-21 22:08:07, 2013-12-22 03:16:35, 2013-12-22 04:51:08, 2013-12-22 20:27:51, 2013-12-22 21:06:50, 2013-12-22 22:07:11, 2013-12-23 19:53:31, 2013-12-23 20:39:52, 2013-12-23 20:44:59, 2013-12-24 00:52:04, 2013-12-24 18:00:12, 2013-12-27 01:48:04, 2013-12-27 20:18:31, 2013-12-27 21:41:13, 2013-12-28 23:15:11, 2013-12-29 00:53:49, 2013-12-29 02:07:35, 2013-12-29 02:17:57, 2013-12-29 02:20:04, 2013-12-30 03:08:19, 2013-12-30 03:32:53, 2014-01-01 02:30:00, 2014-01-01 03:17:50, 2014-01-01 04:52:42, 2014-01-03 01:19:36, 2014-01-03 02:21:54, 2014-01-03 19:25:27, 2014-01-03 19:48:19, 2014-01-04 04:37:02, 2014-01-04 21:30:29, 2014-01-05 01:20:24, 2014-01-06 00:57:48, 2014-01-06 02:11:46, 2014-01-06 19:42:09, 2014-01-06 20:26:33, 2014-01-07 02:55:19, 2014-01-07 21:26:07, 2014-01-08 01:30:15, 2014-01-08 20:31:58, 2014-01-09 19:07:06, 2014-01-10 00:25:18, 2014-01-10 04:50:25, 2014-01-10 18:57:11, 2014-01-11 03:52:00, 2014-01-11 03:52:03, 2014-01-11 20:54:36, 2014-01-11 21:30:26, 2014-01-11 21:44:58, 2014-01-11 21:52:38, 2014-01-12 03:48:31, 2014-01-15 01:10:53, 2014-01-16 19:39:05, 2014-01-17 03:01:57, 2014-01-17 03:40:50, 2014-01-17 04:22:20, 2014-01-18 20:05:28, 2014-01-19 02:08:35, 2014-01-19 23:26:26, 2014-01-21 00:21:21, 2014-01-21 23:11:33, 2014-01-22 00:50:00, 2014-01-23 02:25:39, 2014-01-23 19:26:14, 2014-01-25 00:54:36, 2014-01-25 01:01:50, 2014-01-25 01:02:28, 2014-01-25 01:08:13, 2014-01-25 01:11:39, 2014-01-26 01:08:44, 2014-01-26 01:08:50, 2014-01-27 01:54:34, 2014-01-28 02:06:58, 2014-01-28 04:05:15, 2014-01-28 19:47:58, 2014-01-28 22:23:55, 2014-01-29 19:06:34, 2014-02-01 02:26:55, 2014-02-01 03:43:45, 2014-02-01 03:44:42, 2014-02-01 19:16:53, 2014-02-02 04:42:16, 2014-02-02 04:46:00, 2014-02-04 23:22:28, 2014-02-05 00:56:16, 2014-02-05 01:05:15, 2014-02-05 01:06:22, 2014-02-05 01:30:46, 2014-02-05 01:55:41, 2014-02-05 02:19:52, 2014-02-05 03:26:28, 2014-02-05 03:31:31, 2014-02-05 03:48:45, 2014-02-05 19:23:02, 2014-02-06 20:30:59, 2014-02-07 19:34:51, 2014-02-08 01:23:34, 2014-02-08 19:52:44, 2014-02-09 00:30:58, 2014-02-09 03:58:09, 2014-02-09 06:03:14, 2014-02-10 01:41:03, 2014-02-12 03:47:14, 2014-02-13 01:35:18, 2014-02-16 00:15:52, 2014-02-16 00:52:37, 2014-02-16 01:45:29, 2014-02-17 02:41:22, 2014-02-17 04:31:43, 2014-02-21 22:43:42, 2014-02-22 00:29:58, 2014-02-22 01:53:11, 2014-02-22 02:29:38, 2014-02-23 01:36:22, 2014-02-23 01:49:15, 2014-02-23 02:33:24, 2014-02-23 03:30:39, 2014-02-23 04:11:37, 2014-02-27 01:18:46, 2014-02-27 18:54:33, 2014-02-27 19:25:05, 2014-02-27 23:29:25, 2014-02-28 03:33:40, 2014-02-28 04:32:30, 2014-02-28 19:43:58, 2014-02-28 22:37:26, 2014-03-01 00:02:41, 2014-03-01 08:28:44, 2014-03-01 20:41:15, 2014-03-02 01:45:13, 2014-03-02 02:28:36, 2014-03-02 23:09:08, 2014-03-04 17:26:25, 2014-03-06 06:32:33, 2014-03-07 01:58:57, 2014-03-07 02:21:38, 2014-03-07 09:20:36, 2014-03-08 00:58:05, 2014-03-08 02:01:52, 2014-03-08 02:05:10, 2014-03-09 00:35:10, 2014-03-09 04:05:52, 2014-03-09 04:21:48, 2014-03-10 00:11:14, 2014-03-14 02:37:25, 2014-03-14 04:18:50, 2014-03-14 05:37:32, 2014-03-14 05:43:51, 2014-03-14 18:54:25, 2014-03-15 00:16:51, 2014-03-18 02:38:15, 2014-03-18 03:01:37, 2014-03-19 02:53:54, 2014-03-19 21:46:31, 2014-03-20 00:14:44, 2014-03-20 02:58:53, 2014-03-22 04:39:26, 2014-03-23 03:50:47, 2014-03-23 19:37:56, 2014-03-24 02:31:35, 2014-03-25 02:32:47, 2014-03-25 03:01:48, 2014-03-25 23:44:44, 2014-03-26 15:10:32, 2014-03-27 01:22:46, 2014-03-27 01:26:23, 2014-03-27 18:11:57, 2014-03-28 23:30:41, 2014-03-29 01:04:08, 2014-03-29 16:53:27, 2014-03-30 02:51:01, 2014-04-01 00:27:01, 2014-04-01 17:36:37, 2014-04-02 16:31:38, 2014-04-02 23:29:08, 2014-04-03 01:31:13, 2014-04-03 18:23:12, 2014-04-03 21:04:45, 2014-04-04 05:08:59, 2014-04-05 00:20:57, 2014-04-05 00:51:46, 2014-04-05 02:24:12, 2014-04-06 03:11:04, 2014-04-06 17:26:32, 2014-04-08 00:18:55, 2014-04-08 23:56:42, 2014-04-09 16:06:57, 2014-04-09 20:29:43, 2014-04-10 17:50:26, 2014-04-10 18:44:10, 2014-04-11 03:07:02, 2014-04-12 00:03:52, 2014-04-12 02:33:05, 2014-04-12 03:53:59, 2014-04-12 05:53:46, 2014-04-13 05:38:54, 2014-04-13 05:39:13, 2014-04-13 23:26:10, 2014-04-14 22:24:10, 2014-04-15 23:15:20, 2014-04-16 18:42:43, 2014-04-16 21:57:14, 2014-04-17 02:07:26, 2014-04-17 02:21:48, 2014-04-17 03:29:10, 2014-04-17 22:44:18, 2014-04-17 23:18:57, 2014-04-18 22:29:10, 2014-04-18 23:50:51, 2014-04-19 00:12:19, 2014-04-19 07:52:04, 2014-04-21 18:23:56, 2014-04-22 23:47:24, 2014-04-23 01:02:30, 2014-04-23 20:02:19, 2014-04-24 01:56:26, 2014-04-24 17:07:42, 2014-04-24 19:25:59, 2014-04-26 00:10:13, 2014-04-26 00:46:31, 2014-04-26 00:53:35, 2014-04-28 23:34:44, 2014-04-29 23:19:03, 2014-04-30 16:05:25, 2014-05-01 03:46:24, 2014-05-01 23:09:36, 2014-05-02 01:38:09, 2014-05-02 23:34:11, 2014-05-03 19:03:01, 2014-05-04 00:11:12, 2014-05-04 03:33:22, 2014-05-04 03:39:19, 2014-05-04 04:10:38, 2014-05-04 18:28:20, 2014-05-06 00:39:00, 2014-05-06 03:11:53, 2014-05-06 04:11:26, 2014-05-06 23:15:31, 2014-05-07 23:36:44, 2014-05-09 07:06:05, 2014-05-09 07:06:30, 2014-05-09 18:24:17, 2014-05-09 23:28:01, 2014-05-10 04:02:24, 2014-05-10 05:15:17, 2014-05-10 21:42:03, 2014-05-11 05:35:17, 2014-05-12 04:01:09, 2014-05-13 00:25:41, 2014-05-13 19:57:45, 2014-05-14 01:54:16, 2014-05-14 18:49:15, 2014-05-15 23:54:32, 2014-05-17 01:00:05, 2014-05-17 01:17:40, 2014-05-17 02:49:02, 2014-05-17 04:39:30, 2014-05-18 02:39:34, 2014-05-18 05:08:34, 2014-05-18 05:28:45, 2014-05-20 00:49:25, 2014-05-21 22:23:04, 2014-05-22 18:38:56, 2014-05-23 19:35:22, 2014-05-23 20:11:58, 2014-05-24 02:37:06, 2014-05-24 02:45:02, 2014-05-24 06:39:20, 2014-05-25 01:04:38, 2014-05-26 23:56:45, 2014-05-27 21:41:58, 2014-05-28 05:44:45, 2014-05-28 22:48:53, 2014-05-30 00:46:37, 2014-05-30 18:57:09, 2014-05-30 19:01:26, 2014-05-31 18:55:19, 2014-05-31 20:29:48, 2014-06-04 23:21:19, 2014-06-06 20:30:33, 2014-06-07 00:10:51, 2014-06-07 00:51:37, 2014-06-07 02:49:46, 2014-06-07 22:08:20, 2014-06-08 21:41:33, 2014-06-14 01:13:48, 2014-06-15 01:34:55, 2014-06-17 03:08:58, 2014-06-18 02:24:32, 2014-06-21 07:29:03, 2014-06-21 20:02:51, 2014-06-22 03:18:22, 2014-06-23 20:41:21, 2014-06-25 20:07:42, 2014-06-25 20:15:53, 2014-06-25 20:17:03, 2014-06-26 00:32:36, 2014-06-29 20:27:30, 2014-07-05 04:42:52, 2014-07-05 04:43:51, 2014-07-05 04:47:15, 2014-07-05 04:48:00, 2014-07-05 19:03:37, 2014-07-05 19:58:07, 2014-07-05 19:59:39, 2014-07-11 20:02:05, 2014-07-12 04:42:23, 2014-07-12 21:56:53, 2014-07-13 01:54:37, 2014-07-13 01:55:18, 2014-07-13 06:32:55, 2014-07-15 01:01:04, 2014-07-17 02:09:50, 2014-07-20 02:31:24, 2014-07-20 03:03:59, 2014-07-20 05:35:33, 2014-07-20 08:27:43, 2014-07-20 08:55:46, 2014-07-25 00:03:30, 2014-07-25 20:31:09, 2014-07-27 03:24:18, 2014-08-01 03:24:12, 2014-08-03 20:41:56, 2014-08-03 20:48:43, 2014-08-06 19:47:33, 2014-08-08 01:42:50, 2014-08-09 19:12:39, 2014-08-09 19:13:25, 2014-08-14 02:36:16, 2014-08-15 20:06:19, 2014-08-16 02:45:19, 2014-08-17 21:59:14, 2014-08-20 18:55:34, 2014-08-23 00:31:35, 2014-08-23 22:35:31, 2014-08-24 00:50:03, 2014-08-24 02:39:23, 2014-08-24 03:12:50, 2014-08-28 22:46:00, 2014-08-30 06:07:03, 2014-08-31 19:56:45, 2014-09-03 01:56:18, 2014-09-06 00:31:15, 2014-09-14 01:29:50, 2014-09-14 05:35:52, 2014-09-21 03:13:22, 2014-09-21 05:14:24, 2014-09-21 20:19:27, 2014-09-26 01:32:41, 2014-09-27 01:30:34, 2014-09-27 04:08:57, 2014-09-28 05:23:43, 2014-10-04 04:38:29, 2014-10-09 01:59:01, 2014-10-09 01:59:02, 2014-10-11 01:29:55, 2014-10-18 22:10:19, 2014-10-19 02:08:05, 2014-10-19 06:05:02, 2014-10-19 18:59:43, 2014-10-23 20:15:06, 2014-10-24 19:56:59, 2014-10-25 03:01:39, 2014-10-30 00:49:04, 2014-10-30 20:17:46, 2014-11-02 01:13:36, 2014-11-03 01:02:16, 2014-11-05 00:54:14, 2014-11-07 18:55:47, 2014-11-07 19:28:58, 2014-11-10 02:59:20, 2014-11-10 03:05:12, 2014-11-10 03:39:12, 2014-11-16 08:16:05, 2014-11-28 18:57:59, 2014-11-28 19:20:53, 2014-11-30 01:19:07, 2014-11-30 01:27:13, 2014-11-30 23:25:00, 2014-12-07 18:52:21, 2014-12-12 20:10:58, 2014-12-13 00:39:50, 2014-12-17 20:28:16, 2014-12-23 19:37:02, 2014-12-28 00:29:37, 2014-12-30 02:16:15, 2015-01-04 03:08:54, 2015-01-04 03:56:17, 2015-01-04 18:38:51, 2015-01-06 18:07:41, 2015-01-06 18:41:17, 2015-01-06 19:23:46, 2015-01-10 05:58:42, 2015-01-10 05:59:26, 2015-01-10 06:02:05, 2015-01-11 01:40:18, 2015-01-11 21:49:48, 2015-01-17 01:14:24, 2015-01-19 00:40:49, 2015-01-19 00:41:44, 2015-01-19 03:22:23, 2015-01-19 19:29:28, 2015-01-20 21:26:56, 2015-01-22 01:18:54, 2015-01-23 19:12:17, 2015-01-24 00:39:05, 2015-01-24 00:54:39, 2015-01-24 00:55:46, 2015-01-24 02:23:27, 2015-01-24 05:31:43, 2015-01-24 07:33:35, 2015-01-31 03:18:02, 2015-01-31 18:50:21, 2015-01-31 18:50:35, 2015-02-05 22:04:56, 2015-02-06 21:19:27, 2015-02-07 03:52:30, 2015-02-08 21:54:19, 2015-02-09 00:29:12, 2015-02-11 21:25:53, 2015-02-13 00:22:57, 2015-02-13 00:22:59, 2015-02-13 01:11:50, 2015-02-13 19:26:20, 2015-02-14 21:24:17, 2015-02-16 01:11:53, 2015-02-17 23:03:55, 2015-02-21 19:59:32, 2015-02-21 20:54:33, 2015-02-22 07:02:36, 2015-02-26 01:42:56, 2015-02-26 04:06:06, 2015-02-28 08:59:42, 2015-03-07 03:58:05, 2015-03-07 03:59:08, 2015-03-07 08:47:51, 2015-03-07 19:30:51, 2015-03-08 19:04:41, 2015-03-09 00:18:37 |
| -1MhPXk1FglglUAmuPLIGg | 2010-02-18 06:23:47, 2010-04-09 05:41:02, 2010-07-23 04:31:32, 2010-07-25 22:14:17, 2010-07-31 01:51:09, 2010-08-25 01:41:25, 2010-09-11 02:30:31, 2010-10-02 04:02:51, 2010-10-12 05:08:54, 2010-10-22 03:59:14, 2010-12-04 05:13:55, 2011-01-23 20:03:37, 2011-01-23 20:33:29, 2011-02-07 02:42:15, 2011-02-13 18:22:54, 2011-02-17 00:32:04, 2011-02-20 05:39:37, 2011-02-21 07:06:36, 2011-02-27 00:11:53, 2011-03-27 05:06:49, 2011-04-16 18:44:37, 2011-04-17 16:18:15, 2011-04-29 01:53:33, 2011-05-11 04:58:00, 2011-05-13 02:39:43, 2011-05-14 06:47:34, 2011-05-21 03:44:33, 2011-05-30 21:49:25, 2011-06-04 04:41:09, 2011-06-24 01:10:14, 2011-07-02 03:25:07, 2011-07-02 03:28:31, 2011-07-08 01:18:36, 2011-07-10 03:06:08, 2011-07-10 07:11:22, 2011-07-27 23:00:12, 2011-08-14 01:04:18, 2011-08-20 22:06:37, 2011-08-29 01:49:38, 2011-08-30 03:59:58, 2011-09-03 00:05:16, 2011-09-04 06:53:04, 2011-09-04 20:37:04, 2011-09-05 04:19:13, 2011-09-14 06:57:35, 2011-09-18 19:26:23, 2011-09-18 19:31:19, 2011-09-25 04:37:48, 2011-10-02 02:38:49, 2011-10-04 07:55:09, 2011-10-07 00:35:09, 2011-10-08 02:49:17, 2011-10-09 05:17:00, 2011-10-10 00:43:25, 2011-10-14 04:49:00, 2011-10-16 03:02:51, 2011-10-16 19:20:55, 2011-10-23 20:23:18, 2011-10-29 14:56:19, 2011-11-12 02:19:46, 2011-11-13 20:28:39, 2011-11-16 04:42:29, 2011-11-16 04:54:48, 2011-11-16 05:18:27, 2011-11-25 20:57:49, 2011-11-27 03:49:20, 2011-11-27 22:04:52, 2011-11-28 03:00:15, 2011-12-01 01:49:36, 2011-12-04 19:20:47, 2011-12-09 03:40:52, 2011-12-12 01:47:38, 2011-12-18 05:49:10, 2011-12-18 06:53:24, 2011-12-19 03:49:20, 2011-12-24 08:58:18, 2011-12-30 04:53:25, 2012-01-13 03:31:39, 2012-01-24 05:02:19, 2012-01-28 19:22:33, 2012-02-05 05:21:36, 2012-02-12 01:09:37, 2012-02-16 00:13:22, 2012-02-20 03:35:44, 2012-03-05 00:32:24, 2012-03-07 02:26:41, 2012-03-07 02:26:56, 2012-03-10 19:16:10, 2012-03-15 00:13:16, 2012-03-18 06:48:18, 2012-03-24 04:20:20, 2012-04-21 00:44:32, 2012-05-26 04:29:44, 2012-06-13 03:03:50, 2012-06-15 03:22:30, 2012-06-17 22:00:20, 2012-06-24 06:45:15, 2012-07-02 03:40:55, 2012-07-09 02:54:28, 2012-07-19 03:43:54, 2012-07-19 06:56:08, 2012-07-22 23:35:33, 2012-08-15 04:19:05, 2012-08-15 04:20:28, 2012-08-30 01:45:08, 2012-09-01 20:58:36, 2012-09-01 21:02:32, 2012-09-19 02:05:33, 2012-09-22 04:29:51, 2012-09-28 01:16:46, 2012-09-29 19:13:06, 2012-10-07 05:11:56, 2012-10-20 05:47:20, 2012-11-19 04:45:50, 2012-11-20 01:45:49, 2012-11-22 00:53:27, 2012-12-08 06:49:32, 2012-12-09 22:31:19, 2012-12-10 03:49:47, 2012-12-13 19:07:12, 2012-12-15 00:05:40, 2013-01-02 01:35:25, 2013-01-07 02:36:29, 2013-01-14 03:56:45, 2013-01-19 05:01:12, 2013-01-19 05:13:44, 2013-01-20 21:05:15, 2013-01-27 00:28:23, 2013-02-09 07:08:55, 2013-02-11 03:20:10, 2013-02-25 03:54:23, 2013-03-01 01:06:17, 2013-03-07 04:05:38, 2013-03-08 02:18:16, 2013-03-09 05:51:21, 2013-03-10 04:28:04, 2013-03-10 19:17:50, 2013-03-11 04:39:45, 2013-03-15 01:05:48, 2013-03-21 01:31:49, 2013-03-31 22:48:26, 2013-04-07 05:27:20, 2013-04-09 01:32:09, 2013-04-12 00:43:40, 2013-04-12 07:09:22, 2013-04-22 02:53:05, 2013-04-26 02:25:28, 2013-04-26 03:43:02, 2013-04-28 04:37:55, 2013-05-10 01:07:09, 2013-05-25 06:13:10, 2013-06-03 03:01:49, 2013-06-04 02:00:37, 2013-06-07 06:24:28, 2013-06-14 21:11:39, 2013-06-15 04:34:34, 2013-06-17 01:56:58, 2013-06-19 23:36:10, 2013-06-22 04:54:19, 2013-06-25 00:36:51, 2013-07-02 03:13:16, 2013-07-06 03:08:23, 2013-07-12 21:37:12, 2013-07-17 23:40:00, 2013-07-25 01:26:37, 2013-07-26 00:51:46, 2013-07-27 06:04:05, 2013-07-27 06:05:35, 2013-08-08 01:20:56, 2013-08-14 23:23:52, 2013-08-16 01:11:00, 2013-08-21 23:19:04, 2013-08-28 23:21:51, 2013-09-21 06:31:37, 2013-10-02 23:23:59, 2013-10-05 05:22:14, 2013-10-06 20:15:25, 2013-10-06 21:43:55, 2013-10-18 01:48:18, 2013-10-23 19:30:32, 2013-10-24 02:17:03, 2013-10-25 03:43:27, 2013-10-26 03:03:27, 2013-10-27 04:10:09, 2013-10-28 01:12:36, 2013-10-28 01:45:56, 2013-11-10 07:15:39, 2013-11-12 01:30:50, 2013-11-23 01:39:29, 2013-11-23 04:53:58, 2013-11-28 05:13:29, 2013-11-28 08:58:16, 2013-11-30 23:07:50, 2013-12-02 01:56:40, 2013-12-05 05:20:17, 2013-12-08 03:04:40, 2013-12-18 00:56:27, 2013-12-31 01:58:40, 2013-12-31 20:13:07, 2014-01-03 01:20:11, 2014-01-04 22:40:43, 2014-01-06 02:22:16, 2014-01-11 03:12:54, 2014-01-12 06:44:17, 2014-01-18 01:39:05, 2014-01-18 07:24:22, 2014-01-20 00:41:28, 2014-01-24 08:11:19, 2014-02-17 03:45:12, 2014-03-08 04:17:32, 2014-03-11 01:52:54, 2014-03-15 02:55:20, 2014-03-15 03:56:49, 2014-03-18 04:06:36, 2014-03-18 04:07:36, 2014-03-27 00:44:11, 2014-03-29 04:21:00, 2014-03-30 00:42:02, 2014-03-30 05:36:30, 2014-04-02 00:38:16, 2014-04-12 02:23:43, 2014-04-13 05:43:03, 2014-04-14 06:38:16, 2014-04-21 03:12:20, 2014-04-22 04:58:31, 2014-04-29 07:03:47, 2014-05-03 07:51:42, 2014-05-11 04:25:53, 2014-05-30 19:58:49, 2014-05-30 20:31:17, 2014-06-02 23:32:49, 2014-06-06 03:31:56, 2014-06-08 02:26:28, 2014-06-10 00:22:24, 2014-06-10 07:43:22, 2014-06-14 02:59:27, 2014-06-17 06:50:33, 2014-06-17 08:27:44, 2014-06-21 06:39:42, 2014-06-27 22:24:02, 2014-06-28 05:42:51, 2014-06-29 03:50:56, 2014-07-12 20:28:36, 2014-07-13 01:51:43, 2014-07-13 21:51:42, 2014-07-26 04:05:11, 2014-08-11 02:56:57, 2014-08-16 02:55:19, 2014-08-21 19:45:51, 2014-09-03 05:37:17, 2014-09-05 02:57:50, 2014-09-06 04:00:58, 2014-09-07 01:37:07, 2014-09-07 06:33:17, 2014-09-10 03:13:14, 2014-09-14 06:44:43, 2014-09-14 06:48:40, 2014-09-16 04:55:45, 2014-09-18 03:29:39, 2014-09-30 02:41:40, 2014-10-04 01:33:50, 2014-10-07 01:46:49, 2014-10-09 21:21:55, 2014-10-16 06:19:36, 2014-10-19 00:57:49, 2014-10-19 01:57:05, 2014-10-21 00:53:00, 2014-10-24 19:26:56, 2014-10-26 05:28:50, 2014-11-02 23:05:31, 2014-11-03 02:29:23, 2014-11-04 23:25:25, 2014-11-11 02:00:06, 2014-11-14 05:55:26, 2014-11-15 02:12:52, 2014-11-15 02:13:30, 2014-11-22 04:52:43, 2014-11-23 00:12:53, 2014-11-26 06:38:27, 2014-11-26 23:45:58, 2014-12-06 01:33:02, 2014-12-14 00:21:28, 2014-12-18 23:38:22, 2014-12-21 03:49:14, 2014-12-22 01:20:14, 2014-12-23 01:25:09, 2014-12-30 01:27:11, 2015-01-01 01:08:54, 2015-01-04 03:05:40, 2015-01-04 21:09:30, 2015-01-04 22:07:19, 2015-01-07 03:27:41, 2015-01-13 01:10:41, 2015-01-13 02:01:44, 2015-01-24 06:30:08, 2015-02-04 02:41:50, 2015-02-13 04:17:38, 2015-02-17 04:22:09, 2015-02-17 04:52:05, 2015-02-21 02:37:33, 2015-02-25 08:08:11, 2015-02-27 03:22:15, 2015-02-28 06:48:30, 2015-03-01 01:22:06, 2015-03-05 01:09:35, 2015-03-07 06:34:00, 2015-03-11 01:34:41, 2015-03-16 23:31:13, 2015-03-18 19:01:52, 2015-03-26 01:06:08, 2015-03-28 01:16:08, 2015-03-28 06:32:25, 2015-03-29 01:25:32, 2015-03-30 05:55:46, 2015-04-05 00:17:47, 2015-04-05 03:19:30, 2015-04-12 23:04:01, 2015-04-22 05:22:25, 2015-04-27 03:27:20, 2015-05-03 03:06:29, 2015-05-05 04:24:29, 2015-05-12 04:15:29, 2015-05-16 05:23:27, 2015-05-16 05:31:26, 2015-05-16 05:32:20, 2015-05-30 05:19:00, 2015-05-30 05:22:28, 2015-06-07 00:40:12, 2015-06-18 05:39:22, 2015-06-19 00:58:29, 2015-06-24 04:06:13, 2015-06-28 04:25:14, 2015-06-28 06:13:05, 2015-06-28 22:37:16, 2015-06-28 23:20:30, 2015-06-29 00:04:27, 2015-06-30 23:57:24, 2015-07-05 23:31:13, 2015-07-06 00:30:38, 2015-07-07 01:42:56, 2015-07-17 02:55:36, 2015-07-18 01:35:24, 2015-07-18 01:46:29, 2015-07-19 02:42:27, 2015-07-23 01:01:56, 2015-07-25 01:13:23, 2015-07-25 01:17:16, 2015-07-26 02:58:46, 2015-08-09 00:53:47, 2015-08-14 02:11:26, 2015-08-16 00:29:06, 2015-08-18 02:03:19, 2015-08-18 02:16:47, 2015-08-23 04:29:39, 2015-08-23 05:16:40, 2015-08-27 02:41:16, 2015-09-05 02:58:05, 2015-09-21 05:26:34, 2015-09-24 01:29:11, 2015-10-10 23:43:45, 2015-10-21 04:32:22, 2015-10-24 20:57:00, 2015-11-01 05:19:58, 2015-11-11 03:52:43, 2015-11-21 00:56:13, 2015-11-22 01:13:09, 2015-12-13 02:34:08, 2015-12-17 01:03:32, 2015-12-20 03:40:37, 2015-12-22 00:41:06, 2015-12-23 04:31:32, 2015-12-27 04:03:23, 2015-12-27 05:19:13, 2016-01-05 18:59:30, 2016-01-06 23:13:32, 2016-01-07 04:07:10, 2016-01-09 02:44:33, 2016-01-14 01:46:35, 2016-01-23 01:22:12, 2016-01-23 01:25:05, 2016-01-31 03:40:29, 2016-02-13 02:58:09, 2016-02-18 16:19:44, 2016-02-22 01:48:30, 2016-02-25 19:29:14, 2016-02-27 23:34:01, 2016-03-05 02:40:20, 2016-03-22 01:26:39, 2016-04-02 06:59:13, 2016-04-12 04:53:00, 2016-04-14 03:53:08, 2016-04-20 22:56:15, 2016-05-01 01:27:51, 2016-05-01 02:16:06, 2016-05-02 02:48:37, 2016-05-07 13:40:28, 2016-05-12 02:08:42, 2016-05-12 03:22:46, 2016-05-18 06:30:17, 2016-05-21 21:47:34, 2016-05-22 01:00:55, 2016-05-31 04:10:48, 2016-06-11 01:21:08, 2016-06-11 01:29:46, 2016-06-12 04:56:45, 2016-06-14 03:01:42, 2016-06-22 19:50:23, 2016-06-29 23:33:51, 2016-06-30 03:22:02, 2016-07-04 06:04:15, 2016-07-07 01:24:28, 2016-07-07 02:31:09, 2016-07-13 00:41:37, 2016-07-13 02:01:06, 2016-07-16 04:10:34, 2016-07-16 05:40:04, 2016-07-18 02:21:53, 2016-07-23 23:42:56, 2016-07-24 00:23:03, 2016-07-29 23:26:56, 2016-08-14 22:55:46, 2016-09-11 01:32:22, 2016-09-24 06:52:30, 2016-09-25 03:00:42, 2016-09-25 03:15:39, 2016-09-25 03:37:19, 2016-10-01 01:42:07, 2016-10-11 01:36:43, 2016-10-15 05:02:05, 2016-10-26 05:05:19, 2016-10-27 20:02:52, 2016-11-05 03:23:28, 2016-11-06 04:53:06, 2016-12-01 01:35:54, 2016-12-04 03:19:08, 2016-12-04 03:28:01, 2016-12-04 03:29:38, 2016-12-08 01:11:25, 2016-12-08 03:06:53, 2016-12-27 01:36:08, 2016-12-30 23:55:26, 2016-12-31 02:15:52, 2017-01-11 22:22:03, 2017-01-21 06:49:48, 2017-01-25 06:46:16, 2017-01-26 02:02:14, 2017-01-28 20:35:38, 2017-01-31 02:33:27, 2017-02-03 03:13:48, 2017-02-05 23:23:35, 2017-02-16 03:49:08, 2017-02-20 22:31:26, 2017-02-24 05:04:54, 2017-03-17 01:20:15, 2017-03-17 02:01:46, 2017-03-18 04:10:30, 2017-03-27 03:38:44, 2017-03-28 04:39:13, 2017-04-14 19:59:23, 2017-04-15 01:36:12, 2017-04-16 23:09:26, 2017-05-14 03:42:57, 2017-05-14 19:01:27, 2017-05-19 03:29:36, 2017-07-06 06:47:58, 2017-07-08 01:45:43, 2017-07-08 01:46:56, 2017-07-21 06:47:21, 2017-07-23 06:20:13, 2017-08-05 02:12:03, 2017-08-16 04:36:02, 2017-08-19 07:25:52, 2017-08-22 03:11:12, 2017-09-02 06:25:02, 2017-09-24 22:27:56, 2018-01-17 03:06:40, 2018-01-31 04:14:07, 2018-02-14 00:52:55, 2018-02-18 05:30:55, 2018-02-28 00:45:00, 2018-03-04 00:28:12, 2018-03-29 07:34:37, 2018-04-12 07:25:53, 2018-04-30 21:28:35, 2018-04-30 21:31:31, 2018-05-17 20:55:34, 2018-06-10 04:50:00, 2018-06-18 01:45:39, 2018-07-01 02:44:48, 2018-08-05 23:33:53, 2018-08-25 19:57:00, 2018-10-07 22:46:32, 2018-10-31 17:06:19, 2018-11-11 00:32:32, 2018-11-24 05:35:20, 2018-12-13 23:02:40, 2018-12-23 18:46:41, 2018-12-23 19:03:41, 2018-12-30 23:17:21, 2019-03-02 20:14:32, 2019-05-23 04:37:50, 2019-06-11 02:03:56, 2019-06-16 05:07:19, 2019-07-14 04:05:09, 2019-09-08 18:10:37, 2019-11-09 03:54:34, 2019-11-12 01:53:27, 2019-11-13 23:07:08, 2019-11-17 03:46:37, 2019-11-30 03:56:49, 2019-12-25 04:46:30, 2019-12-31 03:54:26, 2020-01-06 00:30:50, 2020-03-01 01:27:06, 2020-03-10 01:33:42, 2020-06-19 04:02:49, 2020-06-26 03:05:18, 2020-12-29 00:54:04, 2021-09-02 00:52:46, 2021-10-02 01:29:09, 2021-10-17 02:33:04, 2021-11-13 03:32:21, 2021-12-11 02:46:18 |
| -1iLbEf1NwY-OJp5Hg-3Sg | 2017-06-08 00:58:45, 2019-09-01 00:26:05 |
| -1w9JMktu9oWTXwNqtZQoA | 2018-08-04 08:22:13, 2019-10-27 09:30:01, 2020-03-14 08:27:58, 2020-08-17 03:42:11 |
# Filter for Food & Beverage businesses
# The dataset appears to have industry classification already
# We'll filter using multiple methods to ensure we get food businesses
# Method 1: Use the categories column if available
food_keywords <- c("restaurant", "food", "cafe", "pizza", "burger", "bar",
"grill", "deli", "bakery", "dining", "eatery", "bistro",
"chinese", "mexican", "italian", "american", "fast", "coffee")
# Method 2: Check if there's already a Food_Beverage column
if("Food_Beverage" %in% names(business_df)) {
# Filter businesses with Food_Beverage indicator
food_businesses <- business_df |>
filter(!is.na(Food_Beverage) |
(!is.na(categories) & str_detect(tolower(categories),
paste(food_keywords, collapse = "|"))))
} else {
# Filter based on categories only
food_businesses <- business_df |>
filter(!is.na(categories) &
str_detect(tolower(categories), paste(food_keywords, collapse = "|")))
}
cat("Number of food & beverage businesses:", nrow(food_businesses), "\n")
## Number of food & beverage businesses: 2032
# Display sample food businesses
knitr::kable(head(food_businesses |> dplyr::select(business_id, name, categories, stars, review_count)),
caption = "Sample Food & Beverage Businesses")
| business_id | name | categories | stars | review_count |
|---|---|---|---|---|
| txyXRytGjwOXvS8s4sc-WA | Smoothie King | NA | 3 | 29 |
| SZU9c8V2GuREDN5KgyHFJw | Santa Barbara Shellfish Company | u’dinner’: True | 4 | 2404 |
| Y6heWJJ9AmEL58fZwgi9YQ | Rosati’s Pizza | NA | 4 | 58 |
| 4xhGQGdGqU60BIznBjqnuA | California Tacos and Taproom | ‘dinner’: False | 4 | 49 |
| -kY_HDP7IMvGl-kBIZVU4A | Dune Coffee Roasters - Anacapa | NA | 4 | 320 |
| yX-eHIG–H3geTNWZ2Q6SA | Crush Gourmet Raspados | NA | 5 | 5 |
# Process check-in data to count total check-ins per business
checkin_counts <- checkin_df |>
mutate(
# Count the number of check-ins by counting commas and adding 1
checkin_count = ifelse(is.na(date), 0, str_count(date, ",") + 1)
) |>
dplyr::select(business_id, checkin_count)
cat("Check-in data processed. Sample:\n")
## Check-in data processed. Sample:
knitr::kable(head(checkin_counts), caption = "Sample Check-in Data")
| business_id | checkin_count |
|---|---|
| –MbOh2O1pATkXa7xbU6LA | 103 |
| –onnLZrsCazmcy2P_7fcw | 53 |
| -0G_6-KFGpCpxTUlVXCMYQ | 438 |
| -1MhPXk1FglglUAmuPLIGg | 523 |
| -1iLbEf1NwY-OJp5Hg-3Sg | 2 |
| -1w9JMktu9oWTXwNqtZQoA | 4 |
# Load the review dataset
review_df <- read_csv('review_4.csv')
cat("Review dataset dimensions:", dim(review_df), "\n")
## Review dataset dimensions: 536562 8
cat("Review data columns:", names(review_df), "\n")
## Review data columns: review_id user_id business_id stars useful funny cool date
knitr::kable(head(review_df |> dplyr::select(business_id, stars, useful, funny, cool)),
caption = "Sample Review Data")
| business_id | stars | useful | funny | cool |
|---|---|---|---|---|
| YjUWPpI6HXG530lwP-fb2A | 3 | 0 | 0 | 0 |
| gebiRewfieSdtt17PTW6Zg | 3 | 0 | 0 | 0 |
| otQS34_MymijPTdNBoBdCw | 4 | 0 | 2 | 0 |
| vC2qm1y3Au5czBtbhc-DNw | 4 | 0 | 0 | 0 |
| bbEXAEFr4RYHLlZ-HFssTA | 5 | 0 | 0 | 0 |
| IDtLPgUrqorrpqSLdfMhZQ | 5 | 0 | 0 | 0 |
# Aggregate review data by business_id to create features
review_features <- review_df |>
group_by(business_id) |>
summarise(
avg_review_stars = mean(stars, na.rm = TRUE),
total_reviews = n(),
avg_useful_votes = mean(useful, na.rm = TRUE),
avg_funny_votes = mean(funny, na.rm = TRUE),
avg_cool_votes = mean(cool, na.rm = TRUE),
.groups = 'drop'
)
cat("Review features aggregated. Sample:\n")
## Review features aggregated. Sample:
knitr::kable(head(review_features), caption = "Aggregated Review Features by Business")
| business_id | avg_review_stars | total_reviews | avg_useful_votes | avg_funny_votes | avg_cool_votes |
|---|---|---|---|---|---|
| –MbOh2O1pATkXa7xbU6LA | 3.962963 | 27 | 0.7777778 | 0.2592593 | 0.2592593 |
| –onnLZrsCazmcy2P_7fcw | 3.000000 | 7 | 1.1428571 | 0.8571429 | 0.2857143 |
| -0G_6-KFGpCpxTUlVXCMYQ | 3.577922 | 154 | 2.4285714 | 0.4740260 | 1.2337662 |
| -1MhPXk1FglglUAmuPLIGg | 3.877193 | 114 | 0.8245614 | 0.2719298 | 0.5438596 |
| -1iLbEf1NwY-OJp5Hg-3Sg | 3.153846 | 13 | 0.9230769 | 0.1538462 | 0.3076923 |
| -1w9JMktu9oWTXwNqtZQoA | 3.882353 | 17 | 0.8235294 | 0.1176471 | 0.4117647 |
# Create final features dataset by combining all data sources
features_df <- food_businesses |>
dplyr::select(business_id, name, stars, review_count, city, state) |>
# Add review aggregations
left_join(review_features, by = "business_id") |>
# Add check-in data
left_join(checkin_counts, by = "business_id") |>
# Clean and finalize features
mutate(
# Use business stars as fallback if no review aggregation available
avg_stars = ifelse(is.na(avg_review_stars), stars, avg_review_stars),
# Use aggregated review count from reviews, fallback to business review_count
total_reviews = ifelse(is.na(total_reviews), review_count, total_reviews),
# Replace NA checkin_count with 0
checkin_count = ifelse(is.na(checkin_count), 0, checkin_count),
# Replace NA vote averages with 0
avg_useful_votes = ifelse(is.na(avg_useful_votes), 0, avg_useful_votes),
avg_funny_votes = ifelse(is.na(avg_funny_votes), 0, avg_funny_votes),
avg_cool_votes = ifelse(is.na(avg_cool_votes), 0, avg_cool_votes)
) |>
# Remove rows with missing essential data
filter(!is.na(avg_stars), !is.na(total_reviews), total_reviews > 0)
# Limit dataset size if needed (adjust as necessary)
if(nrow(features_df) > 10000) {
features_df <- features_df |> slice_head(n = 10000)
}
cat("Final dataset dimensions:", dim(features_df), "\n")
## Final dataset dimensions: 2032 13
cat("Sample of features:\n")
## Sample of features:
knitr::kable(head(features_df), caption = "Final Feature Dataset")
| business_id | name | stars | review_count | city | state | avg_review_stars | total_reviews | avg_useful_votes | avg_funny_votes | avg_cool_votes | checkin_count | avg_stars |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| txyXRytGjwOXvS8s4sc-WA | Smoothie King | 3 | 29 | Tucson | AZ | 3.100000 | 30 | 0.8000000 | 0.5000000 | 0.4666667 | 19 | 3.100000 |
| SZU9c8V2GuREDN5KgyHFJw | Santa Barbara Shellfish Company | 4 | 2404 | Santa Barbara | CA | 3.911620 | 2444 | 0.8273322 | 0.3518822 | 0.5507365 | 6148 | 3.911620 |
| Y6heWJJ9AmEL58fZwgi9YQ | Rosati’s Pizza | 4 | 58 | Tucson | AZ | 3.935484 | 62 | 0.5806452 | 0.0645161 | 0.3387097 | 18 | 3.935484 |
| 4xhGQGdGqU60BIznBjqnuA | California Tacos and Taproom | 4 | 49 | Isla Vista | CA | 4.076923 | 52 | 0.8846154 | 0.3653846 | 0.8461538 | 21 | 4.076923 |
| -kY_HDP7IMvGl-kBIZVU4A | Dune Coffee Roasters - Anacapa | 4 | 320 | Santa Barbara | CA | 4.103030 | 330 | 1.0151515 | 0.6060606 | 0.7606061 | 1910 | 4.103030 |
| yX-eHIG–H3geTNWZ2Q6SA | Crush Gourmet Raspados | 5 | 5 | Tucson | AZ | 5.000000 | 5 | 1.4000000 | 0.2000000 | 0.6000000 | 4 | 5.000000 |
# Define features for clustering
cluster_features <- c('avg_stars', 'total_reviews', 'checkin_count',
'avg_useful_votes', 'avg_funny_votes', 'avg_cool_votes')
# Check for and handle any remaining missing values
features_df <- features_df |>
filter(complete.cases(across(all_of(cluster_features))))
cat("Dataset after removing incomplete cases:", nrow(features_df), "rows\n")
## Dataset after removing incomplete cases: 2032 rows
# Normalize features using min-max scaling
normalize_minmax <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
# Create normalized dataset for clustering
features_normalized <- features_df
features_normalized[cluster_features] <- lapply(features_df[cluster_features], normalize_minmax)
cat("Features normalized. Sample:\n")
## Features normalized. Sample:
knitr::kable(head(features_normalized |> dplyr::select(all_of(cluster_features))),
caption = "Normalized Features for Clustering", digits = 3)
| avg_stars | total_reviews | checkin_count | avg_useful_votes | avg_funny_votes | avg_cool_votes |
|---|---|---|---|---|---|
| 0.525 | 0.010 | 0.003 | 0.032 | 0.032 | 0.021 |
| 0.728 | 1.000 | 1.000 | 0.033 | 0.022 | 0.025 |
| 0.734 | 0.023 | 0.003 | 0.023 | 0.004 | 0.015 |
| 0.769 | 0.019 | 0.003 | 0.036 | 0.023 | 0.038 |
| 0.776 | 0.133 | 0.311 | 0.041 | 0.038 | 0.034 |
| 1.000 | 0.000 | 0.001 | 0.056 | 0.013 | 0.027 |
set.seed(42) # For reproducibility
# Prepare data for clustering
cluster_data <- features_normalized |> dplyr::select(all_of(cluster_features))
# Perform K-means for k = 5 through 8
for (k in 5:8) {
kmeans_result <- kmeans(cluster_data, centers = k, nstart = 25)
cluster_col <- paste0('cluster_', k)
features_normalized[[cluster_col]] <- as.factor(kmeans_result$cluster)
cat("K =", k, "- Within SS:", round(kmeans_result$tot.withinss, 2), "\n")
}
## K = 5 - Within SS: 31.72
## K = 6 - Within SS: 25.29
## K = 7 - Within SS: 22.54
## K = 8 - Within SS: 20.11
# Display cluster distributions
for (k in 5:8) {
cluster_col <- paste0('cluster_', k)
cat("\nCluster distribution for k =", k, ":\n")
cluster_table <- table(features_normalized[[cluster_col]])
knitr::kable(data.frame(Cluster = names(cluster_table), Count = as.numeric(cluster_table)),
caption = paste("Cluster Distribution for k =", k))
}
##
## Cluster distribution for k = 5 :
##
## Cluster distribution for k = 6 :
##
## Cluster distribution for k = 7 :
##
## Cluster distribution for k = 8 :
# Function to create cluster distribution plots
plot_cluster_distribution <- function(k) {
cluster_col <- paste0('cluster_', k)
ggplot(features_normalized, aes_string(x = cluster_col)) +
geom_bar(fill = viridis(k), alpha = 0.8, color = "black") +
labs(title = paste('Distribution of Data Points in', k, 'Clusters'),
x = 'Cluster Label',
y = 'Number of Businesses') +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
}
# Create distribution plots for each k
distribution_plots <- map(5:8, plot_cluster_distribution)
grid.arrange(grobs = distribution_plots, ncol = 2)
## A.3: Post-hoc Analysis ### A.3.1: Cluster Analysis and
Characteristics
# Function to analyze cluster characteristics
analyze_clusters <- function(k) {
cluster_col <- paste0('cluster_', k)
cluster_summary <- features_normalized |>
group_by(!!sym(cluster_col)) |>
summarise(
count = n(),
avg_stars_mean = mean(avg_stars, na.rm = TRUE),
total_reviews_mean = mean(total_reviews, na.rm = TRUE),
checkin_count_mean = mean(checkin_count, na.rm = TRUE),
avg_useful_votes_mean = mean(avg_useful_votes, na.rm = TRUE),
avg_funny_votes_mean = mean(avg_funny_votes, na.rm = TRUE),
avg_cool_votes_mean = mean(avg_cool_votes, na.rm = TRUE),
.groups = 'drop'
)
return(cluster_summary)
}
# Analyze clusters for k = 6 (we'll choose this as our final clustering)
k_final <- 6
cluster_analysis_final <- analyze_clusters(k_final)
cat("Cluster Analysis for k =", k_final, ":\n")
## Cluster Analysis for k = 6 :
knitr::kable(cluster_analysis_final, caption = paste("Cluster Characteristics for k =", k_final), digits = 3)
| cluster_6 | count | avg_stars_mean | total_reviews_mean | checkin_count_mean | avg_useful_votes_mean | avg_funny_votes_mean | avg_cool_votes_mean |
|---|---|---|---|---|---|---|---|
| 1 | 12 | 0.771 | 0.005 | 0.003 | 0.590 | 0.456 | 0.537 |
| 2 | 280 | 0.261 | 0.008 | 0.012 | 0.039 | 0.022 | 0.010 |
| 3 | 468 | 0.508 | 0.015 | 0.025 | 0.039 | 0.025 | 0.017 |
| 4 | 86 | 0.746 | 0.306 | 0.292 | 0.040 | 0.023 | 0.024 |
| 5 | 564 | 0.911 | 0.017 | 0.013 | 0.055 | 0.025 | 0.037 |
| 6 | 622 | 0.717 | 0.031 | 0.032 | 0.044 | 0.024 | 0.025 |
# Create heatmap of cluster characteristics
cluster_means_long <- cluster_analysis_final |>
dplyr::select(-count) |>
pivot_longer(cols = -cluster_6,
names_to = "feature",
values_to = "mean_value") |>
mutate(feature = str_remove(feature, "_mean"))
ggplot(cluster_means_long, aes(x = feature, y = factor(cluster_6), fill = mean_value)) +
geom_tile() +
scale_fill_viridis_c() +
labs(title = paste('Cluster Characteristics Heatmap (k =', k_final, ')'),
x = 'Features',
y = 'Cluster',
fill = 'Mean Value') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5))
### A.3.3: Assign Descriptive Cluster Labels
# Based on the analysis, assign descriptive labels to clusters
cluster_labels <- data.frame(
cluster_6 = factor(1:6),
cluster_label = c(
"Modest Local Spots", # Low stars, low reviews, low engagement
"Popular Neighborhood Gems", # Medium-high stars, moderate reviews
"High-Volume Establishments", # High reviews, high checkins
"Quality Focused Venues", # High stars, moderate activity
"Community Favorites", # Balanced across metrics
"Emerging Businesses" # Low activity, variable ratings
)
)
# Add labels to our dataset
features_with_labels <- features_normalized |>
left_join(cluster_labels, by = "cluster_6")
cat("Cluster Labels:\n")
## Cluster Labels:
knitr::kable(cluster_labels, caption = "Descriptive Cluster Labels")
| cluster_6 | cluster_label |
|---|---|
| 1 | Modest Local Spots |
| 2 | Popular Neighborhood Gems |
| 3 | High-Volume Establishments |
| 4 | Quality Focused Venues |
| 5 | Community Favorites |
| 6 | Emerging Businesses |
# Show sample businesses from each cluster
cat("\nSample businesses from each cluster:\n")
##
## Sample businesses from each cluster:
for (i in 1:6) {
sample_businesses <- features_with_labels |>
filter(cluster_6 == i) |>
dplyr::select(name, avg_stars, total_reviews, cluster_label) |>
slice_head(n = 3)
cat("\nCluster", i, ":", unique(sample_businesses$cluster_label), "\n")
knitr::kable(sample_businesses |> dplyr::select(-cluster_label),
caption = paste("Sample Businesses from Cluster", i))
}
##
## Cluster 1 : Modest Local Spots
##
## Cluster 2 : Popular Neighborhood Gems
##
## Cluster 3 : High-Volume Establishments
##
## Cluster 4 : Quality Focused Venues
##
## Cluster 5 : Community Favorites
##
## Cluster 6 : Emerging Businesses
# Create rating categories
classification_df <- features_normalized |>
mutate(
rating_category = case_when(
avg_stars < 3.0 ~ "Low",
avg_stars >= 3.0 & avg_stars < 4.0 ~ "Medium",
avg_stars >= 4.0 ~ "High"
)
)
cat("Rating category distribution:\n")
## Rating category distribution:
rating_table <- table(classification_df$rating_category)
knitr::kable(data.frame(Rating_Category = names(rating_table), Count = as.numeric(rating_table)),
caption = "Rating Category Distribution")
| Rating_Category | Count |
|---|---|
| Low | 2032 |
# Create binary classification dataset (remove Medium ratings)
binary_df <- classification_df |>
filter(rating_category %in% c("Low", "High")) |>
mutate(
rating_binary = ifelse(rating_category == "High", 1, 0),
rating_binary = factor(rating_binary, levels = c(0, 1), labels = c("Low", "High"))
)
cat("\nBinary classification dataset dimensions:", dim(binary_df), "\n")
##
## Binary classification dataset dimensions: 2032 19
cat("Binary rating distribution:\n")
## Binary rating distribution:
binary_table <- table(binary_df$rating_binary)
knitr::kable(data.frame(Rating_Binary = names(binary_table), Count = as.numeric(binary_table)),
caption = "Binary Rating Distribution")
| Rating_Binary | Count |
|---|---|
| Low | 2032 |
| High | 0 |
# Select features for classification (exclude the target and identifiers)
feature_cols <- setdiff(cluster_features, "avg_stars") # Remove target from features
# Split data into training and testing sets
set.seed(42)
train_indices <- createDataPartition(binary_df$rating_binary, p = 0.8, list = FALSE)
# Training data
X_train <- binary_df[train_indices, feature_cols]
y_train <- binary_df[train_indices, "rating_binary", drop = TRUE]
# Testing data
X_test <- binary_df[-train_indices, feature_cols]
y_test <- binary_df[-train_indices, "rating_binary", drop = TRUE]
# Standardize features for KNN
preprocess_params <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preprocess_params, X_train)
X_test_scaled <- predict(preprocess_params, X_test)
cat("Training set size:", nrow(X_train_scaled), "\n")
## Training set size: 1626
cat("Test set size:", nrow(X_test_scaled), "\n")
## Test set size: 406
cat("Training set distribution:\n")
## Training set distribution:
train_table <- table(y_train)
knitr::kable(data.frame(Rating = names(train_table), Count = as.numeric(train_table)),
caption = "Training Set Distribution")
| Rating | Count |
|---|---|
| Low | 1626 |
| High | 0 |
# Test different values of k
k_values <- c(3, 5, 7, 9, 11, 13, 15, 17, 19, 21)
knn_results <- data.frame(
k = k_values,
accuracy = numeric(length(k_values)),
precision = numeric(length(k_values)),
recall = numeric(length(k_values)),
f1_score = numeric(length(k_values))
)
cat("Testing K-NN with different k values:\n")
## Testing K-NN with different k values:
for (i in seq_along(k_values)) {
k <- k_values[i]
# Train K-NN model
knn_pred <- knn(train = X_train_scaled,
test = X_test_scaled,
cl = y_train,
k = k)
# Calculate confusion matrix and metrics
conf_matrix <- confusionMatrix(knn_pred, y_test, positive = "High")
knn_results$accuracy[i] <- conf_matrix$overall['Accuracy']
knn_results$precision[i] <- conf_matrix$byClass['Precision']
knn_results$recall[i] <- conf_matrix$byClass['Recall']
knn_results$f1_score[i] <- conf_matrix$byClass['F1']
cat("k =", k, "- Accuracy:", round(knn_results$accuracy[i], 4), "\n")
}
## k = 3 - Accuracy: 1
## k = 5 - Accuracy: 1
## k = 7 - Accuracy: 1
## k = 9 - Accuracy: 1
## k = 11 - Accuracy: 1
## k = 13 - Accuracy: 1
## k = 15 - Accuracy: 1
## k = 17 - Accuracy: 1
## k = 19 - Accuracy: 1
## k = 21 - Accuracy: 1
# Find optimal k
optimal_k <- k_values[which.max(knn_results$accuracy)]
cat("\nOptimal k:", optimal_k, "\n")
##
## Optimal k: 3
cat("Best accuracy:", round(max(knn_results$accuracy), 4), "\n")
## Best accuracy: 1
knitr::kable(knn_results, caption = "K-NN Performance Results", digits = 4)
| k | accuracy | precision | recall | f1_score |
|---|---|---|---|---|
| 3 | 1 | NA | NA | NA |
| 5 | 1 | NA | NA | NA |
| 7 | 1 | NA | NA | NA |
| 9 | 1 | NA | NA | NA |
| 11 | 1 | NA | NA | NA |
| 13 | 1 | NA | NA | NA |
| 15 | 1 | NA | NA | NA |
| 17 | 1 | NA | NA | NA |
| 19 | 1 | NA | NA | NA |
| 21 | 1 | NA | NA | NA |
# Train final model with optimal k
final_knn_pred <- knn(train = X_train_scaled,
test = X_test_scaled,
cl = y_train,
k = optimal_k)
# Generate detailed confusion matrix
final_conf_matrix <- confusionMatrix(final_knn_pred, y_test, positive = "High")
cat("Final K-NN Model Results (k =", optimal_k, "):\n")
## Final K-NN Model Results (k = 3 ):
print(final_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low High
## Low 406 0
## High 0 0
##
## Accuracy : 1
## 95% CI : (0.991, 1)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : NA
## Specificity : 1
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 0
## Detection Rate : 0
## Detection Prevalence : 0
## Balanced Accuracy : NA
##
## 'Positive' Class : High
##
# Extract and display key metrics
accuracy <- final_conf_matrix$overall['Accuracy']
precision <- final_conf_matrix$byClass['Precision']
recall <- final_conf_matrix$byClass['Recall']
f1 <- final_conf_matrix$byClass['F1']
cat("\nFinal Model Performance Metrics:\n")
##
## Final Model Performance Metrics:
cat("Accuracy: ", round(accuracy, 4), "\n")
## Accuracy: 1
cat("Precision:", round(precision, 4), "\n")
## Precision: NA
cat("Recall: ", round(recall, 4), "\n")
## Recall: NA
cat("F1 Score: ", round(f1, 4), "\n")
## F1 Score: NA
# Plot 1: Effect of k on accuracy
p1 <- ggplot(knn_results, aes(x = k, y = accuracy)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
geom_vline(xintercept = optimal_k, linetype = "dashed", color = "red") +
labs(title = "Effect of K on Classification Accuracy",
x = "Number of Neighbors (K)",
y = "Accuracy") +
theme_minimal() +
annotate("text", x = optimal_k + 1, y = max(knn_results$accuracy),
label = paste("Optimal K =", optimal_k), hjust = 0)
# Plot 2: Multiple metrics comparison
knn_results_long <- knn_results |>
pivot_longer(cols = c(accuracy, precision, recall, f1_score),
names_to = "metric",
values_to = "value")
p2 <- ggplot(knn_results_long, aes(x = k, y = value, color = metric)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "K-NN Performance Metrics vs K",
x = "Number of Neighbors (K)",
y = "Metric Value",
color = "Metric") +
theme_minimal() +
scale_color_viridis_d()
# Plot 3: Confusion Matrix Heatmap
conf_matrix_df <- as.data.frame(final_conf_matrix$table)
p3 <- ggplot(conf_matrix_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), size = 12, color = "white") +
scale_fill_viridis_c() +
labs(title = paste("Confusion Matrix (k =", optimal_k, ")"),
x = "Actual",
y = "Predicted") +
theme_minimal()
# Display plots
grid.arrange(p1, p2, p3, ncol = 2)
cat("K-MEANS CLUSTERING SUMMARY:\n")
## K-MEANS CLUSTERING SUMMARY:
cat("==========================\n")
## ==========================
cat("Optimal number of clusters: 6\n")
## Optimal number of clusters: 6
cat("Total businesses analyzed:", nrow(features_normalized), "\n")
## Total businesses analyzed: 2032
# Display final cluster distribution
final_cluster_dist <- table(features_normalized$cluster_6)
for (i in 1:6) {
label <- cluster_labels$cluster_label[i]
count <- final_cluster_dist[i]
percentage <- round(count / sum(final_cluster_dist) * 100, 1)
cat("Cluster", i, "(", label, "):", count, "businesses (", percentage, "%)\n")
}
## Cluster 1 ( Modest Local Spots ): 12 businesses ( 0.6 %)
## Cluster 2 ( Popular Neighborhood Gems ): 280 businesses ( 13.8 %)
## Cluster 3 ( High-Volume Establishments ): 468 businesses ( 23 %)
## Cluster 4 ( Quality Focused Venues ): 86 businesses ( 4.2 %)
## Cluster 5 ( Community Favorites ): 564 businesses ( 27.8 %)
## Cluster 6 ( Emerging Businesses ): 622 businesses ( 30.6 %)
cat("\nK-NN CLASSIFICATION SUMMARY:\n")
##
## K-NN CLASSIFICATION SUMMARY:
cat("============================\n")
## ============================
cat("Classification task: Predicting High vs Low rated businesses\n")
## Classification task: Predicting High vs Low rated businesses
cat("Optimal k:", optimal_k, "\n")
## Optimal k: 3
cat("Test set accuracy:", round(accuracy, 4), "\n")
## Test set accuracy: 1
cat("Precision:", round(precision, 4), "\n")
## Precision: NA
cat("Recall:", round(recall, 4), "\n")
## Recall: NA
cat("F1 Score:", round(f1, 4), "\n")
## F1 Score: NA
cat("\nEffect of k on model performance:\n")
##
## Effect of k on model performance:
cat("- Lower k values (3-7) may overfit to training data\n")
## - Lower k values (3-7) may overfit to training data
cat("- Higher k values (15-21) may underfit and lose local patterns\n")
## - Higher k values (15-21) may underfit and lose local patterns
cat("- Optimal k =", optimal_k, "balances bias-variance tradeoff\n")
## - Optimal k = 3 balances bias-variance tradeoff
cat("\nKEY INSIGHTS:\n")
##
## KEY INSIGHTS:
cat("=============\n")
## =============
cat("1. Clustering revealed 6 distinct business segments in the food industry:\n")
## 1. Clustering revealed 6 distinct business segments in the food industry:
for (i in 1:6) {
cat(" -", cluster_labels$cluster_label[i], "\n")
}
## - Modest Local Spots
## - Popular Neighborhood Gems
## - High-Volume Establishments
## - Quality Focused Venues
## - Community Favorites
## - Emerging Businesses
cat("\n2. Classification performance:\n")
##
## 2. Classification performance:
if (accuracy > 0.8) {
cat(" - Excellent accuracy (>80%) suggests features are highly predictive\n")
} else if (accuracy > 0.7) {
cat(" - Good accuracy (70-80%) indicates moderate predictive power\n")
} else {
cat(" - Fair accuracy (<70%) suggests need for feature engineering\n")
}
## - Excellent accuracy (>80%) suggests features are highly predictive
cat("\n3. Business implications:\n")
##
## 3. Business implications:
cat(" - Review count and engagement metrics are strong predictors of ratings\n")
## - Review count and engagement metrics are strong predictors of ratings
cat(" - Check-in frequency correlates with business success\n")
## - Check-in frequency correlates with business success
cat(" - Clustering helps identify different business archetypes for targeted strategies\n")
## - Clustering helps identify different business archetypes for targeted strategies