library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(rlist)
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(ISLR)
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(permute)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(mlbench)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
set.seed(1)
businessHL = read_csv("/Users/michael/Dropbox/MSDA/Fall 2021/Applications/Individual project/yelp_business.csv")
## Rows: 174567 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): business_id, name, neighborhood, address, city, state, postal_code,...
## dbl (5): latitude, longitude, stars, review_count, is_open
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
businessLL = read_csv("/Users/michael/Dropbox/MSDA/Fall 2021/Applications/Individual project/Data/yelp_business_attributes.csv")
## Rows: 152041 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (28): business_id, ByAppointmentOnly, BusinessAcceptsCreditCards, Busine...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bHL = as.data.frame(businessHL)
bLL = as.data.frame(businessLL)
merged = merge(bHL, bLL, by = "business_id", all.x=TRUE)
hist(bHL$stars)

merged = merged %>%
mutate(rating = case_when(
stars >= 3.5 ~ "Good",
stars < 3.5 ~ "Bad"
))
merged$rating = as.factor(merged$rating)
smp_size <- floor(0.75 * nrow(merged))
train_ind <- sample(seq_len(nrow(merged)), size = smp_size)
train <- merged[train_ind, ]
test <- merged[-train_ind, ]
#remove special characters and whitespace to replace with underscores
train = clean_names(train)
test = clean_names(test)
#remove variables that only have 1 factor
train = train[vapply(train, function(x) length(unique(x)) > 1, logical(1L))]
test = test[vapply(train, function(x) length(unique(x)) > 1, logical(1L))]
lda.fit <- train(rating ~ smoking + dogs_allowed + drive_thru + outdoor_seating +
restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
alcohol,
data = train, method = "lda",
trControl = trainControl(method = "cv"),
na.action=na.exclude)
lda.fit
## Linear Discriminant Analysis
##
## 130925 samples
## 11 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 101339, 101337, 101338, 101339, 101339, 101337, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6960247 0.06236938
qda.fit <- train(rating ~ smoking + dogs_allowed + drive_thru + outdoor_seating +
restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
alcohol,
data = train, method = "qda",
trControl = trainControl(method = "cv"),
na.action=na.exclude)
qda.fit
## Quadratic Discriminant Analysis
##
## 130925 samples
## 11 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 101338, 101338, 101339, 101338, 101337, 101339, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6815307 0.1078945
tree.fit <- train(rating ~ review_count + smoking + dogs_allowed + drive_thru + outdoor_seating +
restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
alcohol,
data = train, method = "rpart",
trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3),
na.action=na.exclude,
tuneLength = 10)
fancyRpartPlot(tree.fit$finalModel)

glm.fit <- train(rating ~
smoking + dogs_allowed + drive_thru + outdoor_seating +
restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
alcohol,
data = train, method = "glm", family = "binomial",
trControl = trainControl(method = "cv"),
na.action = na.exclude)
glm.fit
## Generalized Linear Model
##
## 130925 samples
## 11 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 101339, 101338, 101337, 101338, 101338, 101338, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6961758 0.06246015