library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(rlist)
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
library(ISLR)
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(permute)
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(mlbench)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
set.seed(1)

businessHL = read_csv("/Users/michael/Dropbox/MSDA/Fall 2021/Applications/Individual project/yelp_business.csv")
## Rows: 174567 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): business_id, name, neighborhood, address, city, state, postal_code,...
## dbl (5): latitude, longitude, stars, review_count, is_open
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
businessLL = read_csv("/Users/michael/Dropbox/MSDA/Fall 2021/Applications/Individual project/Data/yelp_business_attributes.csv")
## Rows: 152041 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (28): business_id, ByAppointmentOnly, BusinessAcceptsCreditCards, Busine...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bHL = as.data.frame(businessHL)
bLL = as.data.frame(businessLL)

merged = merge(bHL, bLL, by = "business_id", all.x=TRUE) 
hist(bHL$stars)

merged = merged %>%
  mutate(rating = case_when(
    stars >= 3.5 ~ "Good",
    stars < 3.5 ~ "Bad"
  ))


merged$rating = as.factor(merged$rating)

smp_size <- floor(0.75 * nrow(merged))

train_ind <- sample(seq_len(nrow(merged)), size = smp_size)

train <- merged[train_ind, ]
test <- merged[-train_ind, ]


#remove special characters and whitespace to replace with underscores
train = clean_names(train)
test = clean_names(test)

#remove variables that only have 1 factor
train = train[vapply(train, function(x) length(unique(x)) > 1, logical(1L))]
test = test[vapply(train, function(x) length(unique(x)) > 1, logical(1L))]
lda.fit <- train(rating ~ smoking + dogs_allowed + drive_thru + outdoor_seating + 
                 restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
                 alcohol, 
                 data = train, method = "lda", 
                 trControl = trainControl(method = "cv"),
                 na.action=na.exclude)
lda.fit
## Linear Discriminant Analysis 
## 
## 130925 samples
##     11 predictor
##      2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 101339, 101337, 101338, 101339, 101339, 101337, ... 
## Resampling results:
## 
##   Accuracy   Kappa     
##   0.6960247  0.06236938
qda.fit <- train(rating ~ smoking + dogs_allowed + drive_thru + outdoor_seating + 
                   restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
                   alcohol, 
                 data = train, method = "qda", 
                 trControl = trainControl(method = "cv"),
                 na.action=na.exclude)
qda.fit
## Quadratic Discriminant Analysis 
## 
## 130925 samples
##     11 predictor
##      2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 101338, 101338, 101339, 101338, 101337, 101339, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6815307  0.1078945
tree.fit <- train(rating ~  review_count + smoking + dogs_allowed + drive_thru + outdoor_seating + 
                   restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
                   alcohol, 
                 data = train, method = "rpart", 
                 trControl = trainControl(method = "repeatedcv", number = 3, repeats = 3),
                 na.action=na.exclude,
                 tuneLength = 10)

fancyRpartPlot(tree.fit$finalModel)

glm.fit <- train(rating ~
                   smoking + dogs_allowed + drive_thru + outdoor_seating + 
                   restaurants_delivery + wi_fi + caters + restaurants_reservations + restaurants_take_out + happy_hour +
                   alcohol, 
                 data = train, method = "glm", family = "binomial", 
                 trControl = trainControl(method = "cv"),
                 na.action = na.exclude)
glm.fit
## Generalized Linear Model 
## 
## 130925 samples
##     11 predictor
##      2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 101339, 101338, 101337, 101338, 101338, 101338, ... 
## Resampling results:
## 
##   Accuracy   Kappa     
##   0.6961758  0.06246015