Master Analysis

For our Chopped 2 data set we were given a data set that included information about customers and their rating on restaurants that they ate at.

Our group started with running a simple regression just to see what the results would be.

library(car)
library(tree)
library(textir)

## Loading required package: distrom

## Loading required package: Matrix

## Loading required package: gamlr

## Loading required package: parallel

library(MASS)
library(class)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

RRD <- read.csv("~/DataMining/Data/RestaurantRatersComplete.csv")
RRDE <- read.csv("~/DataMining/Data/RRD Edited2.csv")
RRDE2 <- read.csv("~/DataMining/Data/RRD Edited2.csv")
RRDE3 <- read.csv("~/DataMining/Data/RRD Edited2.csv")

m1=lm(rating~.,data=RRDE)
summary(m1)

## 
## Call:
## lm(formula = rating ~ ., data = RRDE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91982 -0.07269 -0.01253  0.06254  1.90696 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.8934985  1.6133807   3.653 0.000263 ***
## smokerTRUE                    -0.0343015  0.0259226  -1.323 0.185837    
## drink_levelcasual drinker     -0.0680991  0.0254250  -2.678 0.007428 ** 
## drink_levelsocial drinker     -0.1078001  0.0253675  -4.250 2.19e-05 ***
## dress_preferenceformal        -0.0806879  0.0672568  -1.200 0.230329    
## dress_preferenceinformal      -0.2004827  0.0695671  -2.882 0.003975 ** 
## dress_preferenceno preference -0.2006370  0.0712205  -2.817 0.004870 ** 
## ambiencefriends                0.0512738  0.0222279   2.307 0.021122 *  
## ambiencesolitary              -0.0339017  0.0267831  -1.266 0.205665    
## transporton foot              -0.0810585  0.0323891  -2.503 0.012367 *  
## transportpublic               -0.1638244  0.0256980  -6.375 2.04e-10 ***
## marital_statussingle           0.1276733  0.0509882   2.504 0.012321 *  
## marital_statuswidow            0.5316636  0.1003315   5.299 1.23e-07 ***
## hijosindependent              -0.0918057  0.0833669  -1.101 0.270866    
## hijoskids                     -0.3145672  0.0840026  -3.745 0.000183 ***
## birth_year                    -0.0027530  0.0008356  -3.295 0.000994 ***
## religionChristian             -0.1012856  0.0531072  -1.907 0.056569 .  
## religionJewish                -0.4772020  0.0978924  -4.875 1.13e-06 ***
## religionMormon                -0.3960469  0.1122700  -3.528 0.000424 ***
## religionnone                  -0.1064383  0.0285816  -3.724 0.000199 ***
## activitystudent                0.0260464  0.0314700   0.828 0.407915    
## activityunemployed            -0.4237667  0.1426878  -2.970 0.002997 ** 
## activityworking-class          0.2436041  0.1380733   1.764 0.077758 .  
## budgetlow                      0.1024578  0.0643328   1.593 0.111326    
## budgetmedium                   0.1296879  0.0622091   2.085 0.037160 *  
## food_rating                    0.4019010  0.0128045  31.387  < 2e-16 ***
## service_rating                 0.4430751  0.0120789  36.682  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3553 on 3916 degrees of freedom
## Multiple R-squared:  0.8291, Adjusted R-squared:  0.828 
## F-statistic: 730.7 on 26 and 3916 DF,  p-value: < 2.2e-16

Next we decided to also run a logistic regression. Which required a little bit of recoding in order for the regression to run smooth.

RRDE3$smoker=recode(RRDE3$smoker,"'FALSE'=0; 'TRUE'=1")
RRDE3$drink_level=recode(RRDE3$drink_level,"'abstemious'=0; 'casual drinker'=1; 'social drinker'=2")
RRDE3$dress_preference=recode(RRDE3$dress_preference,"'no preference'=0; 'informal'=1; 'formal'=2; 'elegant'=3")
RRDE3$ambience=recode(RRDE3$ambience,"'solitary'=0; 'friends'=1; 'family'=2")
RRDE3$transport=recode(RRDE3$transport,"'on foot'=0; 'public'=1; 'car owner'=2")
RRDE3$marital_status=recode(RRDE3$marital_status,"'single'=0; 'widowed'=1; 'married'=2")
RRDE3$hijos=recode(RRDE3$hijos,"'dependent'=0; 'independent'=1; 'kids'=2")
RRDE3$activity=recode(RRDE3$activity,"'unemployed'=0; 'student'=1; 'working-class'=2; 'professional'=3")
RRDE3$budget=recode(RRDE3$budget,"'low'=0; 'medium'=1; 'high'=2")
RRDE3$religion=recode(RRDE3$religion,"'none'=0; 'Catholic'=1; 'Christian'=1; 'Jewish'=1; 'Mormon'=1")
RRDE3$rating=recode(RRDE3$rating,"'0'=0; '1'=0; '2'=1")
##Logistic Regression
m2=glm(rating~., family=binomial,data=RRDE3)
m2

## 
## Call:  glm(formula = rating ~ ., family = binomial, data = RRDE3)
## 
## Coefficients:
##         (Intercept)               smoker         drink_level1  
##            69.87837             -0.58323              0.22203  
##        drink_level2    dress_preference1    dress_preference2  
##             0.01060              0.17308              0.25062  
##   dress_preference3            ambience1            ambience2  
##             0.51823              0.91948              0.83211  
##          transport1           transport2      marital_status2  
##            -1.25954             -0.28712              0.04129  
## marital_statuswidow               hijos1               hijos2  
##             2.75872             -0.95435             -2.33720  
##          birth_year            religion1            activity1  
##            -0.04271              0.87638             10.14821  
##           activity2            activity3              budget1  
##            12.13832              9.37795              0.19138  
##             budget2          food_rating       service_rating  
##            -2.09213              1.51763              2.14975  
## 
## Degrees of Freedom: 3942 Total (i.e. Null);  3919 Residual
## Null Deviance:       4849 
## Residual Deviance: 2025  AIC: 2073

summary(m2)

## 
## Call:
## glm(formula = rating ~ ., family = binomial, data = RRDE3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1271  -0.2062  -0.1098   0.3141   3.4999  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          69.87837  266.57180   0.262 0.793216    
## smoker               -0.58323    0.21708  -2.687 0.007216 ** 
## drink_level1          0.22203    0.21634   1.026 0.304752    
## drink_level2          0.01060    0.22241   0.048 0.962004    
## dress_preference1     0.17308    0.23504   0.736 0.461516    
## dress_preference2     0.25062    0.21875   1.146 0.251937    
## dress_preference3     0.51823    0.45785   1.132 0.257691    
## ambience1             0.91948    0.27926   3.293 0.000993 ***
## ambience2             0.83211    0.21899   3.800 0.000145 ***
## transport1           -1.25954    0.28488  -4.421 9.81e-06 ***
## transport2           -0.28712    0.30154  -0.952 0.341002    
## marital_status2       0.04129    0.45160   0.091 0.927156    
## marital_statuswidow   2.75872    0.68822   4.008 6.11e-05 ***
## hijos1               -0.95435    1.07941  -0.884 0.376620    
## hijos2               -2.33719    1.11829  -2.090 0.036620 *  
## birth_year           -0.04271    0.00749  -5.702 1.18e-08 ***
## religion1             0.87638    0.24572   3.567 0.000362 ***
## activity1            10.14821  266.15825   0.038 0.969585    
## activity2            12.13832  266.17031   0.046 0.963626    
## activity3             9.37795  266.15823   0.035 0.971893    
## budget1               0.19138    0.17623   1.086 0.277480    
## budget2              -2.09213    0.48374  -4.325 1.53e-05 ***
## food_rating           1.51763    0.10348  14.666  < 2e-16 ***
## service_rating        2.14975    0.10568  20.342  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4849.2  on 3942  degrees of freedom
## Residual deviance: 2024.7  on 3919  degrees of freedom
## AIC: 2072.7
## 
## Number of Fisher Scoring iterations: 13

Next we recoded some more and got into our main analysis which was CART Analysis.

RRDE2$smoker=recode(RRDE2$smoker,"'FALSE'=0; 'TRUE'=1")
RRDE2$drink_level=recode(RRDE2$drink_level,"'abstemious'=0; 'casual drinker'=1; 'social drinker'=2")
RRDE2$dress_preference=recode(RRDE2$dress_preference,"'no preference'=0; 'informal'=1; 'formal'=2; 'elegant'=3")
RRDE2$ambience=recode(RRDE2$ambience,"'solitary'=0; 'friends'=1; 'family'=2")
RRDE2$transport=recode(RRDE2$transport,"'on foot'=0; 'public'=1; 'car owner'=2")
RRDE2$marital_status=recode(RRDE2$marital_status,"'single'=0; 'widowed'=1; 'married'=2")
RRDE2$hijos=recode(RRDE2$hijos,"'dependent'=0; 'independent'=1; 'kids'=2")
RRDE2$activity=recode(RRDE2$activity,"'unemployed'=0; 'student'=1; 'working-class'=2; 'professional'=3")
RRDE2$budget=recode(RRDE2$budget,"'low'=0; 'medium'=1; 'high'=2")
RRDE2$religion=recode(RRDE2$religion,"'none'=0; 'Catholic'=1; 'Christian'=1; 'Jewish'=1; 'Mormon'=1")

length(RRDE2$rating)

## [1] 3943

## Construct the tree
rtree <- tree(rating ~.-food_rating-service_rating, data=RRDE2, mindev=0.1, mincut=1)
rtree <- tree(rating ~.-food_rating-service_rating, data=RRDE2, mincut=1)
rtree

## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3943 2892.00 0.8552  
##    2) hijos: 2 1633  400.90 0.1506  
##      4) budget: 0 1442    0.00 0.0000 *
##      5) budget: 1 191  121.20 1.2880 *
##    3) hijos: 0,1 2310 1108.00 1.3530  
##      6) transport: 1,2 2037  971.30 1.3040  
##       12) birth_year < 1965.5 222   71.32 1.7340 *
##       13) birth_year > 1965.5 1815  853.90 1.2520 *
##      7) transport: 0 273   95.28 1.7180 *

plot(rtree, col=8)
text(rtree, pretty=1)

frtree <- tree(food_rating ~.-rating-service_rating, data=RRDE2, mindev=0.1, mincut=1)
frtree <- tree(food_rating ~.-rating-service_rating, data=RRDE2, mincut=1)
frtree

## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 3943 3085.0 0.8844  
##   2) budget: 0 1980  708.3 0.2768  
##     4) hijos: 1 538  301.8 1.0190  
##       8) birth_year < 1988.5 226  129.4 0.6770 *
##       9) birth_year > 1988.5 312  126.9 1.2660 *
##     5) hijos: 2 1442    0.0 0.0000 *
##   3) budget: 1,2 1963  908.7 1.4970 *

plot(frtree, col=8)
text(frtree, pretty=1)

srtree <- tree(service_rating ~.-rating-food_rating, data=RRDE2, mindev=0.1, mincut=1)
srtree <- tree(service_rating ~.-rating-food_rating, data=RRDE2, mincut=1)
srtree

## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3943 2678.00 0.7616  
##    2) hijos: 2 1633  398.80 0.1488  
##      4) budget: 0 1442    0.00 0.0000 *
##      5) budget: 1 191  125.80 1.2720 *
##    3) hijos: 0,1 2310 1232.00 1.1950  
##      6) birth_year < 1965.5 232   91.52 1.6030 *
##      7) birth_year > 1965.5 2078 1098.00 1.1490  
##       14) transport: 1,2 1815  896.90 1.1000 *
##       15) transport: 0 263  165.70 1.4900 *

plot(srtree, col=8)
text(srtree, pretty=1)

fullrtree <- tree(rating ~., data=RRDE2, mindev=0.1, mincut=1)
fullrtree <- tree(rating ~., data=RRDE2, mincut=1)
fullrtree

## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3943 2892.00 0.85520  
##    2) food_rating < 0.5 1797  155.00 0.06233  
##      4) service_rating < 0.5 1713   27.72 0.01284 *
##      5) service_rating > 0.5 84   37.57 1.07100 *
##    3) food_rating > 0.5 2146  661.70 1.51900  
##      6) service_rating < 1.5 1196  329.10 1.23800  
##       12) food_rating < 1.5 653  137.30 1.09300 *
##       13) food_rating > 1.5 543  161.60 1.41300 *
##      7) service_rating > 1.5 950  119.60 1.87300 *

plot(fullrtree, col=8)
text(fullrtree, pretty=1)

We realized how correlated the different types of ratings might be, so we also decided to build trees without them being included (as shown above) to get a better understanding.

Based upon all of our analysis we found that a low food rating would negatively affect the overall rating. Customers cared more about how good the food was over the actual service they were receiving.

For the master analysis I decided to run a KNN analysis on our original data set.

n4=length(RRD$rating)
nt2=1000
set.seed(1) ## to make the calculations reproducible in repeated runs
train <- sample(1:n4,nt2)
b<- RRD
c<- data.frame(b)
d <-data.matrix(c, rownames.force = NA)
e <- scale(d)
e[1:3,]

##         userID     smoker drink_level dress_preference   ambience
## [1,] -1.985821 -0.2828268   -1.336302       0.05609234 -0.7302105
## [2,] -1.985821 -0.2828268   -1.336302       0.05609234 -0.7302105
## [3,] -1.985821 -0.2828268   -1.336302       0.05609234 -0.7302105
##       transport marital_status      hijos birth_year  interest personality
## [1,] -0.3174001      0.1322515 -0.6229571  0.3471495 0.7104335    1.134082
## [2,] -0.3174001      0.1322515 -0.6229571  0.3471495 0.7104335    1.134082
## [3,] -0.3174001      0.1322515 -0.6229571  0.3471495 0.7104335    1.134082
##      religion  activity     color    weight    budget   height  latitude
## [1,] 3.005739 0.2826399 -1.643762 0.2602649 0.9578806 1.054828 0.2115132
## [2,] 3.005739 0.2826399 -1.643762 0.2602649 0.9578806 1.054828 0.2115132
## [3,] 3.005739 0.2826399 -1.643762 0.2602649 0.9578806 1.054828 0.2115132
##       longitude   placeID    rating food_rating service_rating  Rcuisine
## [1,] -0.4551655 0.6349475 0.1572043   0.1232101      0.2797459 -1.820375
## [2,] -0.4551655 0.6232868 0.1572043   0.1232101      0.2797459 -1.820375
## [3,] -0.4551655 0.6155130 1.3262323   1.2556357      1.4930657 -1.820375
##        Upayment
## [1,] -0.2151928
## [2,] -0.2151928
## [3,] -0.2151928

mean(e)

## [1] 2.928559e-16

sd(e)

## [1] 0.9998826

nearest1 <- knn(train=e[train,],test=e[-train,],cl=RRD$rating[train],k=1)
nearest5 <- knn(train=e[train,],test=e[-train,],cl=RRD$rating[train],k=5)
head(data.frame(RRD$rating[-train],nearest1,nearest5))

##   RRD.rating..train. nearest1 nearest5
## 1                  1        1        1
## 2                  1        1        2
## 3                  2        1        2
## 4                  1        1        2
## 5                  1        1        2
## 6                  1        2        1

Now I decided tocalculate the proportion of correct classifications on this one training set.

pcorrn1=100*sum(RRD$rating[-train]==nearest1)/(n4-nt2)
pcorrn5=100*sum(RRD$rating[-train]==nearest5)/(n4-nt2)
pcorrn1

## [1] 89.38511

pcorrn5

## [1] 85.66343

next we will look at our Press Q and Chi-squared

numCorrn1=(pcorrn1/100)*n4
PressQ1=((n4-(numCorrn1*6))^2)/(n4*5)
PressQ1

## [1] 15572.02

qchisq(.95,5) ##critical value for chi-square with alpha = .05, k-1 df where k=6

## [1] 11.0705

numCorrn5=(pcorrn5/100)*n4
PressQ5=((n4-(numCorrn5*6))^2)/(n4*5)
PressQ5

## [1] 14018.88

And for our cross-validation, leaving one out.

pcorr=dim(10)
for (k in 1:10) {
  pred=knn.cv(e,RRD$rating,k)
  pcorr[k]=100*sum(RRD$rating==pred)/n4
}
pcorr

##  [1] 95.42787 92.98289 92.44499 91.54034 91.44254 90.85575 90.48900
##  [8] 90.46455 90.70905 89.80440

Based upon the results of this, k=1 is our best number of neighbors so that is the one we will want to use. Lastly we will run a confusion matrix to test the accuracy of our prediction.

closest <- data.frame(truetype=RRD$rating[-train],predtype=nearest1)
confusionMatrix(data=nearest1,reference=RRD$rating[-train])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2
##          0 1293   30    1
##          1   53  609   87
##          2   12  145  860
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8939          
##                  95% CI : (0.8825, 0.9045)
##     No Information Rate : 0.4395          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8365          
##  Mcnemar's Test P-Value : 1.264e-06       
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2
## Sensitivity            0.9521   0.7768   0.9072
## Specificity            0.9821   0.9393   0.9267
## Pos Pred Value         0.9766   0.8131   0.8456
## Neg Pred Value         0.9632   0.9252   0.9575
## Prevalence             0.4395   0.2537   0.3068
## Detection Rate         0.4184   0.1971   0.2783
## Detection Prevalence   0.4285   0.2424   0.3291
## Balanced Accuracy      0.9671   0.8580   0.9169

With an accuracy of 89%, we know that this is a very accurate model!

Master Analysis

Jones

December 13, 2017