Analyzing Recipe Blogs

Load library

library(dplyr)
library(tidyr)
library(stringr)
library(dplyr)
library(lubridate)
library(tidytext)
library(tokenizers)
library(RColorBrewer)
library(wordcloud)
library(XML)
library(tm)
library(factoextra)
library(ggplot2)
library(rms)
library(caret)
library(car)
library(corrplot)
library(RColorBrewer)
library(SDMTools)
library(boot)
library(e1071)
library(qgraph)
library(igraph)
library(networkD3)

Load the data frame containing all recipes

load(file="all_recipes.RData")

Data Wrangling

Check for missing value

# NAs
colSums(is.na(all_recipes_df))
##              name      rating_value      rating_count     recipe_author 
##                 0               188               188               123 
##    recipe_cuisine recipe_categories         prep_time         cook_time 
##               129               126               125               112 
##          tot_time      recipe_yield       ingredients      instructions 
##               108                37                22                22
# Assing and ID number to each recipe
all_recipes_df <- all_recipes_df %>% mutate(ID = 1:nrow(all_recipes_df))

# Construct a data frame using words appearing in ingredients
ingrdt <- all_recipes_df %>% 
  dplyr::select(ID,ingredients) %>%
  mutate(ingredients = str_replace(ingredients, "\n", " ") %>% str_replace("<.*?>", " ")) %>%
  unnest_tokens(word, ingredients)
  • common words in the ingredients
ingrdt %>% count(word, sort = TRUE) %>% slice(1:10)
## # A tibble: 10 x 2
##    word            n
##    <chr>       <int>
##  1 1            2239
##  2 or           1382
##  3 cup          1200
##  4 2            1172
##  5 teaspoon     1019
##  6 and           634
##  7 cups          620
##  8 tablespoons   566
##  9 chopped       564
## 10 a             540

The ingredients dataframe contains many stop words and number which are uninformative.Let’s remove such words

# remove the following (which is not included in stopwords package)
word_remove = c("cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons", 
                "ounce", "ounces", "lb", "lbs", "tbs", "tsp", "oz", "handful", "handfull",
                "inch", "can","chopped", "cut","pound","cubes", "ground", "optional","small","powder","_blank","taste",
                "large","cooked","http","diced","drained","href", "black","red","target","white","fresh",
                "freshly","www.thefullhelping.com", "green", "em","sced","g","minced","vegan","strong",   "rinsed","peeled","brown","choice","yellow","dried","finely","medium","maple","crushed","substitute","extra","pieces",
                "raw")

ingrdt <- ingrdt %>% 
  filter(!(word %in% stopwords::stopwords())) %>%
  filter(!(word %in% word_remove)) %>%
  filter(!(str_detect(word, "[0-9]")))  # Remove numbers as well

Fix some missing letter in the words

ingrdt$word <-  gsub("garc","garlic",ingrdt$word)
ingrdt$word <-  ifelse( ingrdt$word == "cloves" |ingrdt$word == "clove" ,"cloves",gsub("ove","olive",ingrdt$word))

Check the most common words again

top25_ingrdt <- ingrdt %>% count(word, sort = TRUE) %>% slice(1:25)
ingrdt_top_words <- ingrdt  %>% filter(word %in% top25_ingrdt$word) %>% distinct()

make a word cloud

# fix some wording 
pal<- brewer.pal(8,"Dark2")

wordcloud(words = top25_ingrdt$word, freq =top25_ingrdt$n, color = pal)

ingrdt_2 <-ingrdt %>%
filter(word %in% top25_ingrdt$word)

Construct features (columns) from the word counts for each recipe using the spread function:

# Spread the word counts to columns
ingrdt_2<- ingrdt_2 %>% 
  group_by(ID) %>%
  count(word) %>%
  spread(key = word, value = n, fill = 0) %>%
  ungroup()


head(ingrdt_2)
## # A tibble: 6 x 26
##      ID apple beans cashew chickpeas cloves coconut flour garlic ginger
##   <int> <dbl> <dbl>  <dbl>     <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl>
## 1     1    1.    0.     0.        0.     0.      0.    0.     0.     0.
## 2     2    0.    1.     0.        0.     1.      0.    0.     1.     0.
## 3     3    1.    0.     0.        0.     0.      0.    2.     0.     1.
## 4     4    0.    0.     2.        0.     1.      0.    0.     1.     0.
## 5     5    0.    2.     0.        0.     0.      0.    0.     0.     0.
## 6     6    0.    0.     0.        0.     0.      1.    1.     0.     0.
## # ... with 16 more variables: juice <dbl>, leaves <dbl>, lemon <dbl>,
## #   milk <dbl>, oil <dbl>, olive <dbl>, onion <dbl>, pepper <dbl>,
## #   rice <dbl>, salt <dbl>, seeds <dbl>, sugar <dbl>, syrup <dbl>,
## #   vegetable <dbl>, vinegar <dbl>, water <dbl>

The amount is not our primary concern; thus, we replace the number with 1 to represent the appearence of the ingredient.

vars <- setdiff(names(ingrdt_2), "ID")
  ingrdt_2 <- ingrdt_2 %>%
   mutate_at(vars, function(x) ifelse(x > 0, 1, 0))
  
    ingrdt_2
## # A tibble: 393 x 26
##       ID apple beans cashew chickpeas cloves coconut flour garlic ginger
##    <int> <dbl> <dbl>  <dbl>     <dbl>  <dbl>   <dbl> <dbl>  <dbl>  <dbl>
##  1     1    1.    0.     0.        0.     0.      0.    0.     0.     0.
##  2     2    0.    1.     0.        0.     1.      0.    0.     1.     0.
##  3     3    1.    0.     0.        0.     0.      0.    1.     0.     1.
##  4     4    0.    0.     1.        0.     1.      0.    0.     1.     0.
##  5     5    0.    1.     0.        0.     0.      0.    0.     0.     0.
##  6     6    0.    0.     0.        0.     0.      1.    1.     0.     0.
##  7     7    1.    0.     0.        0.     0.      1.    0.     0.     0.
##  8     8    1.    0.     0.        1.     1.      0.    0.     1.     1.
##  9     9    1.    0.     0.        0.     0.      0.    0.     0.     0.
## 10    10    0.    1.     0.        0.     0.      1.    0.     0.     0.
## # ... with 383 more rows, and 16 more variables: juice <dbl>,
## #   leaves <dbl>, lemon <dbl>, milk <dbl>, oil <dbl>, olive <dbl>,
## #   onion <dbl>, pepper <dbl>, rice <dbl>, salt <dbl>, seeds <dbl>,
## #   sugar <dbl>, syrup <dbl>, vegetable <dbl>, vinegar <dbl>, water <dbl>

Principal components

## Principal components for ingredients
data <- ingrdt_2 %>% dplyr::select(-ID)
pc  <- prcomp(data, scale = TRUE)

# Plot the first two principal components
biplot(pc, scale = FALSE, cex = c(0.2, 0.8) )

eig.val <- get_eigenvalue(pc)
eig.val
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   4.4425432       17.7701728                    17.77017
## Dim.2   2.2744773        9.0979091                    26.86808
## Dim.3   1.9405640        7.7622558                    34.63034
## Dim.4   1.8158233        7.2632931                    41.89363
## Dim.5   1.5937500        6.3750002                    48.26863
## Dim.6   1.1446240        4.5784962                    52.84713
## Dim.7   1.0919549        4.3678196                    57.21495
## Dim.8   1.0558077        4.2232307                    61.43818
## Dim.9   0.9714770        3.8859079                    65.32409
## Dim.10  0.9100279        3.6401116                    68.96420
## Dim.11  0.8667402        3.4669607                    72.43116
## Dim.12  0.8318936        3.3275746                    75.75873
## Dim.13  0.7389551        2.9558203                    78.71455
## Dim.14  0.7191923        2.8767694                    81.59132
## Dim.15  0.6799342        2.7197369                    84.31106
## Dim.16  0.6599244        2.6396977                    86.95076
## Dim.17  0.6023077        2.4092309                    89.35999
## Dim.18  0.5496512        2.1986046                    91.55859
## Dim.19  0.4907603        1.9630414                    93.52163
## Dim.20  0.3833307        1.5333229                    95.05496
## Dim.21  0.3447263        1.3789051                    96.43386
## Dim.22  0.2999104        1.1996418                    97.63350
## Dim.23  0.2684088        1.0736352                    98.70714
## Dim.24  0.2126633        0.8506530                    99.55779
## Dim.25  0.1105522        0.4422088                   100.00000
#transpose data & get pc again. 
# first remember the names
n <- ingrdt_2$ID

# transpose all but the first column (name)
ingrdt_2_t <- as.data.frame(t(ingrdt_2[,-1]))

PC_V2 <- prcomp(ingrdt_2_t )

fviz_pca_ind(PC_V2 ,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
             )

  • we can see several clusters in the plot, baking ingredients (milk, flour, sugar) are close to each other; and of course, the cooking essentials, oil and salt are close.

  • The distance between variables and the origin measures the quality of the variables on the factor map. Variables that are away from the origin are well represented on the factor map.

  • the higher cos2 value : the higher quality of representation.

network plot (testing)

ingrdt_3 <- ingrdt_2

cormatrix <- cor_auto(ingrdt_3[,-1])
## Variables detected as ordinal: apple; beans; cashew; chickpeas; cloves; coconut; flour; garlic; ginger; juice; leaves; lemon; milk; oil; olive; onion; pepper; rice; salt; seeds; sugar; syrup; vegetable; vinegar; water
## Warning in cor_auto(ingrdt_3[, -1]): Correlation matrix is not positive
## definite. Finding nearest positive definite matrix
graph1<-qgraph(cormatrix, graph="glasso", layout="spring", sampleSize = nrow(data),
               vsize=7, cut=0, maximum=.45, border.width=1.5)
## Warning in EBICglassoCore(S = S, n = n, gamma = gamma, penalize.diagonal =
## penalize.diagonal, : Network with lowest lambda selected as best network.
## Try setting 'lambda.min.ratio' lower.
## Warning in EBICglassoCore(S = S, n = n, gamma = gamma, penalize.diagonal
## = penalize.diagonal, : A dense regularized network was selected (lambda <
## 0.1 * lambda.max). Recent work indicates a possible drop in specificity.
## Interpret the presence of the smallest edges with care. Setting threshold =
## TRUE will enforce high specificity, at the cost of sensitivity.

g = as.igraph(graph1, attributes=TRUE)

ratings

#distribution of avg. rating
p1 <- ggplot(all_recipes_df, aes(rating_value)) + 
  geom_histogram(fill = "skyblue") + 
  geom_vline(xintercept = median(all_recipes_df$rating_value, binwidth = 0.1, na.rm = T), na.rm = TRUE, size = 0.6) + 
  ylab("Counts") + ggtitle("Distribution of Average Ratings")

p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 188 rows containing non-finite values (stat_bin).

appearently most of the rating are 5, so let’s separent the groups by “Rating = 5” and “Rating =lower than 5”.

#number of recipe that has rating lower than 5
low  <- all_recipes_df %>% filter(all_recipes_df$rating_value < 5) %>% count()
high <-  all_recipes_df %>% filter(all_recipes_df$rating_value == 5) %>% count()

d <- data.frame(low = low, high = high)
colnames(d) <- c("low","high")
d
##   low high
## 1  75  154

see which auther has higest rating

author <- all_recipes_df %>% dplyr::select(recipe_author,rating_value)
author$recipe_author <- as.factor(author$recipe_author)

author <- author %>% group_by(recipe_author) %>% summarise(rating = mean(rating_value, na.rm = T)) %>% na.omit()
#oder from the highest

author[order(author$rating,decreasing = T),]
## # A tibble: 16 x 2
##    recipe_author                    rating
##    <fct>                             <dbl>
##  1 Adapted from Molly Wizenberg       5.00
##  2 Alexandra Stafford                 5.00
##  3 America's Test Kitchen             5.00
##  4 Angela Liddon                      5.00
##  5 Anya Kassoff                       5.00
##  6 Brandi Doming                      5.00
##  7 Emilie Raffa                       5.00
##  8 Ethan Ciment and Michael Suchman   5.00
##  9 Hannah Kaminsky                    5.00
## 10 Kathryne Taylor                    5.00
## 11 Kim-Julie Hansen                   5.00
## 12 Richa Hingle                       5.00
## 13 Gena Hamshaw                       4.86
## 14 Celine Steen                       4.80
## 15 Ali Maffucci                       4.50
## 16 Jackie Sobon                       4.00
ggplot(author, aes(reorder(recipe_author,rating),rating))+geom_col( fill = "skyblue")+coord_flip()

Modeling - recipe category

  • wrangling

  • convert total time to minutes

#delete redundent letters
time <- str_sub(all_recipes_df$tot_tim, 3)
#convert 1H to 60 min
time  <- ifelse(time  == "1H" ,"1H0M",time  ) 
time  <- str_sub(time,0,-2)
time  <- ifelse(nchar(time )>2, gsub("H",":",time ), paste("0:",time ))
time  <- lubridate::hm(time )
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings
## failed to parse, or all strings are NAs
time  <-  hour(time )*60 + minute(time )

all_recipes_df$tot_time <- time 
category  <- all_recipes_df %>% 
  select(ID, recipe_categories) %>%
  unnest_tokens(words ,recipe_categories ,token = 'regex', pattern=",") %>% na.omit()

#check for duplicated words 
category_check <- category %>%select(words) %>%  group_by(words) %>% count()

# remove redundant characters 
category <- category %>% mutate (words = gsub("&amp;","and", category$words))
category <- category %>% mutate (words = gsub("dips","dip", category$words))
category <- category %>% mutate (words = gsub("optional","", category$words))
category <- category %>% mutate (words = gsub("option","", category$words))
category <- category %>% mutate (words = gsub("nut free","nut-free", category$words))
category <- category %>% mutate (words = gsub("sauce","sauces", category$words))
category <- category %>% mutate (words = gsub("and sauces","sauces", category$words))
category <- category %>% mutate (words = gsub("side dishes","side dish", category$words))
category <- category %>% mutate (words = gsub("small plates","small plate", category$words))


#spread the table 
category <-category %>%
 group_by(ID) %>%
 count(words) %>%
 spread(key = words, value = n, fill = 0) %>%
 ungroup()

category_df <- data.frame(ID = all_recipes_df$ID ,rating = all_recipes_df$rating_value, 
                          total_time = all_recipes_df$tot_time, all_recipes_df$recipe_author)

category_df <- right_join(category_df, category)
## Joining, by = "ID"
# remove NA in ratings 
category_df <- na.omit(category_df)

head(category_df)
##   ID rating total_time all_recipes_df.recipe_author  appetizer  baked good
## 3  3      5         45                 Gena Hamshaw          0           0
## 4  4      5         30                 Gena Hamshaw          0           0
## 5  6      5         32                 Gena Hamshaw          0           0
## 6  7      5         45                 Gena Hamshaw          0           0
## 7  8      5        190 Adapted from Molly Wizenberg          0           0
## 8  9      5         45                 Gena Hamshaw          0           0
##    baking  bowls  breakfast  brunch  cake  cookie  cookies  dessert  dip
## 3       0      0          0       0     0       0        0        0    0
## 4       0      0          0       0     0       0        0        0    0
## 5       0      0          0       0     0       0        0        0    0
## 6       0      0          1       0     0       0        0        0    0
## 7       0      0          0       0     0       0        0        0    0
## 8       0      0          0       0     0       0        0        0    0
##    dressing  entree  entree slow cooker  gluten free  gluten free 
## 3         0       0                   0            0             0
## 4         0       0                   0            0             0
## 5         0       0                   0            0             0
## 6         0       0                   0            0             0
## 7         0       0                   0            0             0
## 8         0       0                   0            0             0
##    holidays  light bites  main  main dish  no oil  nut-free
## 3         0            0     0          0       0         0
## 4         0            0     0          0       0         0
## 5         0            0     0          0       0         0
## 6         0            0     0          0       0         0
## 7         0            0     0          0       0         0
## 8         0            0     0          0       0         0
##    quick and easy  quickbread  salad  sauces  saucess  side  side dish
## 3               0           0      0       0        0     0          0
## 4               0           0      0       0        0     0          0
## 5               0           0      0       0        0     0          0
## 6               0           0      0       0        0     0          0
## 7               0           0      0       0        0     0          1
## 8               0           0      0       0        0     0          0
##    small plate  snack  soup  soy free  soy-free   spread  spreads  starter
## 3            0      0     0         0          0       0        0        0
## 4            0      0     0         0          0       0        0        0
## 5            0      0     0         0          0       0        0        0
## 6            0      0     0         0          0       0        0        0
## 7            0      0     0         0          0       0        0        0
## 8            0      1     0         0          0       0        0        0
##    stew  topping  tree nut-free appetizer baked goods baking bread
## 3     0        0              0         0           0      0     0
## 4     0        0              0         0           0      0     0
## 5     0        0              0         0           0      0     0
## 6     0        0              0         0           0      0     0
## 7     0        0              0         0           0      0     0
## 8     0        0              0         0           0      0     0
##   breakfast cookie dessert dip dressing entree gluten-free grain main
## 3         0      0       1   0        0      0           0     0    0
## 4         0      0       0   0        0      0           0     0    0
## 5         0      0       1   0        0      0           0     0    0
## 6         0      0       0   0        0      0           0     0    0
## 7         0      0       0   0        0      0           0     0    0
## 8         1      0       0   0        0      0           0     0    0
##   main dish pasta salad salads sauces side side dish slow cooker
## 3         0     0     0      0      0    0         0           0
## 4         1     0     0      0      0    0         0           0
## 5         0     0     0      0      0    0         0           0
## 6         1     0     0      0      0    0         0           0
## 7         1     0     0      0      0    0         0           0
## 8         0     0     0      0      0    0         0           0
##   small plate snack soup sweets toast vegan
## 3           0     0    0      0     0     0
## 4           0     0    0      0     0     0
## 5           0     0    0      0     0     0
## 6           0     0    0      0     0     0
## 7           0     0    0      0     0     0
## 8           0     0    0      0     0     0
#make the dependent variable "rating" into binary outcome
category_df$rating <- ifelse(category_df$rating == 5 , 1, 0)

#check the propotion

d <- sum(category_df$rating) # number of rating = 5
d
## [1] 140

looks fine for logistic regression

#set sample 
sample_size <- floor(0.75*nrow(category_df) )
set.seed(12343)
train_int <-  sample(seq_len(nrow(category_df)), size = sample_size)
train <- category_df[train_int,]
test<- category_df[-train_int,]
  • Build modle
logisticModel <- glm(rating ~.   , family = "binomial", train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logisticModel)
## 
## Call:
## glm(formula = rating ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -8.49    0.00    0.00    0.00    8.49  
## 
## Coefficients: (19 not defined because of singularities)
##                                                                Estimate
## (Intercept)                                                   1.058e+16
## ID                                                            1.570e+12
## total_time                                                   -8.600e+12
## all_recipes_df.recipe_authorAlexandra Stafford               -6.223e+15
## all_recipes_df.recipe_authorAli Maffucci                     -1.288e+16
## all_recipes_df.recipe_authorAmerica's Test Kitchen           -4.221e+15
## all_recipes_df.recipe_authorAngela Liddon                    -9.915e+15
## all_recipes_df.recipe_authorAnya Kassoff                     -7.511e+15
## all_recipes_df.recipe_authorCeline Steen                     -3.816e+15
## all_recipes_df.recipe_authorEmilie Raffa                     -6.106e+15
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman -8.354e+15
## all_recipes_df.recipe_authorGena Hamshaw                     -6.966e+15
## all_recipes_df.recipe_authorHannah Kaminsky                  -5.287e+15
## all_recipes_df.recipe_authorJackie Sobon                     -1.501e+16
## all_recipes_df.recipe_authorKathryne Taylor                  -8.332e+15
## all_recipes_df.recipe_authorKim-Julie Hansen                 -5.658e+15
## ` appetizer`                                                 -5.815e+14
## ` baked good`                                                -7.654e+14
## ` baking`                                                    -4.793e+15
## ` bowls`                                                      1.220e+15
## ` breakfast`                                                  4.508e+15
## ` brunch`                                                    -2.572e+15
## ` cake`                                                      -4.904e+15
## ` cookie`                                                            NA
## ` cookies`                                                           NA
## ` dessert`                                                    7.202e+14
## ` dip`                                                        4.565e+15
## ` dressing`                                                          NA
## ` entree`                                                     2.316e+15
## ` entree slow cooker`                                         7.194e+15
## ` gluten free`                                                9.773e+14
## ` gluten free `                                                      NA
## ` holidays`                                                   3.048e+14
## ` light bites`                                               -4.296e+15
## ` main`                                                      -1.919e+15
## ` main dish`                                                 -2.122e+15
## ` no oil`                                                            NA
## ` nut-free`                                                          NA
## ` quick and easy`                                             8.825e+14
## ` quickbread`                                                -4.070e+15
## ` salad`                                                     -1.465e+14
## ` sauces`                                                     6.178e+15
## ` saucess`                                                   -3.567e+15
## ` side`                                                       1.082e+15
## ` side dish`                                                 -8.454e+13
## ` small plate`                                                2.005e+15
## ` snack`                                                     -6.955e+14
## ` soup`                                                      -6.588e+15
## ` soy free`                                                          NA
## ` soy-free `                                                         NA
## ` spread`                                                    -4.504e+15
## ` spreads`                                                           NA
## ` starter`                                                           NA
## ` stew`                                                       1.166e+15
## ` topping`                                                   -7.869e+15
## ` tree nut-free`                                                     NA
## appetizer                                                     5.480e+14
## `baked goods`                                                -1.424e+15
## baking                                                       -1.172e+16
## bread                                                                NA
## breakfast                                                    -3.339e+15
## cookie                                                        2.412e+14
## dessert                                                      -2.093e+15
## dip                                                                  NA
## dressing                                                             NA
## entree                                                        2.713e+15
## `gluten-free`                                                        NA
## grain                                                        -3.910e+15
## main                                                                 NA
## `main dish`                                                  -4.374e+15
## pasta                                                        -1.023e+16
## salad                                                        -1.863e+15
## salads                                                       -8.412e+15
## sauces                                                       -2.417e+15
## side                                                                 NA
## `side dish`                                                  -2.299e+15
## `slow cooker`                                                 5.920e+15
## `small plate`                                                -3.819e+15
## snack                                                         2.233e+15
## soup                                                         -3.126e+14
## sweets                                                       -3.448e+15
## toast                                                                NA
## vegan                                                                NA
##                                                              Std. Error
## (Intercept)                                                   1.420e+08
## ID                                                            8.737e+04
## total_time                                                    9.914e+04
## all_recipes_df.recipe_authorAlexandra Stafford                1.688e+08
## all_recipes_df.recipe_authorAli Maffucci                      1.193e+08
## all_recipes_df.recipe_authorAmerica's Test Kitchen            8.831e+07
## all_recipes_df.recipe_authorAngela Liddon                     1.231e+08
## all_recipes_df.recipe_authorAnya Kassoff                      1.046e+08
## all_recipes_df.recipe_authorCeline Steen                      1.227e+08
## all_recipes_df.recipe_authorEmilie Raffa                      1.561e+08
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman  1.014e+08
## all_recipes_df.recipe_authorGena Hamshaw                      7.135e+07
## all_recipes_df.recipe_authorHannah Kaminsky                   1.234e+08
## all_recipes_df.recipe_authorJackie Sobon                      1.549e+08
## all_recipes_df.recipe_authorKathryne Taylor                   1.464e+08
## all_recipes_df.recipe_authorKim-Julie Hansen                  1.013e+08
## ` appetizer`                                                  8.032e+07
## ` baked good`                                                 1.022e+08
## ` baking`                                                     7.127e+07
## ` bowls`                                                      1.413e+08
## ` breakfast`                                                  4.523e+07
## ` brunch`                                                     7.528e+07
## ` cake`                                                       7.750e+07
## ` cookie`                                                            NA
## ` cookies`                                                           NA
## ` dessert`                                                    7.898e+07
## ` dip`                                                        1.094e+08
## ` dressing`                                                          NA
## ` entree`                                                     2.215e+07
## ` entree slow cooker`                                         7.181e+07
## ` gluten free`                                                1.374e+08
## ` gluten free `                                                      NA
## ` holidays`                                                   5.219e+07
## ` light bites`                                                1.234e+08
## ` main`                                                       7.612e+07
## ` main dish`                                                  3.304e+07
## ` no oil`                                                            NA
## ` nut-free`                                                          NA
## ` quick and easy`                                             2.536e+07
## ` quickbread`                                                 7.823e+07
## ` salad`                                                      5.009e+07
## ` sauces`                                                     1.663e+08
## ` saucess`                                                    1.405e+08
## ` side`                                                       6.866e+07
## ` side dish`                                                  1.987e+07
## ` small plate`                                                5.904e+07
## ` snack`                                                      3.679e+07
## ` soup`                                                       7.656e+07
## ` soy free`                                                          NA
## ` soy-free `                                                         NA
## ` spread`                                                     9.491e+07
## ` spreads`                                                           NA
## ` starter`                                                           NA
## ` stew`                                                       7.045e+07
## ` topping`                                                    1.414e+08
## ` tree nut-free`                                                     NA
## appetizer                                                     1.175e+08
## `baked goods`                                                 1.301e+08
## baking                                                        1.468e+08
## bread                                                                NA
## breakfast                                                     1.214e+08
## cookie                                                        1.612e+08
## dessert                                                       1.225e+08
## dip                                                                  NA
## dressing                                                             NA
## entree                                                        1.231e+08
## `gluten-free`                                                        NA
## grain                                                         1.405e+08
## main                                                                 NA
## `main dish`                                                   1.207e+08
## pasta                                                         1.420e+08
## salad                                                         1.214e+08
## salads                                                        1.370e+08
## sauces                                                        1.271e+08
## side                                                                 NA
## `side dish`                                                   1.130e+08
## `slow cooker`                                                 1.598e+08
## `small plate`                                                 1.539e+08
## snack                                                         1.565e+08
## soup                                                          1.213e+08
## sweets                                                        1.483e+08
## toast                                                                NA
## vegan                                                                NA
##                                                                 z value
## (Intercept)                                                    74552894
## ID                                                             17973305
## total_time                                                    -86751886
## all_recipes_df.recipe_authorAlexandra Stafford                -36868896
## all_recipes_df.recipe_authorAli Maffucci                     -107928090
## all_recipes_df.recipe_authorAmerica's Test Kitchen            -47793864
## all_recipes_df.recipe_authorAngela Liddon                     -80554360
## all_recipes_df.recipe_authorAnya Kassoff                      -71781947
## all_recipes_df.recipe_authorCeline Steen                      -31111165
## all_recipes_df.recipe_authorEmilie Raffa                      -39125378
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman  -82395131
## all_recipes_df.recipe_authorGena Hamshaw                      -97628718
## all_recipes_df.recipe_authorHannah Kaminsky                   -42828663
## all_recipes_df.recipe_authorJackie Sobon                      -96884997
## all_recipes_df.recipe_authorKathryne Taylor                   -56897478
## all_recipes_df.recipe_authorKim-Julie Hansen                  -55867744
## ` appetizer`                                                   -7239899
## ` baked good`                                                  -7490911
## ` baking`                                                     -67256334
## ` bowls`                                                        8632417
## ` breakfast`                                                   99657482
## ` brunch`                                                     -34166007
## ` cake`                                                       -63275829
## ` cookie`                                                            NA
## ` cookies`                                                           NA
## ` dessert`                                                      9118320
## ` dip`                                                         41711697
## ` dressing`                                                          NA
## ` entree`                                                     104570399
## ` entree slow cooker`                                         100180531
## ` gluten free`                                                  7113731
## ` gluten free `                                                      NA
## ` holidays`                                                     5839522
## ` light bites`                                                -34818372
## ` main`                                                       -25207501
## ` main dish`                                                  -64231120
## ` no oil`                                                            NA
## ` nut-free`                                                          NA
## ` quick and easy`                                              34791421
## ` quickbread`                                                 -52032043
## ` salad`                                                       -2925202
## ` sauces`                                                      37145126
## ` saucess`                                                    -25386306
## ` side`                                                        15761989
## ` side dish`                                                   -4255481
## ` small plate`                                                 33955089
## ` snack`                                                      -18906247
## ` soup`                                                       -86051893
## ` soy free`                                                          NA
## ` soy-free `                                                         NA
## ` spread`                                                     -47459899
## ` spreads`                                                           NA
## ` starter`                                                           NA
## ` stew`                                                        16548427
## ` topping`                                                    -55665992
## ` tree nut-free`                                                     NA
## appetizer                                                       4662263
## `baked goods`                                                 -10941113
## baking                                                        -79838437
## bread                                                                NA
## breakfast                                                     -27510658
## cookie                                                          1496497
## dessert                                                       -17078676
## dip                                                                  NA
## dressing                                                             NA
## entree                                                         22042874
## `gluten-free`                                                        NA
## grain                                                         -27829989
## main                                                                 NA
## `main dish`                                                   -36245252
## pasta                                                         -72018356
## salad                                                         -15346701
## salads                                                        -61397130
## sauces                                                        -19014918
## side                                                                 NA
## `side dish`                                                   -20339639
## `slow cooker`                                                  37037920
## `small plate`                                                 -24817224
## snack                                                          14269246
## soup                                                           -2577011
## sweets                                                        -23254799
## toast                                                                NA
## vegan                                                                NA
##                                                              Pr(>|z|)    
## (Intercept)                                                    <2e-16 ***
## ID                                                             <2e-16 ***
## total_time                                                     <2e-16 ***
## all_recipes_df.recipe_authorAlexandra Stafford                 <2e-16 ***
## all_recipes_df.recipe_authorAli Maffucci                       <2e-16 ***
## all_recipes_df.recipe_authorAmerica's Test Kitchen             <2e-16 ***
## all_recipes_df.recipe_authorAngela Liddon                      <2e-16 ***
## all_recipes_df.recipe_authorAnya Kassoff                       <2e-16 ***
## all_recipes_df.recipe_authorCeline Steen                       <2e-16 ***
## all_recipes_df.recipe_authorEmilie Raffa                       <2e-16 ***
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman   <2e-16 ***
## all_recipes_df.recipe_authorGena Hamshaw                       <2e-16 ***
## all_recipes_df.recipe_authorHannah Kaminsky                    <2e-16 ***
## all_recipes_df.recipe_authorJackie Sobon                       <2e-16 ***
## all_recipes_df.recipe_authorKathryne Taylor                    <2e-16 ***
## all_recipes_df.recipe_authorKim-Julie Hansen                   <2e-16 ***
## ` appetizer`                                                   <2e-16 ***
## ` baked good`                                                  <2e-16 ***
## ` baking`                                                      <2e-16 ***
## ` bowls`                                                       <2e-16 ***
## ` breakfast`                                                   <2e-16 ***
## ` brunch`                                                      <2e-16 ***
## ` cake`                                                        <2e-16 ***
## ` cookie`                                                          NA    
## ` cookies`                                                         NA    
## ` dessert`                                                     <2e-16 ***
## ` dip`                                                         <2e-16 ***
## ` dressing`                                                        NA    
## ` entree`                                                      <2e-16 ***
## ` entree slow cooker`                                          <2e-16 ***
## ` gluten free`                                                 <2e-16 ***
## ` gluten free `                                                    NA    
## ` holidays`                                                    <2e-16 ***
## ` light bites`                                                 <2e-16 ***
## ` main`                                                        <2e-16 ***
## ` main dish`                                                   <2e-16 ***
## ` no oil`                                                          NA    
## ` nut-free`                                                        NA    
## ` quick and easy`                                              <2e-16 ***
## ` quickbread`                                                  <2e-16 ***
## ` salad`                                                       <2e-16 ***
## ` sauces`                                                      <2e-16 ***
## ` saucess`                                                     <2e-16 ***
## ` side`                                                        <2e-16 ***
## ` side dish`                                                   <2e-16 ***
## ` small plate`                                                 <2e-16 ***
## ` snack`                                                       <2e-16 ***
## ` soup`                                                        <2e-16 ***
## ` soy free`                                                        NA    
## ` soy-free `                                                       NA    
## ` spread`                                                      <2e-16 ***
## ` spreads`                                                         NA    
## ` starter`                                                         NA    
## ` stew`                                                        <2e-16 ***
## ` topping`                                                     <2e-16 ***
## ` tree nut-free`                                                   NA    
## appetizer                                                      <2e-16 ***
## `baked goods`                                                  <2e-16 ***
## baking                                                         <2e-16 ***
## bread                                                              NA    
## breakfast                                                      <2e-16 ***
## cookie                                                         <2e-16 ***
## dessert                                                        <2e-16 ***
## dip                                                                NA    
## dressing                                                           NA    
## entree                                                         <2e-16 ***
## `gluten-free`                                                      NA    
## grain                                                          <2e-16 ***
## main                                                               NA    
## `main dish`                                                    <2e-16 ***
## pasta                                                          <2e-16 ***
## salad                                                          <2e-16 ***
## salads                                                         <2e-16 ***
## sauces                                                         <2e-16 ***
## side                                                               NA    
## `side dish`                                                    <2e-16 ***
## `slow cooker`                                                  <2e-16 ***
## `small plate`                                                  <2e-16 ***
## snack                                                          <2e-16 ***
## soup                                                           <2e-16 ***
## sweets                                                         <2e-16 ***
## toast                                                              NA    
## vegan                                                              NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  202.49  on 155  degrees of freedom
## Residual deviance: 2811.40  on  92  degrees of freedom
## AIC: 2939.4
## 
## Number of Fisher Scoring iterations: 23
  • prediction
#prediciton 
test$predictNew <- predict(logisticModel, type = "response" , newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

confusion matrix

# calculating confusion matrix
confMatrix <- confusion.matrix(test$rating, test$predictNew ,threshold = 0.5) # tested different thresholds where 0.5 got highest accuracy 
confMatrix
##     obs
## pred  0  1
##    0  4 17
##    1 10 22
## attr(,"class")
## [1] "confusion.matrix"
  • accuracy
accurracyNew <- sum(diag(confMatrix))/sum(confMatrix)

accurracyNew
## [1] 0.490566
This is a pretty disappointing accuarcy, but at least we know category, author and cooking time are not good predictors for rating in the dataset.