library(dplyr)
library(tidyr)
library(stringr)
library(dplyr)
library(lubridate)
library(tidytext)
library(tokenizers)
library(RColorBrewer)
library(wordcloud)
library(XML)
library(tm)
library(factoextra)
library(ggplot2)
library(rms)
library(caret)
library(car)
library(corrplot)
library(RColorBrewer)
library(SDMTools)
library(boot)
library(e1071)
library(qgraph)
library(igraph)
library(networkD3)
load(file="all_recipes.RData")
Check for missing value
# NAs
colSums(is.na(all_recipes_df))
## name rating_value rating_count recipe_author
## 0 188 188 123
## recipe_cuisine recipe_categories prep_time cook_time
## 129 126 125 112
## tot_time recipe_yield ingredients instructions
## 108 37 22 22
# Assing and ID number to each recipe
all_recipes_df <- all_recipes_df %>% mutate(ID = 1:nrow(all_recipes_df))
# Construct a data frame using words appearing in ingredients
ingrdt <- all_recipes_df %>%
dplyr::select(ID,ingredients) %>%
mutate(ingredients = str_replace(ingredients, "\n", " ") %>% str_replace("<.*?>", " ")) %>%
unnest_tokens(word, ingredients)
ingrdt %>% count(word, sort = TRUE) %>% slice(1:10)
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 1 2239
## 2 or 1382
## 3 cup 1200
## 4 2 1172
## 5 teaspoon 1019
## 6 and 634
## 7 cups 620
## 8 tablespoons 566
## 9 chopped 564
## 10 a 540
The ingredients dataframe contains many stop words and number which are uninformative.Let’s remove such words
# remove the following (which is not included in stopwords package)
word_remove = c("cup", "cups", "teaspoon", "teaspoons", "tablespoon", "tablespoons",
"ounce", "ounces", "lb", "lbs", "tbs", "tsp", "oz", "handful", "handfull",
"inch", "can","chopped", "cut","pound","cubes", "ground", "optional","small","powder","_blank","taste",
"large","cooked","http","diced","drained","href", "black","red","target","white","fresh",
"freshly","www.thefullhelping.com", "green", "em","sced","g","minced","vegan","strong", "rinsed","peeled","brown","choice","yellow","dried","finely","medium","maple","crushed","substitute","extra","pieces",
"raw")
ingrdt <- ingrdt %>%
filter(!(word %in% stopwords::stopwords())) %>%
filter(!(word %in% word_remove)) %>%
filter(!(str_detect(word, "[0-9]"))) # Remove numbers as well
Fix some missing letter in the words
ingrdt$word <- gsub("garc","garlic",ingrdt$word)
ingrdt$word <- ifelse( ingrdt$word == "cloves" |ingrdt$word == "clove" ,"cloves",gsub("ove","olive",ingrdt$word))
Check the most common words again
top25_ingrdt <- ingrdt %>% count(word, sort = TRUE) %>% slice(1:25)
ingrdt_top_words <- ingrdt %>% filter(word %in% top25_ingrdt$word) %>% distinct()
# fix some wording
pal<- brewer.pal(8,"Dark2")
wordcloud(words = top25_ingrdt$word, freq =top25_ingrdt$n, color = pal)
ingrdt_2 <-ingrdt %>%
filter(word %in% top25_ingrdt$word)
Construct features (columns) from the word counts for each recipe using the spread function:
# Spread the word counts to columns
ingrdt_2<- ingrdt_2 %>%
group_by(ID) %>%
count(word) %>%
spread(key = word, value = n, fill = 0) %>%
ungroup()
head(ingrdt_2)
## # A tibble: 6 x 26
## ID apple beans cashew chickpeas cloves coconut flour garlic ginger
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1. 0. 0. 0. 0. 0. 0. 0. 0.
## 2 2 0. 1. 0. 0. 1. 0. 0. 1. 0.
## 3 3 1. 0. 0. 0. 0. 0. 2. 0. 1.
## 4 4 0. 0. 2. 0. 1. 0. 0. 1. 0.
## 5 5 0. 2. 0. 0. 0. 0. 0. 0. 0.
## 6 6 0. 0. 0. 0. 0. 1. 1. 0. 0.
## # ... with 16 more variables: juice <dbl>, leaves <dbl>, lemon <dbl>,
## # milk <dbl>, oil <dbl>, olive <dbl>, onion <dbl>, pepper <dbl>,
## # rice <dbl>, salt <dbl>, seeds <dbl>, sugar <dbl>, syrup <dbl>,
## # vegetable <dbl>, vinegar <dbl>, water <dbl>
The amount is not our primary concern; thus, we replace the number with 1 to represent the appearence of the ingredient.
vars <- setdiff(names(ingrdt_2), "ID")
ingrdt_2 <- ingrdt_2 %>%
mutate_at(vars, function(x) ifelse(x > 0, 1, 0))
ingrdt_2
## # A tibble: 393 x 26
## ID apple beans cashew chickpeas cloves coconut flour garlic ginger
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1. 0. 0. 0. 0. 0. 0. 0. 0.
## 2 2 0. 1. 0. 0. 1. 0. 0. 1. 0.
## 3 3 1. 0. 0. 0. 0. 0. 1. 0. 1.
## 4 4 0. 0. 1. 0. 1. 0. 0. 1. 0.
## 5 5 0. 1. 0. 0. 0. 0. 0. 0. 0.
## 6 6 0. 0. 0. 0. 0. 1. 1. 0. 0.
## 7 7 1. 0. 0. 0. 0. 1. 0. 0. 0.
## 8 8 1. 0. 0. 1. 1. 0. 0. 1. 1.
## 9 9 1. 0. 0. 0. 0. 0. 0. 0. 0.
## 10 10 0. 1. 0. 0. 0. 1. 0. 0. 0.
## # ... with 383 more rows, and 16 more variables: juice <dbl>,
## # leaves <dbl>, lemon <dbl>, milk <dbl>, oil <dbl>, olive <dbl>,
## # onion <dbl>, pepper <dbl>, rice <dbl>, salt <dbl>, seeds <dbl>,
## # sugar <dbl>, syrup <dbl>, vegetable <dbl>, vinegar <dbl>, water <dbl>
## Principal components for ingredients
data <- ingrdt_2 %>% dplyr::select(-ID)
pc <- prcomp(data, scale = TRUE)
# Plot the first two principal components
biplot(pc, scale = FALSE, cex = c(0.2, 0.8) )
eig.val <- get_eigenvalue(pc)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.4425432 17.7701728 17.77017
## Dim.2 2.2744773 9.0979091 26.86808
## Dim.3 1.9405640 7.7622558 34.63034
## Dim.4 1.8158233 7.2632931 41.89363
## Dim.5 1.5937500 6.3750002 48.26863
## Dim.6 1.1446240 4.5784962 52.84713
## Dim.7 1.0919549 4.3678196 57.21495
## Dim.8 1.0558077 4.2232307 61.43818
## Dim.9 0.9714770 3.8859079 65.32409
## Dim.10 0.9100279 3.6401116 68.96420
## Dim.11 0.8667402 3.4669607 72.43116
## Dim.12 0.8318936 3.3275746 75.75873
## Dim.13 0.7389551 2.9558203 78.71455
## Dim.14 0.7191923 2.8767694 81.59132
## Dim.15 0.6799342 2.7197369 84.31106
## Dim.16 0.6599244 2.6396977 86.95076
## Dim.17 0.6023077 2.4092309 89.35999
## Dim.18 0.5496512 2.1986046 91.55859
## Dim.19 0.4907603 1.9630414 93.52163
## Dim.20 0.3833307 1.5333229 95.05496
## Dim.21 0.3447263 1.3789051 96.43386
## Dim.22 0.2999104 1.1996418 97.63350
## Dim.23 0.2684088 1.0736352 98.70714
## Dim.24 0.2126633 0.8506530 99.55779
## Dim.25 0.1105522 0.4422088 100.00000
#transpose data & get pc again.
# first remember the names
n <- ingrdt_2$ID
# transpose all but the first column (name)
ingrdt_2_t <- as.data.frame(t(ingrdt_2[,-1]))
PC_V2 <- prcomp(ingrdt_2_t )
fviz_pca_ind(PC_V2 ,
col.ind = "cos2", # Color by the quality of representation
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
we can see several clusters in the plot, baking ingredients (milk, flour, sugar) are close to each other; and of course, the cooking essentials, oil and salt are close.
The distance between variables and the origin measures the quality of the variables on the factor map. Variables that are away from the origin are well represented on the factor map.
the higher cos2 value : the higher quality of representation.
ingrdt_3 <- ingrdt_2
cormatrix <- cor_auto(ingrdt_3[,-1])
## Variables detected as ordinal: apple; beans; cashew; chickpeas; cloves; coconut; flour; garlic; ginger; juice; leaves; lemon; milk; oil; olive; onion; pepper; rice; salt; seeds; sugar; syrup; vegetable; vinegar; water
## Warning in cor_auto(ingrdt_3[, -1]): Correlation matrix is not positive
## definite. Finding nearest positive definite matrix
graph1<-qgraph(cormatrix, graph="glasso", layout="spring", sampleSize = nrow(data),
vsize=7, cut=0, maximum=.45, border.width=1.5)
## Warning in EBICglassoCore(S = S, n = n, gamma = gamma, penalize.diagonal =
## penalize.diagonal, : Network with lowest lambda selected as best network.
## Try setting 'lambda.min.ratio' lower.
## Warning in EBICglassoCore(S = S, n = n, gamma = gamma, penalize.diagonal
## = penalize.diagonal, : A dense regularized network was selected (lambda <
## 0.1 * lambda.max). Recent work indicates a possible drop in specificity.
## Interpret the presence of the smallest edges with care. Setting threshold =
## TRUE will enforce high specificity, at the cost of sensitivity.
g = as.igraph(graph1, attributes=TRUE)
#distribution of avg. rating
p1 <- ggplot(all_recipes_df, aes(rating_value)) +
geom_histogram(fill = "skyblue") +
geom_vline(xintercept = median(all_recipes_df$rating_value, binwidth = 0.1, na.rm = T), na.rm = TRUE, size = 0.6) +
ylab("Counts") + ggtitle("Distribution of Average Ratings")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 188 rows containing non-finite values (stat_bin).
appearently most of the rating are 5, so let’s separent the groups by “Rating = 5” and “Rating =lower than 5”.
#number of recipe that has rating lower than 5
low <- all_recipes_df %>% filter(all_recipes_df$rating_value < 5) %>% count()
high <- all_recipes_df %>% filter(all_recipes_df$rating_value == 5) %>% count()
d <- data.frame(low = low, high = high)
colnames(d) <- c("low","high")
d
## low high
## 1 75 154
author <- all_recipes_df %>% dplyr::select(recipe_author,rating_value)
author$recipe_author <- as.factor(author$recipe_author)
author <- author %>% group_by(recipe_author) %>% summarise(rating = mean(rating_value, na.rm = T)) %>% na.omit()
#oder from the highest
author[order(author$rating,decreasing = T),]
## # A tibble: 16 x 2
## recipe_author rating
## <fct> <dbl>
## 1 Adapted from Molly Wizenberg 5.00
## 2 Alexandra Stafford 5.00
## 3 America's Test Kitchen 5.00
## 4 Angela Liddon 5.00
## 5 Anya Kassoff 5.00
## 6 Brandi Doming 5.00
## 7 Emilie Raffa 5.00
## 8 Ethan Ciment and Michael Suchman 5.00
## 9 Hannah Kaminsky 5.00
## 10 Kathryne Taylor 5.00
## 11 Kim-Julie Hansen 5.00
## 12 Richa Hingle 5.00
## 13 Gena Hamshaw 4.86
## 14 Celine Steen 4.80
## 15 Ali Maffucci 4.50
## 16 Jackie Sobon 4.00
ggplot(author, aes(reorder(recipe_author,rating),rating))+geom_col( fill = "skyblue")+coord_flip()
wrangling
convert total time to minutes
#delete redundent letters
time <- str_sub(all_recipes_df$tot_tim, 3)
#convert 1H to 60 min
time <- ifelse(time == "1H" ,"1H0M",time )
time <- str_sub(time,0,-2)
time <- ifelse(nchar(time )>2, gsub("H",":",time ), paste("0:",time ))
time <- lubridate::hm(time )
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings
## failed to parse, or all strings are NAs
time <- hour(time )*60 + minute(time )
all_recipes_df$tot_time <- time
category <- all_recipes_df %>%
select(ID, recipe_categories) %>%
unnest_tokens(words ,recipe_categories ,token = 'regex', pattern=",") %>% na.omit()
#check for duplicated words
category_check <- category %>%select(words) %>% group_by(words) %>% count()
# remove redundant characters
category <- category %>% mutate (words = gsub("&","and", category$words))
category <- category %>% mutate (words = gsub("dips","dip", category$words))
category <- category %>% mutate (words = gsub("optional","", category$words))
category <- category %>% mutate (words = gsub("option","", category$words))
category <- category %>% mutate (words = gsub("nut free","nut-free", category$words))
category <- category %>% mutate (words = gsub("sauce","sauces", category$words))
category <- category %>% mutate (words = gsub("and sauces","sauces", category$words))
category <- category %>% mutate (words = gsub("side dishes","side dish", category$words))
category <- category %>% mutate (words = gsub("small plates","small plate", category$words))
#spread the table
category <-category %>%
group_by(ID) %>%
count(words) %>%
spread(key = words, value = n, fill = 0) %>%
ungroup()
category_df <- data.frame(ID = all_recipes_df$ID ,rating = all_recipes_df$rating_value,
total_time = all_recipes_df$tot_time, all_recipes_df$recipe_author)
category_df <- right_join(category_df, category)
## Joining, by = "ID"
# remove NA in ratings
category_df <- na.omit(category_df)
head(category_df)
## ID rating total_time all_recipes_df.recipe_author appetizer baked good
## 3 3 5 45 Gena Hamshaw 0 0
## 4 4 5 30 Gena Hamshaw 0 0
## 5 6 5 32 Gena Hamshaw 0 0
## 6 7 5 45 Gena Hamshaw 0 0
## 7 8 5 190 Adapted from Molly Wizenberg 0 0
## 8 9 5 45 Gena Hamshaw 0 0
## baking bowls breakfast brunch cake cookie cookies dessert dip
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 1 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0
## dressing entree entree slow cooker gluten free gluten free
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## holidays light bites main main dish no oil nut-free
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## quick and easy quickbread salad sauces saucess side side dish
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 1
## 8 0 0 0 0 0 0 0
## small plate snack soup soy free soy-free spread spreads starter
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
## 8 0 1 0 0 0 0 0 0
## stew topping tree nut-free appetizer baked goods baking bread
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## breakfast cookie dessert dip dressing entree gluten-free grain main
## 3 0 0 1 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 1 0 0 0 0 0 0 0 0
## main dish pasta salad salads sauces side side dish slow cooker
## 3 0 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0
## 7 1 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0
## small plate snack soup sweets toast vegan
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
#make the dependent variable "rating" into binary outcome
category_df$rating <- ifelse(category_df$rating == 5 , 1, 0)
#check the propotion
d <- sum(category_df$rating) # number of rating = 5
d
## [1] 140
looks fine for logistic regression
#set sample
sample_size <- floor(0.75*nrow(category_df) )
set.seed(12343)
train_int <- sample(seq_len(nrow(category_df)), size = sample_size)
train <- category_df[train_int,]
test<- category_df[-train_int,]
logisticModel <- glm(rating ~. , family = "binomial", train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logisticModel)
##
## Call:
## glm(formula = rating ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients: (19 not defined because of singularities)
## Estimate
## (Intercept) 1.058e+16
## ID 1.570e+12
## total_time -8.600e+12
## all_recipes_df.recipe_authorAlexandra Stafford -6.223e+15
## all_recipes_df.recipe_authorAli Maffucci -1.288e+16
## all_recipes_df.recipe_authorAmerica's Test Kitchen -4.221e+15
## all_recipes_df.recipe_authorAngela Liddon -9.915e+15
## all_recipes_df.recipe_authorAnya Kassoff -7.511e+15
## all_recipes_df.recipe_authorCeline Steen -3.816e+15
## all_recipes_df.recipe_authorEmilie Raffa -6.106e+15
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman -8.354e+15
## all_recipes_df.recipe_authorGena Hamshaw -6.966e+15
## all_recipes_df.recipe_authorHannah Kaminsky -5.287e+15
## all_recipes_df.recipe_authorJackie Sobon -1.501e+16
## all_recipes_df.recipe_authorKathryne Taylor -8.332e+15
## all_recipes_df.recipe_authorKim-Julie Hansen -5.658e+15
## ` appetizer` -5.815e+14
## ` baked good` -7.654e+14
## ` baking` -4.793e+15
## ` bowls` 1.220e+15
## ` breakfast` 4.508e+15
## ` brunch` -2.572e+15
## ` cake` -4.904e+15
## ` cookie` NA
## ` cookies` NA
## ` dessert` 7.202e+14
## ` dip` 4.565e+15
## ` dressing` NA
## ` entree` 2.316e+15
## ` entree slow cooker` 7.194e+15
## ` gluten free` 9.773e+14
## ` gluten free ` NA
## ` holidays` 3.048e+14
## ` light bites` -4.296e+15
## ` main` -1.919e+15
## ` main dish` -2.122e+15
## ` no oil` NA
## ` nut-free` NA
## ` quick and easy` 8.825e+14
## ` quickbread` -4.070e+15
## ` salad` -1.465e+14
## ` sauces` 6.178e+15
## ` saucess` -3.567e+15
## ` side` 1.082e+15
## ` side dish` -8.454e+13
## ` small plate` 2.005e+15
## ` snack` -6.955e+14
## ` soup` -6.588e+15
## ` soy free` NA
## ` soy-free ` NA
## ` spread` -4.504e+15
## ` spreads` NA
## ` starter` NA
## ` stew` 1.166e+15
## ` topping` -7.869e+15
## ` tree nut-free` NA
## appetizer 5.480e+14
## `baked goods` -1.424e+15
## baking -1.172e+16
## bread NA
## breakfast -3.339e+15
## cookie 2.412e+14
## dessert -2.093e+15
## dip NA
## dressing NA
## entree 2.713e+15
## `gluten-free` NA
## grain -3.910e+15
## main NA
## `main dish` -4.374e+15
## pasta -1.023e+16
## salad -1.863e+15
## salads -8.412e+15
## sauces -2.417e+15
## side NA
## `side dish` -2.299e+15
## `slow cooker` 5.920e+15
## `small plate` -3.819e+15
## snack 2.233e+15
## soup -3.126e+14
## sweets -3.448e+15
## toast NA
## vegan NA
## Std. Error
## (Intercept) 1.420e+08
## ID 8.737e+04
## total_time 9.914e+04
## all_recipes_df.recipe_authorAlexandra Stafford 1.688e+08
## all_recipes_df.recipe_authorAli Maffucci 1.193e+08
## all_recipes_df.recipe_authorAmerica's Test Kitchen 8.831e+07
## all_recipes_df.recipe_authorAngela Liddon 1.231e+08
## all_recipes_df.recipe_authorAnya Kassoff 1.046e+08
## all_recipes_df.recipe_authorCeline Steen 1.227e+08
## all_recipes_df.recipe_authorEmilie Raffa 1.561e+08
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman 1.014e+08
## all_recipes_df.recipe_authorGena Hamshaw 7.135e+07
## all_recipes_df.recipe_authorHannah Kaminsky 1.234e+08
## all_recipes_df.recipe_authorJackie Sobon 1.549e+08
## all_recipes_df.recipe_authorKathryne Taylor 1.464e+08
## all_recipes_df.recipe_authorKim-Julie Hansen 1.013e+08
## ` appetizer` 8.032e+07
## ` baked good` 1.022e+08
## ` baking` 7.127e+07
## ` bowls` 1.413e+08
## ` breakfast` 4.523e+07
## ` brunch` 7.528e+07
## ` cake` 7.750e+07
## ` cookie` NA
## ` cookies` NA
## ` dessert` 7.898e+07
## ` dip` 1.094e+08
## ` dressing` NA
## ` entree` 2.215e+07
## ` entree slow cooker` 7.181e+07
## ` gluten free` 1.374e+08
## ` gluten free ` NA
## ` holidays` 5.219e+07
## ` light bites` 1.234e+08
## ` main` 7.612e+07
## ` main dish` 3.304e+07
## ` no oil` NA
## ` nut-free` NA
## ` quick and easy` 2.536e+07
## ` quickbread` 7.823e+07
## ` salad` 5.009e+07
## ` sauces` 1.663e+08
## ` saucess` 1.405e+08
## ` side` 6.866e+07
## ` side dish` 1.987e+07
## ` small plate` 5.904e+07
## ` snack` 3.679e+07
## ` soup` 7.656e+07
## ` soy free` NA
## ` soy-free ` NA
## ` spread` 9.491e+07
## ` spreads` NA
## ` starter` NA
## ` stew` 7.045e+07
## ` topping` 1.414e+08
## ` tree nut-free` NA
## appetizer 1.175e+08
## `baked goods` 1.301e+08
## baking 1.468e+08
## bread NA
## breakfast 1.214e+08
## cookie 1.612e+08
## dessert 1.225e+08
## dip NA
## dressing NA
## entree 1.231e+08
## `gluten-free` NA
## grain 1.405e+08
## main NA
## `main dish` 1.207e+08
## pasta 1.420e+08
## salad 1.214e+08
## salads 1.370e+08
## sauces 1.271e+08
## side NA
## `side dish` 1.130e+08
## `slow cooker` 1.598e+08
## `small plate` 1.539e+08
## snack 1.565e+08
## soup 1.213e+08
## sweets 1.483e+08
## toast NA
## vegan NA
## z value
## (Intercept) 74552894
## ID 17973305
## total_time -86751886
## all_recipes_df.recipe_authorAlexandra Stafford -36868896
## all_recipes_df.recipe_authorAli Maffucci -107928090
## all_recipes_df.recipe_authorAmerica's Test Kitchen -47793864
## all_recipes_df.recipe_authorAngela Liddon -80554360
## all_recipes_df.recipe_authorAnya Kassoff -71781947
## all_recipes_df.recipe_authorCeline Steen -31111165
## all_recipes_df.recipe_authorEmilie Raffa -39125378
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman -82395131
## all_recipes_df.recipe_authorGena Hamshaw -97628718
## all_recipes_df.recipe_authorHannah Kaminsky -42828663
## all_recipes_df.recipe_authorJackie Sobon -96884997
## all_recipes_df.recipe_authorKathryne Taylor -56897478
## all_recipes_df.recipe_authorKim-Julie Hansen -55867744
## ` appetizer` -7239899
## ` baked good` -7490911
## ` baking` -67256334
## ` bowls` 8632417
## ` breakfast` 99657482
## ` brunch` -34166007
## ` cake` -63275829
## ` cookie` NA
## ` cookies` NA
## ` dessert` 9118320
## ` dip` 41711697
## ` dressing` NA
## ` entree` 104570399
## ` entree slow cooker` 100180531
## ` gluten free` 7113731
## ` gluten free ` NA
## ` holidays` 5839522
## ` light bites` -34818372
## ` main` -25207501
## ` main dish` -64231120
## ` no oil` NA
## ` nut-free` NA
## ` quick and easy` 34791421
## ` quickbread` -52032043
## ` salad` -2925202
## ` sauces` 37145126
## ` saucess` -25386306
## ` side` 15761989
## ` side dish` -4255481
## ` small plate` 33955089
## ` snack` -18906247
## ` soup` -86051893
## ` soy free` NA
## ` soy-free ` NA
## ` spread` -47459899
## ` spreads` NA
## ` starter` NA
## ` stew` 16548427
## ` topping` -55665992
## ` tree nut-free` NA
## appetizer 4662263
## `baked goods` -10941113
## baking -79838437
## bread NA
## breakfast -27510658
## cookie 1496497
## dessert -17078676
## dip NA
## dressing NA
## entree 22042874
## `gluten-free` NA
## grain -27829989
## main NA
## `main dish` -36245252
## pasta -72018356
## salad -15346701
## salads -61397130
## sauces -19014918
## side NA
## `side dish` -20339639
## `slow cooker` 37037920
## `small plate` -24817224
## snack 14269246
## soup -2577011
## sweets -23254799
## toast NA
## vegan NA
## Pr(>|z|)
## (Intercept) <2e-16 ***
## ID <2e-16 ***
## total_time <2e-16 ***
## all_recipes_df.recipe_authorAlexandra Stafford <2e-16 ***
## all_recipes_df.recipe_authorAli Maffucci <2e-16 ***
## all_recipes_df.recipe_authorAmerica's Test Kitchen <2e-16 ***
## all_recipes_df.recipe_authorAngela Liddon <2e-16 ***
## all_recipes_df.recipe_authorAnya Kassoff <2e-16 ***
## all_recipes_df.recipe_authorCeline Steen <2e-16 ***
## all_recipes_df.recipe_authorEmilie Raffa <2e-16 ***
## all_recipes_df.recipe_authorEthan Ciment and Michael Suchman <2e-16 ***
## all_recipes_df.recipe_authorGena Hamshaw <2e-16 ***
## all_recipes_df.recipe_authorHannah Kaminsky <2e-16 ***
## all_recipes_df.recipe_authorJackie Sobon <2e-16 ***
## all_recipes_df.recipe_authorKathryne Taylor <2e-16 ***
## all_recipes_df.recipe_authorKim-Julie Hansen <2e-16 ***
## ` appetizer` <2e-16 ***
## ` baked good` <2e-16 ***
## ` baking` <2e-16 ***
## ` bowls` <2e-16 ***
## ` breakfast` <2e-16 ***
## ` brunch` <2e-16 ***
## ` cake` <2e-16 ***
## ` cookie` NA
## ` cookies` NA
## ` dessert` <2e-16 ***
## ` dip` <2e-16 ***
## ` dressing` NA
## ` entree` <2e-16 ***
## ` entree slow cooker` <2e-16 ***
## ` gluten free` <2e-16 ***
## ` gluten free ` NA
## ` holidays` <2e-16 ***
## ` light bites` <2e-16 ***
## ` main` <2e-16 ***
## ` main dish` <2e-16 ***
## ` no oil` NA
## ` nut-free` NA
## ` quick and easy` <2e-16 ***
## ` quickbread` <2e-16 ***
## ` salad` <2e-16 ***
## ` sauces` <2e-16 ***
## ` saucess` <2e-16 ***
## ` side` <2e-16 ***
## ` side dish` <2e-16 ***
## ` small plate` <2e-16 ***
## ` snack` <2e-16 ***
## ` soup` <2e-16 ***
## ` soy free` NA
## ` soy-free ` NA
## ` spread` <2e-16 ***
## ` spreads` NA
## ` starter` NA
## ` stew` <2e-16 ***
## ` topping` <2e-16 ***
## ` tree nut-free` NA
## appetizer <2e-16 ***
## `baked goods` <2e-16 ***
## baking <2e-16 ***
## bread NA
## breakfast <2e-16 ***
## cookie <2e-16 ***
## dessert <2e-16 ***
## dip NA
## dressing NA
## entree <2e-16 ***
## `gluten-free` NA
## grain <2e-16 ***
## main NA
## `main dish` <2e-16 ***
## pasta <2e-16 ***
## salad <2e-16 ***
## salads <2e-16 ***
## sauces <2e-16 ***
## side NA
## `side dish` <2e-16 ***
## `slow cooker` <2e-16 ***
## `small plate` <2e-16 ***
## snack <2e-16 ***
## soup <2e-16 ***
## sweets <2e-16 ***
## toast NA
## vegan NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 202.49 on 155 degrees of freedom
## Residual deviance: 2811.40 on 92 degrees of freedom
## AIC: 2939.4
##
## Number of Fisher Scoring iterations: 23
#prediciton
test$predictNew <- predict(logisticModel, type = "response" , newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
confusion matrix
# calculating confusion matrix
confMatrix <- confusion.matrix(test$rating, test$predictNew ,threshold = 0.5) # tested different thresholds where 0.5 got highest accuracy
confMatrix
## obs
## pred 0 1
## 0 4 17
## 1 10 22
## attr(,"class")
## [1] "confusion.matrix"
accurracyNew <- sum(diag(confMatrix))/sum(confMatrix)
accurracyNew
## [1] 0.490566