library(dplyr)
library(tidyr)
library(ggplot2)
library(lme4)
library(lmerTest)
source("~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_helpers.R")
dir_path <- "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/"
files <- list.files(dir_path)
ml.data <- data.frame()
for (f in files) {
print(paste0(dir_path, f))
data <- read.csv(paste0(dir_path, f))
ml.data <- rbind(ml.data, data)
}
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/baseline.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/naive_bayes.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/random_forest.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/trigram_stupid_backoff.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/unigram_laplace.csv"
str(ml.data)
## 'data.frame': 50 obs. of 4 variables:
## $ model : Factor w/ 5 levels "baseline","naive_bayes",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ note : Factor w/ 5 levels "baseline","unigram+bigram+trigram",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sample : Factor w/ 1 level "sample_size=30000": 1 1 1 1 1 1 1 1 1 1 ...
## $ accuracy: num 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ...
plot_data <- ml.data %>%
mutate(model = as.factor(ifelse(model == "baseline", "Baseline",
ifelse(model == "unigram", "Unigram LM",
ifelse(model == "trigram", "Trigram Stupid Backoff",
ifelse(model == "naive_bayes", "Naive Bayes", "Random Forest")))))) %>%
group_by(model) %>%
summarise(n = n(),
avg = mean(accuracy),
s = sd(accuracy),
error = qnorm(0.975)*s/sqrt(n),
lower = avg - error,
upper = avg + error)
ggplot(plot_data, aes(x = reorder(model, avg), y = avg)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = lower, ymax = upper)) +
geom_hline(aes(yintercept=0.25), linetype = 2, col = "red") +
xlab("Model") +
ylab("Accuracy - four class classification") +
ggtitle("Expertise classsification")
analysis.R with analysis level featuresd <- read.csv("~/Desktop/Spring2016/CS224U/rateBeerLingRel/data/clean_data_full_final.csv", stringsAsFactors = FALSE)
names(d)
## [1] "X"
## [2] "user_num_places_rated"
## [3] "user_num_following"
## [4] "user_url"
## [5] "beer_num_ratings"
## [6] "user_num_friends"
## [7] "user_num_breweries_rated"
## [8] "review_palate_score"
## [9] "review_taste_score"
## [10] "user_num_ratings"
## [11] "review_ratings_blob"
## [12] "review_aroma_score"
## [13] "user_num_countries_rated"
## [14] "user_id"
## [15] "review_avg_score"
## [16] "beer_global_style_score"
## [17] "beer_weighted_avg_score"
## [18] "beer_brewer_name"
## [19] "review_overall_score"
## [20] "user_location"
## [21] "review_appearance_score"
## [22] "beer_num_calories"
## [23] "beer_style"
## [24] "beer_url"
## [25] "review_blob"
## [26] "beer_global_score"
## [27] "user_name"
## [28] "beer_location"
## [29] "beer_ABV"
## [30] "beer_name"
## [31] "beer_ABV_num"
## [32] "user_experience"
## [33] "review_blob_lower"
## [34] "user_experience_quartile"
## [35] "user_experience1000"
## [36] "user_experience500"
## [37] "num_tokens"
## [38] "num_types"
## [39] "type_token_ratio"
## [40] "corrected_ttr"
## [41] "num_syllables"
## [42] "readability_score"
## [43] "normalized_beer_global_score"
## [44] "normalized_beer_global_style_score"
## [45] "num_first_person_singular_pnouns"
## [46] "num_swear_words"
## [47] "num_negation_words"
## [48] "num_mispelled_words"
ggplot(d, aes(num_tokens)) +
xlim(0, 400) +
xlab("Number of tokens") +
ylab("Numer of reviews") +
geom_histogram(binwidth = 2)
## Warning: Removed 7 rows containing non-finite values (stat_bin).
median(d$num_tokens)
## [1] 35
mean(d$num_tokens)
## [1] 43.4828
d[match(unique(d$user_id), d$user_id),] %>%
select(user_id, user_num_ratings) %>%
group_by(user_num_ratings) %>%
summarise(count = n()) %>%
ggplot(aes(x = log(count), y = log(user_num_ratings))) +
ylab("Log count number of reviews made") +
xlab("Log count number of users") +
ylim(0, 10) +
xlim(0, 10) +
# scale_x_reverse() +
geom_point(alpha = 0.1, size = 5, col = "blue")
# geom_bar(stat = "identity")
min_reviews <- 20
ptm <- proc.time()
d.user_info <- d %>%
filter(user_num_ratings >= min_reviews) %>%
group_by(user_name) %>%
summarise(
## Reviews
## -------
avg_overall_score = mean(review_overall_score),
var_overall_score = var(review_overall_score),
normalized_avg_overall_score = mean(review_overall_score/4),
normalized_var_overall_score = var(review_overall_score/4),
avg_taste_score = mean(review_taste_score),
var_taste_score = var(review_taste_score),
avg_aroma_score = mean(review_aroma_score),
var_aroma_score = var(review_aroma_score),
avg_appearance_score = mean(review_appearance_score),
var_appearance_score = var(review_appearance_score),
avg_palate_score = mean(review_palate_score),
var_palate_score = var(review_palate_score),
## Beer attributes
## ---------------
avg_beer_global_score = mean(beer_global_score, na.rm = TRUE),
var_beer_global_score = var(beer_global_score, na.rm = TRUE),
normalized_avg_global_score = mean(normalized_beer_global_score, na.rm = TRUE),
diff_overall_score = normalized_avg_overall_score - normalized_avg_global_score,
avg_beer_global_style_score = mean(beer_global_style_score, na.rm = TRUE),
var_beer_global_style_score = var(beer_global_style_score, na.rm = TRUE),
# avg_beer_abv = mean(beer_ABV, na.rm = TRUE),
avg_beer_num_calories = mean(beer_num_calories, na.rm = TRUE),
var_beer_num_calories = var(beer_num_calories, na.rm = TRUE),
avg_beer_num_ratings = mean(beer_num_ratings, na.rm = TRUE),
var_beer_num_ratings = var(beer_num_ratings, na.rm = TRUE),
## User
## ----
user_num_ratings = mean(user_num_ratings),
review_sims = review_similarities(review_blob_lower),
avg_num_tokens = mean(num_tokens),
var_num_tokens = var(num_tokens),
avg_num_types = mean(num_types),
var_num_types = var(num_types),
avg_lexdiv_type_token = mean(type_token_ratio),
var_lexdiv_type_token = var(type_token_ratio),
avg_num_syllables = mean(num_syllables),
var_num_syllables = var(num_syllables),
avg_cttr = mean(corrected_ttr),
var_cttr = var(corrected_ttr),
avg_readability = mean(readability_score),
var_readability = var(readability_score),
avg_fpspns = mean(num_first_person_singular_pnouns),
var_fpspns = var(num_first_person_singular_pnouns),
num_styles = length(unique(beer_style)),
avg_num_swear_words = mean(num_swear_words),
var_num_swear_words = var(num_swear_words),
avg_num_negation = mean(num_negation_words),
var_num_negation = var(num_negation_words),
avg_num_mispelled_words = mean(num_mispelled_words),
var_num_mispelled_words = var(num_mispelled_words))
proc.time() - ptm
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_fpspns, size = var_fpspns, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Mean number of first person singular pronouns") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1)) +
theme(legend.position="none")
f <- "num_first_person_singular_pnouns ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula:
## num_first_person_singular_pnouns ~ log(user_num_ratings) + user_num_friends +
## review_overall_score + review_taste_score + review_aroma_score +
## review_palate_score + review_appearance_score + beer_global_score +
## beer_style + (1 | user_name)
## Data: d
##
## REML criterion at convergence: 157152
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2951 -0.4655 -0.1648 0.2175 27.1907
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 0.7851 0.8861
## Residual 1.3722 1.1714
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) 1.265e+00 7.006e-02 4.569e+04
## log(user_num_ratings) -8.391e-02 8.747e-03 2.766e+03
## user_num_friends 3.464e-03 4.194e-03 1.894e+03
## review_overall_score -1.052e-02 3.252e-03 4.732e+04
## review_taste_score -2.459e-02 6.483e-03 4.712e+04
## review_aroma_score 8.467e-03 5.054e-03 4.713e+04
## review_palate_score -2.263e-02 9.675e-03 4.669e+04
## review_appearance_score 2.682e-02 8.629e-03 4.674e+04
## beer_global_score 3.628e-04 3.276e-04 4.722e+04
## beer_styleAbbey Tripel -2.480e-02 6.673e-02 4.508e+04
## beer_styleAbt/Quadrupel 2.593e-02 7.606e-02 4.511e+04
## beer_styleAltbier -2.373e-02 1.145e-01 4.522e+04
## beer_styleAmber Ale -2.845e-02 6.631e-02 4.520e+04
## beer_styleAmber Lager/Vienna -3.758e-02 7.165e-02 4.575e+04
## beer_styleAmerican Pale Ale -3.766e-02 5.935e-02 4.531e+04
## beer_styleAmerican Strong Ale 1.679e-01 6.605e-02 4.508e+04
## beer_styleBaltic Porter 1.333e-02 9.131e-02 4.581e+04
## beer_styleBarley Wine 1.443e-01 6.483e-02 4.495e+04
## beer_styleBelgian Ale -3.584e-02 6.556e-02 4.497e+04
## beer_styleBelgian Strong Ale 9.408e-02 5.863e-02 4.506e+04
## beer_styleBerliner Weisse 1.638e-01 1.041e-01 4.447e+04
## beer_styleBire de Garde 1.629e-01 1.210e-01 4.501e+04
## beer_styleBitter -3.698e-02 7.896e-02 4.624e+04
## beer_styleBlack IPA 1.148e-01 7.283e-02 4.519e+04
## beer_styleBrown Ale -1.551e-02 6.627e-02 4.519e+04
## beer_styleCalifornia Common -9.936e-02 1.263e-01 4.526e+04
## beer_styleCider 1.381e-01 9.214e-02 4.740e+04
## beer_styleCream Ale -4.251e-02 1.233e-01 4.539e+04
## beer_styleCzech Pilsner (Svtl) 1.300e-02 8.847e-02 4.568e+04
## beer_styleDoppelbock 4.135e-03 7.601e-02 4.500e+04
## beer_styleDortmunder/Helles -1.448e-01 8.868e-02 4.531e+04
## beer_styleDry Stout 2.288e-01 8.304e-02 4.545e+04
## beer_styleDunkel/Tmav -1.363e-01 8.640e-02 4.551e+04
## beer_styleDunkelweizen 7.886e-03 1.016e-01 4.499e+04
## beer_styleDunkler Bock -1.602e-01 1.066e-01 4.446e+04
## beer_styleEisbock -4.487e-02 1.825e-01 4.517e+04
## beer_styleEnglish Pale Ale -2.849e-02 9.531e-02 4.490e+04
## beer_styleEnglish Strong Ale 1.095e-01 8.042e-02 4.504e+04
## beer_styleForeign Stout 1.190e-01 9.728e-02 4.524e+04
## beer_styleFruit Beer 1.656e-01 6.616e-02 4.534e+04
## beer_styleGerman Hefeweizen -6.226e-03 6.914e-02 4.547e+04
## beer_styleGerman Kristallweizen -1.505e-01 1.893e-01 4.572e+04
## beer_styleGolden Ale/Blond Ale -9.729e-02 7.045e-02 4.579e+04
## beer_styleGrodziskie/Gose/Lichtenhainer 5.266e-02 1.091e-01 4.401e+04
## beer_styleHeller Bock 3.917e-03 9.527e-02 4.509e+04
## beer_styleIce Cider/Ice Perry 6.707e-02 3.700e-01 4.570e+04
## beer_styleImperial IPA 1.055e-01 5.757e-02 4.540e+04
## beer_styleImperial Pils/Strong Pale Lager 8.935e-02 9.133e-02 4.548e+04
## beer_styleImperial Porter 6.759e-02 8.539e-02 4.460e+04
## beer_styleImperial Stout 1.367e-01 5.746e-02 4.526e+04
## beer_styleIndia Pale Ale (IPA) -3.842e-03 5.528e-02 4.536e+04
## beer_styleIrish Ale -4.115e-02 9.476e-02 4.567e+04
## beer_styleKlsch -7.740e-02 1.072e-01 4.522e+04
## beer_styleLambic Style - Faro 2.958e-01 2.795e-01 4.381e+04
## beer_styleLambic Style - Fruit 1.638e-01 8.450e-02 4.482e+04
## beer_styleLambic Style - Gueuze 3.288e-01 1.050e-01 4.527e+04
## beer_styleLambic Style - Unblended 4.240e-01 1.792e-01 4.371e+04
## beer_styleLow Alcohol 1.062e-01 1.202e-01 4.476e+04
## beer_styleMalt Liquor 1.637e-01 1.051e-01 4.747e+04
## beer_styleMead 1.079e-01 1.298e-01 4.712e+04
## beer_styleMild Ale -1.893e-01 1.493e-01 4.549e+04
## beer_styleOktoberfest/Mrzen -9.991e-02 8.125e-02 4.507e+04
## beer_styleOld Ale -7.052e-02 9.203e-02 4.491e+04
## beer_stylePale Lager -1.693e-02 6.254e-02 4.589e+04
## beer_stylePerry -1.164e-01 2.690e-01 4.454e+04
## beer_stylePilsener -9.609e-02 6.783e-02 4.550e+04
## beer_stylePolotmav -2.117e-01 2.855e-01 4.612e+04
## beer_stylePorter -4.988e-02 6.197e-02 4.529e+04
## beer_stylePremium Bitter/ESB -3.099e-02 7.108e-02 4.528e+04
## beer_stylePremium Lager -9.523e-02 7.912e-02 4.564e+04
## beer_styleRadler/Shandy 4.267e-03 1.346e-01 4.678e+04
## beer_styleSahti/Gotlandsdricke/Kodulu 1.036e-02 3.499e-01 4.382e+04
## beer_styleSaison -6.249e-03 6.541e-02 4.503e+04
## beer_styleSak - Futsu-shu 1.244e+00 9.233e-01 3.876e+04
## beer_styleSak - Ginjo -1.382e+00 7.160e-01 4.740e+04
## beer_styleSak - Infused 9.411e-01 1.189e+00 4.271e+04
## beer_styleSak - Junmai 5.096e-01 8.401e-01 4.267e+04
## beer_styleSak - Nigori -3.322e-01 6.218e-01 4.766e+04
## beer_styleSchwarzbier -1.583e-01 9.560e-02 4.543e+04
## beer_styleScotch Ale 2.103e-02 8.334e-02 4.529e+04
## beer_styleScottish Ale 9.114e-02 1.289e-01 4.582e+04
## beer_styleSession IPA -5.002e-02 8.191e-02 4.489e+04
## beer_styleSmoked 2.382e-01 8.382e-02 4.474e+04
## beer_styleSour Red/Brown 1.916e-01 8.212e-02 4.466e+04
## beer_styleSour/Wild Ale 1.869e-01 7.114e-02 4.490e+04
## beer_styleSpecialty Grain 1.331e-01 9.287e-02 4.490e+04
## beer_styleSpice/Herb/Vegetable 2.206e-01 6.371e-02 4.541e+04
## beer_styleStout -9.123e-02 6.479e-02 4.512e+04
## beer_styleSweet Stout 1.936e-02 7.051e-02 4.525e+04
## beer_styleTraditional Ale 3.531e-01 9.977e-02 4.489e+04
## beer_styleWeizen Bock 3.492e-02 9.885e-02 4.444e+04
## beer_styleWheat Ale 9.962e-02 7.206e-02 4.550e+04
## beer_styleWitbier -3.040e-02 6.864e-02 4.537e+04
## beer_styleZwickel/Keller/Landbier 5.908e-02 1.321e-01 4.683e+04
## t value Pr(>|t|)
## (Intercept) 18.053 < 2e-16 ***
## log(user_num_ratings) -9.594 < 2e-16 ***
## user_num_friends 0.826 0.408920
## review_overall_score -3.235 0.001217 **
## review_taste_score -3.793 0.000149 ***
## review_aroma_score 1.675 0.093923 .
## review_palate_score -2.339 0.019335 *
## review_appearance_score 3.108 0.001883 **
## beer_global_score 1.108 0.268036
## beer_styleAbbey Tripel -0.372 0.710196
## beer_styleAbt/Quadrupel 0.341 0.733214
## beer_styleAltbier -0.207 0.835847
## beer_styleAmber Ale -0.429 0.667883
## beer_styleAmber Lager/Vienna -0.525 0.599908
## beer_styleAmerican Pale Ale -0.635 0.525682
## beer_styleAmerican Strong Ale 2.542 0.011026 *
## beer_styleBaltic Porter 0.146 0.883925
## beer_styleBarley Wine 2.226 0.026011 *
## beer_styleBelgian Ale -0.547 0.584623
## beer_styleBelgian Strong Ale 1.605 0.108551
## beer_styleBerliner Weisse 1.573 0.115681
## beer_styleBire de Garde 1.346 0.178321
## beer_styleBitter -0.468 0.639551
## beer_styleBlack IPA 1.577 0.114850
## beer_styleBrown Ale -0.234 0.815011
## beer_styleCalifornia Common -0.786 0.431624
## beer_styleCider 1.498 0.134030
## beer_styleCream Ale -0.345 0.730385
## beer_styleCzech Pilsner (Svtl) 0.147 0.883166
## beer_styleDoppelbock 0.054 0.956614
## beer_styleDortmunder/Helles -1.632 0.102616
## beer_styleDry Stout 2.756 0.005858 **
## beer_styleDunkel/Tmav -1.578 0.114571
## beer_styleDunkelweizen 0.078 0.938115
## beer_styleDunkler Bock -1.503 0.132888
## beer_styleEisbock -0.246 0.805747
## beer_styleEnglish Pale Ale -0.299 0.765019
## beer_styleEnglish Strong Ale 1.362 0.173313
## beer_styleForeign Stout 1.223 0.221274
## beer_styleFruit Beer 2.504 0.012286 *
## beer_styleGerman Hefeweizen -0.090 0.928248
## beer_styleGerman Kristallweizen -0.795 0.426593
## beer_styleGolden Ale/Blond Ale -1.381 0.167288
## beer_styleGrodziskie/Gose/Lichtenhainer 0.482 0.629481
## beer_styleHeller Bock 0.041 0.967205
## beer_styleIce Cider/Ice Perry 0.181 0.856156
## beer_styleImperial IPA 1.832 0.066990 .
## beer_styleImperial Pils/Strong Pale Lager 0.978 0.327906
## beer_styleImperial Porter 0.792 0.428635
## beer_styleImperial Stout 2.379 0.017360 *
## beer_styleIndia Pale Ale (IPA) -0.070 0.944586
## beer_styleIrish Ale -0.434 0.664086
## beer_styleKlsch -0.722 0.470231
## beer_styleLambic Style - Faro 1.059 0.289800
## beer_styleLambic Style - Fruit 1.939 0.052536 .
## beer_styleLambic Style - Gueuze 3.133 0.001731 **
## beer_styleLambic Style - Unblended 2.366 0.018004 *
## beer_styleLow Alcohol 0.883 0.376994
## beer_styleMalt Liquor 1.558 0.119166
## beer_styleMead 0.831 0.405944
## beer_styleMild Ale -1.268 0.204881
## beer_styleOktoberfest/Mrzen -1.230 0.218852
## beer_styleOld Ale -0.766 0.443485
## beer_stylePale Lager -0.271 0.786655
## beer_stylePerry -0.433 0.665351
## beer_stylePilsener -1.417 0.156606
## beer_stylePolotmav -0.742 0.458385
## beer_stylePorter -0.805 0.420895
## beer_stylePremium Bitter/ESB -0.436 0.662844
## beer_stylePremium Lager -1.204 0.228779
## beer_styleRadler/Shandy 0.032 0.974710
## beer_styleSahti/Gotlandsdricke/Kodulu 0.030 0.976382
## beer_styleSaison -0.096 0.923888
## beer_styleSak - Futsu-shu 1.347 0.178045
## beer_styleSak - Ginjo -1.931 0.053534 .
## beer_styleSak - Infused 0.791 0.428658
## beer_styleSak - Junmai 0.607 0.544107
## beer_styleSak - Nigori -0.534 0.593232
## beer_styleSchwarzbier -1.656 0.097777 .
## beer_styleScotch Ale 0.252 0.800762
## beer_styleScottish Ale 0.707 0.479513
## beer_styleSession IPA -0.611 0.541459
## beer_styleSmoked 2.841 0.004492 **
## beer_styleSour Red/Brown 2.333 0.019652 *
## beer_styleSour/Wild Ale 2.627 0.008611 **
## beer_styleSpecialty Grain 1.434 0.151678
## beer_styleSpice/Herb/Vegetable 3.463 0.000534 ***
## beer_styleStout -1.408 0.159142
## beer_styleSweet Stout 0.275 0.783623
## beer_styleTraditional Ale 3.539 0.000402 ***
## beer_styleWeizen Bock 0.353 0.723861
## beer_styleWheat Ale 1.382 0.166859
## beer_styleWitbier -0.443 0.657856
## beer_styleZwickel/Keller/Landbier 0.447 0.654813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
This interesting and in line with findings from Jurafsky more first person singular pronouns predicts worse experience.
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_num_tokens, size = var_num_tokens, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Mean review length") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1))
f <- "num_tokens ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula:
## num_tokens ~ log(user_num_ratings) + user_num_friends + review_overall_score +
## review_taste_score + review_aroma_score + review_palate_score +
## review_appearance_score + beer_global_score + (1 | user_name)
## Data: d
##
## REML criterion at convergence: 438282.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -9.652 -0.418 -0.097 0.309 33.271
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 571.3 23.90
## Residual 470.4 21.69
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.080e+01 8.291e-01 1.957e+04 37.152 < 2e-16
## log(user_num_ratings) 1.220e+00 2.187e-01 3.946e+03 5.581 2.55e-08
## user_num_friends 2.623e-01 1.102e-01 3.085e+03 2.380 0.01738
## review_overall_score 1.140e-01 6.164e-02 4.769e+04 1.850 0.06432
## review_taste_score -1.249e-01 1.216e-01 4.648e+04 -1.027 0.30433
## review_aroma_score 4.856e-01 9.410e-02 4.646e+04 5.161 2.47e-07
## review_palate_score -2.659e-01 1.809e-01 4.602e+04 -1.470 0.14152
## review_appearance_score 4.551e-01 1.600e-01 4.604e+04 2.845 0.00444
## beer_global_score 2.459e-02 4.794e-03 4.764e+04 5.129 2.93e-07
##
## (Intercept) ***
## log(user_num_ratings) ***
## user_num_friends *
## review_overall_score .
## review_taste_score
## review_aroma_score ***
## review_palate_score
## review_appearance_score **
## beer_global_score ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.528
## usr_nm_frnd 0.128 -0.414
## rvw_vrll_sc -0.013 -0.022 0.005
## rvw_tst_scr -0.047 0.029 -0.006 -0.557
## revw_rm_scr -0.089 0.008 -0.008 -0.212 -0.251
## rvw_plt_scr -0.170 0.023 0.000 -0.209 -0.232 -0.041
## rvw_pprnc_s -0.362 0.048 -0.001 -0.096 -0.044 -0.112 -0.170
## br_glbl_scr 0.113 -0.067 0.003 -0.127 -0.028 -0.214 -0.055 -0.118
Here we also see that it looks like more experienced users are writing longer reviews…
# names(d.user_info)
d.user_info %>%
filter(avg_num_mispelled_words / avg_num_tokens < 0.25) %>%
ggplot(aes(x = log(user_num_ratings), y = avg_num_mispelled_words, size = var_num_mispelled_words, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Mean number of spelling mistakes") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1))
d.filter_non_english <- d %>%
filter(num_mispelled_words / num_tokens < 0.25)
f <- "num_mispelled_words ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d.filter_non_english))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula: num_mispelled_words ~ log(user_num_ratings) + user_num_friends +
## review_overall_score + review_taste_score + review_aroma_score +
## review_palate_score + review_appearance_score + beer_global_score +
## (1 | user_name)
## Data: d.filter_non_english
##
## REML criterion at convergence: 170742.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3450 -0.6031 -0.1628 0.4385 18.2976
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 1.208 1.099
## Residual 2.592 1.610
## Number of obs: 43672, groups: user_name, 4359
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.512e+00 5.623e-02 2.428e+04 26.896 < 2e-16
## log(user_num_ratings) 1.913e-02 1.152e-02 3.195e+03 1.660 0.09692
## user_num_friends 1.629e-02 5.294e-03 2.124e+03 3.077 0.00212
## review_overall_score 2.258e-03 4.664e-03 4.322e+04 0.484 0.62829
## review_taste_score 4.474e-04 9.296e-03 4.340e+04 0.048 0.96161
## review_aroma_score 1.364e-02 7.158e-03 4.341e+04 1.906 0.05664
## review_palate_score 2.479e-03 1.385e-02 4.313e+04 0.179 0.85793
## review_appearance_score -1.075e-02 1.225e-02 4.312e+04 -0.877 0.38038
## beer_global_score 1.657e-03 3.594e-04 4.327e+04 4.611 4.03e-06
##
## (Intercept) ***
## log(user_num_ratings) .
## user_num_friends **
## review_overall_score
## review_taste_score
## review_aroma_score .
## review_palate_score
## review_appearance_score
## beer_global_score ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.513
## usr_nm_frnd 0.132 -0.425
## rvw_vrll_sc -0.021 -0.023 0.005
## rvw_tst_scr -0.048 0.035 -0.008 -0.559
## revw_rm_scr -0.090 0.007 -0.012 -0.210 -0.252
## rvw_plt_scr -0.184 0.030 -0.001 -0.205 -0.238 -0.041
## rvw_pprnc_s -0.403 0.068 -0.001 -0.090 -0.047 -0.119 -0.177
## br_glbl_scr 0.122 -0.093 0.004 -0.124 -0.023 -0.214 -0.056 -0.114
There may be some problems with non-english reviews, but if we filter so that the reviewers are mispelling less than one if four words we aren’t seeing evidence that this is an effect of expertise…
# names(d.user_info)
d.user_info %>%
filter(user_num_ratings >= 50) %>%
ggplot(aes(x = log(user_num_ratings), y = avg_num_swear_words, size = var_num_swear_words, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Mean number of swear words used") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
theme(legend.position="none")
# d.filter_non_english <- d %>%
# filter(num_mispelled_words / num_tokens < 0.25)
names(d)
## [1] "X"
## [2] "user_num_places_rated"
## [3] "user_num_following"
## [4] "user_url"
## [5] "beer_num_ratings"
## [6] "user_num_friends"
## [7] "user_num_breweries_rated"
## [8] "review_palate_score"
## [9] "review_taste_score"
## [10] "user_num_ratings"
## [11] "review_ratings_blob"
## [12] "review_aroma_score"
## [13] "user_num_countries_rated"
## [14] "user_id"
## [15] "review_avg_score"
## [16] "beer_global_style_score"
## [17] "beer_weighted_avg_score"
## [18] "beer_brewer_name"
## [19] "review_overall_score"
## [20] "user_location"
## [21] "review_appearance_score"
## [22] "beer_num_calories"
## [23] "beer_style"
## [24] "beer_url"
## [25] "review_blob"
## [26] "beer_global_score"
## [27] "user_name"
## [28] "beer_location"
## [29] "beer_ABV"
## [30] "beer_name"
## [31] "beer_ABV_num"
## [32] "user_experience"
## [33] "review_blob_lower"
## [34] "user_experience_quartile"
## [35] "user_experience1000"
## [36] "user_experience500"
## [37] "num_tokens"
## [38] "num_types"
## [39] "type_token_ratio"
## [40] "corrected_ttr"
## [41] "num_syllables"
## [42] "readability_score"
## [43] "normalized_beer_global_score"
## [44] "normalized_beer_global_style_score"
## [45] "num_first_person_singular_pnouns"
## [46] "num_swear_words"
## [47] "num_negation_words"
## [48] "num_mispelled_words"
f <- "num_swear_words ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula: num_swear_words ~ log(user_num_ratings) + user_num_friends +
## review_overall_score + review_taste_score + review_aroma_score +
## review_palate_score + review_appearance_score + beer_global_score +
## beer_style + (1 | user_name)
## Data: d
##
## REML criterion at convergence: -52755.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.531 -0.144 -0.068 -0.026 34.314
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 0.001051 0.03242
## Residual 0.018498 0.13601
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) 1.864e-03 7.541e-03 4.542e+04
## log(user_num_ratings) 6.326e-04 5.093e-04 2.846e+03
## user_num_friends 5.874e-04 1.977e-04 1.401e+03
## review_overall_score -3.791e-04 3.437e-04 3.605e+04
## review_taste_score 5.053e-04 7.151e-04 4.560e+04
## review_aroma_score -7.935e-04 5.579e-04 4.603e+04
## review_palate_score 2.130e-03 1.071e-03 4.552e+04
## review_appearance_score 3.617e-05 9.553e-04 4.567e+04
## beer_global_score 3.126e-05 3.619e-05 4.650e+04
## beer_styleAbbey Tripel 5.955e-03 7.595e-03 4.725e+04
## beer_styleAbt/Quadrupel 5.602e-03 8.655e-03 4.724e+04
## beer_styleAltbier 3.805e-05 1.302e-02 4.733e+04
## beer_styleAmber Ale 7.756e-03 7.527e-03 4.749e+04
## beer_styleAmber Lager/Vienna 9.102e-03 8.100e-03 4.760e+04
## beer_styleAmerican Pale Ale 7.365e-03 6.727e-03 4.758e+04
## beer_styleAmerican Strong Ale 1.405e-02 7.507e-03 4.743e+04
## beer_styleBaltic Porter -2.704e-03 1.032e-02 4.755e+04
## beer_styleBarley Wine 7.229e-03 7.370e-03 4.751e+04
## beer_styleBelgian Ale 5.885e-03 7.464e-03 4.728e+04
## beer_styleBelgian Strong Ale 5.946e-03 6.669e-03 4.733e+04
## beer_styleBerliner Weisse -3.923e-03 1.188e-02 4.740e+04
## beer_styleBire de Garde 1.543e-02 1.378e-02 4.724e+04
## beer_styleBitter 1.403e-02 8.848e-03 4.759e+04
## beer_styleBlack IPA 3.148e-03 8.270e-03 4.749e+04
## beer_styleBrown Ale -3.933e-03 7.524e-03 4.748e+04
## beer_styleCalifornia Common -5.448e-03 1.437e-02 4.721e+04
## beer_styleCider -1.785e-03 1.017e-02 4.673e+04
## beer_styleCream Ale -9.153e-03 1.399e-02 4.744e+04
## beer_styleCzech Pilsner (Svtl) 1.036e-02 9.974e-03 4.769e+04
## beer_styleDoppelbock -4.342e-04 8.648e-03 4.735e+04
## beer_styleDortmunder/Helles 1.037e-02 1.006e-02 4.754e+04
## beer_styleDry Stout 1.011e-02 9.401e-03 4.758e+04
## beer_styleDunkel/Tmav 8.435e-04 9.794e-03 4.747e+04
## beer_styleDunkelweizen 3.249e-04 1.156e-02 4.737e+04
## beer_styleDunkler Bock -6.954e-03 1.217e-02 4.717e+04
## beer_styleEisbock 9.353e-02 2.076e-02 4.723e+04
## beer_styleEnglish Pale Ale 1.076e-02 1.086e-02 4.726e+04
## beer_styleEnglish Strong Ale 2.471e-03 9.144e-03 4.739e+04
## beer_styleForeign Stout 1.453e-02 1.106e-02 4.732e+04
## beer_styleFruit Beer 4.584e-03 7.500e-03 4.755e+04
## beer_styleGerman Hefeweizen 7.259e-03 7.832e-03 4.758e+04
## beer_styleGerman Kristallweizen -4.872e-03 2.145e-02 4.742e+04
## beer_styleGolden Ale/Blond Ale 2.934e-03 7.955e-03 4.764e+04
## beer_styleGrodziskie/Gose/Lichtenhainer 8.824e-03 1.249e-02 4.728e+04
## beer_styleHeller Bock 9.699e-03 1.084e-02 4.734e+04
## beer_styleIce Cider/Ice Perry -2.351e-02 4.210e-02 4.716e+04
## beer_styleImperial IPA 1.739e-02 6.515e-03 4.764e+04
## beer_styleImperial Pils/Strong Pale Lager -1.941e-03 1.037e-02 4.736e+04
## beer_styleImperial Porter 8.776e-03 9.742e-03 4.726e+04
## beer_styleImperial Stout 1.035e-02 6.514e-03 4.760e+04
## beer_styleIndia Pale Ale (IPA) 9.735e-03 6.259e-03 4.763e+04
## beer_styleIrish Ale -3.755e-03 1.075e-02 4.736e+04
## beer_styleKlsch -6.267e-03 1.218e-02 4.733e+04
## beer_styleLambic Style - Faro 4.484e-02 3.210e-02 4.673e+04
## beer_styleLambic Style - Fruit 9.426e-03 9.585e-03 4.769e+04
## beer_styleLambic Style - Gueuze 4.916e-03 1.190e-02 4.760e+04
## beer_styleLambic Style - Unblended -1.497e-02 2.048e-02 4.762e+04
## beer_styleLow Alcohol 1.997e-02 1.365e-02 4.766e+04
## beer_styleMalt Liquor 8.071e-02 1.165e-02 4.751e+04
## beer_styleMead -1.050e-03 1.425e-02 4.502e+04
## beer_styleMild Ale -1.546e-02 1.691e-02 4.759e+04
## beer_styleOktoberfest/Mrzen -2.189e-03 9.229e-03 4.750e+04
## beer_styleOld Ale -8.269e-03 1.048e-02 4.726e+04
## beer_stylePale Lager 2.169e-02 7.044e-03 4.770e+04
## beer_stylePerry -1.079e-02 3.069e-02 4.716e+04
## beer_stylePilsener 2.073e-04 7.678e-03 4.761e+04
## beer_stylePolotmav 1.185e-03 3.228e-02 4.742e+04
## beer_stylePorter 3.012e-03 7.028e-03 4.754e+04
## beer_stylePremium Bitter/ESB -1.847e-03 8.064e-03 4.752e+04
## beer_stylePremium Lager 1.063e-02 8.959e-03 4.751e+04
## beer_styleRadler/Shandy 1.468e-02 1.511e-02 4.764e+04
## beer_styleSahti/Gotlandsdricke/Kodulu 2.229e-03 4.013e-02 4.685e+04
## beer_styleSaison 7.063e-03 7.430e-03 4.751e+04
## beer_styleSak - Futsu-shu -7.245e-04 9.809e-02 4.759e+04
## beer_styleSak - Ginjo -4.450e-03 7.986e-02 4.757e+04
## beer_styleSak - Infused 2.172e-03 1.375e-01 4.635e+04
## beer_styleSak - Junmai 1.409e-03 9.721e-02 4.617e+04
## beer_styleSak - Nigori 1.156e-04 6.913e-02 4.744e+04
## beer_styleSchwarzbier 7.240e-04 1.086e-02 4.734e+04
## beer_styleScotch Ale -4.043e-03 9.465e-03 4.738e+04
## beer_styleScottish Ale 2.035e-02 1.460e-02 4.737e+04
## beer_styleSession IPA 1.059e-02 9.319e-03 4.748e+04
## beer_styleSmoked 5.351e-03 9.551e-03 4.733e+04
## beer_styleSour Red/Brown 1.328e-03 9.356e-03 4.738e+04
## beer_styleSour/Wild Ale 9.328e-03 8.071e-03 4.767e+04
## beer_styleSpecialty Grain 8.029e-03 1.057e-02 4.741e+04
## beer_styleSpice/Herb/Vegetable -3.340e-04 7.217e-03 4.758e+04
## beer_styleStout 5.730e-03 7.363e-03 4.744e+04
## beer_styleSweet Stout 2.715e-03 7.998e-03 4.754e+04
## beer_styleTraditional Ale -1.109e-02 1.137e-02 4.726e+04
## beer_styleWeizen Bock -9.125e-03 1.129e-02 4.714e+04
## beer_styleWheat Ale 1.370e-02 8.168e-03 4.752e+04
## beer_styleWitbier 2.006e-02 7.788e-03 4.746e+04
## beer_styleZwickel/Keller/Landbier -6.226e-03 1.485e-02 4.758e+04
## t value Pr(>|t|)
## (Intercept) 0.247 0.80472
## log(user_num_ratings) 1.242 0.21424
## user_num_friends 2.971 0.00301 **
## review_overall_score -1.103 0.27007
## review_taste_score 0.707 0.47978
## review_aroma_score -1.422 0.15492
## review_palate_score 1.989 0.04674 *
## review_appearance_score 0.038 0.96980
## beer_global_score 0.864 0.38771
## beer_styleAbbey Tripel 0.784 0.43300
## beer_styleAbt/Quadrupel 0.647 0.51750
## beer_styleAltbier 0.003 0.99767
## beer_styleAmber Ale 1.030 0.30282
## beer_styleAmber Lager/Vienna 1.124 0.26114
## beer_styleAmerican Pale Ale 1.095 0.27363
## beer_styleAmerican Strong Ale 1.871 0.06134 .
## beer_styleBaltic Porter -0.262 0.79344
## beer_styleBarley Wine 0.981 0.32671
## beer_styleBelgian Ale 0.788 0.43044
## beer_styleBelgian Strong Ale 0.892 0.37264
## beer_styleBerliner Weisse -0.330 0.74119
## beer_styleBire de Garde 1.119 0.26294
## beer_styleBitter 1.586 0.11272
## beer_styleBlack IPA 0.381 0.70344
## beer_styleBrown Ale -0.523 0.60119
## beer_styleCalifornia Common -0.379 0.70460
## beer_styleCider -0.176 0.86063
## beer_styleCream Ale -0.654 0.51297
## beer_styleCzech Pilsner (Svtl) 1.039 0.29902
## beer_styleDoppelbock -0.050 0.95996
## beer_styleDortmunder/Helles 1.031 0.30262
## beer_styleDry Stout 1.076 0.28214
## beer_styleDunkel/Tmav 0.086 0.93136
## beer_styleDunkelweizen 0.028 0.97758
## beer_styleDunkler Bock -0.571 0.56787
## beer_styleEisbock 4.506 6.62e-06 ***
## beer_styleEnglish Pale Ale 0.991 0.32158
## beer_styleEnglish Strong Ale 0.270 0.78700
## beer_styleForeign Stout 1.314 0.18890
## beer_styleFruit Beer 0.611 0.54112
## beer_styleGerman Hefeweizen 0.927 0.35400
## beer_styleGerman Kristallweizen -0.227 0.82031
## beer_styleGolden Ale/Blond Ale 0.369 0.71229
## beer_styleGrodziskie/Gose/Lichtenhainer 0.706 0.47992
## beer_styleHeller Bock 0.895 0.37076
## beer_styleIce Cider/Ice Perry -0.558 0.57655
## beer_styleImperial IPA 2.669 0.00760 **
## beer_styleImperial Pils/Strong Pale Lager -0.187 0.85152
## beer_styleImperial Porter 0.901 0.36769
## beer_styleImperial Stout 1.589 0.11200
## beer_styleIndia Pale Ale (IPA) 1.555 0.11989
## beer_styleIrish Ale -0.349 0.72687
## beer_styleKlsch -0.514 0.60698
## beer_styleLambic Style - Faro 1.397 0.16244
## beer_styleLambic Style - Fruit 0.983 0.32540
## beer_styleLambic Style - Gueuze 0.413 0.67949
## beer_styleLambic Style - Unblended -0.731 0.46499
## beer_styleLow Alcohol 1.463 0.14353
## beer_styleMalt Liquor 6.928 4.32e-12 ***
## beer_styleMead -0.074 0.94125
## beer_styleMild Ale -0.914 0.36059
## beer_styleOktoberfest/Mrzen -0.237 0.81251
## beer_styleOld Ale -0.789 0.43030
## beer_stylePale Lager 3.079 0.00208 **
## beer_stylePerry -0.351 0.72528
## beer_stylePilsener 0.027 0.97845
## beer_stylePolotmav 0.037 0.97072
## beer_stylePorter 0.429 0.66824
## beer_stylePremium Bitter/ESB -0.229 0.81881
## beer_stylePremium Lager 1.186 0.23551
## beer_styleRadler/Shandy 0.971 0.33133
## beer_styleSahti/Gotlandsdricke/Kodulu 0.056 0.95570
## beer_styleSaison 0.951 0.34184
## beer_styleSak - Futsu-shu -0.007 0.99411
## beer_styleSak - Ginjo -0.056 0.95557
## beer_styleSak - Infused 0.016 0.98739
## beer_styleSak - Junmai 0.014 0.98844
## beer_styleSak - Nigori 0.002 0.99867
## beer_styleSchwarzbier 0.067 0.94683
## beer_styleScotch Ale -0.427 0.66929
## beer_styleScottish Ale 1.394 0.16324
## beer_styleSession IPA 1.136 0.25588
## beer_styleSmoked 0.560 0.57529
## beer_styleSour Red/Brown 0.142 0.88714
## beer_styleSour/Wild Ale 1.156 0.24780
## beer_styleSpecialty Grain 0.760 0.44740
## beer_styleSpice/Herb/Vegetable -0.046 0.96309
## beer_styleStout 0.778 0.43644
## beer_styleSweet Stout 0.340 0.73422
## beer_styleTraditional Ale -0.976 0.32918
## beer_styleWeizen Bock -0.808 0.41911
## beer_styleWheat Ale 1.678 0.09341 .
## beer_styleWitbier 2.576 0.01000 *
## beer_styleZwickel/Keller/Landbier -0.419 0.67496
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_num_negation, size = var_num_negation, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Average number of negations used") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1)) +
theme(legend.position="none")
# d.filter_non_english <- d %>%
# filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "num_negation_words ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula: num_negation_words ~ log(user_num_ratings) + user_num_friends +
## review_overall_score + review_taste_score + review_aroma_score +
## review_palate_score + review_appearance_score + beer_global_score +
## beer_style + (1 | user_name)
## Data: d
##
## REML criterion at convergence: 113686.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5235 -0.5823 -0.2496 0.4508 17.8673
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 0.1364 0.3693
## Residual 0.5779 0.7602
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) 1.434e+00 4.387e-02 4.670e+04
## log(user_num_ratings) -1.390e-02 4.165e-03 3.262e+03
## user_num_friends 1.132e-03 1.850e-03 1.946e+03
## review_overall_score -1.161e-02 2.037e-03 4.531e+04
## review_taste_score -3.577e-02 4.128e-03 4.770e+04
## review_aroma_score -2.159e-02 3.218e-03 4.770e+04
## review_palate_score -2.095e-02 6.174e-03 4.766e+04
## review_appearance_score -2.239e-02 5.506e-03 4.766e+04
## beer_global_score -2.061e-03 2.085e-04 4.770e+04
## beer_styleAbbey Tripel -3.439e-02 4.294e-02 4.646e+04
## beer_styleAbt/Quadrupel 6.962e-02 4.894e-02 4.647e+04
## beer_styleAltbier 5.201e-02 7.365e-02 4.655e+04
## beer_styleAmber Ale 1.570e-02 4.264e-02 4.663e+04
## beer_styleAmber Lager/Vienna -8.556e-02 4.599e-02 4.696e+04
## beer_styleAmerican Pale Ale -7.092e-02 3.815e-02 4.673e+04
## beer_styleAmerican Strong Ale 3.963e-02 4.250e-02 4.654e+04
## beer_styleBaltic Porter 4.260e-02 5.860e-02 4.695e+04
## beer_styleBarley Wine 8.338e-02 4.173e-02 4.648e+04
## beer_styleBelgian Ale -9.112e-02 4.221e-02 4.642e+04
## beer_styleBelgian Strong Ale 9.299e-03 3.773e-02 4.649e+04
## beer_styleBerliner Weisse 1.392e-01 6.713e-02 4.614e+04
## beer_styleBire de Garde -5.503e-02 7.789e-02 4.640e+04
## beer_styleBitter -4.901e-02 5.054e-02 4.739e+04
## beer_styleBlack IPA 1.947e-02 4.685e-02 4.660e+04
## beer_styleBrown Ale -4.504e-02 4.262e-02 4.663e+04
## beer_styleCalifornia Common -3.603e-02 8.127e-02 4.654e+04
## beer_styleCider 1.643e-01 5.857e-02 4.769e+04
## beer_styleCream Ale 4.110e-02 7.927e-02 4.674e+04
## beer_styleCzech Pilsner (Svtl) -1.639e-01 5.677e-02 4.706e+04
## beer_styleDoppelbock -2.237e-02 4.892e-02 4.645e+04
## beer_styleDortmunder/Helles -1.273e-01 5.702e-02 4.668e+04
## beer_styleDry Stout -3.366e-02 5.335e-02 4.685e+04
## beer_styleDunkel/Tmav -5.805e-02 5.551e-02 4.677e+04
## beer_styleDunkelweizen -1.178e-01 6.539e-02 4.643e+04
## beer_styleDunkler Bock 3.346e-02 6.874e-02 4.616e+04
## beer_styleEisbock -1.754e-01 1.174e-01 4.652e+04
## beer_styleEnglish Pale Ale -9.925e-02 6.138e-02 4.634e+04
## beer_styleEnglish Strong Ale -6.829e-02 5.175e-02 4.651e+04
## beer_styleForeign Stout -7.789e-03 6.257e-02 4.655e+04
## beer_styleFruit Beer 7.420e-02 4.252e-02 4.673e+04
## beer_styleGerman Hefeweizen -1.014e-01 4.442e-02 4.680e+04
## beer_styleGerman Kristallweizen -5.681e-02 1.216e-01 4.684e+04
## beer_styleGolden Ale/Blond Ale -8.517e-02 4.520e-02 4.702e+04
## beer_styleGrodziskie/Gose/Lichtenhainer 1.197e-01 7.047e-02 4.581e+04
## beer_styleHeller Bock 3.366e-02 6.131e-02 4.650e+04
## beer_styleIce Cider/Ice Perry 3.782e-01 2.379e-01 4.646e+04
## beer_styleImperial IPA 4.507e-02 3.699e-02 4.684e+04
## beer_styleImperial Pils/Strong Pale Lager -5.539e-04 5.870e-02 4.668e+04
## beer_styleImperial Porter 6.522e-03 5.503e-02 4.619e+04
## beer_styleImperial Stout 7.699e-02 3.694e-02 4.672e+04
## beer_styleIndia Pale Ale (IPA) 4.707e-04 3.552e-02 4.681e+04
## beer_styleIrish Ale -1.080e-01 6.087e-02 4.672e+04
## beer_styleKlsch 7.382e-02 6.894e-02 4.656e+04
## beer_styleLambic Style - Faro 1.492e-01 1.806e-01 4.553e+04
## beer_styleLambic Style - Fruit 9.708e-02 5.439e-02 4.658e+04
## beer_styleLambic Style - Gueuze 6.889e-02 6.748e-02 4.671e+04
## beer_styleLambic Style - Unblended 9.028e-03 1.158e-01 4.581e+04
## beer_styleLow Alcohol -3.256e-01 7.740e-02 4.650e+04
## beer_styleMalt Liquor -1.351e-01 6.681e-02 4.769e+04
## beer_styleMead 2.085e-01 8.256e-02 4.766e+04
## beer_styleMild Ale -1.076e-01 9.594e-02 4.680e+04
## beer_styleOktoberfest/Mrzen -5.254e-03 5.228e-02 4.658e+04
## beer_styleOld Ale 3.767e-02 5.926e-02 4.635e+04
## beer_stylePale Lager -2.381e-01 4.010e-02 4.714e+04
## beer_stylePerry -1.993e-01 1.734e-01 4.625e+04
## beer_stylePilsener -8.217e-02 4.357e-02 4.684e+04
## beer_stylePolotmav -3.670e-01 1.830e-01 4.700e+04
## beer_stylePorter -4.486e-02 3.984e-02 4.670e+04
## beer_stylePremium Bitter/ESB -5.968e-02 4.570e-02 4.669e+04
## beer_stylePremium Lager -7.027e-02 5.081e-02 4.685e+04
## beer_styleRadler/Shandy -1.601e-01 8.602e-02 4.740e+04
## beer_styleSahti/Gotlandsdricke/Kodulu 1.824e-01 2.260e-01 4.569e+04
## beer_styleSaison -4.810e-02 4.209e-02 4.656e+04
## beer_styleSak - Futsu-shu 7.006e-02 5.694e-01 4.464e+04
## beer_styleSak - Ginjo -7.137e-01 4.556e-01 4.766e+04
## beer_styleSak - Infused -5.293e-01 7.709e-01 4.469e+04
## beer_styleSak - Junmai 2.983e-01 5.448e-01 4.461e+04
## beer_styleSak - Nigori -4.754e-01 3.940e-01 4.769e+04
## beer_styleSchwarzbier -5.571e-02 6.145e-02 4.665e+04
## beer_styleScotch Ale -1.837e-03 5.359e-02 4.664e+04
## beer_styleScottish Ale -4.386e-02 8.273e-02 4.688e+04
## beer_styleSession IPA -8.507e-02 5.274e-02 4.643e+04
## beer_styleSmoked 7.304e-02 5.399e-02 4.631e+04
## beer_styleSour Red/Brown 9.436e-03 5.291e-02 4.631e+04
## beer_styleSour/Wild Ale 4.693e-03 4.579e-02 4.658e+04
## beer_styleSpecialty Grain -4.605e-02 5.980e-02 4.642e+04
## beer_styleSpice/Herb/Vegetable 1.088e-01 4.094e-02 4.678e+04
## beer_styleStout -2.074e-02 4.169e-02 4.655e+04
## beer_styleSweet Stout 1.910e-02 4.533e-02 4.669e+04
## beer_styleTraditional Ale 1.736e-01 6.424e-02 4.636e+04
## beer_styleWeizen Bock -3.822e-02 6.375e-02 4.608e+04
## beer_styleWheat Ale 3.298e-02 4.630e-02 4.676e+04
## beer_styleWitbier -2.866e-02 4.412e-02 4.670e+04
## beer_styleZwickel/Keller/Landbier -2.002e-02 8.445e-02 4.737e+04
## t value Pr(>|t|)
## (Intercept) 32.700 < 2e-16 ***
## log(user_num_ratings) -3.337 0.000857 ***
## user_num_friends 0.612 0.540831
## review_overall_score -5.699 1.21e-08 ***
## review_taste_score -8.667 < 2e-16 ***
## review_aroma_score -6.711 1.96e-11 ***
## review_palate_score -3.394 0.000690 ***
## review_appearance_score -4.067 4.78e-05 ***
## beer_global_score -9.886 < 2e-16 ***
## beer_styleAbbey Tripel -0.801 0.423311
## beer_styleAbt/Quadrupel 1.422 0.154890
## beer_styleAltbier 0.706 0.480121
## beer_styleAmber Ale 0.368 0.712800
## beer_styleAmber Lager/Vienna -1.860 0.062822 .
## beer_styleAmerican Pale Ale -1.859 0.063048 .
## beer_styleAmerican Strong Ale 0.932 0.351109
## beer_styleBaltic Porter 0.727 0.467230
## beer_styleBarley Wine 1.998 0.045730 *
## beer_styleBelgian Ale -2.159 0.030858 *
## beer_styleBelgian Strong Ale 0.246 0.805309
## beer_styleBerliner Weisse 2.073 0.038133 *
## beer_styleBire de Garde -0.706 0.479881
## beer_styleBitter -0.970 0.332207
## beer_styleBlack IPA 0.416 0.677745
## beer_styleBrown Ale -1.057 0.290652
## beer_styleCalifornia Common -0.443 0.657526
## beer_styleCider 2.804 0.005042 **
## beer_styleCream Ale 0.519 0.604101
## beer_styleCzech Pilsner (Svtl) -2.886 0.003900 **
## beer_styleDoppelbock -0.457 0.647541
## beer_styleDortmunder/Helles -2.233 0.025564 *
## beer_styleDry Stout -0.631 0.528013
## beer_styleDunkel/Tmav -1.046 0.295666
## beer_styleDunkelweizen -1.802 0.071572 .
## beer_styleDunkler Bock 0.487 0.626500
## beer_styleEisbock -1.495 0.135050
## beer_styleEnglish Pale Ale -1.617 0.105884
## beer_styleEnglish Strong Ale -1.320 0.186958
## beer_styleForeign Stout -0.124 0.900938
## beer_styleFruit Beer 1.745 0.081025 .
## beer_styleGerman Hefeweizen -2.282 0.022497 *
## beer_styleGerman Kristallweizen -0.467 0.640304
## beer_styleGolden Ale/Blond Ale -1.884 0.059566 .
## beer_styleGrodziskie/Gose/Lichtenhainer 1.698 0.089517 .
## beer_styleHeller Bock 0.549 0.583010
## beer_styleIce Cider/Ice Perry 1.590 0.111853
## beer_styleImperial IPA 1.218 0.223098
## beer_styleImperial Pils/Strong Pale Lager -0.009 0.992471
## beer_styleImperial Porter 0.119 0.905669
## beer_styleImperial Stout 2.084 0.037163 *
## beer_styleIndia Pale Ale (IPA) 0.013 0.989428
## beer_styleIrish Ale -1.775 0.075966 .
## beer_styleKlsch 1.071 0.284293
## beer_styleLambic Style - Faro 0.826 0.408643
## beer_styleLambic Style - Fruit 1.785 0.074295 .
## beer_styleLambic Style - Gueuze 1.021 0.307292
## beer_styleLambic Style - Unblended 0.078 0.937852
## beer_styleLow Alcohol -4.207 2.59e-05 ***
## beer_styleMalt Liquor -2.021 0.043236 *
## beer_styleMead 2.525 0.011563 *
## beer_styleMild Ale -1.122 0.261891
## beer_styleOktoberfest/Mrzen -0.101 0.919940
## beer_styleOld Ale 0.636 0.524937
## beer_stylePale Lager -5.937 2.93e-09 ***
## beer_stylePerry -1.149 0.250481
## beer_stylePilsener -1.886 0.059336 .
## beer_stylePolotmav -2.005 0.044971 *
## beer_stylePorter -1.126 0.260148
## beer_stylePremium Bitter/ESB -1.306 0.191632
## beer_stylePremium Lager -1.383 0.166663
## beer_styleRadler/Shandy -1.861 0.062787 .
## beer_styleSahti/Gotlandsdricke/Kodulu 0.807 0.419567
## beer_styleSaison -1.143 0.253118
## beer_styleSak - Futsu-shu 0.123 0.902078
## beer_styleSak - Ginjo -1.567 0.117208
## beer_styleSak - Infused -0.687 0.492353
## beer_styleSak - Junmai 0.548 0.583968
## beer_styleSak - Nigori -1.207 0.227550
## beer_styleSchwarzbier -0.907 0.364668
## beer_styleScotch Ale -0.034 0.972658
## beer_styleScottish Ale -0.530 0.595961
## beer_styleSession IPA -1.613 0.106766
## beer_styleSmoked 1.353 0.176136
## beer_styleSour Red/Brown 0.178 0.858450
## beer_styleSour/Wild Ale 0.103 0.918357
## beer_styleSpecialty Grain -0.770 0.441213
## beer_styleSpice/Herb/Vegetable 2.657 0.007888 **
## beer_styleStout -0.497 0.618856
## beer_styleSweet Stout 0.421 0.673505
## beer_styleTraditional Ale 2.702 0.006892 **
## beer_styleWeizen Bock -0.600 0.548753
## beer_styleWheat Ale 0.712 0.476256
## beer_styleWitbier -0.650 0.515964
## beer_styleZwickel/Keller/Landbier -0.237 0.812569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
Looks like experts tend to use negation less…
# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_cttr, size = var_cttr, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Average corrected type token ratio") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1)) +
theme(legend.position="none")
# d.filter_non_english <- d %>%
# filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula:
## corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score +
## review_taste_score + review_aroma_score + review_palate_score +
## review_appearance_score + beer_global_score + (1 | user_name)
## Data: d
##
## REML criterion at convergence: 109681.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.9153 -0.4656 0.0018 0.5183 7.9317
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 0.4256 0.6524
## Residual 0.4971 0.7050
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 3.312e+00 2.548e-02 2.264e+04 129.988 < 2e-16
## log(user_num_ratings) 5.051e-02 6.155e-03 3.807e+03 8.206 4.44e-16
## user_num_friends 5.242e-03 3.038e-03 2.814e+03 1.725 0.0846
## review_overall_score 4.102e-03 1.981e-03 4.778e+04 2.071 0.0384
## review_taste_score -2.537e-03 3.928e-03 4.693e+04 -0.646 0.5184
## review_aroma_score 5.659e-03 3.040e-03 4.691e+04 1.861 0.0627
## review_palate_score -1.298e-02 5.850e-03 4.651e+04 -2.220 0.0265
## review_appearance_score 8.640e-03 5.173e-03 4.652e+04 1.670 0.0949
## beer_global_score 1.066e-03 1.541e-04 4.778e+04 6.917 4.66e-12
##
## (Intercept) ***
## log(user_num_ratings) ***
## user_num_friends .
## review_overall_score *
## review_taste_score
## review_aroma_score .
## review_palate_score *
## review_appearance_score .
## beer_global_score ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.519
## usr_nm_frnd 0.127 -0.415
## rvw_vrll_sc -0.014 -0.025 0.005
## rvw_tst_scr -0.048 0.032 -0.007 -0.556
## revw_rm_scr -0.090 0.008 -0.009 -0.211 -0.254
## rvw_plt_scr -0.178 0.025 0.000 -0.206 -0.234 -0.041
## rvw_pprnc_s -0.380 0.055 -0.001 -0.094 -0.044 -0.113 -0.174
## br_glbl_scr 0.115 -0.075 0.003 -0.126 -0.027 -0.213 -0.054 -0.116
# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_readability, size = var_readability, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Average readability score") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1))
# d.filter_non_english <- d %>%
# filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [lmerMod]
## Formula:
## corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score +
## review_taste_score + review_aroma_score + review_palate_score +
## review_appearance_score + beer_global_score + beer_style +
## (1 | user_name)
## Data: d
##
## REML criterion at convergence: 109628.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.8190 -0.4652 0.0039 0.5184 7.9171
##
## Random effects:
## Groups Name Variance Std.Dev.
## user_name (Intercept) 0.4222 0.6498
## Residual 0.4938 0.7027
## Number of obs: 47791, groups: user_name, 4679
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) 3.365e+00 4.294e-02 4.525e+04
## log(user_num_ratings) 5.022e-02 6.137e-03 3.819e+03
## user_num_friends 4.891e-03 3.027e-03 2.817e+03
## review_overall_score 4.925e-03 1.979e-03 4.769e+04
## review_taste_score -1.718e-03 3.921e-03 4.685e+04
## review_aroma_score 2.610e-03 3.057e-03 4.686e+04
## review_palate_score -1.482e-02 5.843e-03 4.644e+04
## review_appearance_score 8.783e-03 5.212e-03 4.650e+04
## beer_global_score 1.106e-04 1.982e-04 4.698e+04
## beer_styleAbbey Tripel 2.723e-02 4.018e-02 4.534e+04
## beer_styleAbt/Quadrupel 1.487e-01 4.580e-02 4.536e+04
## beer_styleAltbier 2.254e-02 6.896e-02 4.544e+04
## beer_styleAmber Ale -1.897e-03 3.993e-02 4.539e+04
## beer_styleAmber Lager/Vienna -4.672e-02 4.319e-02 4.581e+04
## beer_styleAmerican Pale Ale -3.511e-02 3.575e-02 4.546e+04
## beer_styleAmerican Strong Ale 1.404e-01 3.977e-02 4.531e+04
## beer_styleBaltic Porter 3.301e-02 5.505e-02 4.587e+04
## beer_styleBarley Wine 1.574e-01 3.902e-02 4.520e+04
## beer_styleBelgian Ale -1.664e-02 3.947e-02 4.524e+04
## beer_styleBelgian Strong Ale 6.558e-02 3.530e-02 4.530e+04
## beer_styleBerliner Weisse 7.004e-02 6.263e-02 4.486e+04
## beer_styleBire de Garde 4.627e-02 7.285e-02 4.528e+04
## beer_styleBitter -2.151e-02 4.764e-02 4.613e+04
## beer_styleBlack IPA 1.078e-01 4.386e-02 4.540e+04
## beer_styleBrown Ale -2.990e-02 3.991e-02 4.538e+04
## beer_styleCalifornia Common -1.444e-01 7.610e-02 4.547e+04
## beer_styleCider 6.082e-02 5.579e-02 4.718e+04
## beer_styleCream Ale 1.056e-02 7.430e-02 4.553e+04
## beer_styleCzech Pilsner (Svtl) -1.337e-01 5.332e-02 4.572e+04
## beer_styleDoppelbock 1.352e-02 4.576e-02 4.526e+04
## beer_styleDortmunder/Helles -1.510e-01 5.342e-02 4.549e+04
## beer_styleDry Stout -4.824e-02 5.003e-02 4.555e+04
## beer_styleDunkel/Tmav -7.092e-02 5.206e-02 4.564e+04
## beer_styleDunkelweizen -5.576e-02 6.115e-02 4.526e+04
## beer_styleDunkler Bock -1.178e-01 6.413e-02 4.481e+04
## beer_styleEisbock 1.249e-01 1.099e-01 4.537e+04
## beer_styleEnglish Pale Ale -2.897e-02 5.737e-02 4.519e+04
## beer_styleEnglish Strong Ale -7.631e-03 4.842e-02 4.528e+04
## beer_styleForeign Stout 1.320e-01 5.859e-02 4.546e+04
## beer_styleFruit Beer 8.378e-02 3.985e-02 4.550e+04
## beer_styleGerman Hefeweizen -7.029e-02 4.166e-02 4.561e+04
## beer_styleGerman Kristallweizen -8.558e-02 1.141e-01 4.582e+04
## beer_styleGolden Ale/Blond Ale -6.681e-02 4.247e-02 4.583e+04
## beer_styleGrodziskie/Gose/Lichtenhainer 3.849e-02 6.560e-02 4.453e+04
## beer_styleHeller Bock -1.502e-01 5.737e-02 4.533e+04
## beer_styleIce Cider/Ice Perry -2.112e-01 2.231e-01 4.599e+04
## beer_styleImperial IPA 1.370e-01 3.468e-02 4.552e+04
## beer_styleImperial Pils/Strong Pale Lager 1.243e-01 5.503e-02 4.565e+04
## beer_styleImperial Porter 1.087e-01 5.137e-02 4.496e+04
## beer_styleImperial Stout 1.378e-01 3.461e-02 4.542e+04
## beer_styleIndia Pale Ale (IPA) 1.618e-02 3.330e-02 4.548e+04
## beer_styleIrish Ale -7.408e-02 5.712e-02 4.585e+04
## beer_styleKlsch -1.101e-01 6.455e-02 4.543e+04
## beer_styleLambic Style - Faro 1.066e-01 1.679e-01 4.439e+04
## beer_styleLambic Style - Fruit 4.649e-02 5.085e-02 4.506e+04
## beer_styleLambic Style - Gueuze 6.289e-02 6.321e-02 4.544e+04
## beer_styleLambic Style - Unblended 2.780e-01 1.077e-01 4.425e+04
## beer_styleLow Alcohol 1.085e-02 7.233e-02 4.503e+04
## beer_styleMalt Liquor 2.458e-01 6.365e-02 4.732e+04
## beer_styleMead -8.328e-03 7.850e-02 4.673e+04
## beer_styleMild Ale -1.376e-01 8.997e-02 4.562e+04
## beer_styleOktoberfest/Mrzen -1.050e-01 4.892e-02 4.528e+04
## beer_styleOld Ale 9.998e-02 5.539e-02 4.521e+04
## beer_stylePale Lager -5.762e-02 3.771e-02 4.589e+04
## beer_stylePerry -1.673e-01 1.618e-01 4.483e+04
## beer_stylePilsener -1.068e-01 4.087e-02 4.561e+04
## beer_stylePolotmav 3.687e-02 1.722e-01 4.616e+04
## beer_stylePorter -1.680e-02 3.733e-02 4.545e+04
## beer_stylePremium Bitter/ESB -1.236e-02 4.281e-02 4.544e+04
## beer_stylePremium Lager -9.976e-02 4.769e-02 4.574e+04
## beer_styleRadler/Shandy -4.164e-02 8.133e-02 4.669e+04
## beer_styleSahti/Gotlandsdricke/Kodulu -1.363e-01 2.102e-01 4.433e+04
## beer_styleSaison 3.922e-02 3.938e-02 4.524e+04
## beer_styleSak - Futsu-shu 6.147e-02 5.713e-01 3.989e+04
## beer_styleSak - Ginjo -8.437e-01 4.335e-01 4.717e+04
## beer_styleSak - Infused -1.468e-01 7.134e-01 4.362e+04
## beer_styleSak - Junmai 7.067e-04 5.041e-01 4.360e+04
## beer_styleSak - Nigori -2.394e-01 3.782e-01 4.768e+04
## beer_styleSchwarzbier -1.013e-01 5.760e-02 4.561e+04
## beer_styleScotch Ale 3.699e-02 5.020e-02 4.546e+04
## beer_styleScottish Ale -5.654e-03 7.771e-02 4.590e+04
## beer_styleSession IPA 4.595e-02 4.930e-02 4.516e+04
## beer_styleSmoked 8.120e-02 5.044e-02 4.505e+04
## beer_styleSour Red/Brown 2.026e-02 4.941e-02 4.497e+04
## beer_styleSour/Wild Ale 9.137e-02 4.282e-02 4.513e+04
## beer_styleSpecialty Grain 8.717e-02 5.590e-02 4.518e+04
## beer_styleSpice/Herb/Vegetable 9.030e-02 3.838e-02 4.554e+04
## beer_styleStout -3.376e-02 3.901e-02 4.534e+04
## beer_styleSweet Stout -1.004e-02 4.246e-02 4.542e+04
## beer_styleTraditional Ale 1.449e-01 6.005e-02 4.518e+04
## beer_styleWeizen Bock 8.951e-02 5.946e-02 4.484e+04
## beer_styleWheat Ale 4.346e-03 4.342e-02 4.564e+04
## beer_styleWitbier 2.591e-02 4.135e-02 4.553e+04
## beer_styleZwickel/Keller/Landbier 1.284e-02 7.986e-02 4.676e+04
## t value Pr(>|t|)
## (Intercept) 78.382 < 2e-16 ***
## log(user_num_ratings) 8.183 4.44e-16 ***
## user_num_friends 1.616 0.106291
## review_overall_score 2.488 0.012845 *
## review_taste_score -0.438 0.661324
## review_aroma_score 0.854 0.393196
## review_palate_score -2.536 0.011207 *
## review_appearance_score 1.685 0.091972 .
## beer_global_score 0.558 0.576846
## beer_styleAbbey Tripel 0.678 0.497986
## beer_styleAbt/Quadrupel 3.246 0.001170 **
## beer_styleAltbier 0.327 0.743820
## beer_styleAmber Ale -0.048 0.962106
## beer_styleAmber Lager/Vienna -1.082 0.279413
## beer_styleAmerican Pale Ale -0.982 0.325947
## beer_styleAmerican Strong Ale 3.530 0.000416 ***
## beer_styleBaltic Porter 0.600 0.548781
## beer_styleBarley Wine 4.035 5.48e-05 ***
## beer_styleBelgian Ale -0.422 0.673356
## beer_styleBelgian Strong Ale 1.858 0.063199 .
## beer_styleBerliner Weisse 1.118 0.263410
## beer_styleBire de Garde 0.635 0.525310
## beer_styleBitter -0.451 0.651710
## beer_styleBlack IPA 2.457 0.014027 *
## beer_styleBrown Ale -0.749 0.453780
## beer_styleCalifornia Common -1.898 0.057700 .
## beer_styleCider 1.090 0.275631
## beer_styleCream Ale 0.142 0.886951
## beer_styleCzech Pilsner (Svtl) -2.508 0.012159 *
## beer_styleDoppelbock 0.296 0.767591
## beer_styleDortmunder/Helles -2.827 0.004698 **
## beer_styleDry Stout -0.964 0.334950
## beer_styleDunkel/Tmav -1.362 0.173134
## beer_styleDunkelweizen -0.912 0.361831
## beer_styleDunkler Bock -1.837 0.066286 .
## beer_styleEisbock 1.136 0.255822
## beer_styleEnglish Pale Ale -0.505 0.613527
## beer_styleEnglish Strong Ale -0.158 0.874768
## beer_styleForeign Stout 2.254 0.024226 *
## beer_styleFruit Beer 2.102 0.035522 *
## beer_styleGerman Hefeweizen -1.687 0.091558 .
## beer_styleGerman Kristallweizen -0.750 0.453358
## beer_styleGolden Ale/Blond Ale -1.573 0.115695
## beer_styleGrodziskie/Gose/Lichtenhainer 0.587 0.557415
## beer_styleHeller Bock -2.619 0.008833 **
## beer_styleIce Cider/Ice Perry -0.947 0.343688
## beer_styleImperial IPA 3.950 7.83e-05 ***
## beer_styleImperial Pils/Strong Pale Lager 2.259 0.023878 *
## beer_styleImperial Porter 2.116 0.034361 *
## beer_styleImperial Stout 3.981 6.89e-05 ***
## beer_styleIndia Pale Ale (IPA) 0.486 0.626996
## beer_styleIrish Ale -1.297 0.194688
## beer_styleKlsch -1.706 0.087973 .
## beer_styleLambic Style - Faro 0.635 0.525453
## beer_styleLambic Style - Fruit 0.914 0.360634
## beer_styleLambic Style - Gueuze 0.995 0.319768
## beer_styleLambic Style - Unblended 2.582 0.009826 **
## beer_styleLow Alcohol 0.150 0.880812
## beer_styleMalt Liquor 3.861 0.000113 ***
## beer_styleMead -0.106 0.915507
## beer_styleMild Ale -1.530 0.126140
## beer_styleOktoberfest/Mrzen -2.147 0.031771 *
## beer_styleOld Ale 1.805 0.071105 .
## beer_stylePale Lager -1.528 0.126492
## beer_stylePerry -1.034 0.301247
## beer_stylePilsener -2.612 0.008992 **
## beer_stylePolotmav 0.214 0.830487
## beer_stylePorter -0.450 0.652563
## beer_stylePremium Bitter/ESB -0.289 0.772828
## beer_stylePremium Lager -2.092 0.036446 *
## beer_styleRadler/Shandy -0.512 0.608687
## beer_styleSahti/Gotlandsdricke/Kodulu -0.648 0.516753
## beer_styleSaison 0.996 0.319245
## beer_styleSak - Futsu-shu 0.108 0.914318
## beer_styleSak - Ginjo -1.946 0.051619 .
## beer_styleSak - Infused -0.206 0.836921
## beer_styleSak - Junmai 0.001 0.998881
## beer_styleSak - Nigori -0.633 0.526764
## beer_styleSchwarzbier -1.758 0.078677 .
## beer_styleScotch Ale 0.737 0.461183
## beer_styleScottish Ale -0.073 0.942002
## beer_styleSession IPA 0.932 0.351302
## beer_styleSmoked 1.610 0.107432
## beer_styleSour Red/Brown 0.410 0.681780
## beer_styleSour/Wild Ale 2.134 0.032870 *
## beer_styleSpecialty Grain 1.559 0.118944
## beer_styleSpice/Herb/Vegetable 2.353 0.018631 *
## beer_styleStout -0.865 0.386875
## beer_styleSweet Stout -0.237 0.813012
## beer_styleTraditional Ale 2.414 0.015794 *
## beer_styleWeizen Bock 1.505 0.132214
## beer_styleWheat Ale 0.100 0.920283
## beer_styleWitbier 0.627 0.530924
## beer_styleZwickel/Keller/Landbier 0.161 0.872228
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
Readability appears to increse…
# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = review_sims, col = user_name)) +
geom_point(alpha = 0.7) +
ylab("Average readability score") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm", aes(group = 1)) +
guides(legend.position = "none")
f <- "review_sims ~ log(user_num_ratings)"
summary(lm(f, data = d.user_info))
##
## Call:
## lm(formula = f, data = d.user_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28967 -0.08149 -0.01002 0.06626 0.65491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.195836 0.012917 15.16 <2e-16 ***
## log(user_num_ratings) 0.026263 0.002491 10.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1152 on 894 degrees of freedom
## Multiple R-squared: 0.1106, Adjusted R-squared: 0.1096
## F-statistic: 111.2 on 1 and 894 DF, p-value: < 2.2e-16
Clearly there’s an effect here. We can’r run lmer, because this is an average score. We can regress with multiple regression.
# names(d.user_info)
d.user_info %>%
gather(type, value, c(avg_overall_score, avg_taste_score, avg_aroma_score, avg_palate_score, avg_appearance_score)) %>%
ggplot(., aes(x = log(user_num_ratings), y = value, col = type)) +
geom_point(alpha = 0.7) +
ylab("Mean aspect score") +
xlab("Log user num ratings") +
scale_colour_discrete(guide = FALSE) +
geom_smooth(method = "lm")
d.user_info %>%
filter(user_num_ratings >= 50) %>%
gather(type, value, c(var_taste_score, var_aroma_score, var_palate_score, avg_appearance_score)) %>%
ggplot(., aes(x = log(user_num_ratings), y = value, col = type)) +
geom_point(alpha = 0.7) +
ylab("Rating variance") +
xlab("Log user num ratings") +
geom_smooth(method = "lm") +
facet_wrap(~type, nrow = 4)
f <- "var_taste_score ~ log(user_num_ratings)"
summary(lm(f, data = d.user_info))
##
## Call:
## lm(formula = f, data = d.user_info)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7750 -1.0678 -0.3616 0.6887 9.1457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.84179 0.18370 26.36 <2e-16 ***
## log(user_num_ratings) -0.45852 0.03542 -12.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.638 on 894 degrees of freedom
## Multiple R-squared: 0.1579, Adjusted R-squared: 0.1569
## F-statistic: 167.6 on 1 and 894 DF, p-value: < 2.2e-16
Clearly there’s an effect here. We can’r run lmer, because this is an average score. We can regress with multiple regression.