library(dplyr)
library(tidyr)
library(ggplot2)
library(lme4)
library(lmerTest)
source("~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_helpers.R")

Classification results

dir_path <- "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/"
files <- list.files(dir_path)
ml.data <- data.frame()
for (f in files) {
  print(paste0(dir_path, f))
  data <- read.csv(paste0(dir_path, f))
  ml.data <- rbind(ml.data, data)
}
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/baseline.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/naive_bayes.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/random_forest.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/trigram_stupid_backoff.csv"
## [1] "~/Desktop/Spring2016/CS224U/rateBeerLingRel/analysis/analysis_data/unigram_laplace.csv"

Prepare plot data

str(ml.data)
## 'data.frame':    50 obs. of  4 variables:
##  $ model   : Factor w/ 5 levels "baseline","naive_bayes",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ note    : Factor w/ 5 levels "baseline","unigram+bigram+trigram",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sample  : Factor w/ 1 level "sample_size=30000": 1 1 1 1 1 1 1 1 1 1 ...
##  $ accuracy: num  0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ...
plot_data <- ml.data %>%
  mutate(model = as.factor(ifelse(model == "baseline", "Baseline", 
                        ifelse(model == "unigram", "Unigram LM", 
                               ifelse(model == "trigram", "Trigram Stupid Backoff",
                                      ifelse(model == "naive_bayes", "Naive Bayes", "Random Forest")))))) %>%
  group_by(model) %>%
  summarise(n = n(),
            avg = mean(accuracy),
            s = sd(accuracy),
            error = qnorm(0.975)*s/sqrt(n),
            lower = avg - error,
            upper = avg + error)

Plot performance

ggplot(plot_data, aes(x = reorder(model, avg), y = avg)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = lower, ymax = upper)) +
  geom_hline(aes(yintercept=0.25), linetype = 2, col = "red") +
  xlab("Model") +
  ylab("Accuracy - four class classification") +
  ggtitle("Expertise classsification")

Data from analysis.R with analysis level features

d <- read.csv("~/Desktop/Spring2016/CS224U/rateBeerLingRel/data/clean_data_full_final.csv", stringsAsFactors = FALSE)
names(d)
##  [1] "X"                                 
##  [2] "user_num_places_rated"             
##  [3] "user_num_following"                
##  [4] "user_url"                          
##  [5] "beer_num_ratings"                  
##  [6] "user_num_friends"                  
##  [7] "user_num_breweries_rated"          
##  [8] "review_palate_score"               
##  [9] "review_taste_score"                
## [10] "user_num_ratings"                  
## [11] "review_ratings_blob"               
## [12] "review_aroma_score"                
## [13] "user_num_countries_rated"          
## [14] "user_id"                           
## [15] "review_avg_score"                  
## [16] "beer_global_style_score"           
## [17] "beer_weighted_avg_score"           
## [18] "beer_brewer_name"                  
## [19] "review_overall_score"              
## [20] "user_location"                     
## [21] "review_appearance_score"           
## [22] "beer_num_calories"                 
## [23] "beer_style"                        
## [24] "beer_url"                          
## [25] "review_blob"                       
## [26] "beer_global_score"                 
## [27] "user_name"                         
## [28] "beer_location"                     
## [29] "beer_ABV"                          
## [30] "beer_name"                         
## [31] "beer_ABV_num"                      
## [32] "user_experience"                   
## [33] "review_blob_lower"                 
## [34] "user_experience_quartile"          
## [35] "user_experience1000"               
## [36] "user_experience500"                
## [37] "num_tokens"                        
## [38] "num_types"                         
## [39] "type_token_ratio"                  
## [40] "corrected_ttr"                     
## [41] "num_syllables"                     
## [42] "readability_score"                 
## [43] "normalized_beer_global_score"      
## [44] "normalized_beer_global_style_score"
## [45] "num_first_person_singular_pnouns"  
## [46] "num_swear_words"                   
## [47] "num_negation_words"                
## [48] "num_mispelled_words"

Quick look at num reviews break-down

Average review length

ggplot(d, aes(num_tokens)) +
  xlim(0, 400) +
  xlab("Number of tokens") +
  ylab("Numer of reviews") +
  geom_histogram(binwidth = 2)
## Warning: Removed 7 rows containing non-finite values (stat_bin).

median(d$num_tokens)
## [1] 35
mean(d$num_tokens)
## [1] 43.4828

Average review length

d[match(unique(d$user_id), d$user_id),] %>%
    select(user_id, user_num_ratings) %>%
    group_by(user_num_ratings) %>%
    summarise(count = n()) %>%
    ggplot(aes(x = log(count), y = log(user_num_ratings))) +
      ylab("Log count number of reviews made") +
      xlab("Log count number of users") +
      ylim(0, 10) +
      xlim(0, 10) +
      # scale_x_reverse() +
      geom_point(alpha = 0.1, size = 5, col = "blue")

      # geom_bar(stat = "identity")

User level averages and variance

min_reviews <- 20

ptm <- proc.time()
d.user_info <- d %>%
  filter(user_num_ratings >= min_reviews) %>%
  group_by(user_name) %>%
  summarise(
            ## Reviews
            ## -------
            avg_overall_score = mean(review_overall_score),
            var_overall_score = var(review_overall_score),
            normalized_avg_overall_score = mean(review_overall_score/4),
            normalized_var_overall_score = var(review_overall_score/4),
            avg_taste_score = mean(review_taste_score),
            var_taste_score = var(review_taste_score),
            avg_aroma_score = mean(review_aroma_score),
            var_aroma_score = var(review_aroma_score),
            avg_appearance_score = mean(review_appearance_score),
            var_appearance_score = var(review_appearance_score),
            avg_palate_score = mean(review_palate_score),
            var_palate_score = var(review_palate_score),
            ## Beer attributes
            ## ---------------
            avg_beer_global_score = mean(beer_global_score, na.rm = TRUE),
            var_beer_global_score = var(beer_global_score, na.rm = TRUE),
            normalized_avg_global_score = mean(normalized_beer_global_score, na.rm = TRUE),
            diff_overall_score = normalized_avg_overall_score - normalized_avg_global_score,
            avg_beer_global_style_score = mean(beer_global_style_score, na.rm = TRUE),
            var_beer_global_style_score = var(beer_global_style_score, na.rm = TRUE),
            # avg_beer_abv = mean(beer_ABV, na.rm = TRUE),
            avg_beer_num_calories = mean(beer_num_calories, na.rm = TRUE),
            var_beer_num_calories = var(beer_num_calories, na.rm = TRUE),
            avg_beer_num_ratings = mean(beer_num_ratings, na.rm = TRUE),
            var_beer_num_ratings = var(beer_num_ratings, na.rm = TRUE),
            ## User
            ## ----
            user_num_ratings = mean(user_num_ratings),
            review_sims = review_similarities(review_blob_lower),
            avg_num_tokens = mean(num_tokens),
            var_num_tokens = var(num_tokens),
            avg_num_types = mean(num_types),
            var_num_types = var(num_types),
            avg_lexdiv_type_token = mean(type_token_ratio),
            var_lexdiv_type_token = var(type_token_ratio),
            avg_num_syllables = mean(num_syllables),
            var_num_syllables = var(num_syllables),
            avg_cttr = mean(corrected_ttr),
            var_cttr = var(corrected_ttr),
            avg_readability = mean(readability_score),
            var_readability = var(readability_score),
            avg_fpspns = mean(num_first_person_singular_pnouns),
            var_fpspns = var(num_first_person_singular_pnouns),
            num_styles = length(unique(beer_style)),
            avg_num_swear_words = mean(num_swear_words),
            var_num_swear_words = var(num_swear_words),
            avg_num_negation = mean(num_negation_words),
            var_num_negation = var(num_negation_words),
            avg_num_mispelled_words = mean(num_mispelled_words),
            var_num_mispelled_words = var(num_mispelled_words))
proc.time() - ptm

Hypothesis 1: First person singular pronouns

Plot

ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_fpspns, size = var_fpspns, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Mean number of first person singular pronouns") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1)) +
  theme(legend.position="none")

lmer test

f <- "num_first_person_singular_pnouns ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## num_first_person_singular_pnouns ~ log(user_num_ratings) + user_num_friends +  
##     review_overall_score + review_taste_score + review_aroma_score +  
##     review_palate_score + review_appearance_score + beer_global_score +  
##     beer_style + (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: 157152
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2951 -0.4655 -0.1648  0.2175 27.1907 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 0.7851   0.8861  
##  Residual              1.3722   1.1714  
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                                             Estimate Std. Error         df
## (Intercept)                                1.265e+00  7.006e-02  4.569e+04
## log(user_num_ratings)                     -8.391e-02  8.747e-03  2.766e+03
## user_num_friends                           3.464e-03  4.194e-03  1.894e+03
## review_overall_score                      -1.052e-02  3.252e-03  4.732e+04
## review_taste_score                        -2.459e-02  6.483e-03  4.712e+04
## review_aroma_score                         8.467e-03  5.054e-03  4.713e+04
## review_palate_score                       -2.263e-02  9.675e-03  4.669e+04
## review_appearance_score                    2.682e-02  8.629e-03  4.674e+04
## beer_global_score                          3.628e-04  3.276e-04  4.722e+04
## beer_styleAbbey Tripel                    -2.480e-02  6.673e-02  4.508e+04
## beer_styleAbt/Quadrupel                    2.593e-02  7.606e-02  4.511e+04
## beer_styleAltbier                         -2.373e-02  1.145e-01  4.522e+04
## beer_styleAmber Ale                       -2.845e-02  6.631e-02  4.520e+04
## beer_styleAmber Lager/Vienna              -3.758e-02  7.165e-02  4.575e+04
## beer_styleAmerican Pale Ale               -3.766e-02  5.935e-02  4.531e+04
## beer_styleAmerican Strong Ale              1.679e-01  6.605e-02  4.508e+04
## beer_styleBaltic Porter                    1.333e-02  9.131e-02  4.581e+04
## beer_styleBarley Wine                      1.443e-01  6.483e-02  4.495e+04
## beer_styleBelgian Ale                     -3.584e-02  6.556e-02  4.497e+04
## beer_styleBelgian Strong Ale               9.408e-02  5.863e-02  4.506e+04
## beer_styleBerliner Weisse                  1.638e-01  1.041e-01  4.447e+04
## beer_styleBire de Garde                    1.629e-01  1.210e-01  4.501e+04
## beer_styleBitter                          -3.698e-02  7.896e-02  4.624e+04
## beer_styleBlack IPA                        1.148e-01  7.283e-02  4.519e+04
## beer_styleBrown Ale                       -1.551e-02  6.627e-02  4.519e+04
## beer_styleCalifornia Common               -9.936e-02  1.263e-01  4.526e+04
## beer_styleCider                            1.381e-01  9.214e-02  4.740e+04
## beer_styleCream Ale                       -4.251e-02  1.233e-01  4.539e+04
## beer_styleCzech Pilsner (Svtl)             1.300e-02  8.847e-02  4.568e+04
## beer_styleDoppelbock                       4.135e-03  7.601e-02  4.500e+04
## beer_styleDortmunder/Helles               -1.448e-01  8.868e-02  4.531e+04
## beer_styleDry Stout                        2.288e-01  8.304e-02  4.545e+04
## beer_styleDunkel/Tmav                     -1.363e-01  8.640e-02  4.551e+04
## beer_styleDunkelweizen                     7.886e-03  1.016e-01  4.499e+04
## beer_styleDunkler Bock                    -1.602e-01  1.066e-01  4.446e+04
## beer_styleEisbock                         -4.487e-02  1.825e-01  4.517e+04
## beer_styleEnglish Pale Ale                -2.849e-02  9.531e-02  4.490e+04
## beer_styleEnglish Strong Ale               1.095e-01  8.042e-02  4.504e+04
## beer_styleForeign Stout                    1.190e-01  9.728e-02  4.524e+04
## beer_styleFruit Beer                       1.656e-01  6.616e-02  4.534e+04
## beer_styleGerman Hefeweizen               -6.226e-03  6.914e-02  4.547e+04
## beer_styleGerman Kristallweizen           -1.505e-01  1.893e-01  4.572e+04
## beer_styleGolden Ale/Blond Ale            -9.729e-02  7.045e-02  4.579e+04
## beer_styleGrodziskie/Gose/Lichtenhainer    5.266e-02  1.091e-01  4.401e+04
## beer_styleHeller Bock                      3.917e-03  9.527e-02  4.509e+04
## beer_styleIce Cider/Ice Perry              6.707e-02  3.700e-01  4.570e+04
## beer_styleImperial IPA                     1.055e-01  5.757e-02  4.540e+04
## beer_styleImperial Pils/Strong Pale Lager  8.935e-02  9.133e-02  4.548e+04
## beer_styleImperial Porter                  6.759e-02  8.539e-02  4.460e+04
## beer_styleImperial Stout                   1.367e-01  5.746e-02  4.526e+04
## beer_styleIndia Pale Ale (IPA)            -3.842e-03  5.528e-02  4.536e+04
## beer_styleIrish Ale                       -4.115e-02  9.476e-02  4.567e+04
## beer_styleKlsch                           -7.740e-02  1.072e-01  4.522e+04
## beer_styleLambic Style - Faro              2.958e-01  2.795e-01  4.381e+04
## beer_styleLambic Style - Fruit             1.638e-01  8.450e-02  4.482e+04
## beer_styleLambic Style - Gueuze            3.288e-01  1.050e-01  4.527e+04
## beer_styleLambic Style - Unblended         4.240e-01  1.792e-01  4.371e+04
## beer_styleLow Alcohol                      1.062e-01  1.202e-01  4.476e+04
## beer_styleMalt Liquor                      1.637e-01  1.051e-01  4.747e+04
## beer_styleMead                             1.079e-01  1.298e-01  4.712e+04
## beer_styleMild Ale                        -1.893e-01  1.493e-01  4.549e+04
## beer_styleOktoberfest/Mrzen               -9.991e-02  8.125e-02  4.507e+04
## beer_styleOld Ale                         -7.052e-02  9.203e-02  4.491e+04
## beer_stylePale Lager                      -1.693e-02  6.254e-02  4.589e+04
## beer_stylePerry                           -1.164e-01  2.690e-01  4.454e+04
## beer_stylePilsener                        -9.609e-02  6.783e-02  4.550e+04
## beer_stylePolotmav                        -2.117e-01  2.855e-01  4.612e+04
## beer_stylePorter                          -4.988e-02  6.197e-02  4.529e+04
## beer_stylePremium Bitter/ESB              -3.099e-02  7.108e-02  4.528e+04
## beer_stylePremium Lager                   -9.523e-02  7.912e-02  4.564e+04
## beer_styleRadler/Shandy                    4.267e-03  1.346e-01  4.678e+04
## beer_styleSahti/Gotlandsdricke/Kodulu      1.036e-02  3.499e-01  4.382e+04
## beer_styleSaison                          -6.249e-03  6.541e-02  4.503e+04
## beer_styleSak - Futsu-shu                  1.244e+00  9.233e-01  3.876e+04
## beer_styleSak - Ginjo                     -1.382e+00  7.160e-01  4.740e+04
## beer_styleSak - Infused                    9.411e-01  1.189e+00  4.271e+04
## beer_styleSak - Junmai                     5.096e-01  8.401e-01  4.267e+04
## beer_styleSak - Nigori                    -3.322e-01  6.218e-01  4.766e+04
## beer_styleSchwarzbier                     -1.583e-01  9.560e-02  4.543e+04
## beer_styleScotch Ale                       2.103e-02  8.334e-02  4.529e+04
## beer_styleScottish Ale                     9.114e-02  1.289e-01  4.582e+04
## beer_styleSession IPA                     -5.002e-02  8.191e-02  4.489e+04
## beer_styleSmoked                           2.382e-01  8.382e-02  4.474e+04
## beer_styleSour Red/Brown                   1.916e-01  8.212e-02  4.466e+04
## beer_styleSour/Wild Ale                    1.869e-01  7.114e-02  4.490e+04
## beer_styleSpecialty Grain                  1.331e-01  9.287e-02  4.490e+04
## beer_styleSpice/Herb/Vegetable             2.206e-01  6.371e-02  4.541e+04
## beer_styleStout                           -9.123e-02  6.479e-02  4.512e+04
## beer_styleSweet Stout                      1.936e-02  7.051e-02  4.525e+04
## beer_styleTraditional Ale                  3.531e-01  9.977e-02  4.489e+04
## beer_styleWeizen Bock                      3.492e-02  9.885e-02  4.444e+04
## beer_styleWheat Ale                        9.962e-02  7.206e-02  4.550e+04
## beer_styleWitbier                         -3.040e-02  6.864e-02  4.537e+04
## beer_styleZwickel/Keller/Landbier          5.908e-02  1.321e-01  4.683e+04
##                                           t value Pr(>|t|)    
## (Intercept)                                18.053  < 2e-16 ***
## log(user_num_ratings)                      -9.594  < 2e-16 ***
## user_num_friends                            0.826 0.408920    
## review_overall_score                       -3.235 0.001217 ** 
## review_taste_score                         -3.793 0.000149 ***
## review_aroma_score                          1.675 0.093923 .  
## review_palate_score                        -2.339 0.019335 *  
## review_appearance_score                     3.108 0.001883 ** 
## beer_global_score                           1.108 0.268036    
## beer_styleAbbey Tripel                     -0.372 0.710196    
## beer_styleAbt/Quadrupel                     0.341 0.733214    
## beer_styleAltbier                          -0.207 0.835847    
## beer_styleAmber Ale                        -0.429 0.667883    
## beer_styleAmber Lager/Vienna               -0.525 0.599908    
## beer_styleAmerican Pale Ale                -0.635 0.525682    
## beer_styleAmerican Strong Ale               2.542 0.011026 *  
## beer_styleBaltic Porter                     0.146 0.883925    
## beer_styleBarley Wine                       2.226 0.026011 *  
## beer_styleBelgian Ale                      -0.547 0.584623    
## beer_styleBelgian Strong Ale                1.605 0.108551    
## beer_styleBerliner Weisse                   1.573 0.115681    
## beer_styleBire de Garde                     1.346 0.178321    
## beer_styleBitter                           -0.468 0.639551    
## beer_styleBlack IPA                         1.577 0.114850    
## beer_styleBrown Ale                        -0.234 0.815011    
## beer_styleCalifornia Common                -0.786 0.431624    
## beer_styleCider                             1.498 0.134030    
## beer_styleCream Ale                        -0.345 0.730385    
## beer_styleCzech Pilsner (Svtl)              0.147 0.883166    
## beer_styleDoppelbock                        0.054 0.956614    
## beer_styleDortmunder/Helles                -1.632 0.102616    
## beer_styleDry Stout                         2.756 0.005858 ** 
## beer_styleDunkel/Tmav                      -1.578 0.114571    
## beer_styleDunkelweizen                      0.078 0.938115    
## beer_styleDunkler Bock                     -1.503 0.132888    
## beer_styleEisbock                          -0.246 0.805747    
## beer_styleEnglish Pale Ale                 -0.299 0.765019    
## beer_styleEnglish Strong Ale                1.362 0.173313    
## beer_styleForeign Stout                     1.223 0.221274    
## beer_styleFruit Beer                        2.504 0.012286 *  
## beer_styleGerman Hefeweizen                -0.090 0.928248    
## beer_styleGerman Kristallweizen            -0.795 0.426593    
## beer_styleGolden Ale/Blond Ale             -1.381 0.167288    
## beer_styleGrodziskie/Gose/Lichtenhainer     0.482 0.629481    
## beer_styleHeller Bock                       0.041 0.967205    
## beer_styleIce Cider/Ice Perry               0.181 0.856156    
## beer_styleImperial IPA                      1.832 0.066990 .  
## beer_styleImperial Pils/Strong Pale Lager   0.978 0.327906    
## beer_styleImperial Porter                   0.792 0.428635    
## beer_styleImperial Stout                    2.379 0.017360 *  
## beer_styleIndia Pale Ale (IPA)             -0.070 0.944586    
## beer_styleIrish Ale                        -0.434 0.664086    
## beer_styleKlsch                            -0.722 0.470231    
## beer_styleLambic Style - Faro               1.059 0.289800    
## beer_styleLambic Style - Fruit              1.939 0.052536 .  
## beer_styleLambic Style - Gueuze             3.133 0.001731 ** 
## beer_styleLambic Style - Unblended          2.366 0.018004 *  
## beer_styleLow Alcohol                       0.883 0.376994    
## beer_styleMalt Liquor                       1.558 0.119166    
## beer_styleMead                              0.831 0.405944    
## beer_styleMild Ale                         -1.268 0.204881    
## beer_styleOktoberfest/Mrzen                -1.230 0.218852    
## beer_styleOld Ale                          -0.766 0.443485    
## beer_stylePale Lager                       -0.271 0.786655    
## beer_stylePerry                            -0.433 0.665351    
## beer_stylePilsener                         -1.417 0.156606    
## beer_stylePolotmav                         -0.742 0.458385    
## beer_stylePorter                           -0.805 0.420895    
## beer_stylePremium Bitter/ESB               -0.436 0.662844    
## beer_stylePremium Lager                    -1.204 0.228779    
## beer_styleRadler/Shandy                     0.032 0.974710    
## beer_styleSahti/Gotlandsdricke/Kodulu       0.030 0.976382    
## beer_styleSaison                           -0.096 0.923888    
## beer_styleSak - Futsu-shu                   1.347 0.178045    
## beer_styleSak - Ginjo                      -1.931 0.053534 .  
## beer_styleSak - Infused                     0.791 0.428658    
## beer_styleSak - Junmai                      0.607 0.544107    
## beer_styleSak - Nigori                     -0.534 0.593232    
## beer_styleSchwarzbier                      -1.656 0.097777 .  
## beer_styleScotch Ale                        0.252 0.800762    
## beer_styleScottish Ale                      0.707 0.479513    
## beer_styleSession IPA                      -0.611 0.541459    
## beer_styleSmoked                            2.841 0.004492 ** 
## beer_styleSour Red/Brown                    2.333 0.019652 *  
## beer_styleSour/Wild Ale                     2.627 0.008611 ** 
## beer_styleSpecialty Grain                   1.434 0.151678    
## beer_styleSpice/Herb/Vegetable              3.463 0.000534 ***
## beer_styleStout                            -1.408 0.159142    
## beer_styleSweet Stout                       0.275 0.783623    
## beer_styleTraditional Ale                   3.539 0.000402 ***
## beer_styleWeizen Bock                       0.353 0.723861    
## beer_styleWheat Ale                         1.382 0.166859    
## beer_styleWitbier                          -0.443 0.657856    
## beer_styleZwickel/Keller/Landbier           0.447 0.654813    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE)  or
##   vcov(x)     if you need it

This interesting and in line with findings from Jurafsky more first person singular pronouns predicts worse experience.

Hypothesis 2: Review length

Plot

ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_num_tokens, size = var_num_tokens, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Mean review length") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1))

lmer test

f <- "num_tokens ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## num_tokens ~ log(user_num_ratings) + user_num_friends + review_overall_score +  
##     review_taste_score + review_aroma_score + review_palate_score +  
##     review_appearance_score + beer_global_score + (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: 438282.1
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -9.652 -0.418 -0.097  0.309 33.271 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 571.3    23.90   
##  Residual              470.4    21.69   
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                           Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)              3.080e+01  8.291e-01  1.957e+04  37.152  < 2e-16
## log(user_num_ratings)    1.220e+00  2.187e-01  3.946e+03   5.581 2.55e-08
## user_num_friends         2.623e-01  1.102e-01  3.085e+03   2.380  0.01738
## review_overall_score     1.140e-01  6.164e-02  4.769e+04   1.850  0.06432
## review_taste_score      -1.249e-01  1.216e-01  4.648e+04  -1.027  0.30433
## review_aroma_score       4.856e-01  9.410e-02  4.646e+04   5.161 2.47e-07
## review_palate_score     -2.659e-01  1.809e-01  4.602e+04  -1.470  0.14152
## review_appearance_score  4.551e-01  1.600e-01  4.604e+04   2.845  0.00444
## beer_global_score        2.459e-02  4.794e-03  4.764e+04   5.129 2.93e-07
##                            
## (Intercept)             ***
## log(user_num_ratings)   ***
## user_num_friends        *  
## review_overall_score    .  
## review_taste_score         
## review_aroma_score      ***
## review_palate_score        
## review_appearance_score ** 
## beer_global_score       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.528                                                   
## usr_nm_frnd  0.128 -0.414                                            
## rvw_vrll_sc -0.013 -0.022  0.005                                     
## rvw_tst_scr -0.047  0.029 -0.006 -0.557                              
## revw_rm_scr -0.089  0.008 -0.008 -0.212 -0.251                       
## rvw_plt_scr -0.170  0.023  0.000 -0.209 -0.232 -0.041                
## rvw_pprnc_s -0.362  0.048 -0.001 -0.096 -0.044 -0.112 -0.170         
## br_glbl_scr  0.113 -0.067  0.003 -0.127 -0.028 -0.214 -0.055  -0.118

Here we also see that it looks like more experienced users are writing longer reviews…

Hypothesis 3: spelling mistakes

Plot

# names(d.user_info)
d.user_info %>%
  filter(avg_num_mispelled_words / avg_num_tokens < 0.25) %>%
ggplot(aes(x = log(user_num_ratings), y = avg_num_mispelled_words, size = var_num_mispelled_words, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Mean number of spelling mistakes") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1))

lmer test

d.filter_non_english <- d %>%
  filter(num_mispelled_words / num_tokens < 0.25)

f <- "num_mispelled_words  ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d.filter_non_english))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: num_mispelled_words ~ log(user_num_ratings) + user_num_friends +  
##     review_overall_score + review_taste_score + review_aroma_score +  
##     review_palate_score + review_appearance_score + beer_global_score +  
##     (1 | user_name)
##    Data: d.filter_non_english
## 
## REML criterion at convergence: 170742.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3450 -0.6031 -0.1628  0.4385 18.2976 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 1.208    1.099   
##  Residual              2.592    1.610   
## Number of obs: 43672, groups:  user_name, 4359
## 
## Fixed effects:
##                           Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)              1.512e+00  5.623e-02  2.428e+04  26.896  < 2e-16
## log(user_num_ratings)    1.913e-02  1.152e-02  3.195e+03   1.660  0.09692
## user_num_friends         1.629e-02  5.294e-03  2.124e+03   3.077  0.00212
## review_overall_score     2.258e-03  4.664e-03  4.322e+04   0.484  0.62829
## review_taste_score       4.474e-04  9.296e-03  4.340e+04   0.048  0.96161
## review_aroma_score       1.364e-02  7.158e-03  4.341e+04   1.906  0.05664
## review_palate_score      2.479e-03  1.385e-02  4.313e+04   0.179  0.85793
## review_appearance_score -1.075e-02  1.225e-02  4.312e+04  -0.877  0.38038
## beer_global_score        1.657e-03  3.594e-04  4.327e+04   4.611 4.03e-06
##                            
## (Intercept)             ***
## log(user_num_ratings)   .  
## user_num_friends        ** 
## review_overall_score       
## review_taste_score         
## review_aroma_score      .  
## review_palate_score        
## review_appearance_score    
## beer_global_score       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.513                                                   
## usr_nm_frnd  0.132 -0.425                                            
## rvw_vrll_sc -0.021 -0.023  0.005                                     
## rvw_tst_scr -0.048  0.035 -0.008 -0.559                              
## revw_rm_scr -0.090  0.007 -0.012 -0.210 -0.252                       
## rvw_plt_scr -0.184  0.030 -0.001 -0.205 -0.238 -0.041                
## rvw_pprnc_s -0.403  0.068 -0.001 -0.090 -0.047 -0.119 -0.177         
## br_glbl_scr  0.122 -0.093  0.004 -0.124 -0.023 -0.214 -0.056  -0.114

There may be some problems with non-english reviews, but if we filter so that the reviewers are mispelling less than one if four words we aren’t seeing evidence that this is an effect of expertise…

Hypothesis 4: Profanity

Plot

# names(d.user_info)
d.user_info %>%
  filter(user_num_ratings >= 50) %>%
ggplot(aes(x = log(user_num_ratings), y = avg_num_swear_words, size = var_num_swear_words, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Mean number of swear words used") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  theme(legend.position="none")

lmer test

# d.filter_non_english <- d %>%
#   filter(num_mispelled_words / num_tokens < 0.25)
names(d)
##  [1] "X"                                 
##  [2] "user_num_places_rated"             
##  [3] "user_num_following"                
##  [4] "user_url"                          
##  [5] "beer_num_ratings"                  
##  [6] "user_num_friends"                  
##  [7] "user_num_breweries_rated"          
##  [8] "review_palate_score"               
##  [9] "review_taste_score"                
## [10] "user_num_ratings"                  
## [11] "review_ratings_blob"               
## [12] "review_aroma_score"                
## [13] "user_num_countries_rated"          
## [14] "user_id"                           
## [15] "review_avg_score"                  
## [16] "beer_global_style_score"           
## [17] "beer_weighted_avg_score"           
## [18] "beer_brewer_name"                  
## [19] "review_overall_score"              
## [20] "user_location"                     
## [21] "review_appearance_score"           
## [22] "beer_num_calories"                 
## [23] "beer_style"                        
## [24] "beer_url"                          
## [25] "review_blob"                       
## [26] "beer_global_score"                 
## [27] "user_name"                         
## [28] "beer_location"                     
## [29] "beer_ABV"                          
## [30] "beer_name"                         
## [31] "beer_ABV_num"                      
## [32] "user_experience"                   
## [33] "review_blob_lower"                 
## [34] "user_experience_quartile"          
## [35] "user_experience1000"               
## [36] "user_experience500"                
## [37] "num_tokens"                        
## [38] "num_types"                         
## [39] "type_token_ratio"                  
## [40] "corrected_ttr"                     
## [41] "num_syllables"                     
## [42] "readability_score"                 
## [43] "normalized_beer_global_score"      
## [44] "normalized_beer_global_style_score"
## [45] "num_first_person_singular_pnouns"  
## [46] "num_swear_words"                   
## [47] "num_negation_words"                
## [48] "num_mispelled_words"
f <- "num_swear_words ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: num_swear_words ~ log(user_num_ratings) + user_num_friends +  
##     review_overall_score + review_taste_score + review_aroma_score +  
##     review_palate_score + review_appearance_score + beer_global_score +  
##     beer_style + (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: -52755.6
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -2.531 -0.144 -0.068 -0.026 34.314 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 0.001051 0.03242 
##  Residual              0.018498 0.13601 
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                                             Estimate Std. Error         df
## (Intercept)                                1.864e-03  7.541e-03  4.542e+04
## log(user_num_ratings)                      6.326e-04  5.093e-04  2.846e+03
## user_num_friends                           5.874e-04  1.977e-04  1.401e+03
## review_overall_score                      -3.791e-04  3.437e-04  3.605e+04
## review_taste_score                         5.053e-04  7.151e-04  4.560e+04
## review_aroma_score                        -7.935e-04  5.579e-04  4.603e+04
## review_palate_score                        2.130e-03  1.071e-03  4.552e+04
## review_appearance_score                    3.617e-05  9.553e-04  4.567e+04
## beer_global_score                          3.126e-05  3.619e-05  4.650e+04
## beer_styleAbbey Tripel                     5.955e-03  7.595e-03  4.725e+04
## beer_styleAbt/Quadrupel                    5.602e-03  8.655e-03  4.724e+04
## beer_styleAltbier                          3.805e-05  1.302e-02  4.733e+04
## beer_styleAmber Ale                        7.756e-03  7.527e-03  4.749e+04
## beer_styleAmber Lager/Vienna               9.102e-03  8.100e-03  4.760e+04
## beer_styleAmerican Pale Ale                7.365e-03  6.727e-03  4.758e+04
## beer_styleAmerican Strong Ale              1.405e-02  7.507e-03  4.743e+04
## beer_styleBaltic Porter                   -2.704e-03  1.032e-02  4.755e+04
## beer_styleBarley Wine                      7.229e-03  7.370e-03  4.751e+04
## beer_styleBelgian Ale                      5.885e-03  7.464e-03  4.728e+04
## beer_styleBelgian Strong Ale               5.946e-03  6.669e-03  4.733e+04
## beer_styleBerliner Weisse                 -3.923e-03  1.188e-02  4.740e+04
## beer_styleBire de Garde                    1.543e-02  1.378e-02  4.724e+04
## beer_styleBitter                           1.403e-02  8.848e-03  4.759e+04
## beer_styleBlack IPA                        3.148e-03  8.270e-03  4.749e+04
## beer_styleBrown Ale                       -3.933e-03  7.524e-03  4.748e+04
## beer_styleCalifornia Common               -5.448e-03  1.437e-02  4.721e+04
## beer_styleCider                           -1.785e-03  1.017e-02  4.673e+04
## beer_styleCream Ale                       -9.153e-03  1.399e-02  4.744e+04
## beer_styleCzech Pilsner (Svtl)             1.036e-02  9.974e-03  4.769e+04
## beer_styleDoppelbock                      -4.342e-04  8.648e-03  4.735e+04
## beer_styleDortmunder/Helles                1.037e-02  1.006e-02  4.754e+04
## beer_styleDry Stout                        1.011e-02  9.401e-03  4.758e+04
## beer_styleDunkel/Tmav                      8.435e-04  9.794e-03  4.747e+04
## beer_styleDunkelweizen                     3.249e-04  1.156e-02  4.737e+04
## beer_styleDunkler Bock                    -6.954e-03  1.217e-02  4.717e+04
## beer_styleEisbock                          9.353e-02  2.076e-02  4.723e+04
## beer_styleEnglish Pale Ale                 1.076e-02  1.086e-02  4.726e+04
## beer_styleEnglish Strong Ale               2.471e-03  9.144e-03  4.739e+04
## beer_styleForeign Stout                    1.453e-02  1.106e-02  4.732e+04
## beer_styleFruit Beer                       4.584e-03  7.500e-03  4.755e+04
## beer_styleGerman Hefeweizen                7.259e-03  7.832e-03  4.758e+04
## beer_styleGerman Kristallweizen           -4.872e-03  2.145e-02  4.742e+04
## beer_styleGolden Ale/Blond Ale             2.934e-03  7.955e-03  4.764e+04
## beer_styleGrodziskie/Gose/Lichtenhainer    8.824e-03  1.249e-02  4.728e+04
## beer_styleHeller Bock                      9.699e-03  1.084e-02  4.734e+04
## beer_styleIce Cider/Ice Perry             -2.351e-02  4.210e-02  4.716e+04
## beer_styleImperial IPA                     1.739e-02  6.515e-03  4.764e+04
## beer_styleImperial Pils/Strong Pale Lager -1.941e-03  1.037e-02  4.736e+04
## beer_styleImperial Porter                  8.776e-03  9.742e-03  4.726e+04
## beer_styleImperial Stout                   1.035e-02  6.514e-03  4.760e+04
## beer_styleIndia Pale Ale (IPA)             9.735e-03  6.259e-03  4.763e+04
## beer_styleIrish Ale                       -3.755e-03  1.075e-02  4.736e+04
## beer_styleKlsch                           -6.267e-03  1.218e-02  4.733e+04
## beer_styleLambic Style - Faro              4.484e-02  3.210e-02  4.673e+04
## beer_styleLambic Style - Fruit             9.426e-03  9.585e-03  4.769e+04
## beer_styleLambic Style - Gueuze            4.916e-03  1.190e-02  4.760e+04
## beer_styleLambic Style - Unblended        -1.497e-02  2.048e-02  4.762e+04
## beer_styleLow Alcohol                      1.997e-02  1.365e-02  4.766e+04
## beer_styleMalt Liquor                      8.071e-02  1.165e-02  4.751e+04
## beer_styleMead                            -1.050e-03  1.425e-02  4.502e+04
## beer_styleMild Ale                        -1.546e-02  1.691e-02  4.759e+04
## beer_styleOktoberfest/Mrzen               -2.189e-03  9.229e-03  4.750e+04
## beer_styleOld Ale                         -8.269e-03  1.048e-02  4.726e+04
## beer_stylePale Lager                       2.169e-02  7.044e-03  4.770e+04
## beer_stylePerry                           -1.079e-02  3.069e-02  4.716e+04
## beer_stylePilsener                         2.073e-04  7.678e-03  4.761e+04
## beer_stylePolotmav                         1.185e-03  3.228e-02  4.742e+04
## beer_stylePorter                           3.012e-03  7.028e-03  4.754e+04
## beer_stylePremium Bitter/ESB              -1.847e-03  8.064e-03  4.752e+04
## beer_stylePremium Lager                    1.063e-02  8.959e-03  4.751e+04
## beer_styleRadler/Shandy                    1.468e-02  1.511e-02  4.764e+04
## beer_styleSahti/Gotlandsdricke/Kodulu      2.229e-03  4.013e-02  4.685e+04
## beer_styleSaison                           7.063e-03  7.430e-03  4.751e+04
## beer_styleSak - Futsu-shu                 -7.245e-04  9.809e-02  4.759e+04
## beer_styleSak - Ginjo                     -4.450e-03  7.986e-02  4.757e+04
## beer_styleSak - Infused                    2.172e-03  1.375e-01  4.635e+04
## beer_styleSak - Junmai                     1.409e-03  9.721e-02  4.617e+04
## beer_styleSak - Nigori                     1.156e-04  6.913e-02  4.744e+04
## beer_styleSchwarzbier                      7.240e-04  1.086e-02  4.734e+04
## beer_styleScotch Ale                      -4.043e-03  9.465e-03  4.738e+04
## beer_styleScottish Ale                     2.035e-02  1.460e-02  4.737e+04
## beer_styleSession IPA                      1.059e-02  9.319e-03  4.748e+04
## beer_styleSmoked                           5.351e-03  9.551e-03  4.733e+04
## beer_styleSour Red/Brown                   1.328e-03  9.356e-03  4.738e+04
## beer_styleSour/Wild Ale                    9.328e-03  8.071e-03  4.767e+04
## beer_styleSpecialty Grain                  8.029e-03  1.057e-02  4.741e+04
## beer_styleSpice/Herb/Vegetable            -3.340e-04  7.217e-03  4.758e+04
## beer_styleStout                            5.730e-03  7.363e-03  4.744e+04
## beer_styleSweet Stout                      2.715e-03  7.998e-03  4.754e+04
## beer_styleTraditional Ale                 -1.109e-02  1.137e-02  4.726e+04
## beer_styleWeizen Bock                     -9.125e-03  1.129e-02  4.714e+04
## beer_styleWheat Ale                        1.370e-02  8.168e-03  4.752e+04
## beer_styleWitbier                          2.006e-02  7.788e-03  4.746e+04
## beer_styleZwickel/Keller/Landbier         -6.226e-03  1.485e-02  4.758e+04
##                                           t value Pr(>|t|)    
## (Intercept)                                 0.247  0.80472    
## log(user_num_ratings)                       1.242  0.21424    
## user_num_friends                            2.971  0.00301 ** 
## review_overall_score                       -1.103  0.27007    
## review_taste_score                          0.707  0.47978    
## review_aroma_score                         -1.422  0.15492    
## review_palate_score                         1.989  0.04674 *  
## review_appearance_score                     0.038  0.96980    
## beer_global_score                           0.864  0.38771    
## beer_styleAbbey Tripel                      0.784  0.43300    
## beer_styleAbt/Quadrupel                     0.647  0.51750    
## beer_styleAltbier                           0.003  0.99767    
## beer_styleAmber Ale                         1.030  0.30282    
## beer_styleAmber Lager/Vienna                1.124  0.26114    
## beer_styleAmerican Pale Ale                 1.095  0.27363    
## beer_styleAmerican Strong Ale               1.871  0.06134 .  
## beer_styleBaltic Porter                    -0.262  0.79344    
## beer_styleBarley Wine                       0.981  0.32671    
## beer_styleBelgian Ale                       0.788  0.43044    
## beer_styleBelgian Strong Ale                0.892  0.37264    
## beer_styleBerliner Weisse                  -0.330  0.74119    
## beer_styleBire de Garde                     1.119  0.26294    
## beer_styleBitter                            1.586  0.11272    
## beer_styleBlack IPA                         0.381  0.70344    
## beer_styleBrown Ale                        -0.523  0.60119    
## beer_styleCalifornia Common                -0.379  0.70460    
## beer_styleCider                            -0.176  0.86063    
## beer_styleCream Ale                        -0.654  0.51297    
## beer_styleCzech Pilsner (Svtl)              1.039  0.29902    
## beer_styleDoppelbock                       -0.050  0.95996    
## beer_styleDortmunder/Helles                 1.031  0.30262    
## beer_styleDry Stout                         1.076  0.28214    
## beer_styleDunkel/Tmav                       0.086  0.93136    
## beer_styleDunkelweizen                      0.028  0.97758    
## beer_styleDunkler Bock                     -0.571  0.56787    
## beer_styleEisbock                           4.506 6.62e-06 ***
## beer_styleEnglish Pale Ale                  0.991  0.32158    
## beer_styleEnglish Strong Ale                0.270  0.78700    
## beer_styleForeign Stout                     1.314  0.18890    
## beer_styleFruit Beer                        0.611  0.54112    
## beer_styleGerman Hefeweizen                 0.927  0.35400    
## beer_styleGerman Kristallweizen            -0.227  0.82031    
## beer_styleGolden Ale/Blond Ale              0.369  0.71229    
## beer_styleGrodziskie/Gose/Lichtenhainer     0.706  0.47992    
## beer_styleHeller Bock                       0.895  0.37076    
## beer_styleIce Cider/Ice Perry              -0.558  0.57655    
## beer_styleImperial IPA                      2.669  0.00760 ** 
## beer_styleImperial Pils/Strong Pale Lager  -0.187  0.85152    
## beer_styleImperial Porter                   0.901  0.36769    
## beer_styleImperial Stout                    1.589  0.11200    
## beer_styleIndia Pale Ale (IPA)              1.555  0.11989    
## beer_styleIrish Ale                        -0.349  0.72687    
## beer_styleKlsch                            -0.514  0.60698    
## beer_styleLambic Style - Faro               1.397  0.16244    
## beer_styleLambic Style - Fruit              0.983  0.32540    
## beer_styleLambic Style - Gueuze             0.413  0.67949    
## beer_styleLambic Style - Unblended         -0.731  0.46499    
## beer_styleLow Alcohol                       1.463  0.14353    
## beer_styleMalt Liquor                       6.928 4.32e-12 ***
## beer_styleMead                             -0.074  0.94125    
## beer_styleMild Ale                         -0.914  0.36059    
## beer_styleOktoberfest/Mrzen                -0.237  0.81251    
## beer_styleOld Ale                          -0.789  0.43030    
## beer_stylePale Lager                        3.079  0.00208 ** 
## beer_stylePerry                            -0.351  0.72528    
## beer_stylePilsener                          0.027  0.97845    
## beer_stylePolotmav                          0.037  0.97072    
## beer_stylePorter                            0.429  0.66824    
## beer_stylePremium Bitter/ESB               -0.229  0.81881    
## beer_stylePremium Lager                     1.186  0.23551    
## beer_styleRadler/Shandy                     0.971  0.33133    
## beer_styleSahti/Gotlandsdricke/Kodulu       0.056  0.95570    
## beer_styleSaison                            0.951  0.34184    
## beer_styleSak - Futsu-shu                  -0.007  0.99411    
## beer_styleSak - Ginjo                      -0.056  0.95557    
## beer_styleSak - Infused                     0.016  0.98739    
## beer_styleSak - Junmai                      0.014  0.98844    
## beer_styleSak - Nigori                      0.002  0.99867    
## beer_styleSchwarzbier                       0.067  0.94683    
## beer_styleScotch Ale                       -0.427  0.66929    
## beer_styleScottish Ale                      1.394  0.16324    
## beer_styleSession IPA                       1.136  0.25588    
## beer_styleSmoked                            0.560  0.57529    
## beer_styleSour Red/Brown                    0.142  0.88714    
## beer_styleSour/Wild Ale                     1.156  0.24780    
## beer_styleSpecialty Grain                   0.760  0.44740    
## beer_styleSpice/Herb/Vegetable             -0.046  0.96309    
## beer_styleStout                             0.778  0.43644    
## beer_styleSweet Stout                       0.340  0.73422    
## beer_styleTraditional Ale                  -0.976  0.32918    
## beer_styleWeizen Bock                      -0.808  0.41911    
## beer_styleWheat Ale                         1.678  0.09341 .  
## beer_styleWitbier                           2.576  0.01000 *  
## beer_styleZwickel/Keller/Landbier          -0.419  0.67496    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE)  or
##   vcov(x)     if you need it

Hypothesis 5: Negation

Plot

# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_num_negation, size = var_num_negation, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Average number of negations used") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1)) +
  theme(legend.position="none")

lmer test

# d.filter_non_english <- d %>%
#   filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "num_negation_words ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: num_negation_words ~ log(user_num_ratings) + user_num_friends +  
##     review_overall_score + review_taste_score + review_aroma_score +  
##     review_palate_score + review_appearance_score + beer_global_score +  
##     beer_style + (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: 113686.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5235 -0.5823 -0.2496  0.4508 17.8673 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 0.1364   0.3693  
##  Residual              0.5779   0.7602  
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                                             Estimate Std. Error         df
## (Intercept)                                1.434e+00  4.387e-02  4.670e+04
## log(user_num_ratings)                     -1.390e-02  4.165e-03  3.262e+03
## user_num_friends                           1.132e-03  1.850e-03  1.946e+03
## review_overall_score                      -1.161e-02  2.037e-03  4.531e+04
## review_taste_score                        -3.577e-02  4.128e-03  4.770e+04
## review_aroma_score                        -2.159e-02  3.218e-03  4.770e+04
## review_palate_score                       -2.095e-02  6.174e-03  4.766e+04
## review_appearance_score                   -2.239e-02  5.506e-03  4.766e+04
## beer_global_score                         -2.061e-03  2.085e-04  4.770e+04
## beer_styleAbbey Tripel                    -3.439e-02  4.294e-02  4.646e+04
## beer_styleAbt/Quadrupel                    6.962e-02  4.894e-02  4.647e+04
## beer_styleAltbier                          5.201e-02  7.365e-02  4.655e+04
## beer_styleAmber Ale                        1.570e-02  4.264e-02  4.663e+04
## beer_styleAmber Lager/Vienna              -8.556e-02  4.599e-02  4.696e+04
## beer_styleAmerican Pale Ale               -7.092e-02  3.815e-02  4.673e+04
## beer_styleAmerican Strong Ale              3.963e-02  4.250e-02  4.654e+04
## beer_styleBaltic Porter                    4.260e-02  5.860e-02  4.695e+04
## beer_styleBarley Wine                      8.338e-02  4.173e-02  4.648e+04
## beer_styleBelgian Ale                     -9.112e-02  4.221e-02  4.642e+04
## beer_styleBelgian Strong Ale               9.299e-03  3.773e-02  4.649e+04
## beer_styleBerliner Weisse                  1.392e-01  6.713e-02  4.614e+04
## beer_styleBire de Garde                   -5.503e-02  7.789e-02  4.640e+04
## beer_styleBitter                          -4.901e-02  5.054e-02  4.739e+04
## beer_styleBlack IPA                        1.947e-02  4.685e-02  4.660e+04
## beer_styleBrown Ale                       -4.504e-02  4.262e-02  4.663e+04
## beer_styleCalifornia Common               -3.603e-02  8.127e-02  4.654e+04
## beer_styleCider                            1.643e-01  5.857e-02  4.769e+04
## beer_styleCream Ale                        4.110e-02  7.927e-02  4.674e+04
## beer_styleCzech Pilsner (Svtl)            -1.639e-01  5.677e-02  4.706e+04
## beer_styleDoppelbock                      -2.237e-02  4.892e-02  4.645e+04
## beer_styleDortmunder/Helles               -1.273e-01  5.702e-02  4.668e+04
## beer_styleDry Stout                       -3.366e-02  5.335e-02  4.685e+04
## beer_styleDunkel/Tmav                     -5.805e-02  5.551e-02  4.677e+04
## beer_styleDunkelweizen                    -1.178e-01  6.539e-02  4.643e+04
## beer_styleDunkler Bock                     3.346e-02  6.874e-02  4.616e+04
## beer_styleEisbock                         -1.754e-01  1.174e-01  4.652e+04
## beer_styleEnglish Pale Ale                -9.925e-02  6.138e-02  4.634e+04
## beer_styleEnglish Strong Ale              -6.829e-02  5.175e-02  4.651e+04
## beer_styleForeign Stout                   -7.789e-03  6.257e-02  4.655e+04
## beer_styleFruit Beer                       7.420e-02  4.252e-02  4.673e+04
## beer_styleGerman Hefeweizen               -1.014e-01  4.442e-02  4.680e+04
## beer_styleGerman Kristallweizen           -5.681e-02  1.216e-01  4.684e+04
## beer_styleGolden Ale/Blond Ale            -8.517e-02  4.520e-02  4.702e+04
## beer_styleGrodziskie/Gose/Lichtenhainer    1.197e-01  7.047e-02  4.581e+04
## beer_styleHeller Bock                      3.366e-02  6.131e-02  4.650e+04
## beer_styleIce Cider/Ice Perry              3.782e-01  2.379e-01  4.646e+04
## beer_styleImperial IPA                     4.507e-02  3.699e-02  4.684e+04
## beer_styleImperial Pils/Strong Pale Lager -5.539e-04  5.870e-02  4.668e+04
## beer_styleImperial Porter                  6.522e-03  5.503e-02  4.619e+04
## beer_styleImperial Stout                   7.699e-02  3.694e-02  4.672e+04
## beer_styleIndia Pale Ale (IPA)             4.707e-04  3.552e-02  4.681e+04
## beer_styleIrish Ale                       -1.080e-01  6.087e-02  4.672e+04
## beer_styleKlsch                            7.382e-02  6.894e-02  4.656e+04
## beer_styleLambic Style - Faro              1.492e-01  1.806e-01  4.553e+04
## beer_styleLambic Style - Fruit             9.708e-02  5.439e-02  4.658e+04
## beer_styleLambic Style - Gueuze            6.889e-02  6.748e-02  4.671e+04
## beer_styleLambic Style - Unblended         9.028e-03  1.158e-01  4.581e+04
## beer_styleLow Alcohol                     -3.256e-01  7.740e-02  4.650e+04
## beer_styleMalt Liquor                     -1.351e-01  6.681e-02  4.769e+04
## beer_styleMead                             2.085e-01  8.256e-02  4.766e+04
## beer_styleMild Ale                        -1.076e-01  9.594e-02  4.680e+04
## beer_styleOktoberfest/Mrzen               -5.254e-03  5.228e-02  4.658e+04
## beer_styleOld Ale                          3.767e-02  5.926e-02  4.635e+04
## beer_stylePale Lager                      -2.381e-01  4.010e-02  4.714e+04
## beer_stylePerry                           -1.993e-01  1.734e-01  4.625e+04
## beer_stylePilsener                        -8.217e-02  4.357e-02  4.684e+04
## beer_stylePolotmav                        -3.670e-01  1.830e-01  4.700e+04
## beer_stylePorter                          -4.486e-02  3.984e-02  4.670e+04
## beer_stylePremium Bitter/ESB              -5.968e-02  4.570e-02  4.669e+04
## beer_stylePremium Lager                   -7.027e-02  5.081e-02  4.685e+04
## beer_styleRadler/Shandy                   -1.601e-01  8.602e-02  4.740e+04
## beer_styleSahti/Gotlandsdricke/Kodulu      1.824e-01  2.260e-01  4.569e+04
## beer_styleSaison                          -4.810e-02  4.209e-02  4.656e+04
## beer_styleSak - Futsu-shu                  7.006e-02  5.694e-01  4.464e+04
## beer_styleSak - Ginjo                     -7.137e-01  4.556e-01  4.766e+04
## beer_styleSak - Infused                   -5.293e-01  7.709e-01  4.469e+04
## beer_styleSak - Junmai                     2.983e-01  5.448e-01  4.461e+04
## beer_styleSak - Nigori                    -4.754e-01  3.940e-01  4.769e+04
## beer_styleSchwarzbier                     -5.571e-02  6.145e-02  4.665e+04
## beer_styleScotch Ale                      -1.837e-03  5.359e-02  4.664e+04
## beer_styleScottish Ale                    -4.386e-02  8.273e-02  4.688e+04
## beer_styleSession IPA                     -8.507e-02  5.274e-02  4.643e+04
## beer_styleSmoked                           7.304e-02  5.399e-02  4.631e+04
## beer_styleSour Red/Brown                   9.436e-03  5.291e-02  4.631e+04
## beer_styleSour/Wild Ale                    4.693e-03  4.579e-02  4.658e+04
## beer_styleSpecialty Grain                 -4.605e-02  5.980e-02  4.642e+04
## beer_styleSpice/Herb/Vegetable             1.088e-01  4.094e-02  4.678e+04
## beer_styleStout                           -2.074e-02  4.169e-02  4.655e+04
## beer_styleSweet Stout                      1.910e-02  4.533e-02  4.669e+04
## beer_styleTraditional Ale                  1.736e-01  6.424e-02  4.636e+04
## beer_styleWeizen Bock                     -3.822e-02  6.375e-02  4.608e+04
## beer_styleWheat Ale                        3.298e-02  4.630e-02  4.676e+04
## beer_styleWitbier                         -2.866e-02  4.412e-02  4.670e+04
## beer_styleZwickel/Keller/Landbier         -2.002e-02  8.445e-02  4.737e+04
##                                           t value Pr(>|t|)    
## (Intercept)                                32.700  < 2e-16 ***
## log(user_num_ratings)                      -3.337 0.000857 ***
## user_num_friends                            0.612 0.540831    
## review_overall_score                       -5.699 1.21e-08 ***
## review_taste_score                         -8.667  < 2e-16 ***
## review_aroma_score                         -6.711 1.96e-11 ***
## review_palate_score                        -3.394 0.000690 ***
## review_appearance_score                    -4.067 4.78e-05 ***
## beer_global_score                          -9.886  < 2e-16 ***
## beer_styleAbbey Tripel                     -0.801 0.423311    
## beer_styleAbt/Quadrupel                     1.422 0.154890    
## beer_styleAltbier                           0.706 0.480121    
## beer_styleAmber Ale                         0.368 0.712800    
## beer_styleAmber Lager/Vienna               -1.860 0.062822 .  
## beer_styleAmerican Pale Ale                -1.859 0.063048 .  
## beer_styleAmerican Strong Ale               0.932 0.351109    
## beer_styleBaltic Porter                     0.727 0.467230    
## beer_styleBarley Wine                       1.998 0.045730 *  
## beer_styleBelgian Ale                      -2.159 0.030858 *  
## beer_styleBelgian Strong Ale                0.246 0.805309    
## beer_styleBerliner Weisse                   2.073 0.038133 *  
## beer_styleBire de Garde                    -0.706 0.479881    
## beer_styleBitter                           -0.970 0.332207    
## beer_styleBlack IPA                         0.416 0.677745    
## beer_styleBrown Ale                        -1.057 0.290652    
## beer_styleCalifornia Common                -0.443 0.657526    
## beer_styleCider                             2.804 0.005042 ** 
## beer_styleCream Ale                         0.519 0.604101    
## beer_styleCzech Pilsner (Svtl)             -2.886 0.003900 ** 
## beer_styleDoppelbock                       -0.457 0.647541    
## beer_styleDortmunder/Helles                -2.233 0.025564 *  
## beer_styleDry Stout                        -0.631 0.528013    
## beer_styleDunkel/Tmav                      -1.046 0.295666    
## beer_styleDunkelweizen                     -1.802 0.071572 .  
## beer_styleDunkler Bock                      0.487 0.626500    
## beer_styleEisbock                          -1.495 0.135050    
## beer_styleEnglish Pale Ale                 -1.617 0.105884    
## beer_styleEnglish Strong Ale               -1.320 0.186958    
## beer_styleForeign Stout                    -0.124 0.900938    
## beer_styleFruit Beer                        1.745 0.081025 .  
## beer_styleGerman Hefeweizen                -2.282 0.022497 *  
## beer_styleGerman Kristallweizen            -0.467 0.640304    
## beer_styleGolden Ale/Blond Ale             -1.884 0.059566 .  
## beer_styleGrodziskie/Gose/Lichtenhainer     1.698 0.089517 .  
## beer_styleHeller Bock                       0.549 0.583010    
## beer_styleIce Cider/Ice Perry               1.590 0.111853    
## beer_styleImperial IPA                      1.218 0.223098    
## beer_styleImperial Pils/Strong Pale Lager  -0.009 0.992471    
## beer_styleImperial Porter                   0.119 0.905669    
## beer_styleImperial Stout                    2.084 0.037163 *  
## beer_styleIndia Pale Ale (IPA)              0.013 0.989428    
## beer_styleIrish Ale                        -1.775 0.075966 .  
## beer_styleKlsch                             1.071 0.284293    
## beer_styleLambic Style - Faro               0.826 0.408643    
## beer_styleLambic Style - Fruit              1.785 0.074295 .  
## beer_styleLambic Style - Gueuze             1.021 0.307292    
## beer_styleLambic Style - Unblended          0.078 0.937852    
## beer_styleLow Alcohol                      -4.207 2.59e-05 ***
## beer_styleMalt Liquor                      -2.021 0.043236 *  
## beer_styleMead                              2.525 0.011563 *  
## beer_styleMild Ale                         -1.122 0.261891    
## beer_styleOktoberfest/Mrzen                -0.101 0.919940    
## beer_styleOld Ale                           0.636 0.524937    
## beer_stylePale Lager                       -5.937 2.93e-09 ***
## beer_stylePerry                            -1.149 0.250481    
## beer_stylePilsener                         -1.886 0.059336 .  
## beer_stylePolotmav                         -2.005 0.044971 *  
## beer_stylePorter                           -1.126 0.260148    
## beer_stylePremium Bitter/ESB               -1.306 0.191632    
## beer_stylePremium Lager                    -1.383 0.166663    
## beer_styleRadler/Shandy                    -1.861 0.062787 .  
## beer_styleSahti/Gotlandsdricke/Kodulu       0.807 0.419567    
## beer_styleSaison                           -1.143 0.253118    
## beer_styleSak - Futsu-shu                   0.123 0.902078    
## beer_styleSak - Ginjo                      -1.567 0.117208    
## beer_styleSak - Infused                    -0.687 0.492353    
## beer_styleSak - Junmai                      0.548 0.583968    
## beer_styleSak - Nigori                     -1.207 0.227550    
## beer_styleSchwarzbier                      -0.907 0.364668    
## beer_styleScotch Ale                       -0.034 0.972658    
## beer_styleScottish Ale                     -0.530 0.595961    
## beer_styleSession IPA                      -1.613 0.106766    
## beer_styleSmoked                            1.353 0.176136    
## beer_styleSour Red/Brown                    0.178 0.858450    
## beer_styleSour/Wild Ale                     0.103 0.918357    
## beer_styleSpecialty Grain                  -0.770 0.441213    
## beer_styleSpice/Herb/Vegetable              2.657 0.007888 ** 
## beer_styleStout                            -0.497 0.618856    
## beer_styleSweet Stout                       0.421 0.673505    
## beer_styleTraditional Ale                   2.702 0.006892 ** 
## beer_styleWeizen Bock                      -0.600 0.548753    
## beer_styleWheat Ale                         0.712 0.476256    
## beer_styleWitbier                          -0.650 0.515964    
## beer_styleZwickel/Keller/Landbier          -0.237 0.812569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE)  or
##   vcov(x)     if you need it

Looks like experts tend to use negation less…

Hypothesis 6: Lexical Diversity via Corrected TTR

Plot

# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_cttr, size = var_cttr, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Average corrected type token ratio") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1)) +
  theme(legend.position="none")

lmer test

# d.filter_non_english <- d %>%
#   filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score +  
##     review_taste_score + review_aroma_score + review_palate_score +  
##     review_appearance_score + beer_global_score + (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: 109681.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.9153 -0.4656  0.0018  0.5183  7.9317 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 0.4256   0.6524  
##  Residual              0.4971   0.7050  
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                           Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)              3.312e+00  2.548e-02  2.264e+04 129.988  < 2e-16
## log(user_num_ratings)    5.051e-02  6.155e-03  3.807e+03   8.206 4.44e-16
## user_num_friends         5.242e-03  3.038e-03  2.814e+03   1.725   0.0846
## review_overall_score     4.102e-03  1.981e-03  4.778e+04   2.071   0.0384
## review_taste_score      -2.537e-03  3.928e-03  4.693e+04  -0.646   0.5184
## review_aroma_score       5.659e-03  3.040e-03  4.691e+04   1.861   0.0627
## review_palate_score     -1.298e-02  5.850e-03  4.651e+04  -2.220   0.0265
## review_appearance_score  8.640e-03  5.173e-03  4.652e+04   1.670   0.0949
## beer_global_score        1.066e-03  1.541e-04  4.778e+04   6.917 4.66e-12
##                            
## (Intercept)             ***
## log(user_num_ratings)   ***
## user_num_friends        .  
## review_overall_score    *  
## review_taste_score         
## review_aroma_score      .  
## review_palate_score     *  
## review_appearance_score .  
## beer_global_score       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) lg(__) usr_n_ rvw_v_ rvw_t_ rvw_r_ rvw_pl_ rvw_pp_
## lg(sr_nm_r) -0.519                                                   
## usr_nm_frnd  0.127 -0.415                                            
## rvw_vrll_sc -0.014 -0.025  0.005                                     
## rvw_tst_scr -0.048  0.032 -0.007 -0.556                              
## revw_rm_scr -0.090  0.008 -0.009 -0.211 -0.254                       
## rvw_plt_scr -0.178  0.025  0.000 -0.206 -0.234 -0.041                
## rvw_pprnc_s -0.380  0.055 -0.001 -0.094 -0.044 -0.113 -0.174         
## br_glbl_scr  0.115 -0.075  0.003 -0.126 -0.027 -0.213 -0.054  -0.116

Hypothesis 7: Readability

Plot

# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = avg_readability, size = var_readability, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Average readability score") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1))

lmer test

# d.filter_non_english <- d %>%
#   filter(num_mispelled_words / num_tokens < 0.25)
# names(d)
f <- "corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score + review_taste_score + review_aroma_score + review_palate_score + review_appearance_score + beer_global_score + beer_style + (1 | user_name)"
summary(lmer(as.formula(f), data = d))
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [lmerMod]
## Formula: 
## corrected_ttr ~ log(user_num_ratings) + user_num_friends + review_overall_score +  
##     review_taste_score + review_aroma_score + review_palate_score +  
##     review_appearance_score + beer_global_score + beer_style +  
##     (1 | user_name)
##    Data: d
## 
## REML criterion at convergence: 109628.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.8190 -0.4652  0.0039  0.5184  7.9171 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  user_name (Intercept) 0.4222   0.6498  
##  Residual              0.4938   0.7027  
## Number of obs: 47791, groups:  user_name, 4679
## 
## Fixed effects:
##                                             Estimate Std. Error         df
## (Intercept)                                3.365e+00  4.294e-02  4.525e+04
## log(user_num_ratings)                      5.022e-02  6.137e-03  3.819e+03
## user_num_friends                           4.891e-03  3.027e-03  2.817e+03
## review_overall_score                       4.925e-03  1.979e-03  4.769e+04
## review_taste_score                        -1.718e-03  3.921e-03  4.685e+04
## review_aroma_score                         2.610e-03  3.057e-03  4.686e+04
## review_palate_score                       -1.482e-02  5.843e-03  4.644e+04
## review_appearance_score                    8.783e-03  5.212e-03  4.650e+04
## beer_global_score                          1.106e-04  1.982e-04  4.698e+04
## beer_styleAbbey Tripel                     2.723e-02  4.018e-02  4.534e+04
## beer_styleAbt/Quadrupel                    1.487e-01  4.580e-02  4.536e+04
## beer_styleAltbier                          2.254e-02  6.896e-02  4.544e+04
## beer_styleAmber Ale                       -1.897e-03  3.993e-02  4.539e+04
## beer_styleAmber Lager/Vienna              -4.672e-02  4.319e-02  4.581e+04
## beer_styleAmerican Pale Ale               -3.511e-02  3.575e-02  4.546e+04
## beer_styleAmerican Strong Ale              1.404e-01  3.977e-02  4.531e+04
## beer_styleBaltic Porter                    3.301e-02  5.505e-02  4.587e+04
## beer_styleBarley Wine                      1.574e-01  3.902e-02  4.520e+04
## beer_styleBelgian Ale                     -1.664e-02  3.947e-02  4.524e+04
## beer_styleBelgian Strong Ale               6.558e-02  3.530e-02  4.530e+04
## beer_styleBerliner Weisse                  7.004e-02  6.263e-02  4.486e+04
## beer_styleBire de Garde                    4.627e-02  7.285e-02  4.528e+04
## beer_styleBitter                          -2.151e-02  4.764e-02  4.613e+04
## beer_styleBlack IPA                        1.078e-01  4.386e-02  4.540e+04
## beer_styleBrown Ale                       -2.990e-02  3.991e-02  4.538e+04
## beer_styleCalifornia Common               -1.444e-01  7.610e-02  4.547e+04
## beer_styleCider                            6.082e-02  5.579e-02  4.718e+04
## beer_styleCream Ale                        1.056e-02  7.430e-02  4.553e+04
## beer_styleCzech Pilsner (Svtl)            -1.337e-01  5.332e-02  4.572e+04
## beer_styleDoppelbock                       1.352e-02  4.576e-02  4.526e+04
## beer_styleDortmunder/Helles               -1.510e-01  5.342e-02  4.549e+04
## beer_styleDry Stout                       -4.824e-02  5.003e-02  4.555e+04
## beer_styleDunkel/Tmav                     -7.092e-02  5.206e-02  4.564e+04
## beer_styleDunkelweizen                    -5.576e-02  6.115e-02  4.526e+04
## beer_styleDunkler Bock                    -1.178e-01  6.413e-02  4.481e+04
## beer_styleEisbock                          1.249e-01  1.099e-01  4.537e+04
## beer_styleEnglish Pale Ale                -2.897e-02  5.737e-02  4.519e+04
## beer_styleEnglish Strong Ale              -7.631e-03  4.842e-02  4.528e+04
## beer_styleForeign Stout                    1.320e-01  5.859e-02  4.546e+04
## beer_styleFruit Beer                       8.378e-02  3.985e-02  4.550e+04
## beer_styleGerman Hefeweizen               -7.029e-02  4.166e-02  4.561e+04
## beer_styleGerman Kristallweizen           -8.558e-02  1.141e-01  4.582e+04
## beer_styleGolden Ale/Blond Ale            -6.681e-02  4.247e-02  4.583e+04
## beer_styleGrodziskie/Gose/Lichtenhainer    3.849e-02  6.560e-02  4.453e+04
## beer_styleHeller Bock                     -1.502e-01  5.737e-02  4.533e+04
## beer_styleIce Cider/Ice Perry             -2.112e-01  2.231e-01  4.599e+04
## beer_styleImperial IPA                     1.370e-01  3.468e-02  4.552e+04
## beer_styleImperial Pils/Strong Pale Lager  1.243e-01  5.503e-02  4.565e+04
## beer_styleImperial Porter                  1.087e-01  5.137e-02  4.496e+04
## beer_styleImperial Stout                   1.378e-01  3.461e-02  4.542e+04
## beer_styleIndia Pale Ale (IPA)             1.618e-02  3.330e-02  4.548e+04
## beer_styleIrish Ale                       -7.408e-02  5.712e-02  4.585e+04
## beer_styleKlsch                           -1.101e-01  6.455e-02  4.543e+04
## beer_styleLambic Style - Faro              1.066e-01  1.679e-01  4.439e+04
## beer_styleLambic Style - Fruit             4.649e-02  5.085e-02  4.506e+04
## beer_styleLambic Style - Gueuze            6.289e-02  6.321e-02  4.544e+04
## beer_styleLambic Style - Unblended         2.780e-01  1.077e-01  4.425e+04
## beer_styleLow Alcohol                      1.085e-02  7.233e-02  4.503e+04
## beer_styleMalt Liquor                      2.458e-01  6.365e-02  4.732e+04
## beer_styleMead                            -8.328e-03  7.850e-02  4.673e+04
## beer_styleMild Ale                        -1.376e-01  8.997e-02  4.562e+04
## beer_styleOktoberfest/Mrzen               -1.050e-01  4.892e-02  4.528e+04
## beer_styleOld Ale                          9.998e-02  5.539e-02  4.521e+04
## beer_stylePale Lager                      -5.762e-02  3.771e-02  4.589e+04
## beer_stylePerry                           -1.673e-01  1.618e-01  4.483e+04
## beer_stylePilsener                        -1.068e-01  4.087e-02  4.561e+04
## beer_stylePolotmav                         3.687e-02  1.722e-01  4.616e+04
## beer_stylePorter                          -1.680e-02  3.733e-02  4.545e+04
## beer_stylePremium Bitter/ESB              -1.236e-02  4.281e-02  4.544e+04
## beer_stylePremium Lager                   -9.976e-02  4.769e-02  4.574e+04
## beer_styleRadler/Shandy                   -4.164e-02  8.133e-02  4.669e+04
## beer_styleSahti/Gotlandsdricke/Kodulu     -1.363e-01  2.102e-01  4.433e+04
## beer_styleSaison                           3.922e-02  3.938e-02  4.524e+04
## beer_styleSak - Futsu-shu                  6.147e-02  5.713e-01  3.989e+04
## beer_styleSak - Ginjo                     -8.437e-01  4.335e-01  4.717e+04
## beer_styleSak - Infused                   -1.468e-01  7.134e-01  4.362e+04
## beer_styleSak - Junmai                     7.067e-04  5.041e-01  4.360e+04
## beer_styleSak - Nigori                    -2.394e-01  3.782e-01  4.768e+04
## beer_styleSchwarzbier                     -1.013e-01  5.760e-02  4.561e+04
## beer_styleScotch Ale                       3.699e-02  5.020e-02  4.546e+04
## beer_styleScottish Ale                    -5.654e-03  7.771e-02  4.590e+04
## beer_styleSession IPA                      4.595e-02  4.930e-02  4.516e+04
## beer_styleSmoked                           8.120e-02  5.044e-02  4.505e+04
## beer_styleSour Red/Brown                   2.026e-02  4.941e-02  4.497e+04
## beer_styleSour/Wild Ale                    9.137e-02  4.282e-02  4.513e+04
## beer_styleSpecialty Grain                  8.717e-02  5.590e-02  4.518e+04
## beer_styleSpice/Herb/Vegetable             9.030e-02  3.838e-02  4.554e+04
## beer_styleStout                           -3.376e-02  3.901e-02  4.534e+04
## beer_styleSweet Stout                     -1.004e-02  4.246e-02  4.542e+04
## beer_styleTraditional Ale                  1.449e-01  6.005e-02  4.518e+04
## beer_styleWeizen Bock                      8.951e-02  5.946e-02  4.484e+04
## beer_styleWheat Ale                        4.346e-03  4.342e-02  4.564e+04
## beer_styleWitbier                          2.591e-02  4.135e-02  4.553e+04
## beer_styleZwickel/Keller/Landbier          1.284e-02  7.986e-02  4.676e+04
##                                           t value Pr(>|t|)    
## (Intercept)                                78.382  < 2e-16 ***
## log(user_num_ratings)                       8.183 4.44e-16 ***
## user_num_friends                            1.616 0.106291    
## review_overall_score                        2.488 0.012845 *  
## review_taste_score                         -0.438 0.661324    
## review_aroma_score                          0.854 0.393196    
## review_palate_score                        -2.536 0.011207 *  
## review_appearance_score                     1.685 0.091972 .  
## beer_global_score                           0.558 0.576846    
## beer_styleAbbey Tripel                      0.678 0.497986    
## beer_styleAbt/Quadrupel                     3.246 0.001170 ** 
## beer_styleAltbier                           0.327 0.743820    
## beer_styleAmber Ale                        -0.048 0.962106    
## beer_styleAmber Lager/Vienna               -1.082 0.279413    
## beer_styleAmerican Pale Ale                -0.982 0.325947    
## beer_styleAmerican Strong Ale               3.530 0.000416 ***
## beer_styleBaltic Porter                     0.600 0.548781    
## beer_styleBarley Wine                       4.035 5.48e-05 ***
## beer_styleBelgian Ale                      -0.422 0.673356    
## beer_styleBelgian Strong Ale                1.858 0.063199 .  
## beer_styleBerliner Weisse                   1.118 0.263410    
## beer_styleBire de Garde                     0.635 0.525310    
## beer_styleBitter                           -0.451 0.651710    
## beer_styleBlack IPA                         2.457 0.014027 *  
## beer_styleBrown Ale                        -0.749 0.453780    
## beer_styleCalifornia Common                -1.898 0.057700 .  
## beer_styleCider                             1.090 0.275631    
## beer_styleCream Ale                         0.142 0.886951    
## beer_styleCzech Pilsner (Svtl)             -2.508 0.012159 *  
## beer_styleDoppelbock                        0.296 0.767591    
## beer_styleDortmunder/Helles                -2.827 0.004698 ** 
## beer_styleDry Stout                        -0.964 0.334950    
## beer_styleDunkel/Tmav                      -1.362 0.173134    
## beer_styleDunkelweizen                     -0.912 0.361831    
## beer_styleDunkler Bock                     -1.837 0.066286 .  
## beer_styleEisbock                           1.136 0.255822    
## beer_styleEnglish Pale Ale                 -0.505 0.613527    
## beer_styleEnglish Strong Ale               -0.158 0.874768    
## beer_styleForeign Stout                     2.254 0.024226 *  
## beer_styleFruit Beer                        2.102 0.035522 *  
## beer_styleGerman Hefeweizen                -1.687 0.091558 .  
## beer_styleGerman Kristallweizen            -0.750 0.453358    
## beer_styleGolden Ale/Blond Ale             -1.573 0.115695    
## beer_styleGrodziskie/Gose/Lichtenhainer     0.587 0.557415    
## beer_styleHeller Bock                      -2.619 0.008833 ** 
## beer_styleIce Cider/Ice Perry              -0.947 0.343688    
## beer_styleImperial IPA                      3.950 7.83e-05 ***
## beer_styleImperial Pils/Strong Pale Lager   2.259 0.023878 *  
## beer_styleImperial Porter                   2.116 0.034361 *  
## beer_styleImperial Stout                    3.981 6.89e-05 ***
## beer_styleIndia Pale Ale (IPA)              0.486 0.626996    
## beer_styleIrish Ale                        -1.297 0.194688    
## beer_styleKlsch                            -1.706 0.087973 .  
## beer_styleLambic Style - Faro               0.635 0.525453    
## beer_styleLambic Style - Fruit              0.914 0.360634    
## beer_styleLambic Style - Gueuze             0.995 0.319768    
## beer_styleLambic Style - Unblended          2.582 0.009826 ** 
## beer_styleLow Alcohol                       0.150 0.880812    
## beer_styleMalt Liquor                       3.861 0.000113 ***
## beer_styleMead                             -0.106 0.915507    
## beer_styleMild Ale                         -1.530 0.126140    
## beer_styleOktoberfest/Mrzen                -2.147 0.031771 *  
## beer_styleOld Ale                           1.805 0.071105 .  
## beer_stylePale Lager                       -1.528 0.126492    
## beer_stylePerry                            -1.034 0.301247    
## beer_stylePilsener                         -2.612 0.008992 ** 
## beer_stylePolotmav                          0.214 0.830487    
## beer_stylePorter                           -0.450 0.652563    
## beer_stylePremium Bitter/ESB               -0.289 0.772828    
## beer_stylePremium Lager                    -2.092 0.036446 *  
## beer_styleRadler/Shandy                    -0.512 0.608687    
## beer_styleSahti/Gotlandsdricke/Kodulu      -0.648 0.516753    
## beer_styleSaison                            0.996 0.319245    
## beer_styleSak - Futsu-shu                   0.108 0.914318    
## beer_styleSak - Ginjo                      -1.946 0.051619 .  
## beer_styleSak - Infused                    -0.206 0.836921    
## beer_styleSak - Junmai                      0.001 0.998881    
## beer_styleSak - Nigori                     -0.633 0.526764    
## beer_styleSchwarzbier                      -1.758 0.078677 .  
## beer_styleScotch Ale                        0.737 0.461183    
## beer_styleScottish Ale                     -0.073 0.942002    
## beer_styleSession IPA                       0.932 0.351302    
## beer_styleSmoked                            1.610 0.107432    
## beer_styleSour Red/Brown                    0.410 0.681780    
## beer_styleSour/Wild Ale                     2.134 0.032870 *  
## beer_styleSpecialty Grain                   1.559 0.118944    
## beer_styleSpice/Herb/Vegetable              2.353 0.018631 *  
## beer_styleStout                            -0.865 0.386875    
## beer_styleSweet Stout                      -0.237 0.813012    
## beer_styleTraditional Ale                   2.414 0.015794 *  
## beer_styleWeizen Bock                       1.505 0.132214    
## beer_styleWheat Ale                         0.100 0.920283    
## beer_styleWitbier                           0.627 0.530924    
## beer_styleZwickel/Keller/Landbier           0.161 0.872228    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 94 > 20.
## Use print(x, correlation=TRUE)  or
##   vcov(x)     if you need it

Readability appears to increse…

Hypothesis 8: Self-similarity

Plot

# names(d.user_info)
ggplot(d.user_info, aes(x = log(user_num_ratings), y = review_sims, col = user_name)) +
  geom_point(alpha = 0.7) + 
  ylab("Average readability score") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm", aes(group = 1)) +
  guides(legend.position = "none")

Normal LM

f <- "review_sims ~ log(user_num_ratings)"
summary(lm(f, data = d.user_info))
## 
## Call:
## lm(formula = f, data = d.user_info)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28967 -0.08149 -0.01002  0.06626  0.65491 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.195836   0.012917   15.16   <2e-16 ***
## log(user_num_ratings) 0.026263   0.002491   10.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1152 on 894 degrees of freedom
## Multiple R-squared:  0.1106, Adjusted R-squared:  0.1096 
## F-statistic: 111.2 on 1 and 894 DF,  p-value: < 2.2e-16

Clearly there’s an effect here. We can’r run lmer, because this is an average score. We can regress with multiple regression.

Hypothesis 8: Ratings variance vs average overall scores

Plot average scores

# names(d.user_info)
d.user_info %>%
  gather(type, value, c(avg_overall_score, avg_taste_score, avg_aroma_score, avg_palate_score, avg_appearance_score)) %>%
  ggplot(., aes(x = log(user_num_ratings), y = value, col = type)) +
  geom_point(alpha = 0.7) + 
  ylab("Mean aspect score") +
  xlab("Log user num ratings") +
  scale_colour_discrete(guide = FALSE) +
  geom_smooth(method = "lm")

Plot variance scores

d.user_info %>%
  filter(user_num_ratings >= 50) %>%
  gather(type, value, c(var_taste_score, var_aroma_score, var_palate_score, avg_appearance_score)) %>%
  ggplot(., aes(x = log(user_num_ratings), y = value, col = type)) +
  geom_point(alpha = 0.7) + 
  ylab("Rating variance") +
  xlab("Log user num ratings") +
  geom_smooth(method = "lm") +
  facet_wrap(~type, nrow = 4)

Normal LM

f <- "var_taste_score ~ log(user_num_ratings)"
summary(lm(f, data = d.user_info))
## 
## Call:
## lm(formula = f, data = d.user_info)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7750 -1.0678 -0.3616  0.6887  9.1457 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.84179    0.18370   26.36   <2e-16 ***
## log(user_num_ratings) -0.45852    0.03542  -12.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.638 on 894 degrees of freedom
## Multiple R-squared:  0.1579, Adjusted R-squared:  0.1569 
## F-statistic: 167.6 on 1 and 894 DF,  p-value: < 2.2e-16

Clearly there’s an effect here. We can’r run lmer, because this is an average score. We can regress with multiple regression.