Load data

Distribution of all variables:

tidy_df %>%
  pivot_longer(cols = 3:33) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_wrap(~name, scale = "free_x") +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 102 rows containing non-finite values (stat_bin).

Distribution of all variables after taking log:

log_tidy_df <- tidy_df %>%
  mutate_at(vars(euclidean_distance, imed, avg_hausdorff.x, avg_hausdorff.y, avg_hausdorff_f3, avg_hausdorff_l3, mahalanobis.y), ~log(. + .001)) %>%
  mutate(sq_human_rating_mean = human_rating_mean ^2)

log_tidy_df %>%
  pivot_longer(cols = 3:33) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_wrap(~name, scale = "free_x") +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 102 rows containing non-finite values (stat_bin).

Pairwise correlation:

log_tidy_df %>%
  select(3:34) %>%
  mutate_if(is.numeric, list(~na_if(., -Inf))) %>%
  make_corr_plot()

log_tidy_df %>%
  select(human_rating_mean, imed, first_three, three_longest, manhattan, chessboard, avg_hausdorff.x, 
         ss_index_pt.x, ss_index_bm.x, euclidean.y, euclidean, mahalanobis.y, avg_hausdorff_l3) %>%
  make_corr_plot()

Additive models

lm(human_rating_mean ~ mahalanobis.y, data = log_tidy_df) %>%
  summary()
## 
## Call:
## lm(formula = human_rating_mean ~ mahalanobis.y, data = log_tidy_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5610 -0.8922  0.0901  0.9282  2.5470 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.5756     0.7161  -0.804    0.422    
## mahalanobis.y   8.7511     1.1312   7.736 8.52e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.214 on 398 degrees of freedom
## Multiple R-squared:  0.1307, Adjusted R-squared:  0.1285 
## F-statistic: 59.84 on 1 and 398 DF,  p-value: 8.519e-14
lm(human_rating_mean ~ avg_hausdorff.x + mahalanobis.y, data = log_tidy_df) %>%
  summary()
## 
## Call:
## lm(formula = human_rating_mean ~ avg_hausdorff.x + mahalanobis.y, 
##     data = log_tidy_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.97500 -0.79438  0.03128  0.84820  2.81598 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.1310     0.6816   0.192    0.848    
## avg_hausdorff.x   0.7359     0.1024   7.187 3.31e-12 ***
## mahalanobis.y     7.2684     1.0853   6.697 7.27e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.144 on 397 degrees of freedom
## Multiple R-squared:  0.2308, Adjusted R-squared:  0.2269 
## F-statistic: 59.55 on 2 and 397 DF,  p-value: < 2.2e-16
m2 <- lm(human_rating_mean ~ mahalanobis.y + avg_hausdorff.x+  euclidean, data = log_tidy_df)
m2 %>%
  summary()
## 
## Call:
## lm(formula = human_rating_mean ~ mahalanobis.y + avg_hausdorff.x + 
##     euclidean, data = log_tidy_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.79754 -0.80321  0.02951  0.88274  2.66284 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.46915    0.86626   2.850   0.0046 ** 
## mahalanobis.y    7.13799    1.06327   6.713 6.62e-11 ***
## avg_hausdorff.x  1.07160    0.12783   8.383 9.13e-16 ***
## euclidean       -0.01863    0.00440  -4.235 2.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.12 on 396 degrees of freedom
## Multiple R-squared:  0.2641, Adjusted R-squared:  0.2585 
## F-statistic: 47.37 on 3 and 396 DF,  p-value: < 2.2e-16
m3 <- lm(human_rating_mean ~ mahalanobis.y + avg_hausdorff.x+  euclidean + first_three, data = log_tidy_df)
m3 %>%
  summary()
## 
## Call:
## lm(formula = human_rating_mean ~ mahalanobis.y + avg_hausdorff.x + 
##     euclidean + first_three, data = log_tidy_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7255 -0.8031  0.0233  0.8688  2.6868 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.424287   0.864692   2.804   0.0053 ** 
## mahalanobis.y    6.960192   1.066108   6.529 2.04e-10 ***
## avg_hausdorff.x  0.977410   0.139327   7.015 1.00e-11 ***
## euclidean       -0.019468   0.004418  -4.406 1.36e-05 ***
## first_three      0.002817   0.001678   1.679   0.0939 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.118 on 395 degrees of freedom
## Multiple R-squared:  0.2693, Adjusted R-squared:  0.2619 
## F-statistic:  36.4 on 4 and 395 DF,  p-value: < 2.2e-16
m4 <- lm(human_rating_mean ~ mahalanobis.y + avg_hausdorff.x+  euclidean + three_longest, data = log_tidy_df)
m4 %>%
  summary()
## 
## Call:
## lm(formula = human_rating_mean ~ mahalanobis.y + avg_hausdorff.x + 
##     euclidean + three_longest, data = log_tidy_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70471 -0.80269  0.03677  0.86377  2.72431 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.351767   0.865930   2.716   0.0069 ** 
## mahalanobis.y    6.764822   1.078969   6.270 9.49e-10 ***
## avg_hausdorff.x  0.969963   0.138736   6.991 1.17e-11 ***
## euclidean       -0.018107   0.004396  -4.119 4.64e-05 ***
## three_longest    0.002795   0.001508   1.853   0.0646 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.117 on 395 degrees of freedom
## Multiple R-squared:  0.2704, Adjusted R-squared:  0.2631 
## F-statistic: 36.61 on 4 and 395 DF,  p-value: < 2.2e-16
data_with_predictions <- log_tidy_df %>%
    add_predictions(m3, var = "m2") %>%
    add_predictions(m3, var = "m3") %>%
    add_predictions(m4, var = "m4")

cor.test(data_with_predictions$m2, data_with_predictions$human_rating_mean)
## 
##  Pearson's product-moment correlation
## 
## data:  data_with_predictions$m2 and data_with_predictions$human_rating_mean
## t = 12.112, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4434759 0.5871366
## sample estimates:
##       cor 
## 0.5189614
cor.test(data_with_predictions$m3, data_with_predictions$human_rating_mean)
## 
##  Pearson's product-moment correlation
## 
## data:  data_with_predictions$m3 and data_with_predictions$human_rating_mean
## t = 12.112, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4434759 0.5871366
## sample estimates:
##       cor 
## 0.5189614
cor.test(data_with_predictions$m4, data_with_predictions$human_rating_mean)
## 
##  Pearson's product-moment correlation
## 
## data:  data_with_predictions$m4 and data_with_predictions$human_rating_mean
## t = 12.147, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4446702 0.5881105
## sample estimates:
##       cor 
## 0.5200475
ggplot(data_with_predictions, aes(x = m4, y = human_rating_mean)) +
  geom_point(aes(color = category)) +
  geom_smooth(method = "lm") +
  theme_classic()