#Read in the data prior to this
enshittified <- enshittified %>%
  mutate(enshittified = 1)

ethical <- ethical %>%
  mutate(enshittified = 0)

raw_df <- bind_rows(enshittified, ethical)

raw_df <- raw_df %>%
  clean_names()

raw_df <- raw_df %>%
  filter(
    !(is.na(company) & is.na(variablename))
  )

long_df <- raw_df %>%
  pivot_longer(
    cols = matches("^x\\d{4}$"),
    names_to = "year",
    values_to = "value"
  ) %>%
  mutate(
    year = str_remove(year, "^x"),
    year = as.integer(year)
  ) |> select (-c(x20))

long_df <- long_df %>%
  filter(!is.na(value))

long_df <- long_df %>%
  mutate(
    value = as.character(value),
    value = str_replace_all(value, ",", ""),
    value = str_replace_all(value, "%", ""),
    value = na_if(value, ""),
    value = as.numeric(value)
  )

long_df %>%
  count(company, year, variablename) %>%
  filter(n > 1)

## # A tibble: 0 × 4
## # ℹ 4 variables: company <chr>, year <int>, variablename <chr>, n <int>

panel_df <- long_df %>%
  pivot_wider(
    id_cols = c(company, year, enshittified),
    names_from = variablename,
    values_from = value
  )

colnames(panel_df) <- c("company", "year", "enshit", "RnD_mil","market_mil","total_oper_mil", "share_repurch_mil","cash_mil","total_rev_mil","total_inc","goodwill_mil","total_asset_mil","RnD_pct_oper", "market_pct_oper","share_repurch_pct_cash","goodwill_pct_asset","total_expense")

Quick Visualizations

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

ggplot(panel_df,
       aes(x = year, y = RnD_pct_oper,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(panel_df,
       aes(x = year, y = market_pct_oper,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(panel_df,
       aes(x = year, y = share_repurch_pct_cash,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(panel_df,
       aes(x = year, y = goodwill_pct_asset,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(panel_df,
       aes(x = year, y = total_expense,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(panel_df,
       aes(x = year, y = total_inc,
           color = factor(enshit),
           group = company)) +
  geom_line(alpha = 0.4) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Basic Regressions

We’re starting with individual regs then moving on to everything(yay)

“company”, “year”, “enshit”, “RnD_mil”,“market_mil”,“total_oper_mil”, “share_repurch_mil”,“cash_mil”,“total_rev_mil”,“total_inc”,“goodwill_mil”,“total_asset_mil”,“RnD_pct_oper”, “market_pct_oper”,“share_repurch_pct_cash”,“goodwill_pct_asset”,“total_expense”

RnD_simple <- glm(
  enshit ~ RnD_pct_oper,
  data = panel_df,
  family = binomial()
)
summary(RnD_simple)

## 
## Call:
## glm(formula = enshit ~ RnD_pct_oper, family = binomial(), data = panel_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    0.7384     0.3166   2.332  0.01971 * 
## RnD_pct_oper  -2.8020     1.0699  -2.619  0.00882 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 270.01  on 198  degrees of freedom
## AIC: 274.01
## 
## Number of Fisher Scoring iterations: 4

market_simple <- glm(
  enshit ~ market_pct_oper,
  data = panel_df,
  family = binomial()
)
summary(market_simple)

## 
## Call:
## glm(formula = enshit ~ market_pct_oper, family = binomial(), 
##     data = panel_df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       1.3018     0.3566   3.650 0.000262 ***
## market_pct_oper  -3.3458     0.8358  -4.003 6.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 259.53  on 198  degrees of freedom
## AIC: 263.53
## 
## Number of Fisher Scoring iterations: 4

share_simple <- glm(
  enshit ~ share_repurch_pct_cash,
  data = panel_df,
  family = binomial()
)
summary(share_simple)

## 
## Call:
## glm(formula = enshit ~ share_repurch_pct_cash, family = binomial(), 
##     data = panel_df)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)
## (Intercept)            -0.03967    0.15961  -0.249    0.804
## share_repurch_pct_cash  0.05031    0.09511   0.529    0.597
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 276.96  on 198  degrees of freedom
## AIC: 280.96
## 
## Number of Fisher Scoring iterations: 4

goodwill_simple <- glm(
  enshit ~ goodwill_pct_asset,
  data = panel_df,
  family = binomial()
)
summary(goodwill_simple)

## 
## Call:
## glm(formula = enshit ~ goodwill_pct_asset, family = binomial(), 
##     data = panel_df)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -0.1383     0.1959  -0.706     0.48
## goodwill_pct_asset   0.7492     0.7384   1.015     0.31
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 276.19  on 198  degrees of freedom
## AIC: 280.19
## 
## Number of Fisher Scoring iterations: 4

inc_simple <- glm(
  enshit ~ total_inc,
  data = panel_df,
  family = binomial()
)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(inc_simple)

## 
## Call:
## glm(formula = enshit ~ total_inc, family = binomial(), data = panel_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.2586289  0.2274332  -5.534 3.13e-08 ***
## total_inc    0.0015246  0.0003549   4.296 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 165.91  on 198  degrees of freedom
## AIC: 169.91
## 
## Number of Fisher Scoring iterations: 10

Combining it all

The above are not good regressions, but hey who cares. They look neat. Now is time for a regression with some actual purpose.

This model is what indiciates if a firm is enshittified.The problem with this one is we don’t include year but this will be fixed in our next model.

base_model <- glm(
  enshit ~ total_inc + goodwill_pct_asset + share_repurch_pct_cash + market_pct_oper + RnD_pct_oper
  , data = panel_df, 
  family=binomial()
)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(base_model)

## 
## Call:
## glm(formula = enshit ~ total_inc + goodwill_pct_asset + share_repurch_pct_cash + 
##     market_pct_oper + RnD_pct_oper, family = binomial(), data = panel_df)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.8525478  0.7929694   2.336  0.01948 *  
## total_inc               0.0016798  0.0004314   3.894 9.87e-05 ***
## goodwill_pct_asset     -2.4026330  1.9245978  -1.248  0.21189    
## share_repurch_pct_cash -0.3089679  0.3139155  -0.984  0.32500    
## market_pct_oper        -3.9634202  1.3423220  -2.953  0.00315 ** 
## RnD_pct_oper           -3.7693551  1.6768015  -2.248  0.02458 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 277.26  on 199  degrees of freedom
## Residual deviance: 143.53  on 194  degrees of freedom
## AIC: 155.53
## 
## Number of Fisher Scoring iterations: 11

As we might expect income, marketing, and RnD are good indicators of whether a company is enshittified. The problem is we haven’t really considered Fixed effects (year and company type). We fix that with this next model.

library(fixest)

## Warning: package 'fixest' was built under R version 4.5.3

year_fe_model <- feols(
  enshit ~ 
    total_inc + 
    goodwill_pct_asset + 
    share_repurch_pct_cash + 
    market_pct_oper + 
    RnD_pct_oper | 
    year,
  data = panel_df
  )
summary(year_fe_model)

## OLS estimation, Dep. Var.: enshit
## Observations: 200
## Fixed-effects: year: 17
## Standard-errors: IID 
##                         Estimate Std. Error  t value   Pr(>|t|)    
## total_inc               0.000012 0.00000208  5.60342 7.8737e-08 ***
## goodwill_pct_asset      0.302855 0.18845680  1.60703 1.0982e-01    
## share_repurch_pct_cash -0.037439 0.02302020 -1.62636 1.0564e-01    
## market_pct_oper        -0.559729 0.18633235 -3.00393 3.0493e-03 ** 
## RnD_pct_oper           -0.408171 0.27695304 -1.47379 1.4230e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.432489     Adj. R2: 0.163542
##                  Within R2: 0.236729

Now when we consider year fixed effects it’s just income and marketing that are significant. This makes some sense, since bigger companies have more income and are more likely to be enshittified. What’s happening with marketing is likely that upstart companies are trying to get the word out for their project, while bigger companies have significantly larger operating budgets and thus a smaller marketing percentage.

Logistic Fixed Effect Model

Since our dependent variable is a dummy, doing a logistic regression can help uncover some hidden variation.

logit_model <- feglm(
  enshit ~
    total_inc +
    goodwill_pct_asset +
    market_pct_oper +
    RnD_pct_oper + 
    share_repurch_pct_cash
    |
    year,
  family = binomial(),
  data = panel_df
)

summary(logit_model)

## GLM estimation, family = binomial, Dep. Var.: enshit
## Observations: 200
## Fixed-effects: year: 17
## Standard-errors: IID 
##                         Estimate Std. Error   z value   Pr(>|z|)    
## total_inc               0.001783   0.000490  3.641346 0.00027122 ***
## goodwill_pct_asset     -1.346121   2.236498 -0.601888 0.54724864    
## market_pct_oper        -4.607807   1.581175 -2.914166 0.00356640 ** 
## RnD_pct_oper           -4.075340   1.888760 -2.157680 0.03095275 *  
## share_repurch_pct_cash -0.456208   0.399679 -1.141434 0.25368923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -68.6   Adj. Pseudo R2: 0.35364 
##            BIC: 253.8     Squared Cor.: 0.555866

write_csv(panel_df, "enshit_data.csv")

Same as earlier, but RnD is a bit back on the menu!

Multicollinearity Check

library(car)

## Warning: package 'car' was built under R version 4.5.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.5.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

vif(lm(
  enshit ~
    total_inc +
    goodwill_pct_asset +
    share_repurch_pct_cash +
    market_pct_oper +
    RnD_pct_oper,
  data = panel_df
))

##              total_inc     goodwill_pct_asset share_repurch_pct_cash 
##               1.124555               1.309900               1.200085 
##        market_pct_oper           RnD_pct_oper 
##               1.126795               1.280706

Almost none. Cool! Don’t have to worry about stuff then.

We won’t be conditioning on companies since our dependent variable is directly correlated with company. We’ve assigned enshittified or not, so assigning it like this would be like planting some carrots with your tomatos, deciding to harvest all of your tomatos, and wondering why you only have carrots left.

PCA

What we will be doing is PCA. This is a more advanced technique but it determines the principal components of your variable. Advanced stuff but could be helpful.

pca_vars <- panel_df |>
  select(RnD_pct_oper,
         market_pct_oper,
         share_repurch_pct_cash,
         goodwill_pct_asset, 
         total_inc) |>
  na.omit()


pca <- PCA(pca_vars, scale.unit = TRUE)

First graph tells us there are a couple of outlier firms, but most behave roughly the same. No tremendous isolation. Our variable graph indiciates that total income and share repurchase are strongly correlated, which makes sense. These are negatively correlated with marketing.

Heres some k-means clustering. This finds clusters of data and groups them together. Basically tries to form groups around data.

kmeans(scale(pca_vars), centers = 2)

## K-means clustering with 2 clusters of sizes 53, 147
## 
## Cluster means:
##   RnD_pct_oper market_pct_oper share_repurch_pct_cash goodwill_pct_asset
## 1   -1.2480687      -1.1098681              0.4671277          0.4533630
## 2    0.4499839       0.4001565             -0.1684202         -0.1634574
##    total_inc
## 1  0.3318934
## 2 -0.1196622
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 413.7298 341.6249
##  (between_SS / total_SS =  24.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Now this is a whole hell of a lot more interesting. Our two clusters are super different. One is far smaller than the other, but that’s fine. Our bigger cluster has higher RnD and marketing percent spending, but lower share repurchases, goodwill, and income. The reverse is true for our smaller cluster. These clusters only explain 24% of the variation though, which tracks. Tons of other stuff goes into these variables. 24% is actually a fairly promising result.

Making an index

Using this data, we’ll construct an index (yay)

Our options are using the regression

panel_df <- panel_df %>%
  mutate(
    across(
      c(
        share_repurch_pct_cash,
        goodwill_pct_asset,
        total_inc,
        RnD_pct_oper,
        market_pct_oper
      ),
      scale,
      .names = "z_{.col}"
    )
  )

panel_df <- panel_df %>%
  mutate(
    enshit_index =
      z_share_repurch_pct_cash +
      z_goodwill_pct_asset +
      z_total_inc -
      z_RnD_pct_oper -
      z_market_pct_oper
  )

To Do

Figure out Clustering companies and years (might’ve uncovered actual delineations for enshittified companies, would be nice to graph by company by year) Index with our logistic regression results (basically just ignore goodwill and share repurchase) Index with our Clustering Results (this I’m far more keen on, but both are good)

Danniel-Enshittified-Project

Trey Slaten

2026-05-21