#Read in the data prior to this
enshittified <- enshittified %>%
mutate(enshittified = 1)
ethical <- ethical %>%
mutate(enshittified = 0)
raw_df <- bind_rows(enshittified, ethical)
raw_df <- raw_df %>%
clean_names()
raw_df <- raw_df %>%
filter(
!(is.na(company) & is.na(variablename))
)
long_df <- raw_df %>%
pivot_longer(
cols = matches("^x\\d{4}$"),
names_to = "year",
values_to = "value"
) %>%
mutate(
year = str_remove(year, "^x"),
year = as.integer(year)
) |> select (-c(x20))
long_df <- long_df %>%
filter(!is.na(value))
long_df <- long_df %>%
mutate(
value = as.character(value),
value = str_replace_all(value, ",", ""),
value = str_replace_all(value, "%", ""),
value = na_if(value, ""),
value = as.numeric(value)
)
long_df %>%
count(company, year, variablename) %>%
filter(n > 1)
## # A tibble: 0 × 4
## # ℹ 4 variables: company <chr>, year <int>, variablename <chr>, n <int>
panel_df <- long_df %>%
pivot_wider(
id_cols = c(company, year, enshittified),
names_from = variablename,
values_from = value
)
colnames(panel_df) <- c("company", "year", "enshit", "RnD_mil","market_mil","total_oper_mil", "share_repurch_mil","cash_mil","total_rev_mil","total_inc","goodwill_mil","total_asset_mil","RnD_pct_oper", "market_pct_oper","share_repurch_pct_cash","goodwill_pct_asset","total_expense")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
ggplot(panel_df,
aes(x = year, y = RnD_pct_oper,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(panel_df,
aes(x = year, y = market_pct_oper,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(panel_df,
aes(x = year, y = share_repurch_pct_cash,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(panel_df,
aes(x = year, y = goodwill_pct_asset,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(panel_df,
aes(x = year, y = total_expense,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(panel_df,
aes(x = year, y = total_inc,
color = factor(enshit),
group = company)) +
geom_line(alpha = 0.4) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
We’re starting with individual regs then moving on to everything(yay)
“company”, “year”, “enshit”, “RnD_mil”,“market_mil”,“total_oper_mil”, “share_repurch_mil”,“cash_mil”,“total_rev_mil”,“total_inc”,“goodwill_mil”,“total_asset_mil”,“RnD_pct_oper”, “market_pct_oper”,“share_repurch_pct_cash”,“goodwill_pct_asset”,“total_expense”
RnD_simple <- glm(
enshit ~ RnD_pct_oper,
data = panel_df,
family = binomial()
)
summary(RnD_simple)
##
## Call:
## glm(formula = enshit ~ RnD_pct_oper, family = binomial(), data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7384 0.3166 2.332 0.01971 *
## RnD_pct_oper -2.8020 1.0699 -2.619 0.00882 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 270.01 on 198 degrees of freedom
## AIC: 274.01
##
## Number of Fisher Scoring iterations: 4
market_simple <- glm(
enshit ~ market_pct_oper,
data = panel_df,
family = binomial()
)
summary(market_simple)
##
## Call:
## glm(formula = enshit ~ market_pct_oper, family = binomial(),
## data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3018 0.3566 3.650 0.000262 ***
## market_pct_oper -3.3458 0.8358 -4.003 6.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 259.53 on 198 degrees of freedom
## AIC: 263.53
##
## Number of Fisher Scoring iterations: 4
share_simple <- glm(
enshit ~ share_repurch_pct_cash,
data = panel_df,
family = binomial()
)
summary(share_simple)
##
## Call:
## glm(formula = enshit ~ share_repurch_pct_cash, family = binomial(),
## data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.03967 0.15961 -0.249 0.804
## share_repurch_pct_cash 0.05031 0.09511 0.529 0.597
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 276.96 on 198 degrees of freedom
## AIC: 280.96
##
## Number of Fisher Scoring iterations: 4
goodwill_simple <- glm(
enshit ~ goodwill_pct_asset,
data = panel_df,
family = binomial()
)
summary(goodwill_simple)
##
## Call:
## glm(formula = enshit ~ goodwill_pct_asset, family = binomial(),
## data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1383 0.1959 -0.706 0.48
## goodwill_pct_asset 0.7492 0.7384 1.015 0.31
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 276.19 on 198 degrees of freedom
## AIC: 280.19
##
## Number of Fisher Scoring iterations: 4
inc_simple <- glm(
enshit ~ total_inc,
data = panel_df,
family = binomial()
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(inc_simple)
##
## Call:
## glm(formula = enshit ~ total_inc, family = binomial(), data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2586289 0.2274332 -5.534 3.13e-08 ***
## total_inc 0.0015246 0.0003549 4.296 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 165.91 on 198 degrees of freedom
## AIC: 169.91
##
## Number of Fisher Scoring iterations: 10
The above are not good regressions, but hey who cares. They look neat. Now is time for a regression with some actual purpose.
This model is what indiciates if a firm is enshittified.The problem with this one is we don’t include year but this will be fixed in our next model.
base_model <- glm(
enshit ~ total_inc + goodwill_pct_asset + share_repurch_pct_cash + market_pct_oper + RnD_pct_oper
, data = panel_df,
family=binomial()
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(base_model)
##
## Call:
## glm(formula = enshit ~ total_inc + goodwill_pct_asset + share_repurch_pct_cash +
## market_pct_oper + RnD_pct_oper, family = binomial(), data = panel_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8525478 0.7929694 2.336 0.01948 *
## total_inc 0.0016798 0.0004314 3.894 9.87e-05 ***
## goodwill_pct_asset -2.4026330 1.9245978 -1.248 0.21189
## share_repurch_pct_cash -0.3089679 0.3139155 -0.984 0.32500
## market_pct_oper -3.9634202 1.3423220 -2.953 0.00315 **
## RnD_pct_oper -3.7693551 1.6768015 -2.248 0.02458 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 277.26 on 199 degrees of freedom
## Residual deviance: 143.53 on 194 degrees of freedom
## AIC: 155.53
##
## Number of Fisher Scoring iterations: 11
As we might expect income, marketing, and RnD are good indicators of whether a company is enshittified. The problem is we haven’t really considered Fixed effects (year and company type). We fix that with this next model.
library(fixest)
## Warning: package 'fixest' was built under R version 4.5.3
year_fe_model <- feols(
enshit ~
total_inc +
goodwill_pct_asset +
share_repurch_pct_cash +
market_pct_oper +
RnD_pct_oper |
year,
data = panel_df
)
summary(year_fe_model)
## OLS estimation, Dep. Var.: enshit
## Observations: 200
## Fixed-effects: year: 17
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## total_inc 0.000012 0.00000208 5.60342 7.8737e-08 ***
## goodwill_pct_asset 0.302855 0.18845680 1.60703 1.0982e-01
## share_repurch_pct_cash -0.037439 0.02302020 -1.62636 1.0564e-01
## market_pct_oper -0.559729 0.18633235 -3.00393 3.0493e-03 **
## RnD_pct_oper -0.408171 0.27695304 -1.47379 1.4230e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.432489 Adj. R2: 0.163542
## Within R2: 0.236729
Now when we consider year fixed effects it’s just income and marketing that are significant. This makes some sense, since bigger companies have more income and are more likely to be enshittified. What’s happening with marketing is likely that upstart companies are trying to get the word out for their project, while bigger companies have significantly larger operating budgets and thus a smaller marketing percentage.
Since our dependent variable is a dummy, doing a logistic regression can help uncover some hidden variation.
logit_model <- feglm(
enshit ~
total_inc +
goodwill_pct_asset +
market_pct_oper +
RnD_pct_oper +
share_repurch_pct_cash
|
year,
family = binomial(),
data = panel_df
)
summary(logit_model)
## GLM estimation, family = binomial, Dep. Var.: enshit
## Observations: 200
## Fixed-effects: year: 17
## Standard-errors: IID
## Estimate Std. Error z value Pr(>|z|)
## total_inc 0.001783 0.000490 3.641346 0.00027122 ***
## goodwill_pct_asset -1.346121 2.236498 -0.601888 0.54724864
## market_pct_oper -4.607807 1.581175 -2.914166 0.00356640 **
## RnD_pct_oper -4.075340 1.888760 -2.157680 0.03095275 *
## share_repurch_pct_cash -0.456208 0.399679 -1.141434 0.25368923
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -68.6 Adj. Pseudo R2: 0.35364
## BIC: 253.8 Squared Cor.: 0.555866
write_csv(panel_df, "enshit_data.csv")
Same as earlier, but RnD is a bit back on the menu!
Multicollinearity Check
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
vif(lm(
enshit ~
total_inc +
goodwill_pct_asset +
share_repurch_pct_cash +
market_pct_oper +
RnD_pct_oper,
data = panel_df
))
## total_inc goodwill_pct_asset share_repurch_pct_cash
## 1.124555 1.309900 1.200085
## market_pct_oper RnD_pct_oper
## 1.126795 1.280706
Almost none. Cool! Don’t have to worry about stuff then.
We won’t be conditioning on companies since our dependent variable is directly correlated with company. We’ve assigned enshittified or not, so assigning it like this would be like planting some carrots with your tomatos, deciding to harvest all of your tomatos, and wondering why you only have carrots left.
What we will be doing is PCA. This is a more advanced technique but it determines the principal components of your variable. Advanced stuff but could be helpful.
pca_vars <- panel_df |>
select(RnD_pct_oper,
market_pct_oper,
share_repurch_pct_cash,
goodwill_pct_asset,
total_inc) |>
na.omit()
pca <- PCA(pca_vars, scale.unit = TRUE)
First graph tells us there are a couple of outlier firms, but most
behave roughly the same. No tremendous isolation. Our variable graph
indiciates that total income and share repurchase are strongly
correlated, which makes sense. These are negatively correlated with
marketing.
Heres some k-means clustering. This finds clusters of data and groups them together. Basically tries to form groups around data.
kmeans(scale(pca_vars), centers = 2)
## K-means clustering with 2 clusters of sizes 53, 147
##
## Cluster means:
## RnD_pct_oper market_pct_oper share_repurch_pct_cash goodwill_pct_asset
## 1 -1.2480687 -1.1098681 0.4671277 0.4533630
## 2 0.4499839 0.4001565 -0.1684202 -0.1634574
## total_inc
## 1 0.3318934
## 2 -0.1196622
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 413.7298 341.6249
## (between_SS / total_SS = 24.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Now this is a whole hell of a lot more interesting. Our two clusters are super different. One is far smaller than the other, but that’s fine. Our bigger cluster has higher RnD and marketing percent spending, but lower share repurchases, goodwill, and income. The reverse is true for our smaller cluster. These clusters only explain 24% of the variation though, which tracks. Tons of other stuff goes into these variables. 24% is actually a fairly promising result.
Using this data, we’ll construct an index (yay)
Our options are using the regression
panel_df <- panel_df %>%
mutate(
across(
c(
share_repurch_pct_cash,
goodwill_pct_asset,
total_inc,
RnD_pct_oper,
market_pct_oper
),
scale,
.names = "z_{.col}"
)
)
panel_df <- panel_df %>%
mutate(
enshit_index =
z_share_repurch_pct_cash +
z_goodwill_pct_asset +
z_total_inc -
z_RnD_pct_oper -
z_market_pct_oper
)
Figure out Clustering companies and years (might’ve uncovered actual delineations for enshittified companies, would be nice to graph by company by year) Index with our logistic regression results (basically just ignore goodwill and share repurchase) Index with our Clustering Results (this I’m far more keen on, but both are good)