I download the data files for the workshop:
# To avoid scientific notation:
options(scipen=999)
#Download the csv files and import them into your R environment:
download.file("https://www.apradie.com/datos/dataus2024.csv","dataus2024.csv")
trying URL 'https://www.apradie.com/datos/dataus2024.csv'
Content type 'text/csv' length 48648943 bytes (46.4 MB)
==================================================
downloaded 46.4 MB
uspanel = read.csv("dataus2024.csv")
download.file("https://www.apradie.com/datos/firmsus2024.csv","firmsus2024.csv")
trying URL 'https://www.apradie.com/datos/firmsus2024.csv'
Content type 'text/csv' length 747426 bytes (729 KB)
==================================================
downloaded 729 KB
usfirms <- read.csv("firmsus2024.csv")
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(dplyr)
#change the name of the column "empresa" to "firm"
#merge the industry from the firmus2024 dataset into the uspanel
usfirms = usfirms %>% rename(firm=empresa)
newdataset= merge(uspanel,usfirms,by="firm")
#Filter uspanel to get only 2023 Fiscal-END quearters
byyear=newdataset %>% filter(yearf==2023,fiscalmonth==12) %>%
group_by(naics1)%>% summarize(totalrevenue=sum(revenue, na.rm = TRUE))
uspanel_2023 <- uspanel %>% filter(year == 2023)
# Check column names in both data frames
colnames(uspanel)
[1] "firm" "q" "fiscalmonth"
[4] "revenue" "cogs" "sgae"
[7] "otherincome" "extraordinaryitems" "finexp"
[10] "incometax" "totalassets" "currentassets"
[13] "inventory" "totalliabilities" "currentliabilities"
[16] "longdebt" "adjprice" "originalprice"
[19] "sharesoutstanding" "year" "fixedassets"
[22] "yearf" "cto" "fiscalq"
colnames(usfirms)
[1] "firm" "Nombre" "status"
[4] "partind" "naics1" "naics2"
[7] "SectorEconomatica"
# Merge using the common column "firm"
merged_data <- merge(uspanel_2023, usfirms, by = "firm")
# Group by the appropriate industry column (e.g., "partind") and calculate the median
median_assets_by_industry <- merged_data %>%
group_by(partind) %>%
summarize(median_total_assets = median(totalassets, na.rm = TRUE))
# View the results
print(median_assets_by_industry)
NA
#CHALLENGE 1_NADIM
uspanel <- uspanel %>%
group_by(firm, yearf) %>%
arrange(firm, q) %>%
mutate(
revenue_quarterly = revenue - lag(revenue, default = 0),
EBIT_quarterly = revenue_quarterly - cogs - sgae)
install.packages("dplyr")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.4/dplyr_1.1.4.tgz'
Content type 'application/x-gzip' length 1604793 bytes (1.5 MB)
==================================================
downloaded 1.5 MB
The downloaded binary packages are in
/var/folders/vm/ylgq08j14tj3m4t5vkftm3300000gn/T//RtmpwDLeW4/downloaded_packages
No sabía si quiere el ebit así normal o quarterly así que hice los dos.
# Calculate EBIT and Market Value
uspanel <- uspanel %>%
mutate(ebit = revenue - cogs - sgae, # Calculate EBIT
market_value = adjprice * sharesoutstanding) # Calculate Market Value
# Display EBIT_quarterly and revenue_quarterly
uspanel %>%
select(firm, q, yearf, revenue_quarterly, EBIT_quarterly) %>%
head()
NA
CHALLENGE 2_NADIM
# Summarize the total market value, total revenue, and total net income by year
summary_table <- uspanel %>%
group_by(year) %>%
summarize(
total_market_value = sum(market_value, na.rm = TRUE),
total_revenue = sum(revenue, na.rm = TRUE),
total_net_income = sum(ebit - finexp - incometax, na.rm = TRUE) # Calculate Net Income
)
# Display the summary table
print(summary_table)
NA
NA
#Usaría gráfica de líneas para ver como se mueve las tres variables a través del tiempo y cuál es la tendencia a futuro. También usaría Gráfica de Barras si quisiera hacer una comparación de año con año de cada variable a través del tiempo para ver como se comporta el mercado y como cada segmento aumenta o disminuye cada año.
CHALLENGE 3
library("dplyr")
uspanel <- uspanel %>%
group_by(firm, yearf) %>%
arrange(firm, q) %>%
mutate(
revenue_quarterly = revenue - lag(revenue, default = 0),
EBIT_quarterly = revenue_quarterly - cogs - sgae
) %>%
ungroup()
# Crear oepsp usando el pipe %>%
uspanel <- uspanel %>%
mutate(oepsp = (EBIT_quarterly / sharesoutstanding) / adjprice)
library("statar")
uspanel <- uspanel %>%
mutate(oepspw = winsorize(oepsp, probs = c(0.01, 0.99)))
0.37 % observations replaced at the bottom
0.37 % observations replaced at the top
uspanel <- uspanel %>%
group_by(firm) %>%
mutate(
log_return = log(1 + revenue_quarterly), # Asumimos que revenue_quarterly se puede usar como retorno trimestral
annual_return = sum(log_return, na.rm = TRUE) # Sumamos los retornos logarítmicos para obtener el retorno anual
) %>%
ungroup()
Warning: There were 1134 warnings in `mutate()`.
The first warning was:
ℹ In argument: `log_return = log(1 + revenue_quarterly)`.
ℹ In group 5: `firm = "AAIC_old"`.
Caused by warning in `log()`:
! NaNs produced
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1133 remaining warnings.
# Crear la variable binaria
uspanel <- uspanel %>%
mutate(beats_median_return = ifelse(annual_return > median(annual_return, na.rm = TRUE), 1, 0))
# Ejecutar el modelo Logit
logit_model <- glm(beats_median_return ~ oepspw, data = uspanel, family = binomial)
# Resumen del modelo
summary(logit_model)
Call:
glm(formula = beats_median_return ~ oepspw, family = binomial,
data = uspanel)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.876814 0.005504 159.31 <0.0000000000000002 ***
oepspw -0.254470 0.006343 -40.12 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 242297 on 207312 degrees of freedom
Residual deviance: 240199 on 207311 degrees of freedom
(349868 observations deleted due to missingness)
AIC: 240203
Number of Fisher Scoring iterations: 4
#El modelo Logit que corrimos sirve para predecir la probabilidad de que una empresa tenga un rendimiento anual superior a la mediana de su industria en función de la variable oepspw (la versión winsorizada de las ganancias operacionales por acción sobre el precio).
#Intercepto: El valor de 0.8768 indica la probabilidad base de superar la mediana de la industria cuando oepspw es 0. Es estadísticamente significativo, lo que quiere decir que es relevante para el modelo.
#oepspw: El coeficiente de -0.2545 significa que, a medida que oepspw aumenta, la probabilidad de que una empresa supere el rendimiento de la mediana de la industria disminuye.
#Significancia: Ambos valores p son increíblemente bajos (mucho menores que 0.05), lo que muestra que estas relaciones no son aleatorias y tienen peso estadístico.
#Ajuste del modelo: La diferencia entre la deviance nula y la residual sugiere que nuestro modelo con oepspw es mejor que uno que no tiene ninguna variable explicativa. El AIC de 240203 nos da una idea del ajuste, aunque para eso es mejor compararlo con otros modelos.
#REFERENCIAS APA: #WORKSHOP1 y ayuda para corregir el código con Perplexity.