forbes_dataset.5 <- read.csv("/cloud/project/forbes_dataset 5.csv", sep=";", stringsAsFactors=TRUE)
summary(forbes_dataset.5)
## Rank Name
## Min. : 1.0 Abdulla Al Futtaim & family : 1
## 1st Qu.: 396.8 Abdulla bin Ahmad Al Ghurair & family: 1
## Median : 788.5 Abdullah Amer Al Nahdi : 1
## Mean : 788.1 Abdulsamad Rabiu : 1
## 3rd Qu.:1181.2 Abel Avellan : 1
## Max. :1575.0 Abhay Firodia : 1
## (Other) :1550
## Net.Worth Net.Worth.Billions Change Percentage.Change
## Min. : 2.400 Min. : 2.400 Min. : 0.00 Min. : -9.52
## 1st Qu.: 3.400 1st Qu.: 3.400 1st Qu.: 0.00 1st Qu.: -0.17
## Median : 4.900 Median : 4.900 Median : 5.00 Median : 0.00
## Mean : 9.542 Mean : 9.542 Mean : 40.83 Mean : 1478.10
## 3rd Qu.: 8.300 3rd Qu.: 8.300 3rd Qu.: 38.00 3rd Qu.: 0.05
## Max. :405.600 Max. :405.600 Max. :874.00 Max. :2200000.00
##
## Age Source Country.Territory X
## Min. : 20.00 Real estate : 72 United States:540 Mode:logical
## 1st Qu.: 58.00 Investments : 53 China :175 NA's:1556
## Median : 68.00 Diversified : 52 India :115
## Mean : 67.02 Pharmaceuticals: 52 Germany : 99
## 3rd Qu.: 77.00 Private equity : 41 Russia : 53
## Max. :101.00 Hedge funds : 30 Italy : 40
## (Other) :1256 (Other) :534
## X.1 X.2
## Mode:logical Mode:logical
## NA's:1556 NA's:1556
##
##
##
##
##
library(ggplot2)
ggplot(forbes_dataset.5[1:20,], aes(x = Country.Territory, y = Net.Worth)) +
geom_boxplot(fill = "skyblue", color = "red") +
labs(title = "Distribución de Net Worth por PaÃs(primeros 20)",
x = "PaÃs",
y = "Net Worth (Billones USD)")

##Interpretación Agregar interpretación del diagrama de cajas
forbes_dataset.5 <- read.csv("/cloud/project/forbes_dataset 5.csv", sep=";", stringsAsFactors=TRUE)
forbes_dataset.5$Net.Worth <- gsub(" B", "", forbes_dataset.5$Net.Worth)
forbes_dataset.5$Net.Worth <- as.numeric(forbes_dataset.5$Net.Worth)
forbes_dataset.5$Age <- as.numeric(forbes_dataset.5$Age)
summary(forbes_dataset.5[, c("Net.Worth", "Age", "Rank")])
## Net.Worth Age Rank
## Min. : 2.400 Min. : 20.00 Min. : 1.0
## 1st Qu.: 3.400 1st Qu.: 58.00 1st Qu.: 396.8
## Median : 4.900 Median : 68.00 Median : 788.5
## Mean : 9.542 Mean : 67.02 Mean : 788.1
## 3rd Qu.: 8.300 3rd Qu.: 77.00 3rd Qu.:1181.2
## Max. :405.600 Max. :101.00 Max. :1575.0
cor.test(forbes_dataset.5$Net.Worth, forbes_dataset.5$Age, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: forbes_dataset.5$Net.Worth and forbes_dataset.5$Age
## t = 0.60778, df = 1554, p-value = 0.5434
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03430433 0.06506023
## sample estimates:
## cor
## 0.01541601
library(ggplot2)
ggplot(forbes_dataset.5, aes(x = Age, y = Net.Worth)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Relación entre Edad y Patrimonio Neto",
x = "Edad (años)",
y = "Net Worth (Billones USD)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Net.Worth ~ Age
## Net.Worth ~ Age
modelo <- lm(Net.Worth ~ Age, data = forbes_dataset.5)
library(tidyverse)
library(knitr)
df <- readr::read_delim(
"/cloud/project/forbes_dataset 5.csv",
delim = ";",
locale = readr::locale(encoding = "Latin1"),
trim_ws = TRUE,
show_col_types = FALSE
)
names(df) <- str_trim(names(df))
print(names(df))
## [1] "Rank" "Name" "Net Worth"
## [4] "Net.Worth.Billions" "Change" "Percentage Change"
## [7] "Age" "Source" "Country/Territory"
## [10] "...10" "...11" "...12"
df <- df %>%
rename(
Country.Territory = matches("Country|country|Territory"),
Source = matches("Source|source|Wealth|wealth")
)
df20 <- df %>% slice_head(n = 20)
df20 <- df20 %>%
mutate(
Sector = case_when(
str_detect(Source, regex("Microsoft|Oracle|Facebook|Meta|Amazon|Google|Dell|Semiconductors|Tech", ignore_case = TRUE)) ~ "TecnologÃa",
str_detect(Source, regex("Walmart|Zara|L.?Or[eé]al|LVMH|Retail|Luxury|Lujo", ignore_case = TRUE)) ~ "Retail/Lujo",
str_detect(Source, regex("Berkshire|Bloomberg|Finance|brokerage|Bank|Capital|Invest", ignore_case = TRUE)) ~ "Finanzas/Inversión",
str_detect(Source, regex("Telecom|Telecommunications", ignore_case = TRUE)) ~ "Telecom",
TRUE ~ "Otros"
),
Region = case_when(
Country.Territory %in% c("United States", "Canada", "Mexico") ~ "América del Norte",
Country.Territory %in% c("France","Spain","Germany","Italy","United Kingdom","UK","Sweden","Netherlands","Switzerland") ~ "Europa",
Country.Territory %in% c("China","India","Japan","South Korea","Taiwan","Hong Kong","Singapore") ~ "Asia",
Country.Territory %in% c("Brazil","Argentina","Chile","Colombia","Peru") ~ "América Latina",
TRUE ~ "Otros"
),
Country.Territory = forcats::fct_infreq(Country.Territory),
Sector = forcats::fct_infreq(Sector),
Source = forcats::fct_infreq(Source)
)
tab_src_country <- table(df20$Country.Territory, df20$Source)
tab_sector_region <- table(df20$Sector, df20$Region)
kable(as.data.frame.matrix(tab_src_country),
caption = "Tabla cruzada (primeros 20): PaÃs × Source")
Tabla cruzada (primeros 20): PaÃs × Source
| United States |
3 |
2 |
2 |
1 |
1 |
1 |
1 |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
1 |
0 |
| France |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
| India |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| Mexico |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
| Spain |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
kable(as.data.frame.matrix(tab_sector_region),
caption = "Tabla cruzada (primeros 20): Sector × Región")
Tabla cruzada (primeros 20): Sector × Región
| TecnologÃa |
9 |
0 |
0 |
| Retail/Lujo |
3 |
0 |
2 |
| Otros |
1 |
1 |
1 |
| Finanzas/Inversión |
2 |
0 |
0 |
| Telecom |
1 |
0 |
0 |
ggplot(df20, aes(x = Country.Territory, fill = Sector)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(title = "Composición (primeros 20): Sector por PaÃs",
x = "PaÃs", y = "Proporción") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")

df_plot2 <- df20 %>% count(Sector, Region)
ggplot(df_plot2, aes(x = Sector, y = n, fill = Region)) +
geom_col(position = "stack") +
labs(title = "Distribución (primeros 20): Sector por Región",
x = "Sector", y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
