forbes_dataset.5 <- read.csv("/cloud/project/forbes_dataset 5.csv", sep=";", stringsAsFactors=TRUE)

summary(forbes_dataset.5)
##       Rank                                           Name     
##  Min.   :   1.0   Abdulla Al Futtaim & family          :   1  
##  1st Qu.: 396.8   Abdulla bin Ahmad Al Ghurair & family:   1  
##  Median : 788.5   Abdullah Amer Al Nahdi               :   1  
##  Mean   : 788.1   Abdulsamad Rabiu                     :   1  
##  3rd Qu.:1181.2   Abel Avellan                         :   1  
##  Max.   :1575.0   Abhay Firodia                        :   1  
##                   (Other)                              :1550  
##    Net.Worth       Net.Worth.Billions     Change       Percentage.Change   
##  Min.   :  2.400   Min.   :  2.400    Min.   :  0.00   Min.   :     -9.52  
##  1st Qu.:  3.400   1st Qu.:  3.400    1st Qu.:  0.00   1st Qu.:     -0.17  
##  Median :  4.900   Median :  4.900    Median :  5.00   Median :      0.00  
##  Mean   :  9.542   Mean   :  9.542    Mean   : 40.83   Mean   :   1478.10  
##  3rd Qu.:  8.300   3rd Qu.:  8.300    3rd Qu.: 38.00   3rd Qu.:      0.05  
##  Max.   :405.600   Max.   :405.600    Max.   :874.00   Max.   :2200000.00  
##                                                                            
##       Age                     Source         Country.Territory    X          
##  Min.   : 20.00   Real estate    :  72   United States:540     Mode:logical  
##  1st Qu.: 58.00   Investments    :  53   China        :175     NA's:1556     
##  Median : 68.00   Diversified    :  52   India        :115                   
##  Mean   : 67.02   Pharmaceuticals:  52   Germany      : 99                   
##  3rd Qu.: 77.00   Private equity :  41   Russia       : 53                   
##  Max.   :101.00   Hedge funds    :  30   Italy        : 40                   
##                   (Other)        :1256   (Other)      :534                   
##    X.1            X.2         
##  Mode:logical   Mode:logical  
##  NA's:1556      NA's:1556     
##                               
##                               
##                               
##                               
## 
library(ggplot2)
ggplot(forbes_dataset.5[1:20,], aes(x = Country.Territory, y = Net.Worth)) +
  geom_boxplot(fill = "skyblue", color = "red") +
  labs(title = "Distribución de Net Worth por País(primeros 20)",
    x = "País",
    y = "Net Worth (Billones USD)")

##Interpretación Agregar interpretación del diagrama de cajas

forbes_dataset.5 <- read.csv("/cloud/project/forbes_dataset 5.csv", sep=";", stringsAsFactors=TRUE)

forbes_dataset.5$Net.Worth <- gsub(" B", "", forbes_dataset.5$Net.Worth)
forbes_dataset.5$Net.Worth <- as.numeric(forbes_dataset.5$Net.Worth)
forbes_dataset.5$Age <- as.numeric(forbes_dataset.5$Age)

summary(forbes_dataset.5[, c("Net.Worth", "Age", "Rank")])
##    Net.Worth            Age              Rank       
##  Min.   :  2.400   Min.   : 20.00   Min.   :   1.0  
##  1st Qu.:  3.400   1st Qu.: 58.00   1st Qu.: 396.8  
##  Median :  4.900   Median : 68.00   Median : 788.5  
##  Mean   :  9.542   Mean   : 67.02   Mean   : 788.1  
##  3rd Qu.:  8.300   3rd Qu.: 77.00   3rd Qu.:1181.2  
##  Max.   :405.600   Max.   :101.00   Max.   :1575.0
cor.test(forbes_dataset.5$Net.Worth, forbes_dataset.5$Age, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  forbes_dataset.5$Net.Worth and forbes_dataset.5$Age
## t = 0.60778, df = 1554, p-value = 0.5434
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03430433  0.06506023
## sample estimates:
##        cor 
## 0.01541601
library(ggplot2)
ggplot(forbes_dataset.5, aes(x = Age, y = Net.Worth)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(
    title = "Relación entre Edad y Patrimonio Neto",
    x = "Edad (años)",
    y = "Net Worth (Billones USD)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Net.Worth ~ Age
## Net.Worth ~ Age
modelo <- lm(Net.Worth ~ Age, data = forbes_dataset.5)
library(tidyverse)
library(knitr)

df <- readr::read_delim(
  "/cloud/project/forbes_dataset 5.csv",
  delim = ";",
  locale = readr::locale(encoding = "Latin1"),
  trim_ws = TRUE,
  show_col_types = FALSE
)

names(df) <- str_trim(names(df)) 
print(names(df)) 
##  [1] "Rank"               "Name"               "Net Worth"         
##  [4] "Net.Worth.Billions" "Change"             "Percentage Change" 
##  [7] "Age"                "Source"             "Country/Territory" 
## [10] "...10"              "...11"              "...12"
df <- df %>%
  rename(
    Country.Territory = matches("Country|country|Territory"),
    Source = matches("Source|source|Wealth|wealth")
  )

df20 <- df %>% slice_head(n = 20)

df20 <- df20 %>%
  mutate(
    Sector = case_when(
      str_detect(Source, regex("Microsoft|Oracle|Facebook|Meta|Amazon|Google|Dell|Semiconductors|Tech", ignore_case = TRUE)) ~ "Tecnología",
      str_detect(Source, regex("Walmart|Zara|L.?Or[eé]al|LVMH|Retail|Luxury|Lujo", ignore_case = TRUE)) ~ "Retail/Lujo",
      str_detect(Source, regex("Berkshire|Bloomberg|Finance|brokerage|Bank|Capital|Invest", ignore_case = TRUE)) ~ "Finanzas/Inversión",
      str_detect(Source, regex("Telecom|Telecommunications", ignore_case = TRUE)) ~ "Telecom",
      TRUE ~ "Otros"
    ),
    Region = case_when(
      Country.Territory %in% c("United States", "Canada", "Mexico") ~ "América del Norte",
      Country.Territory %in% c("France","Spain","Germany","Italy","United Kingdom","UK","Sweden","Netherlands","Switzerland") ~ "Europa",
      Country.Territory %in% c("China","India","Japan","South Korea","Taiwan","Hong Kong","Singapore") ~ "Asia",
      Country.Territory %in% c("Brazil","Argentina","Chile","Colombia","Peru") ~ "América Latina",
      TRUE ~ "Otros"
    ),
    Country.Territory = forcats::fct_infreq(Country.Territory),
    Sector = forcats::fct_infreq(Sector),
    Source = forcats::fct_infreq(Source)
  )

tab_src_country   <- table(df20$Country.Territory, df20$Source)
tab_sector_region <- table(df20$Sector, df20$Region)

kable(as.data.frame.matrix(tab_src_country),
      caption = "Tabla cruzada (primeros 20): País × Source")
Tabla cruzada (primeros 20): País × Source
Walmart Google Microsoft Amazon Berkshire Hathaway Bloomberg LP Dell Technologies Diversified Facebook L’Or̃å©al LVMH Oracle Semiconductors Telecom Tesla, SpaceX Zara
United States 3 2 2 1 1 1 1 0 1 0 0 1 1 0 1 0
France 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
India 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
Mexico 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
Spain 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
kable(as.data.frame.matrix(tab_sector_region),
      caption = "Tabla cruzada (primeros 20): Sector × Región")
Tabla cruzada (primeros 20): Sector × Región
América del Norte Asia Europa
Tecnología 9 0 0
Retail/Lujo 3 0 2
Otros 1 1 1
Finanzas/Inversión 2 0 0
Telecom 1 0 0
ggplot(df20, aes(x = Country.Territory, fill = Sector)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(title = "Composición (primeros 20): Sector por País",
       x = "País", y = "Proporción") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

df_plot2 <- df20 %>% count(Sector, Region)
ggplot(df_plot2, aes(x = Sector, y = n, fill = Region)) +
  geom_col(position = "stack") +
  labs(title = "Distribución (primeros 20): Sector por Región",
       x = "Sector", y = "Frecuencia") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")