Librerias necesarias

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(readr) #Para leer archivos .csv
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.1

Dataset

medical_costs <- read_csv("E:\\2023_Capacitaciones\\BIG DATA UAO\\medical_costs.csv")
## Rows: 10000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Sex, Smoker, Region
## dbl (4): Age, BMI, Children, Medical Cost
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(medical_costs,5)
## # A tibble: 5 × 7
##     Age Sex      BMI Children Smoker Region    `Medical Cost`
##   <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>              <dbl>
## 1    58 male    15.6        2 yes    northwest         17908.
## 2    24 male    29.8        0 yes    northeast         16313.
## 3    50 male    29          5 no     northwest          6819.
## 4    35 male    34          1 no     southeast          5248.
## 5    31 female  17.6        3 yes    southeast         17525.

costo promedio de gastos medicos por region

medical_costs %>%
      select(Sex:Region, Costo='Medical Cost') %>%
      filter(Sex == 'male') %>%
      group_by(Region) %>%
      summarise(Avg_Costo = mean(Costo)) %>%
  ggplot(aes(Region, Avg_Costo, fill = Region))+
  geom_bar(stat = "identity") + 
  geom_text(aes(label = sprintf( "%.2f", Avg_Costo)),position = position_stack(vjust = 0.8))

dispersion de costos medicos por edad

p <- medical_costs %>% ggplot(aes(x = Age,
                                  y = `Medical Cost`, color=Smoker))+
                      geom_point(aes(shape = Smoker))+
                      xlab("Edad")+
                      ylab("Costos Medicos")+
                      ggtitle("Costos medicos respecto a la edad")+ 
                      scale_x_continuous(trans = "log10") +
                      scale_y_continuous(trans = "log10") 
p  + theme_solarized(light=FALSE) +
  scale_colour_solarized()

p1 <- medical_costs %>%  filter(Smoker == 'yes') %>%
                      ggplot(aes(x = Age,
                                  y = `Medical Cost`, color=Smoker))+
                      geom_point(aes(shape = Smoker))+
                      xlab("Edad")+
                      ylab("Costos Medicos")+
                      ggtitle("Costos medicos respecto a la edad en personas que fuman")+ 
                      scale_x_continuous(trans = "log10") +
                      scale_y_continuous(trans = "log10") +
                      facet_wrap(~ Region, nrow = 3) + 
                      geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs")) 
p1  + theme_excel_new() +
  scale_colour_excel_new()

p2 <- medical_costs %>%  filter(Smoker == 'no') %>%
                      ggplot(aes(x = Age,
                                  y = `Medical Cost`, color=Smoker))+
                      geom_point(aes(shape = Smoker))+
                      xlab("Edad")+
                      ylab("Costos Medicos")+
                      ggtitle("Costos medicos respecto a la edad en personas que no fuman")+ 
                      scale_x_continuous(trans = "log10") +
                      scale_y_continuous(trans = "log10") +
                      facet_wrap(~ Region, nrow = 3) + 
                      geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs"))  
p2  + theme_clean()

p3 <- medical_costs %>% ggplot(aes(x = Age,
                                  y = `Medical Cost`, color=Smoker, shape=Region))+
                      geom_point(show.legend = FALSE)+
                      xlab("Edad")+
                      ylab("Costos Medicos")+
                      ggtitle("Costos medicos respecto a la edad")+ 
                      scale_x_continuous(trans = "log10") +
                      scale_y_continuous(trans = "log10") +
                      facet_wrap(~ Region, nrow = 2) + 
                      geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs"))

p3  + theme_solarized(light=FALSE) +
  scale_colour_solarized()

medical_costs %>%
  select(Smoker, Region, Costo='Medical Cost') %>%
  filter(Smoker == 'yes')%>%
  group_by(Region) %>%
  ggplot(aes(Region, Costo, fill = Region))+
   geom_boxplot()

medical_costs %>%
  select(Smoker, Region, Costo='Medical Cost') %>%
  filter(Smoker == 'no')%>%
  group_by(Region) %>%
  ggplot(aes(Region, Costo, fill = Region))+
  geom_boxplot()