Librerias necesarias
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(readr) #Para leer archivos .csv
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.1
Dataset
medical_costs <- read_csv("E:\\2023_Capacitaciones\\BIG DATA UAO\\medical_costs.csv")
## Rows: 10000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Sex, Smoker, Region
## dbl (4): Age, BMI, Children, Medical Cost
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(medical_costs,5)
## # A tibble: 5 × 7
## Age Sex BMI Children Smoker Region `Medical Cost`
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 58 male 15.6 2 yes northwest 17908.
## 2 24 male 29.8 0 yes northeast 16313.
## 3 50 male 29 5 no northwest 6819.
## 4 35 male 34 1 no southeast 5248.
## 5 31 female 17.6 3 yes southeast 17525.
costo promedio de gastos medicos por region
medical_costs %>%
select(Sex:Region, Costo='Medical Cost') %>%
filter(Sex == 'male') %>%
group_by(Region) %>%
summarise(Avg_Costo = mean(Costo)) %>%
ggplot(aes(Region, Avg_Costo, fill = Region))+
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf( "%.2f", Avg_Costo)),position = position_stack(vjust = 0.8))

dispersion de costos medicos por edad
p <- medical_costs %>% ggplot(aes(x = Age,
y = `Medical Cost`, color=Smoker))+
geom_point(aes(shape = Smoker))+
xlab("Edad")+
ylab("Costos Medicos")+
ggtitle("Costos medicos respecto a la edad")+
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10")
p + theme_solarized(light=FALSE) +
scale_colour_solarized()

p1 <- medical_costs %>% filter(Smoker == 'yes') %>%
ggplot(aes(x = Age,
y = `Medical Cost`, color=Smoker))+
geom_point(aes(shape = Smoker))+
xlab("Edad")+
ylab("Costos Medicos")+
ggtitle("Costos medicos respecto a la edad en personas que fuman")+
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
facet_wrap(~ Region, nrow = 3) +
geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs"))
p1 + theme_excel_new() +
scale_colour_excel_new()

p2 <- medical_costs %>% filter(Smoker == 'no') %>%
ggplot(aes(x = Age,
y = `Medical Cost`, color=Smoker))+
geom_point(aes(shape = Smoker))+
xlab("Edad")+
ylab("Costos Medicos")+
ggtitle("Costos medicos respecto a la edad en personas que no fuman")+
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
facet_wrap(~ Region, nrow = 3) +
geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs"))
p2 + theme_clean()

p3 <- medical_costs %>% ggplot(aes(x = Age,
y = `Medical Cost`, color=Smoker, shape=Region))+
geom_point(show.legend = FALSE)+
xlab("Edad")+
ylab("Costos Medicos")+
ggtitle("Costos medicos respecto a la edad")+
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
facet_wrap(~ Region, nrow = 2) +
geom_smooth(method = 'gam', formula = y ~ s(x, bs = "cs"))
p3 + theme_solarized(light=FALSE) +
scale_colour_solarized()

medical_costs %>%
select(Smoker, Region, Costo='Medical Cost') %>%
filter(Smoker == 'yes')%>%
group_by(Region) %>%
ggplot(aes(Region, Costo, fill = Region))+
geom_boxplot()

medical_costs %>%
select(Smoker, Region, Costo='Medical Cost') %>%
filter(Smoker == 'no')%>%
group_by(Region) %>%
ggplot(aes(Region, Costo, fill = Region))+
geom_boxplot()
