prac2 <- read.csv('data_prac_2.csv', header = T, sep = ',', dec = '.',
na.strings = c('&&', 'null', ' ', '$$'), stringsAsFactors =T )
str(prac2)
## 'data.frame': 1338 obs. of 7 variables:
## $ edad : int 19 18 28 33 NA 31 46 37 37 60 ...
## $ sexo : Factor w/ 2 levels "femenino","masculino": 1 2 2 2 2 1 1 1 2 1 ...
## $ imc : num 27.9 33.8 33 22.7 28.9 ...
## $ hijos : int 0 1 3 0 0 0 1 3 2 0 ...
## $ fumador: Factor w/ 2 levels "no","s\xed": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "noreste","noroeste",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ clm : num 16885 1726 4449 21984 3867 ...
prac2$fumador <- lapply(prac2$fumador, function(x) ifelse(x=='no','no', 'si'))
prac2$fumador <- factor(prac2$fumador, levels = c('si', 'no'))
str(prac2)
## 'data.frame': 1338 obs. of 7 variables:
## $ edad : int 19 18 28 33 NA 31 46 37 37 60 ...
## $ sexo : Factor w/ 2 levels "femenino","masculino": 1 2 2 2 2 1 1 1 2 1 ...
## $ imc : num 27.9 33.8 33 22.7 28.9 ...
## $ hijos : int 0 1 3 0 0 0 1 3 2 0 ...
## $ fumador: Factor w/ 2 levels "si","no": 1 2 2 2 2 2 2 2 2 2 ...
## $ region : Factor w/ 4 levels "noreste","noroeste",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ clm : num 16885 1726 4449 21984 3867 ...
Respuesta: Si, con el siguiente código:
prac2 %>%
janitor::get_dupes()
## No variable names specified - using all columns.
## edad sexo imc hijos fumador region clm dupe_count
## 1 19 masculino 30.59 0 no noroeste 1639.563 2
## 2 19 masculino 30.59 0 no noroeste 1639.563 2
prac2 <- prac2%>%
distinct()
Respuesta: En estos casos lo ideal seria usar la interpolación, ya que es una técnica mas estructurada, aunque la técnica a utilzar dependera del tipo de datos que tengamos.
colSums(is.na(prac2))
## edad sexo imc hijos fumador region clm
## 72 0 39 0 0 0 41
prac2num <- prac2 %>%
select(where(is.numeric))
#Reemplazar con la media
prac2_mean <- data.frame(lapply(prac2num, function(x) ifelse(is.na(x), mean(x, na.rm=TRUE), x)))
#Reemplazar con la media recortada
prac2_meanc <- data.frame(lapply(prac2num, function(x) ifelse(is.na(x), mean(x, na.rm=TRUE, trim = 0.2), x)))
#Reemplazar con mediana
prac2_median <- data.frame(lapply(prac2num, function(x) ifelse(is.na(x), median(x, na.rm=TRUE), x)))
# Reemplazar con la moda
prac2_mode <- data.frame(lapply(prac2num, function(x) ifelse(is.na(x), mfv(x, na_rm=TRUE), x)))
# Reemplazar con interpolacion
prac2_inter <- data.frame(lapply(prac2num, function(x) ifelse(is.na(x), na.approx(x, na.rm=TRUE), x)))
#Resumen
summary(prac2num)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.22 1st Qu.:0.000 1st Qu.: 4750
## Median :39.00 Median :30.30 Median :1.000 Median : 9382
## Mean :39.24 Mean :30.62 Mean :1.096 Mean :13287
## 3rd Qu.:51.00 3rd Qu.:34.59 3rd Qu.:2.000 3rd Qu.:16781
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
## NA's :72 NA's :39 NA's :41
summary(prac2_mean)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.40 1st Qu.:0.000 1st Qu.: 4878
## Median :39.24 Median :30.59 Median :1.000 Median : 9705
## Mean :39.24 Mean :30.62 Mean :1.096 Mean :13287
## 3rd Qu.:51.00 3rd Qu.:34.40 3rd Qu.:2.000 3rd Qu.:16115
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
summary(prac2_meanc)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.40 1st Qu.:0.000 1st Qu.: 4878
## Median :39.08 Median :30.41 Median :1.000 Median : 9705
## Mean :39.23 Mean :30.61 Mean :1.096 Mean :13183
## 3rd Qu.:51.00 3rd Qu.:34.40 3rd Qu.:2.000 3rd Qu.:16115
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
summary(prac2_median)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.40 1st Qu.:0.000 1st Qu.: 4878
## Median :39.00 Median :30.30 Median :1.000 Median : 9382
## Mean :39.23 Mean :30.61 Mean :1.096 Mean :13167
## 3rd Qu.:51.00 3rd Qu.:34.40 3rd Qu.:2.000 3rd Qu.:16115
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
summary(prac2_mode)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:25.00 1st Qu.:26.40 1st Qu.:0.000 1st Qu.: 4762
## Median :38.00 Median :30.59 Median :1.000 Median : 9305
## Mean :38.13 Mean :30.66 Mean :1.096 Mean :13254
## 3rd Qu.:51.00 3rd Qu.:34.40 3rd Qu.:2.000 3rd Qu.:16587
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
summary(prac2_inter)
## edad imc hijos clm
## Min. :18.00 Min. :15.96 Min. :0.000 Min. : 1122
## 1st Qu.:27.00 1st Qu.:26.22 1st Qu.:0.000 1st Qu.: 4796
## Median :39.00 Median :30.40 Median :1.000 Median : 9411
## Mean :39.29 Mean :30.62 Mean :1.096 Mean :13274
## 3rd Qu.:51.00 3rd Qu.:34.58 3rd Qu.:2.000 3rd Qu.:16819
## Max. :64.00 Max. :53.13 Max. :5.000 Max. :63770
prac2_inter <- data.frame(lapply(prac2, function(x) ifelse(is.na(x), na.approx(x, na.rm=TRUE), x)))
Respuesta: Para un primer acercamiento yo considero que si es el “ideal”; pues un histograma nos permite ver el comportamiento que tienen los datos. Aunque depende mucho de que es lo que se quiere analizar.
ggplot(prac2_inter, aes(edad)) + geom_histogram(fill = "hotpink2", color = "lightpink") + labs(title = "Edad",
x = "Años",
y = "Frecuencia") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, family = "serif", size = 16, face = "bold"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(prac2_inter, aes(imc)) + geom_histogram(fill = "hotpink2", color = "lightpink") +
labs(title = "Indice de Masa Corporal",
x = "imc",
y = "Frecuencia") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, family = "serif", size = 16, face = "bold"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(prac2_inter, aes(hijos)) + geom_histogram(fill = "hotpink2", color = "lightpink") +
labs(title = "Número de hijos",
x = "Edad",
y = "Frecuencia") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, family = "serif", size = 16, face = "bold"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(prac2_inter, aes(clm)) + geom_histogram(fill = "hotpink2", color = "lightpink") +
labs(title = "Reclamaciones",
x = "clm",
y = "Frecuencia") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, family = "serif", size = 16, face = "bold"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Contemplando el genero (reclamaciones por genero y monto pot genero)
rec_sex <- prac2_inter %>%
group_by(sexo) %>%
summarise(recs_mean = mean(clm))
rec_sex
## # A tibble: 2 × 2
## sexo recs_mean
## <int> <dbl>
## 1 1 12517.
## 2 2 14017.
rec_fs <- prac2_inter %>%
group_by(fumador,sexo) %>%
summarise(recfs_mean = mean(clm))
## `summarise()` has grouped output by 'fumador'. You can override using the
## `.groups` argument.
rec_fs
## # A tibble: 4 × 3
## # Groups: fumador [2]
## fumador sexo recfs_mean
## <int> <int> <dbl>
## 1 1 1 29556.
## 2 1 2 32874.
## 3 2 1 8935.
## 4 2 2 8206.
regm <- prac2_inter %>%
group_by(region) %>%
summarise(regm_prom = mean(clm))%>%
arrange(desc(regm_prom))
regm
## # A tibble: 4 × 2
## region regm_prom
## <int> <dbl>
## 1 3 14588.
## 2 1 13577.
## 3 2 12418.
## 4 4 12354.
Respuesta: La región con mayor monto promedio es sureste(3).
prac2_inter <- prac2_inter %>%
mutate(clasificacion_imc = ifelse(imc > 30, 'obesidad', ' '))
obs_top10 <- prac2_inter %>%
select(imc,edad, sexo, hijos, clm, region) %>%
arrange(desc(imc))
head(obs_top10, 10)
## imc edad sexo hijos clm region
## 1 53.13 18 2 0 1163.463 3
## 2 52.58 22 2 1 44501.398 3
## 3 50.38 23 2 1 2438.055 3
## 4 49.06 58 2 0 11381.325 3
## 5 47.74 52 2 1 9748.911 3
## 6 47.60 37 1 2 46113.511 4
## 7 47.52 47 2 1 8083.920 3
## 8 47.41 54 1 0 63770.428 3
## 9 46.75 52 1 5 12592.534 3
## 10 46.70 54 1 2 11538.421 4