# UNIVERSIDAD NACIONAL DEL ALTIPLANO PUNO
# INGENIERIA ESTADISTICA E INFORMATICA
# TECNICAS ESTADISTICAS MULTIVARIADAS
# IMPUTACION DE LOS DATOS
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.1.3
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(base)
library(mice)
## Warning: package 'mice' was built under R version 4.1.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(visdat)
## Warning: package 'visdat' was built under R version 4.1.3
library(naniar)
## Warning: package 'naniar' was built under R version 4.1.3
library(VIM)
## Warning: package 'VIM' was built under R version 4.1.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.1.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datosmundo <- read_excel("D:/TECNICAS ESTADISTICAS MULTIVARIADAS/TAREA 01/datosmundo.xlsx")
datosmundo
## # A tibble: 109 x 25
## country poblacion densidad urban esp_vidaf esp_vidam alfabet inc_pobl
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 20500 25 18 44 45 29 2.8
## 2 Argentina 33900 12 86 75 68 95 1.3
## 3 Armenia 3700 126 68 75 68 98 1.4
## 4 Australia 17800 2.3 85 80 74 100 1.4
## 5 Austria 8000 94 58 79 73 99 0.2
## 6 Azerbaijan 7400 86 54 75 67 98 1.4
## 7 Bahrain 600 828 83 74 71 77 2.4
## 8 Bangladesh 125000 800 16 53 53 35 2.4
## 9 Barbados 256 605 45 78 73 99 0.2
## 10 Belarus 10300 50 65 76 66 99 0.3
## # ... with 99 more rows, and 17 more variables: mort_inf <dbl>, gdp_cap <dbl>,
## # region <dbl>, calories <dbl>, aids <dbl>, natalidad <dbl>,
## # mortalidad <dbl>, aids_rt <dbl>, Log_pib <dbl>, lg_aidsr <dbl>,
## # b_to_d <dbl>, fertilidad <dbl>, log_pop <dbl>, cropgrow <dbl>,
## # lit_male <dbl>, lit_fema <dbl>, climate <dbl>
# View(datosmundo)
kable(datosmundo[1:4,1:7],"markdown")
| Afghanistan |
20500 |
25.0 |
18 |
44 |
45 |
29 |
| Argentina |
33900 |
12.0 |
86 |
75 |
68 |
95 |
| Armenia |
3700 |
126.0 |
68 |
75 |
68 |
98 |
| Australia |
17800 |
2.3 |
85 |
80 |
74 |
100 |
# Valores NAs por Variables
miss_var_summary(datosmundo)
## # A tibble: 25 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 calories 34 31.2
## 2 lit_male 24 22.0
## 3 lit_fema 24 22.0
## 4 aids 3 2.75
## 5 aids_rt 3 2.75
## 6 lg_aidsr 3 2.75
## 7 cropgrow 3 2.75
## 8 alfabet 2 1.83
## 9 fertilidad 2 1.83
## 10 climate 2 1.83
## # ... with 15 more rows
# Contar el total de NAs en la base datos
sum(is.na(datosmundo))
## [1] 103
# Saber el numero de NAs por columna
colSums(is.na(datosmundo))
## country poblacion densidad urban esp_vidaf esp_vidam alfabet
## 0 0 0 1 0 0 2
## inc_pobl mort_inf gdp_cap region calories aids natalidad
## 0 0 0 0 34 3 0
## mortalidad aids_rt Log_pib lg_aidsr b_to_d fertilidad log_pop
## 1 3 0 3 1 2 0
## cropgrow lit_male lit_fema climate
## 3 24 24 2
# Omitir las filas con observaciones
base1 <- na.omit(datosmundo)
# Graficos
aggr(datosmundo, numbers = T, sortVar = T)

##
## Variables sorted by number of missings:
## Variable Count
## calories 0.311926606
## lit_male 0.220183486
## lit_fema 0.220183486
## aids 0.027522936
## aids_rt 0.027522936
## lg_aidsr 0.027522936
## cropgrow 0.027522936
## alfabet 0.018348624
## fertilidad 0.018348624
## climate 0.018348624
## urban 0.009174312
## mortalidad 0.009174312
## b_to_d 0.009174312
## country 0.000000000
## poblacion 0.000000000
## densidad 0.000000000
## esp_vidaf 0.000000000
## esp_vidam 0.000000000
## inc_pobl 0.000000000
## mort_inf 0.000000000
## gdp_cap 0.000000000
## region 0.000000000
## natalidad 0.000000000
## Log_pib 0.000000000
## log_pop 0.000000000
vis_miss(datosmundo)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

gg_miss_var(datosmundo)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

# Moda
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
Data_Impu_Moda<-ifelse(is.na(datosmundo$region),getmode(datosmundo$region),datosmundo$region)
par(mfrow=c(1,2))
plot(density(datosmundo$region,na.rm = T),col=2,main="Region")
lines(density(Data_Impu_Moda),col=3)
# Media
colSums(is.na(datosmundo))
## country poblacion densidad urban esp_vidaf esp_vidam alfabet
## 0 0 0 1 0 0 2
## inc_pobl mort_inf gdp_cap region calories aids natalidad
## 0 0 0 0 34 3 0
## mortalidad aids_rt Log_pib lg_aidsr b_to_d fertilidad log_pop
## 1 3 0 3 1 2 0
## cropgrow lit_male lit_fema climate
## 3 24 24 2
tempData <- mice(datosmundo,m=5,maxit=2,method="mean",seed=500)
##
## iter imp variable
## 1 1 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 1 2 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 1 3 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 1 4 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 1 5 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 2 1 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 2 2 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 2 3 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 2 4 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## 2 5 urban alfabet calories aids mortalidad aids_rt lg_aidsr b_to_d fertilidad cropgrow lit_male lit_fema climate
## Warning: Number of logged events: 1
newdata <- complete(tempData)
colSums(is.na(newdata))
## country poblacion densidad urban esp_vidaf esp_vidam alfabet
## 0 0 0 0 0 0 0
## inc_pobl mort_inf gdp_cap region calories aids natalidad
## 0 0 0 0 0 0 0
## mortalidad aids_rt Log_pib lg_aidsr b_to_d fertilidad log_pop
## 0 0 0 0 0 0 0
## cropgrow lit_male lit_fema climate
## 0 0 0 0
xyplot(tempData,cropgrow ~climate)
# Imputacion mediante regresion
columns <- c(names(datosmundo))
imputed_data <- mice(datosmundo[,names(datosmundo) %in% columns],m = 5,
maxit = 1, method = "norm.predict",seed = 2018,print=F)
## Warning: Number of logged events: 1
complete.data <- mice::complete(imputed_data)
str(complete.data)
## 'data.frame': 109 obs. of 25 variables:
## $ country : chr "Afghanistan" "Argentina" "Armenia" "Australia" ...
## $ poblacion : num 20500 33900 3700 17800 8000 7400 600 125000 256 10300 ...
## $ densidad : num 25 12 126 2.3 94 86 828 800 605 50 ...
## $ urban : num 18 86 68 85 58 54 83 16 45 65 ...
## $ esp_vidaf : num 44 75 75 80 79 75 74 53 78 76 ...
## $ esp_vidam : num 45 68 68 74 73 67 71 53 73 66 ...
## $ alfabet : num 29 95 98 100 99 98 77 35 99 99 ...
## $ inc_pobl : num 2.8 1.3 1.4 1.4 0.2 1.4 2.4 2.4 0.2 0.3 ...
## $ mort_inf : num 168 25.6 27 7.3 6.7 35 25 106 20.3 19 ...
## $ gdp_cap : num 205 3408 5000 16848 18396 ...
## $ region : num 3 6 5 1 1 5 5 3 6 2 ...
## $ calories : num 2104 3113 2831 3216 3495 ...
## $ aids : num 0 3904 2 4727 1150 ...
## $ natalidad : num 53 20 23 15 12 23 29 35 16 13 ...
## $ mortalidad: num 22 9 6 8 11 7 4 11 8 11 ...
## $ aids_rt : num 0 11.52 0.05 26.56 14.38 ...
## $ Log_pib : num 2.31 3.53 3.7 4.23 4.26 3.48 3.9 2.31 3.84 3.81 ...
## $ lg_aidsr : num 0 1.63 0.56 1.93 1.7 ...
## $ b_to_d : num 2.41 2.22 3.83 1.88 1.09 3.29 7.25 3.18 1.9 1.18 ...
## $ fertilidad: num 6.9 2.8 3.2 1.9 1.5 2.8 4 4.7 1.8 1.9 ...
## $ log_pop : num 4.31 4.53 3.57 4.25 3.9 3.87 2.78 5.1 2.41 4.01 ...
## $ cropgrow : num 12 9 17 6 17 18 2 67 77 29 ...
## $ lit_male : num 44 96 100 100 40 ...
## $ lit_fema : num 14 95 100 100 61.6 ...
## $ climate : num 3 8 4.81 3 8 ...
colSums(is.na(complete.data))
## country poblacion densidad urban esp_vidaf esp_vidam alfabet
## 0 0 0 0 0 0 0
## inc_pobl mort_inf gdp_cap region calories aids natalidad
## 0 0 0 0 0 0 0
## mortalidad aids_rt Log_pib lg_aidsr b_to_d fertilidad log_pop
## 0 0 0 0 0 0 0
## cropgrow lit_male lit_fema climate
## 0 0 0 0
