IMPUTAR-DATOS.R

# UNIVERSIDAD NACIONAL DEL ALTIPLANO PUNO
# INGENIERIA ESTADISTICA E INFORMATICA
# TECNICAS ESTADISTICAS MULTIVARIADAS

# IMPUTACION DE LOS DATOS
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.1.3

library(readxl)

## Warning: package 'readxl' was built under R version 4.1.3

library(base)
library(mice)

## Warning: package 'mice' was built under R version 4.1.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(visdat)

## Warning: package 'visdat' was built under R version 4.1.3

library(naniar)

## Warning: package 'naniar' was built under R version 4.1.3

library(VIM)

## Warning: package 'VIM' was built under R version 4.1.3

## Loading required package: colorspace

## Warning: package 'colorspace' was built under R version 4.1.3

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:kableExtra':
## 
##     group_rows

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

datosmundo <- read_excel("D:/TECNICAS ESTADISTICAS MULTIVARIADAS/TAREA 01/datosmundo.xlsx")
datosmundo

## # A tibble: 109 x 25
##    country     poblacion densidad urban esp_vidaf esp_vidam alfabet inc_pobl
##    <chr>           <dbl>    <dbl> <dbl>     <dbl>     <dbl>   <dbl>    <dbl>
##  1 Afghanistan     20500     25      18        44        45      29      2.8
##  2 Argentina       33900     12      86        75        68      95      1.3
##  3 Armenia          3700    126      68        75        68      98      1.4
##  4 Australia       17800      2.3    85        80        74     100      1.4
##  5 Austria          8000     94      58        79        73      99      0.2
##  6 Azerbaijan       7400     86      54        75        67      98      1.4
##  7 Bahrain           600    828      83        74        71      77      2.4
##  8 Bangladesh     125000    800      16        53        53      35      2.4
##  9 Barbados          256    605      45        78        73      99      0.2
## 10 Belarus         10300     50      65        76        66      99      0.3
## # ... with 99 more rows, and 17 more variables: mort_inf <dbl>, gdp_cap <dbl>,
## #   region <dbl>, calories <dbl>, aids <dbl>, natalidad <dbl>,
## #   mortalidad <dbl>, aids_rt <dbl>, Log_pib <dbl>, lg_aidsr <dbl>,
## #   b_to_d <dbl>, fertilidad <dbl>, log_pop <dbl>, cropgrow <dbl>,
## #   lit_male <dbl>, lit_fema <dbl>, climate <dbl>

# View(datosmundo)
kable(datosmundo[1:4,1:7],"markdown")

country	poblacion	densidad	urban	esp_vidaf	esp_vidam	alfabet
Afghanistan	20500	25.0	18	44	45	29
Argentina	33900	12.0	86	75	68	95
Armenia	3700	126.0	68	75	68	98
Australia	17800	2.3	85	80	74	100

# Valores NAs por Variables
miss_var_summary(datosmundo)

## # A tibble: 25 x 3
##    variable   n_miss pct_miss
##    <chr>       <int>    <dbl>
##  1 calories       34    31.2 
##  2 lit_male       24    22.0 
##  3 lit_fema       24    22.0 
##  4 aids            3     2.75
##  5 aids_rt         3     2.75
##  6 lg_aidsr        3     2.75
##  7 cropgrow        3     2.75
##  8 alfabet         2     1.83
##  9 fertilidad      2     1.83
## 10 climate         2     1.83
## # ... with 15 more rows

# Contar el total de NAs en la base datos
sum(is.na(datosmundo))

## [1] 103

# Saber el numero de NAs por columna
colSums(is.na(datosmundo))

##    country  poblacion   densidad      urban  esp_vidaf  esp_vidam    alfabet 
##          0          0          0          1          0          0          2 
##   inc_pobl   mort_inf    gdp_cap     region   calories       aids  natalidad 
##          0          0          0          0         34          3          0 
## mortalidad    aids_rt    Log_pib   lg_aidsr     b_to_d fertilidad    log_pop 
##          1          3          0          3          1          2          0 
##   cropgrow   lit_male   lit_fema    climate 
##          3         24         24          2

# Omitir las filas con observaciones
base1 <- na.omit(datosmundo)

# Graficos
aggr(datosmundo, numbers = T, sortVar = T)

## 
##  Variables sorted by number of missings: 
##    Variable       Count
##    calories 0.311926606
##    lit_male 0.220183486
##    lit_fema 0.220183486
##        aids 0.027522936
##     aids_rt 0.027522936
##    lg_aidsr 0.027522936
##    cropgrow 0.027522936
##     alfabet 0.018348624
##  fertilidad 0.018348624
##     climate 0.018348624
##       urban 0.009174312
##  mortalidad 0.009174312
##      b_to_d 0.009174312
##     country 0.000000000
##   poblacion 0.000000000
##    densidad 0.000000000
##   esp_vidaf 0.000000000
##   esp_vidam 0.000000000
##    inc_pobl 0.000000000
##    mort_inf 0.000000000
##     gdp_cap 0.000000000
##      region 0.000000000
##   natalidad 0.000000000
##     Log_pib 0.000000000
##     log_pop 0.000000000

vis_miss(datosmundo)

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

gg_miss_var(datosmundo)

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

# Moda
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

Data_Impu_Moda<-ifelse(is.na(datosmundo$region),getmode(datosmundo$region),datosmundo$region)

par(mfrow=c(1,2))
plot(density(datosmundo$region,na.rm = T),col=2,main="Region")
lines(density(Data_Impu_Moda),col=3)

# Media
colSums(is.na(datosmundo))

##    country  poblacion   densidad      urban  esp_vidaf  esp_vidam    alfabet 
##          0          0          0          1          0          0          2 
##   inc_pobl   mort_inf    gdp_cap     region   calories       aids  natalidad 
##          0          0          0          0         34          3          0 
## mortalidad    aids_rt    Log_pib   lg_aidsr     b_to_d fertilidad    log_pop 
##          1          3          0          3          1          2          0 
##   cropgrow   lit_male   lit_fema    climate 
##          3         24         24          2

tempData <- mice(datosmundo,m=5,maxit=2,method="mean",seed=500)

## 
##  iter imp variable
##   1   1  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   1   2  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   1   3  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   1   4  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   1   5  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   2   1  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   2   2  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   2   3  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   2   4  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate
##   2   5  urban  alfabet  calories  aids  mortalidad  aids_rt  lg_aidsr  b_to_d  fertilidad  cropgrow  lit_male  lit_fema  climate

## Warning: Number of logged events: 1

newdata <- complete(tempData)
colSums(is.na(newdata))

##    country  poblacion   densidad      urban  esp_vidaf  esp_vidam    alfabet 
##          0          0          0          0          0          0          0 
##   inc_pobl   mort_inf    gdp_cap     region   calories       aids  natalidad 
##          0          0          0          0          0          0          0 
## mortalidad    aids_rt    Log_pib   lg_aidsr     b_to_d fertilidad    log_pop 
##          0          0          0          0          0          0          0 
##   cropgrow   lit_male   lit_fema    climate 
##          0          0          0          0

xyplot(tempData,cropgrow ~climate)


# Imputacion mediante regresion
columns <- c(names(datosmundo))
imputed_data <- mice(datosmundo[,names(datosmundo) %in% columns],m = 5,
                     maxit = 1, method = "norm.predict",seed = 2018,print=F)

## Warning: Number of logged events: 1

complete.data <- mice::complete(imputed_data)
str(complete.data)

## 'data.frame':    109 obs. of  25 variables:
##  $ country   : chr  "Afghanistan" "Argentina" "Armenia" "Australia" ...
##  $ poblacion : num  20500 33900 3700 17800 8000 7400 600 125000 256 10300 ...
##  $ densidad  : num  25 12 126 2.3 94 86 828 800 605 50 ...
##  $ urban     : num  18 86 68 85 58 54 83 16 45 65 ...
##  $ esp_vidaf : num  44 75 75 80 79 75 74 53 78 76 ...
##  $ esp_vidam : num  45 68 68 74 73 67 71 53 73 66 ...
##  $ alfabet   : num  29 95 98 100 99 98 77 35 99 99 ...
##  $ inc_pobl  : num  2.8 1.3 1.4 1.4 0.2 1.4 2.4 2.4 0.2 0.3 ...
##  $ mort_inf  : num  168 25.6 27 7.3 6.7 35 25 106 20.3 19 ...
##  $ gdp_cap   : num  205 3408 5000 16848 18396 ...
##  $ region    : num  3 6 5 1 1 5 5 3 6 2 ...
##  $ calories  : num  2104 3113 2831 3216 3495 ...
##  $ aids      : num  0 3904 2 4727 1150 ...
##  $ natalidad : num  53 20 23 15 12 23 29 35 16 13 ...
##  $ mortalidad: num  22 9 6 8 11 7 4 11 8 11 ...
##  $ aids_rt   : num  0 11.52 0.05 26.56 14.38 ...
##  $ Log_pib   : num  2.31 3.53 3.7 4.23 4.26 3.48 3.9 2.31 3.84 3.81 ...
##  $ lg_aidsr  : num  0 1.63 0.56 1.93 1.7 ...
##  $ b_to_d    : num  2.41 2.22 3.83 1.88 1.09 3.29 7.25 3.18 1.9 1.18 ...
##  $ fertilidad: num  6.9 2.8 3.2 1.9 1.5 2.8 4 4.7 1.8 1.9 ...
##  $ log_pop   : num  4.31 4.53 3.57 4.25 3.9 3.87 2.78 5.1 2.41 4.01 ...
##  $ cropgrow  : num  12 9 17 6 17 18 2 67 77 29 ...
##  $ lit_male  : num  44 96 100 100 40 ...
##  $ lit_fema  : num  14 95 100 100 61.6 ...
##  $ climate   : num  3 8 4.81 3 8 ...

colSums(is.na(complete.data))

##    country  poblacion   densidad      urban  esp_vidaf  esp_vidam    alfabet 
##          0          0          0          0          0          0          0 
##   inc_pobl   mort_inf    gdp_cap     region   calories       aids  natalidad 
##          0          0          0          0          0          0          0 
## mortalidad    aids_rt    Log_pib   lg_aidsr     b_to_d fertilidad    log_pop 
##          0          0          0          0          0          0          0 
##   cropgrow   lit_male   lit_fema    climate 
##          0          0          0          0

IMPUTAR-DATOS.R

LENOVO

2022-04-04