- Cargar la base de datos y mostrar la estructura de las variables
datos <- read.csv("Advertising.csv")
datos <- datos[,2:5]
str(datos)
## 'data.frame': 200 obs. of 4 variables:
## $ TV : num 230.1 44.5 17.2 151.5 180.8 ...
## $ Radio : num 37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
## $ Newspaper: num 69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
## $ Sales : num 22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...
dplyr::glimpse(datos)
## Rows: 200
## Columns: 4
## $ TV <dbl> 230.1, 44.5, 17.2, 151.5, 180.8, 8.7, 57.5, 120.2, 8.6, 199.~
## $ Radio <dbl> 37.8, 39.3, 45.9, 41.3, 10.8, 48.9, 32.8, 19.6, 2.1, 2.6, 5.~
## $ Newspaper <dbl> 69.2, 45.1, 69.3, 58.5, 58.4, 75.0, 23.5, 11.6, 1.0, 21.2, 2~
## $ Sales <dbl> 22.1, 10.4, 9.3, 18.5, 12.9, 7.2, 11.8, 13.2, 4.8, 10.6, 8.6~
summary(datos)
## TV Radio Newspaper Sales
## Min. : 0.70 Min. : 0.000 Min. : 0.30 Min. : 1.60
## 1st Qu.: 74.38 1st Qu.: 9.975 1st Qu.: 12.75 1st Qu.:10.38
## Median :149.75 Median :22.900 Median : 25.75 Median :12.90
## Mean :147.04 Mean :23.264 Mean : 30.55 Mean :14.02
## 3rd Qu.:218.82 3rd Qu.:36.525 3rd Qu.: 45.10 3rd Qu.:17.40
## Max. :296.40 Max. :49.600 Max. :114.00 Max. :27.00
- Convertir a lo mucho el 5% de valores de las variables Radio y TV a datos perdidos (NA). Guardar la nueva base de datos en un nueva data frame con el nombre publicidad.
n <- dim(datos)[1]
indicesNA1 <- sample(1:n,n*0.05)
datos$TV[indicesNA1] <- NA
indicesNA2 <- sample(1:n,n*0.05)
datos$Radio[indicesNA2] <- NA
publicidad <- datos
summary(publicidad)
## TV Radio Newspaper Sales
## Min. : 0.7 Min. : 0.000 Min. : 0.30 Min. : 1.60
## 1st Qu.: 71.3 1st Qu.: 9.675 1st Qu.: 12.75 1st Qu.:10.38
## Median :148.5 Median :22.400 Median : 25.75 Median :12.90
## Mean :144.4 Mean :23.225 Mean : 30.55 Mean :14.02
## 3rd Qu.:217.5 3rd Qu.:36.575 3rd Qu.: 45.10 3rd Qu.:17.40
## Max. :296.4 Max. :49.600 Max. :114.00 Max. :27.00
## NA's :10 NA's :10
apply(publicidad, 2, function(x){sum(is.na(x))})
## TV Radio Newspaper Sales
## 10 10 0 0
- Mostrar la proporción de datos perdidos por variable y por registro. Interpretar estos valores
mice::md.pattern(publicidad, rotate.names=TRUE)

## Newspaper Sales TV Radio
## 180 1 1 1 1 0
## 10 1 1 1 0 1
## 10 1 1 0 1 1
## 0 0 10 10 20
mice::md.pairs(publicidad) #r:respondido #m: missing
## $rr
## TV Radio Newspaper Sales
## TV 190 180 190 190
## Radio 180 190 190 190
## Newspaper 190 190 200 200
## Sales 190 190 200 200
##
## $rm
## TV Radio Newspaper Sales
## TV 0 10 0 0
## Radio 10 0 0 0
## Newspaper 10 10 0 0
## Sales 10 10 0 0
##
## $mr
## TV Radio Newspaper Sales
## TV 0 10 10 10
## Radio 10 0 10 10
## Newspaper 0 0 0 0
## Sales 0 0 0 0
##
## $mm
## TV Radio Newspaper Sales
## TV 10 0 0 0
## Radio 0 10 0 0
## Newspaper 0 0 0 0
## Sales 0 0 0 0
sleep_aggr <- VIM::aggr(publicidad, col = mice::mdc(1:2), numbers = TRUE,
sortVars = TRUE, labels = names(publicidad),
cex.axis= 0.7, gap = 3,
ylab = c("Proporción de Pérdida",
"Patrón de Pérdida"))

##
## Variables sorted by number of missings:
## Variable Count
## TV 0.05
## Radio 0.05
## Newspaper 0.00
## Sales 0.00