Cargando los datos
data <- read.csv("C:/Users/LUIS 1/Desktop/MachineLearningR/data/t1/missing-data.csv", na.strings = "")
str(data)
## 'data.frame': 27 obs. of 3 variables:
## $ Income : int 89800 47500 45000 44700 59500 NA 63300 52900 78200 145100 ...
## $ Phone_type: chr "Android" "Android" "iPhone" NA ...
## $ Car_type : chr "Luxury" "Non-Luxury" "Luxury" "Luxury" ...
head(data, 22)
## Income Phone_type Car_type
## 1 89800 Android Luxury
## 2 47500 Android Non-Luxury
## 3 45000 iPhone Luxury
## 4 44700 <NA> Luxury
## 5 59500 iPhone Luxury
## 6 NA Android Non-Luxury
## 7 63300 iPhone Non-Luxury
## 8 52900 Android Luxury
## 9 78200 Android Luxury
## 10 145100 iPhone Luxury
## 11 88600 iPhone Non-Luxury
## 12 65600 iPhone Luxury
## 13 NA Android Non-Luxury
## 14 94600 Android Luxury
## 15 59400 iPhone Luxury
## 16 47300 iPhone Non-Luxury
## 17 72100 <NA> Luxury
## 18 0 iPhone Non-Luxury
## 19 0 Android Luxury
## 20 83000 iPhone Luxury
## 21 64100 Android Non-Luxury
## 22 42100 iPhone Non-Luxury
data.cleaned <- na.omit(data)
head(data.cleaned)
## Income Phone_type Car_type
## 1 89800 Android Luxury
## 2 47500 Android Non-Luxury
## 3 45000 iPhone Luxury
## 5 59500 iPhone Luxury
## 7 63300 iPhone Non-Luxury
## 8 52900 Android Luxury
is.na(data[4,2])
## [1] TRUE
is.na(data[4,1])
## [1] FALSE
is.na(data$Income)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE
Limpiar NA de solamente la variable Income
data.income.cleaned <- data[!is.na(data$Income),]
head(data.income.cleaned, 22)
## Income Phone_type Car_type
## 1 89800 Android Luxury
## 2 47500 Android Non-Luxury
## 3 45000 iPhone Luxury
## 4 44700 <NA> Luxury
## 5 59500 iPhone Luxury
## 7 63300 iPhone Non-Luxury
## 8 52900 Android Luxury
## 9 78200 Android Luxury
## 10 145100 iPhone Luxury
## 11 88600 iPhone Non-Luxury
## 12 65600 iPhone Luxury
## 14 94600 Android Luxury
## 15 59400 iPhone Luxury
## 16 47300 iPhone Non-Luxury
## 17 72100 <NA> Luxury
## 18 0 iPhone Non-Luxury
## 19 0 Android Luxury
## 20 83000 iPhone Luxury
## 21 64100 Android Non-Luxury
## 22 42100 iPhone Non-Luxury
## 23 0 iPhone Luxury
## 24 91500 iPhone Non-Luxury
Filas completas para un data frame
complete.cases(data)
## [1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE
data.cleaned.2 <- data[complete.cases(data), ]
head(data.cleaned.2)
## Income Phone_type Car_type
## 1 89800 Android Luxury
## 2 47500 Android Non-Luxury
## 3 45000 iPhone Luxury
## 5 59500 iPhone Luxury
## 7 63300 iPhone Non-Luxury
## 8 52900 Android Luxury
Convertir los ceros de ingresos en NA
(data$Income[data$Income == 0] <- NA)
## [1] NA
head(data$Income, 22)
## [1] 89800 47500 45000 44700 59500 NA 63300 52900 78200 145100
## [11] 88600 65600 NA 94600 59400 47300 72100 NA NA 83000
## [21] 64100 42100
Medidas de centralización y dispersión
mean(data$Income)
## [1] NA
mean(data$Income, na.rm = TRUE)
## [1] 65763.64
sd(data$Income)
## [1] NA
sd(data$Income, na.rm = TRUE)
## [1] 26715.87