housing.data <- read.csv("C:/Users/LUIS 1/Desktop/MachineLearningR/data/t1/housing-with-missing-value.csv", header = TRUE, stringsAsFactors = FALSE)
summary(housing.data)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
##
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
##
## dis rad tax ptratio
## Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
## 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40
## Median : 3.207 Median : 5.000 Median :330.0 Median :19.10
## Mean : 3.795 Mean : 9.515 Mean :408.2 Mean :18.47
## 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
## NA's :40 NA's :40
## b lstat medv
## Min. : 0.32 Min. : 1.73 Min. : 5.00
## 1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.02
## Median :391.44 Median :11.36 Median :21.20
## Mean :356.67 Mean :12.65 Mean :22.53
## 3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :396.90 Max. :37.97 Max. :50.00
##
str(housing.data)
## 'data.frame': 506 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
head(housing.data)
## X crim zn indus chas nox rm age dis rad tax ptratio b lstat
## 1 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
housing.data.1 <- na.omit(housing.data)
summary(housing.data.1)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:120.5 1st Qu.: 0.07373 1st Qu.: 0.00 1st Qu.: 5.13
## Median :252.0 Median : 0.25356 Median : 0.00 Median : 8.56
## Mean :251.4 Mean : 3.66428 Mean : 11.79 Mean :11.03
## 3rd Qu.:381.5 3rd Qu.: 3.69503 3rd Qu.: 15.00 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4480 1st Qu.:5.886 1st Qu.: 45.25
## Median :0.00000 Median :0.5380 Median :6.195 Median : 76.70
## Mean :0.06729 Mean :0.5529 Mean :6.277 Mean : 68.51
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.630 3rd Qu.: 94.30
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis rad tax ptratio
## Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
## 1st Qu.: 2.083 1st Qu.: 4.000 1st Qu.:278.0 1st Qu.:17.40
## Median : 3.360 Median : 5.000 Median :330.0 Median :19.10
## Mean : 3.861 Mean : 9.599 Mean :407.8 Mean :18.47
## 3rd Qu.: 5.287 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
## b lstat medv
## Min. : 0.32 Min. : 1.73 Min. : 5.00
## 1st Qu.:374.96 1st Qu.: 6.91 1st Qu.:16.60
## Median :391.45 Median :11.41 Median :21.10
## Mean :357.24 Mean :12.76 Mean :22.38
## 3rd Qu.:396.25 3rd Qu.:17.14 3rd Qu.:25.00
## Max. :396.90 Max. :37.97 Max. :50.00
drop_na <- c("rad")
housing.data.2 <- housing.data[
complete.cases(housing.data[,!(names(housing.data))%in% drop_na]),]
summary(housing.data.2)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:126.2 1st Qu.: 0.07880 1st Qu.: 0.00 1st Qu.: 5.13
## Median :254.5 Median : 0.25651 Median : 0.00 Median : 8.56
## Mean :253.5 Mean : 3.73046 Mean : 11.53 Mean :11.06
## 3rd Qu.:380.8 3rd Qu.: 3.68939 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
##
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.62
## Median :0.00000 Median :0.5380 Median :6.211 Median : 76.80
## Mean :0.06438 Mean :0.5536 Mean :6.284 Mean : 68.72
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.630 3rd Qu.: 94.38
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
##
## dis rad tax ptratio
## Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
## 1st Qu.: 2.091 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40
## Median : 3.299 Median : 5.000 Median :330.0 Median :19.10
## Mean : 3.824 Mean : 9.599 Mean :408.7 Mean :18.47
## 3rd Qu.: 5.215 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
## NA's :35
## b lstat medv
## Min. : 0.32 Min. : 1.730 Min. : 5.00
## 1st Qu.:375.24 1st Qu.: 6.923 1st Qu.:16.73
## Median :391.38 Median :11.235 Median :21.20
## Mean :358.05 Mean :12.662 Mean :22.53
## 3rd Qu.:396.23 3rd Qu.:17.043 3rd Qu.:25.00
## Max. :396.90 Max. :37.970 Max. :50.00
##
housing.data$rad <- NULL
summary(housing.data)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
##
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
##
## dis tax ptratio b
## Min. : 1.130 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 3.207 Median :330.0 Median :19.10 Median :391.44
## Mean : 3.795 Mean :408.2 Mean :18.47 Mean :356.67
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :12.127 Max. :711.0 Max. :22.00 Max. :396.90
## NA's :40
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
##
drops <- c("rad", "ptratio")
housing.data.3 <- housing.data[,!(names(housing.data) %in% drops)]
summary(housing.data.3)
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis tax b lstat
## Min. : 1.130 Min. :187.0 Min. : 0.32 Min. : 1.73
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:375.38 1st Qu.: 6.95
## Median : 3.207 Median :330.0 Median :391.44 Median :11.36
## Mean : 3.795 Mean :408.2 Mean :356.67 Mean :12.65
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :12.127 Max. :711.0 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
housing.data.copy1 <- housing.data
housing.data.copy1$ptratio <- impute(housing.data.copy1$ptratio, mean)
housing.data.copy1$rad <- impute(housing.data.copy1$rad, mean)
summary(housing.data.copy1)
##
## 40 values imputed to 18.4676
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis tax ptratio b
## Min. : 1.130 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 3.207 Median :330.0 Median :18.60 Median :391.44
## Mean : 3.795 Mean :408.2 Mean :18.47 Mean :356.67
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :12.127 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
housing.data.copy2 <- housing.data
housing.data.copy2$ptratio <- impute(housing.data.copy2$ptratio, median)
housing.data.copy2$rad <- impute(housing.data.copy2$rad, median)
summary(housing.data.copy2)
##
## 40 values imputed to 19.1
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis tax ptratio b
## Min. : 1.130 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 3.207 Median :330.0 Median :19.10 Median :391.44
## Mean : 3.795 Mean :408.2 Mean :18.52 Mean :356.67
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :12.127 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
housing.data.copy3 <- housing.data
housing.data.copy3$ptratio <- impute(housing.data.copy3$ptratio, 18)
housing.data.copy3$rad <- impute(housing.data.copy3$rad, 7)
summary(housing.data.copy3)
##
## 40 values imputed to 18
## X crim zn indus
## Min. : 1.0 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## 1st Qu.:127.2 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19
## Median :253.5 Median : 0.25651 Median : 0.00 Median : 9.69
## Mean :253.5 Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.:379.8 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :506.0 Max. :88.97620 Max. :100.00 Max. :27.74
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
## dis tax ptratio b
## Min. : 1.130 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 2.100 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 3.207 Median :330.0 Median :18.60 Median :391.44
## Mean : 3.795 Mean :408.2 Mean :18.43 Mean :356.67
## 3rd Qu.: 5.188 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :12.127 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(mice)
## Warning: package 'mice' was built under R version 4.1.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(housing.data)
## X crim zn indus chas nox rm age dis tax b lstat medv ptratio
## 466 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## 40 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## 0 0 0 0 0 0 0 0 0 0 0 0 0 40 40
library(VIM)
## Warning: package 'VIM' was built under R version 4.1.3
## Loading required package: colorspace
## Warning: package 'colorspace' was built under R version 4.1.3
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(housing.data,
col= c('green', 'red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(housing.data),
cex.axis = 0.75,
gap = 1,
ylab = c("Histograma de NAs", "Patrón")
)
##
## Variables sorted by number of missings:
## Variable Count
## ptratio 0.07905138
## X 0.00000000
## crim 0.00000000
## zn 0.00000000
## indus 0.00000000
## chas 0.00000000
## nox 0.00000000
## rm 0.00000000
## age 0.00000000
## dis 0.00000000
## tax 0.00000000
## b 0.00000000
## lstat 0.00000000
## medv 0.00000000