url <- "https://raw.githubusercontent.com/forvis/Labs-2021/main/Datasets/boston.csv"
data <- read.csv(url)
The Boston data frame has 506 rows and 14 columns (predictors). We have descriptions and summaries of predictors as follow:
crim: per capita crime rate by town.zn: proportion of residential land zoned for lots over 25,000 sq.ft.indus: proportion of non-retail business acres per town.chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).nox: nitrogen oxides concentration (parts per 10 million).rm: average number of rooms per dwelling.age: proportion of owner-occupied units built prior to 1940.dis: weighted mean of distances to five Boston employment centres.rad: index of accessibility to radial highways.tax: full-value property-tax rate per $10,000.ptratio: pupil-teacher ratio by town.black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.lstat: lower status of the population (percent).medv: median value of owner-occupied homes in $1000s.# первые 5 строк датасета
head(data)
# Удаление колонки data$X
data$X <- NULL
head(data)
# размер датасета - 506 строк, 13 колонок признаков, 1 колонка целевой величины
dim(data)
[1] 506 14
# Список колонок
colnames(data)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age" "dis" "rad"
[10] "tax" "ptratio" "black" "lstat" "medv"
# Список колнок с типами данных
str(data)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : int 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Или
sapply(data, class)
crim zn indus chas nox rm age dis rad tax
"numeric" "numeric" "numeric" "integer" "numeric" "numeric" "numeric" "numeric" "integer" "integer"
ptratio black lstat medv
"numeric" "numeric" "numeric" "numeric"
# # Проверка на наличие пропущенных значений
any(is.na(data))
[1] FALSE
colSums(is.na(data))
crim zn indus chas nox rm age dis rad tax ptratio black
0 0 0 0 0 0 0 0 0 0 0 0
lstat medv
0 0
# Основные статистические характеристики набора данных
summary(data)
crim zn indus chas nox
Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850
1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490
Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380
Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917 Mean :0.5547
3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240
Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710
rm age dis rad tax ptratio
Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40
Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000 Median :330.0 Median :19.05
Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549 Mean :408.2 Mean :18.46
3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
black lstat medv
Min. : 0.32 Min. : 1.73 Min. : 5.00
1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.02
Median :391.44 Median :11.36 Median :21.20
Mean :356.67 Mean :12.65 Mean :22.53
3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.00
Max. :396.90 Max. :37.97 Max. :50.00
# Определим эмпирическое среднее и дисперсию целевого признака
mean(data$medv)
[1] 22.53281
sd(data$medv)
[1] 9.197104
ggplot(data, aes(x = crim)) +
geom_histogram()
ggplot(data, aes(x = lstat, y = crim)) +geom_point()
library(tidyverse)
data %>%
gather(key, val, -medv) %>%
ggplot(aes(x = val, y = medv)) +
geom_point() +
stat_smooth(method = "lm", se = TRUE, col = "blue") +
facet_wrap(~key, scales = "free") +
theme_gray() +
ggtitle("Scatter plot of dependent variables vs Median Value (medv)")
# Парные диаграммы
pairs(~ medv + ptratio + black + lstat + dis + rm + crim, data = data, main = "Boston Data")
par(mfrow = c(3,5))
mapply(boxplot, data[-4], main=paste("Boxplot of",colnames(data[-4])), xlab=colnames(data[-4]))
X crim zn chas nox rm age dis rad tax ptratio
stats Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5 Numeric,5
n 506 506 506 506 506 506 506 506 506 506 506
conf Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2 Numeric,2
out Numeric,0 Numeric,66 Numeric,68 Numeric,35 Numeric,0 Numeric,30 Numeric,0 Numeric,5 Numeric,0 Numeric,0 Numeric,15
group Numeric,0 Numeric,66 Numeric,68 Numeric,35 Numeric,0 Numeric,30 Numeric,0 Numeric,5 Numeric,0 Numeric,0 Numeric,15
names "" "" "" "" "" "" "" "" "" "" ""
black lstat medv
stats Numeric,5 Numeric,5 Numeric,5
n 506 506 506
conf Numeric,2 Numeric,2 Numeric,2
out Numeric,76 Numeric,6 Numeric,37
group Numeric,76 Numeric,6 Numeric,37
names "" "" ""
plot(medv~lstat, data)
ggplot(data, aes(x = medv, y = crim, colour = as.factor(chas))) +
geom_point() +
ggtitle("Fig 1.Crime, Property Value and River Proximity of Boston Towns") +
theme_bw()
library(ggthemes)
ggplot(data, aes(x = medv, y = crim, colour = as.factor(chas))) +
geom_point() +
ggtitle("Fig 1.Crime, Property Value and River Proximity of Boston Towns") +
theme_economist()
ggplot(data, aes(x = medv, y = crim, colour = as.factor(chas))) +
geom_point() +
ggtitle("Fig 1.Crime, Property Value and River Proximity of Boston Towns") +
labs(x = "Median Property Value (in US Dollars x 1000)",
y = "Per capita crime rate",
colour = "Borders the river") +
scale_colour_discrete(labels = c("No", "Yes"))
#I create a new data frame that only contains 4 variables included in the Boston dataset and I am calling this new data frame object Boston_spm
Boston_spm <- dplyr::select(data, crim, medv, lstat)
# run the scatterplot matrix using the ggpairs function from GGally:
library(GGally)
ggpairs(Boston_spm)
# матрица корреляций
corr_matrix<-cor(data)
corr_matrix
crim zn indus chas nox rm age
crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171 -0.21924670 0.35273425
zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371 0.31199059 -0.56953734
indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145 -0.39167585 0.64477851
chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281 0.09125123 0.08651777
nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000 -0.30218819 0.73147010
rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819 1.00000000 -0.24026493
age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010 -0.24026493 1.00000000
dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011 0.20524621 -0.74788054
rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056 -0.20984667 0.45602245
tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320 -0.29204783 0.50645559
ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268 -0.35550149 0.26151501
black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064 0.12806864 -0.27353398
lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892 -0.61380827 0.60233853
medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077 0.69535995 -0.37695457
dis rad tax ptratio black lstat medv
crim -0.37967009 0.625505145 0.58276431 0.2899456 -0.38506394 0.4556215 -0.3883046
zn 0.66440822 -0.311947826 -0.31456332 -0.3916785 0.17552032 -0.4129946 0.3604453
indus -0.70802699 0.595129275 0.72076018 0.3832476 -0.35697654 0.6037997 -0.4837252
chas -0.09917578 -0.007368241 -0.03558652 -0.1215152 0.04878848 -0.0539293 0.1752602
nox -0.76923011 0.611440563 0.66802320 0.1889327 -0.38005064 0.5908789 -0.4273208
rm 0.20524621 -0.209846668 -0.29204783 -0.3555015 0.12806864 -0.6138083 0.6953599
age -0.74788054 0.456022452 0.50645559 0.2615150 -0.27353398 0.6023385 -0.3769546
dis 1.00000000 -0.494587930 -0.53443158 -0.2324705 0.29151167 -0.4969958 0.2499287
rad -0.49458793 1.000000000 0.91022819 0.4647412 -0.44441282 0.4886763 -0.3816262
tax -0.53443158 0.910228189 1.00000000 0.4608530 -0.44180801 0.5439934 -0.4685359
ptratio -0.23247054 0.464741179 0.46085304 1.0000000 -0.17738330 0.3740443 -0.5077867
black 0.29151167 -0.444412816 -0.44180801 -0.1773833 1.00000000 -0.3660869 0.3334608
lstat -0.49699583 0.488676335 0.54399341 0.3740443 -0.36608690 1.0000000 -0.7376627
medv 0.24992873 -0.381626231 -0.46853593 -0.5077867 0.33346082 -0.7376627 1.0000000
library(corrplot)
corrplot(corr_matrix, type="upper")
corrplot(corr_matrix, method = "number", type = "upper", diag = FALSE)
corrplot(corr_matrix, method="number", diag=FALSE)
corrplot(cor(data[,c(1, 3, 4, 6)]), method="number", diag=FALSE)
corrplot(cor(data[,c(1, 3, 4, 6)]), method="color", diag=FALSE)
Можно наблюдать сильную корреляцию между фичей DIS и фичами AGE, NOX, INDUS; имеет смысл оставить только одну из четырех.
normalize <- function(x)
{
return((x- min(x)) /(max(x)-min(x)))
}
# To get a vector, use apply instead of lapply
as.data.frame(apply(df$name, normalize))