Проведемразведочный анализ наших даннных, а именно Boston методом главных компонент.
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
apply(Boston, 2, mean)
## crim zn indus chas nox
## 3.61352356 11.36363636 11.13677866 0.06916996 0.55469506
## rm age dis rad tax
## 6.28463439 68.57490119 3.79504269 9.54940711 408.23715415
## ptratio black lstat medv
## 18.45553360 356.67403162 12.65306324 22.53280632
apply(Boston, 2, var)
## crim zn indus chas nox
## 7.398658e+01 5.439368e+02 4.706444e+01 6.451297e-02 1.342764e-02
## rm age dis rad tax
## 4.936709e-01 7.923584e+02 4.434015e+00 7.581637e+01 2.840476e+04
## ptratio black lstat medv
## 4.686989e+00 8.334752e+03 5.099476e+01 8.458672e+01
pr.out=prcomp(Boston, scale=TRUE)
names(pr.out)
## [1] "sdev" "rotation" "center" "scale" "x"
pr.out$center
## crim zn indus chas nox
## 3.61352356 11.36363636 11.13677866 0.06916996 0.55469506
## rm age dis rad tax
## 6.28463439 68.57490119 3.79504269 9.54940711 408.23715415
## ptratio black lstat medv
## 18.45553360 356.67403162 12.65306324 22.53280632
pr.out$scale
## crim zn indus chas nox rm
## 8.6015451 23.3224530 6.8603529 0.2539940 0.1158777 0.7026171
## age dis rad tax ptratio black
## 28.1488614 2.1057101 8.7072594 168.5371161 2.1649455 91.2948644
## lstat medv
## 7.1410615 9.1971041
pr.out$rotation
## PC1 PC2 PC3 PC4 PC5
## crim 0.242284451 -0.065873108 0.395077419 -0.100366211 0.004957659
## zn -0.245435005 -0.148002653 0.394545713 -0.342958421 0.114495002
## indus 0.331859746 0.127075668 -0.066081913 0.009626936 -0.022583692
## chas -0.005027133 0.410668763 -0.125305293 -0.700406497 -0.535197817
## nox 0.325193880 0.254276363 -0.046475549 -0.053707583 0.194605570
## rm -0.202816554 0.434005810 0.353406095 0.293357309 -0.008320481
## age 0.296976574 0.260303205 -0.200823078 0.078426326 0.149750092
## dis -0.298169809 -0.359149977 0.157068710 -0.184747787 -0.106219480
## rad 0.303412754 0.031149596 0.418510334 0.051374381 -0.230352185
## tax 0.324033052 0.008851406 0.343232194 0.026810695 -0.163425820
## ptratio 0.207679535 -0.314623061 0.000399092 0.342036328 -0.615707380
## black -0.196638358 0.026481032 -0.361375914 0.201741185 -0.367460674
## lstat 0.311397955 -0.201245177 -0.161060336 -0.242621217 0.178358870
## medv -0.266636396 0.444924411 0.163188735 0.180297553 -0.050659893
## PC6 PC7 PC8 PC9 PC10
## crim -0.22462703 0.777083366 -0.15740140 0.254211798 -0.071384615
## zn -0.33574694 -0.274178365 0.38031404 0.382899480 0.245579673
## indus -0.08082495 -0.340273839 -0.17174578 0.627048264 -0.254827026
## chas 0.16264906 0.074075775 0.03292700 -0.018642967 -0.041706916
## nox -0.14899191 -0.198092965 -0.04745838 -0.043024391 -0.211620349
## rm 0.13108056 0.074084938 0.43761566 -0.003666947 -0.526133916
## age -0.06086960 0.118580363 0.58810569 -0.043265822 0.245647942
## dis 0.01162335 -0.104397844 0.12823060 -0.175802196 -0.299412026
## rad -0.13493732 -0.137080107 -0.07464872 -0.463439397 0.115793486
## tax -0.18847146 -0.313984433 -0.07099212 -0.179446555 -0.008366413
## ptratio 0.27901731 0.001485608 0.28346960 0.274525949 0.160474164
## black -0.78590728 0.074842780 0.04444175 -0.060975651 -0.146292237
## lstat -0.09197211 0.083213083 0.35748247 -0.171810921 0.066647267
## medv -0.05402804 -0.009964973 -0.15230879 0.070751083 0.575547284
## PC11 PC12 PC13 PC14
## crim -0.071068781 0.06327612 0.097032312 0.059114176
## zn -0.127709065 -0.22112210 -0.132375830 -0.096296807
## indus 0.273797614 0.34840828 0.083716854 -0.235472877
## chas -0.009968402 -0.01903975 -0.049917454 0.023488966
## nox -0.437475550 -0.44909357 0.524974687 0.087649148
## rm 0.223951923 -0.12560554 -0.049893596 0.007190515
## age -0.329630928 0.48633905 -0.051462562 -0.038227027
## dis -0.114600078 0.49356822 0.552292172 0.047124029
## rad 0.042213348 0.01863641 -0.006278474 -0.634975332
## tax 0.042794054 0.17042179 -0.242987756 0.698822190
## ptratio -0.099991841 -0.23214842 0.188347079 0.055738160
## black 0.039194858 -0.04152885 -0.021078199 -0.016165280
## lstat 0.683032690 -0.18189209 0.249489863 0.083143795
## medv 0.242001064 0.09828580 0.469629324 0.134127182
dim(pr.out$x)
## [1] 506 14
biplot(pr.out, scale=0)
pr.out$rotation=-pr.out$rotation
pr.out$x=-pr.out$x
biplot(pr.out, scale=0)
pr.out$sdev
## [1] 2.5585132 1.2843410 1.1614241 0.9415625 0.9224421 0.8124105 0.7317177
## [8] 0.6348831 0.5265582 0.5022524 0.4612919 0.4277704 0.3660733 0.2456149
pr.var=pr.out$sdev^2
pr.var
## [1] 6.54598958 1.64953191 1.34890592 0.88653987 0.85089944 0.66001077
## [7] 0.53541080 0.40307658 0.27726358 0.25225744 0.21279025 0.18298750
## [13] 0.13400970 0.06032666
pve=pr.var/sum(pr.var)
pve
## [1] 0.467570684 0.117823708 0.096350423 0.063324276 0.060778531
## [6] 0.047143627 0.038243629 0.028791184 0.019804542 0.018018389
## [11] 0.015199304 0.013070536 0.009572121 0.004309047
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')
Из графиков можно сделать вывод, что у нас останется 2, максимум 4 главные компоненты. Суммарно они объясняют от 60 до 70% суммарной дисперсии факторов.
Выполним кластеризацию методом k-средних.
set.seed(2)
km.out=kmeans(Boston,2,nstart=20)
km.out$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## 505 506
## 1 1
plot(Boston, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", pch=20, cex=2)
set.seed(4)
km.out=kmeans(Boston,3,nstart=20)
km.out
## K-means clustering with 3 clusters of sizes 102, 38, 366
##
## Cluster means:
## crim zn indus chas nox rm age
## 1 10.9105113 0.00000 18.572549 0.07843137 0.6712255 5.982265 89.91373
## 2 15.2190382 0.00000 17.926842 0.02631579 0.6737105 6.065500 89.90526
## 3 0.3749927 15.71038 8.359536 0.07103825 0.5098626 6.391653 60.41339
## dis rad tax ptratio black lstat medv
## 1 2.077164 23.01961 668.2059 20.19510 371.80304 17.87402 17.42941
## 2 1.994429 22.50000 644.7368 19.92895 57.78632 20.44868 13.12632
## 3 4.460745 4.45082 311.2322 17.81776 383.48981 10.38866 24.93169
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 3 3 3 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 1 1 1
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 2 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 2 1
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3
## 505 506
## 3 3
##
## Within cluster sum of squares by cluster:
## [1] 181891.7 313208.7 2573399.1
## (between_SS / total_SS = 84.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(Boston, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", pch=20, cex=2)
set.seed(3)
km.out=kmeans(Boston,2,nstart=1)
km.out$tot.withinss
## [1] 5764994
km.out=kmeans(Boston,3,nstart=1)
km.out$tot.withinss
## [1] 4460944
km.out=kmeans(Boston,3,nstart=20)
km.out$tot.withinss
## [1] 3068499
По результатам кластерного анализа методом K-средних можно сделать вывод, что наиболее оптимальным является разбиение на 3 кластера при выборе 20 случайных наблюдений в качестве ядер. Такой вывод мы делаем при подсчете внутриклассовой дисперсии. Чем она меньше, тем лучше качество разбиения.
Теперь выполним иерархическую кластеризацию.
hc.complete=hclust(dist(Boston), method="complete")
hc.average=hclust(dist(Boston), method="average")
hc.single=hclust(dist(Boston), method="single")
par(mfrow=c(1,3))
plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9)
plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9)
plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9)
par(mfrow=c(1,1))
cutree(hc.complete, 2)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## 505 506
## 1 1
cutree(hc.average, 2)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## 505 506
## 1 1
cutree(hc.single, 2)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
## 505 506
## 1 1
cutree(hc.single, 4)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 1 1 1
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
## 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1
## 505 506
## 1 1
xsc=scale(Boston)
plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")
Построим модели. Сначала, с помощью метода главных компонент.
set.seed(2)
pcr.fit <- pcr(crim ~ ., data = Boston, scale = T, validation = 'CV')
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.207 7.211 6.779 6.772 6.786 6.815
## adjCV 8.61 7.204 7.208 6.771 6.765 6.780 6.807
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.816 6.676 6.707 6.705 6.716 6.677 6.598
## adjCV 6.807 6.667 6.696 6.694 6.704 6.664 6.585
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14
## 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## X 93.45 95.40 97.04 98.46 99.52 100.0
## crim 42.47 42.55 42.78 43.04 44.13 45.4
y11 <- predict(pcr.fit)
mse <- round(mean((Boston$crim-y11)^2))
mse
## [1] 44
validationplot(pcr.fit, val.type = 'MSEP')
Выведем отдельно каждый из кластеров.
km.out=kmeans(Boston,3,nstart=100)
data1 <- data.table(Boston, km.out$cluster)
str(data1)
## Classes 'data.table' and 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ V2 : int 3 3 3 3 3 3 3 3 3 3 ...
## - attr(*, ".internal.selfref")=<externalptr>
clust1 <- data1[V2==1, -c("rad","tax", "ptratio","zn")]
clust2 <- data1[V2==2, -c("rad","tax", "ptratio","zn")]
clust3 <- data1[V2==3, -c("rad","tax", "ptratio","zn")]
Построим регрессию на каждом кластере.
model1 <- glm(crim ~ age+dis+medv , data = clust1)
summary(model1)
##
## Call:
## glm(formula = crim ~ age + dis + medv, data = clust1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.392 -4.118 -0.721 2.224 68.671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.2607 16.0522 4.066 9.68e-05 ***
## age -0.2800 0.1258 -2.226 0.028328 *
## dis -10.2536 2.4416 -4.200 5.90e-05 ***
## medv -0.4520 0.1173 -3.854 0.000207 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 111.0538)
##
## Null deviance: 14838 on 101 degrees of freedom
## Residual deviance: 10883 on 98 degrees of freedom
## AIC: 775.8
##
## Number of Fisher Scoring iterations: 2
model2 <- lm(crim ~ nox+age+dis+medv, data = clust2)
summary(model2)
##
## Call:
## lm(formula = crim ~ nox + age + dis + medv, data = clust2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.170 -6.827 -2.674 2.362 45.698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.6101 31.4054 1.739 0.0914 .
## nox -53.8456 28.6092 -1.882 0.0687 .
## age 0.4631 0.2291 2.021 0.0514 .
## dis -14.0070 5.8605 -2.390 0.0227 *
## medv -1.2807 0.4950 -2.587 0.0143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.04 on 33 degrees of freedom
## Multiple R-squared: 0.4028, Adjusted R-squared: 0.3305
## F-statistic: 5.565 on 4 and 33 DF, p-value: 0.001545
model3 <- lm(crim ~ nox+rm+age+dis+black+medv, data = clust3)
summary(model3)
##
## Call:
## lm(formula = crim ~ nox + rm + age + dis + black + medv, data = clust3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.02506 -0.20534 -0.02612 0.11637 2.32837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9389484 0.5027688 1.868 0.062639 .
## nox 4.2205135 0.3288843 12.833 < 2e-16 ***
## rm -0.2064991 0.0663917 -3.110 0.002018 **
## age 0.0020544 0.0010620 1.935 0.053830 .
## dis 0.0471744 0.0148378 3.179 0.001604 **
## black -0.0057732 0.0007237 -7.977 2.03e-14 ***
## medv 0.0193915 0.0054771 3.540 0.000452 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3667 on 359 degrees of freedom
## Multiple R-squared: 0.6322, Adjusted R-squared: 0.626
## F-statistic: 102.8 on 6 and 359 DF, p-value: < 2.2e-16
y1 <- predict(model1)
y1_1 <- clust1$crim
mse1 <- round(mean((y1-y1_1)^2))
mse1
## [1] 107
y2 <- predict(model2)
y2_1 <- clust2$crim
mse2 <- round(mean((y2-y2_1)^2))
mse2
## [1] 126
y3 <- predict(model3)
y3_1 <- clust3$crim
mse3 <- round(mean((y3-y3_1)^2))
mse3
## [1] 0
Наименьшая регрессия наблюдается в регрессии по главным компонентам. Она равна 44.