Đọc dữ liệu
setwd("d:/DATA2020/jam_nguyen")
dulieu <-read.delim("uscities.txt")
head(dulieu)
## y x1 x2 x3 x4 x5 x6
## 1 10 70.3 213 582 6.0 7.05 36
## 2 13 61.0 91 132 8.2 48.52 100
## 3 12 56.7 453 716 8.7 20.66 67
## 4 17 51.9 454 515 9.0 12.95 86
## 5 56 49.1 412 158 9.0 43.37 127
## 6 36 54.0 80 80 9.0 40.25 114
Tạo biến LogY
attach(dulieu)
dulieu$LogY <-log(y)
head(dulieu)
## y x1 x2 x3 x4 x5 x6 LogY
## 1 10 70.3 213 582 6.0 7.05 36 2.302585
## 2 13 61.0 91 132 8.2 48.52 100 2.564949
## 3 12 56.7 453 716 8.7 20.66 67 2.484907
## 4 17 51.9 454 515 9.0 12.95 86 2.833213
## 5 56 49.1 412 158 9.0 43.37 127 4.025352
## 6 36 54.0 80 80 9.0 40.25 114 3.583519
Hồi quy đa biến
congthuc <- LogY ~ x1 + x2 + x3 + x4 + x5 + x6
hoiquy <-lm(data=dulieu,congthuc)
summary(hoiquy)
##
## Call:
## lm(formula = congthuc, data = dulieu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79697 -0.27812 -0.03398 0.28827 0.98730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2599547 1.4677044 4.946 2.16e-05 ***
## x1 -0.0600496 0.0192708 -3.116 0.00378 **
## x2 0.0012553 0.0004891 2.566 0.01500 *
## x3 -0.0006989 0.0004701 -1.487 0.14655
## x4 -0.1695248 0.0562957 -3.011 0.00496 **
## x5 0.0169887 0.0113071 1.502 0.14248
## x6 0.0005101 0.0050297 0.101 0.91983
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4539 on 33 degrees of freedom
## Multiple R-squared: 0.6553, Adjusted R-squared: 0.5927
## F-statistic: 10.46 on 6 and 33 DF, p-value: 1.72e-06
Tính AIC và BIC
AIC(hoiquy)
## [1] 58.6365
BIC(hoiquy)
## [1] 72.14754
library(leaps)
hoiquysub <- regsubsets(LogY~.,data=dulieu, nvmax=6)
names(hoiquysub)
## [1] "np" "nrbar" "d" "rbar" "thetab" "first"
## [7] "last" "vorder" "tol" "rss" "bound" "nvmax"
## [13] "ress" "ir" "nbest" "lopt" "il" "ier"
## [19] "xnames" "method" "force.in" "force.out" "sserr" "intercept"
## [25] "lindep" "nullrss" "nn" "call"
ketqua <-summary(hoiquysub)
names(ketqua)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
ketqua$bic
## [1] -75.08222 -80.88843 -90.25294 -92.47034 -92.55689 -94.05602
library(lmSubsets)
hquysub <- lmSubsets(data=dulieu, LogY~.)
hquysub
## Call:
## lmSubsets(formula = LogY ~ ., data = dulieu)
##
## Deviance:
## [best, size (tolerance)] = RSS
## 2 (0) 3 (0) 4 (0) 5 (0) 6 (0) 7 (0) 8 (0)
## 1st 2.510681 1.980158 1.42881 1.232673 1.121648 0.9852096 0.9778533
##
## Subset:
## [variable, best] = size
## 1st
## +(Intercept) 2-8
## y 2-8
## x1 3-8
## x2 6-8
## x3 7-8
## x4 4-8
## x5 8
## x6 5-8
names(hquysub)
## [1] "submodel" "subset" ".interrupted" ".nodes" "NOBS"
## [6] "nobs" "nvar" "intercept" "include" "exclude"
## [11] "size" "tolerance" "nbest" "call" "xlevels"
## [16] "terms" "model"
hquy <-summary(hquysub)
hquy
## Call:
## lmSubsets(formula = LogY ~ ., data = dulieu)
##
## Statistics:
## SIZE BEST sigma R2 R2adj pval Cp AIC
## 2 1 0.2570419 0.8727370 0.8693880 < 2.22e-16 46.161393 8.782066
## 3 1 0.2313391 0.8996285 0.8942030 < 2.22e-16 30.800166 1.286972
## 4 1 0.1992214 0.9275756 0.9215402 < 2.22e-16 14.757443 -9.766417
## 5 1 0.1876679 0.9375175 0.9303766 < 2.22e-16 10.338906 -13.672698
## 6 1 0.1816305 0.9431452 0.9347842 < 2.22e-16 8.705652 -15.448125
## 7 1 0.1727855 0.9500611 0.9409813 < 2.22e-16 6.240733 -18.636129
## 8 1 0.1748082 0.9504339 0.9395914 < 2.22e-16 8.000000 -16.935919
## BIC
## 13.848704
## 8.042490
## -1.322020
## -3.539422
## -3.625969
## -5.125093
## -1.736004
names(hquy)
## [1] "call" "terms" "nvar" "nbest" "size" "stats"
library(MuMIn)
hq <-lm(congthuc, data=dulieu, na.action = na.fail)
kethop <-dredge(hq)
## Fixed term is "(Intercept)"
nrow(kethop)
## [1] 64
kethop
## Global model call: lm(formula = congthuc, data = dulieu, na.action = na.fail)
## ---
## Model selection table
## (Intrc) x1 x2 x3 x4 x5 x6 df
## 28 7.785 -0.07025 0.0005559 -0.180200 0.019700 6
## 32 7.373 -0.06148 0.0012560 -0.0006984 -0.170500 0.017900 7
## 60 7.688 -0.06902 0.0005552 -0.179400 0.018910 0.0004393 7
## 48 5.805 -0.03805 0.0013280 -0.0007796 -0.150400 0.0065310 7
## 44 6.103 -0.04538 0.0005463 -0.159000 0.0072210 6
## 64 7.260 -0.06005 0.0012550 -0.0006989 -0.169500 0.016990 0.0005101 8
## 30 8.084 -0.07699 0.0004635 -0.179700 0.020980 6
## 16 7.028 -0.04680 0.0014810 -0.0009160 -0.148200 6
## 12 7.539 -0.05667 0.0005637 -0.158300 5
## 24 5.138 -0.04710 0.0013520 -0.0008682 0.014680 6
## 62 7.947 -0.07525 0.0004625 -0.178500 0.019870 0.0006187 7
## 46 6.275 -0.05031 0.0004469 -0.156400 0.0077390 6
## 40 3.871 -0.02765 0.0013830 -0.0009007 0.0063810 6
## 20 5.497 -0.05712 0.0004773 0.016720 5
## 36 4.088 -0.03549 0.0004747 0.0071750 5
## 8 5.093 -0.03635 0.0015310 -0.0010320 5
## 47 2.927 0.0018580 -0.0012760 -0.099960 0.0095310 6
## 39 2.088 0.0017920 -0.0012650 0.0088210 5
## 56 4.531 -0.03893 0.0013470 -0.0008651 0.009271 0.0030850 7
## 52 4.869 -0.04865 0.0004745 0.011120 0.0031830 6
## 4 5.524 -0.04676 0.0004923 4
## 63 3.047 0.0017240 -0.0011570 -0.105700 -0.009779 0.0120000 7
## 14 7.820 -0.06260 0.0004520 -0.154700 5
## 22 5.777 -0.06304 0.0003786 0.017830 5
## 55 2.150 0.0016710 -0.0011610 -0.008561 0.0109500 6
## 38 4.279 -0.03999 0.0003730 0.0076340 5
## 35 1.630 0.0005348 0.0111800 4
## 26 7.938 -0.07589 -0.138000 0.020270 5
## 43 2.450 0.0005893 -0.098020 0.0118900 5
## 59 2.678 0.0005668 -0.106200 -0.013590 0.0150200 6
## 54 5.115 -0.05410 0.0003761 0.011940 0.0033480 6
## 51 1.776 0.0005102 -0.012380 0.0139700 5
## 6 5.826 -0.05223 0.0003788 4
## 7 3.126 0.0022260 -0.0016520 4
## 42 6.110 -0.04945 -0.117100 0.0079030 5
## 15 3.843 0.0023050 -0.0016850 -0.077600 5
## 18 6.109 -0.06486 0.017840 4
## 34 4.573 -0.04151 0.0078000 4
## 58 7.538 -0.07081 -0.134800 0.017030 0.0018050 6
## 23 2.973 0.0022340 -0.0016580 0.004196 5
## 10 7.687 -0.06200 -0.114900 4
## 2 6.158 -0.05404 3
## 50 5.357 -0.05473 0.011170 0.0037900 5
## 31 3.690 0.0023130 -0.0016900 -0.077630 0.004202 6
## 53 1.681 0.0003809 -0.014420 0.0155000 5
## 37 1.509 0.0003933 0.0123200 4
## 61 2.472 0.0004277 -0.094770 -0.015680 0.0165700 6
## 45 2.195 0.0004356 -0.083700 0.0130200 5
## 3 2.871 0.0006043 3
## 49 1.886 -0.015500 0.0160900 4
## 33 1.709 0.0126800 3
## 11 3.484 0.0006445 -0.066860 4
## 57 2.392 -0.058790 -0.016370 0.0168000 5
## 19 2.734 0.0006065 0.003724 4
## 41 2.101 -0.046530 0.0130900 4
## 27 3.348 0.0006467 -0.066850 0.003722 5
## 5 2.897 0.0004171 3
## 13 3.302 0.0004403 -0.044350 4
## 21 2.775 0.0004186 0.003334 4
## 1 3.152 2
## 29 3.180 0.0004417 -0.044280 0.003321 5
## 17 3.047 0.002885 3
## 9 3.214 -0.006551 3
## 25 3.108 -0.006387 0.002880 4
## logLik AICc delta weight
## 28 -22.619 59.8 0.00 0.322
## 32 -21.324 60.1 0.36 0.268
## 60 -22.615 62.7 2.95 0.074
## 48 -22.642 62.8 3.00 0.072
## 44 -24.163 62.9 3.09 0.069
## 64 -21.318 63.3 3.50 0.056
## 30 -24.966 64.5 4.69 0.031
## 16 -25.102 64.7 4.97 0.027
## 12 -26.978 65.7 5.94 0.017
## 24 -26.357 67.3 7.48 0.008
## 62 -24.958 67.4 7.63 0.007
## 46 -26.475 67.5 7.71 0.007
## 40 -26.508 67.6 7.78 0.007
## 20 -27.924 67.6 7.83 0.006
## 36 -28.190 68.1 8.36 0.005
## 8 -28.469 68.7 8.92 0.004
## 47 -27.288 69.1 9.34 0.003
## 39 -28.869 69.5 9.72 0.002
## 56 -26.174 69.8 10.06 0.002
## 52 -27.743 70.0 10.25 0.002
## 4 -30.492 70.1 10.34 0.002
## 63 -26.477 70.5 10.67 0.002
## 14 -29.358 70.5 10.70 0.002
## 22 -29.682 71.1 11.34 0.001
## 55 -28.294 71.1 11.35 0.001
## 38 -29.970 71.7 11.92 0.001
## 35 -32.075 73.3 13.51 0.000
## 26 -30.769 73.3 13.52 0.000
## 43 -30.789 73.3 13.56 0.000
## 59 -29.399 73.3 13.56 0.000
## 54 -29.499 73.5 13.76 0.000
## 51 -30.995 73.8 13.97 0.000
## 6 -32.356 73.9 14.07 0.000
## 7 -32.497 74.1 14.35 0.000
## 42 -31.574 74.9 15.13 0.000
## 15 -31.705 75.2 15.39 0.000
## 18 -33.081 75.3 15.52 0.000
## 34 -33.233 75.6 15.82 0.000
## 58 -30.720 76.0 16.20 0.000
## 23 -32.337 76.4 16.65 0.000
## 10 -33.936 77.0 17.23 0.000
## 2 -35.363 77.4 17.61 0.000
## 50 -32.883 77.5 17.75 0.000
## 31 -31.537 77.6 17.84 0.000
## 53 -33.175 78.1 18.33 0.000
## 37 -34.495 78.1 18.35 0.000
## 61 -32.046 78.6 18.85 0.000
## 45 -33.669 79.1 19.32 0.000
## 3 -37.266 81.2 21.41 0.000
## 49 -36.098 81.3 21.55 0.000
## 33 -37.419 81.5 21.72 0.000
## 11 -36.805 82.8 22.97 0.000
## 57 -35.713 83.2 23.41 0.000
## 19 -37.166 83.5 23.69 0.000
## 41 -37.192 83.5 23.74 0.000
## 27 -36.703 85.2 25.39 0.000
## 5 -40.105 86.9 27.09 0.000
## 13 -39.929 89.0 29.22 0.000
## 21 -40.036 89.2 29.43 0.000
## 1 -42.621 89.6 29.78 0.000
## 29 -39.859 91.5 31.70 0.000
## 17 -42.575 91.8 32.03 0.000
## 9 -42.617 91.9 32.12 0.000
## 25 -42.572 94.3 34.50 0.000
## Models ranked by AICc(x)
model.sel(kethop, rank=AIC) ->dulieuAIC
## New rank 'AIC' applied to logLik objects
model.sel(kethop, rank=BIC) ->dulieuBIC
## New rank 'BIC' applied to logLik objects
dulieuAIC$AIC
## [1] 56.64897 57.23889 58.63650 59.23023 59.28327 60.32690 61.93139 62.20454
## [9] 63.91611 63.95600 64.71440 64.95010 65.01599 65.84775 66.34779 66.38065
## [17] 66.57670 66.93869 66.95356 67.48675 67.73889 68.58834 68.71599 68.98488
## [25] 69.36348 69.93973 70.79855 70.99762 71.53735 71.57835 71.98931 72.15011
## [33] 72.71166 72.99448 73.14721 73.40929 73.43950 74.16211 74.46551 74.67352
## [41] 75.07421 75.76611 75.87111 76.09166 76.34981 76.72660 76.99068 77.33863
## [49] 80.19523 80.53170 80.83897 81.42616 81.60974 82.33273 82.38475 83.40636
## [57] 86.21066 87.85702 88.07231 89.24204 89.71856 91.15070 91.23495 93.14394
dulieuBIC$BIC
## [1] 67.37217 68.47112 70.46018 71.05238 71.10542 72.06467 72.14754 72.33782
## [9] 72.40040 74.29215 74.82505 74.84768 75.08338 75.14927 75.38309 75.73827
## [17] 75.74040 76.18329 76.70998 77.16039 77.62002 77.80788 78.16994 78.38413
## [25] 78.72161 78.77571 78.90563 79.46718 79.75000 79.98175 80.02274 80.43371
## [33] 80.91763 80.93183 81.13090 81.22103 81.59161 81.79324 81.85369 82.62663
## [41] 83.11792 83.57278 83.74620 84.21051 84.79421 85.20749 85.59834 85.78303
## [49] 85.90561 86.22494 86.95075 88.36526 89.08825 89.14027 89.87056 91.27730
## [57] 91.85075 92.61980 94.61254 94.82783 96.21734 96.30159 98.16296 99.89946
n <-nrow(dulieu)
n
## [1] 40
t<-ncol(dulieu)-1
t
## [1] 7
dulieu$yhat <-predict(hoiquy)
dulieu$ri<-dulieu$LogY-dulieu$yhat
dulieu$phandu<-residuals(hoiquy)
dulieu$ri2 <- dulieu$ri*dulieu$ri
head(dulieu)
## y x1 x2 x3 x4 x5 x6 LogY yhat ri phandu
## 1 10 70.3 213 582 6.0 7.05 36 2.302585 2.020064 0.28252095 0.28252095
## 2 13 61.0 91 132 8.2 48.52 100 2.564949 3.104102 -0.53915236 -0.53915236
## 3 12 56.7 453 716 8.7 20.66 67 2.484907 2.833659 -0.34875244 -0.34875244
## 4 17 51.9 454 515 9.0 12.95 86 2.833213 3.091483 -0.25826976 -0.25826976
## 5 56 49.1 412 158 9.0 43.37 127 4.025352 3.994118 0.03123335 0.03123335
## 6 36 54.0 80 80 9.0 40.25 114 3.583519 3.278009 0.30550985 0.30550985
## ri2
## 1 0.0798180899
## 2 0.2906852656
## 3 0.1216282650
## 4 0.0667032681
## 5 0.0009755219
## 6 0.0933362657
CV100 <-sum(dulieu$ri2)/n
CV100
## [1] 0.1699987
RSS <-function(x){
tong <-sum(residuals(x)^2)
return(tong)
}
CV100 <-RSS(hoiquy)/n
CV100
## [1] 0.1699987
get.models(kethop,subset=TRUE) ->tachModel
names(tachModel)
## [1] "28" "32" "60" "48" "44" "64" "30" "16" "12" "24" "62" "46" "40" "20" "36"
## [16] "8" "47" "39" "56" "52" "4" "63" "14" "22" "55" "38" "35" "26" "43" "59"
## [31] "54" "51" "6" "7" "42" "15" "18" "34" "58" "23" "10" "2" "50" "31" "53"
## [46] "37" "61" "45" "3" "49" "33" "11" "57" "19" "41" "27" "5" "13" "21" "1"
## [61] "29" "17" "9" "25"
tachModel[[1]]$residuals
## 1 2 3 4 5 6
## 0.279365726 -0.464153191 -0.408660525 -0.192174055 0.227453121 0.375847413
## 7 8 9 10 11 12
## 0.275477832 0.095670708 0.149525788 -0.193893638 0.004767504 -0.157481856
## 13 14 15 16 17 18
## -0.113469764 0.006158303 -0.508364798 0.496970005 -0.106185774 -0.351928519
## 19 20 21 22 23 24
## -0.456630908 0.740743374 -0.259687711 0.027542609 0.291083810 -0.773205721
## 25 26 27 28 29 30
## -0.603054740 0.595260018 -0.236848099 0.286833738 0.653333964 1.147478938
## 31 32 33 34 35 36
## -0.652371548 -0.357834251 -0.038087953 -0.046840295 0.322044967 0.790757773
## 37 38 39 40
## -0.046029032 -0.109482174 -0.124674454 -0.565256581
RSS(tachModel[[1]])
## [1] 7.257059
#residuals(tachModel[1])
#tachModel[1]$residuals
phantach <- sapply(tachModel, residuals, datamoi = tachModel)
phantach <-data.frame(phantach)
#phantach
attach(phantach)
m=0
kqua=data.frame(Model=0,CV100=0, AIC=0, BIC=0)
for(i in 1:64){
m=m+1
z = RSS(tachModel[[i]])/n
a = AIC(tachModel[[i]])
b = BIC(tachModel[[i]])
kqua[m,]=c(m,z,a,b)
}
kqua
## Model CV100 AIC BIC
## 1 1 0.1814265 57.23889 67.37217
## 2 2 0.1700517 56.64897 68.47112
## 3 3 0.1813872 59.23023 71.05238
## 4 4 0.1816279 59.28327 71.10542
## 5 5 0.1959875 60.32690 70.46018
## 6 6 0.1699987 58.63650 72.14754
## 7 7 0.2040088 61.93139 72.06467
## 8 8 0.2054067 62.20454 72.33782
## 9 9 0.2256033 63.95600 72.40040
## 10 10 0.2187082 64.71440 74.84768
## 11 11 0.2039308 63.91611 75.73827
## 12 12 0.2200007 64.95010 75.08338
## 13 13 0.2203634 65.01599 75.14927
## 14 14 0.2365293 65.84775 74.29215
## 15 15 0.2397015 66.38065 74.82505
## 16 16 0.2430690 66.93869 75.38309
## 17 17 0.2291314 66.57670 76.70998
## 18 18 0.2479806 67.73889 76.18329
## 19 19 0.2167128 66.34779 78.16994
## 20 20 0.2344042 67.48675 77.62002
## 21 21 0.2689432 68.98488 75.74040
## 22 22 0.2200197 66.95356 78.77571
## 23 23 0.2541128 68.71599 77.16039
## 24 24 0.2582596 69.36348 77.80788
## 25 25 0.2409493 68.58834 78.72161
## 26 26 0.2620071 69.93973 78.38413
## 27 27 0.2910896 72.15011 78.90563
## 28 28 0.2726836 71.53735 79.98175
## 29 29 0.2729632 71.57835 80.02274
## 30 30 0.2546377 70.79855 80.93183
## 31 31 0.2559082 70.99762 81.13090
## 32 32 0.2757821 71.98931 80.43371
## 33 33 0.2952049 72.71166 79.46718
## 34 34 0.2972996 72.99448 79.75000
## 35 35 0.2838819 73.14721 81.59161
## 36 36 0.2857480 73.40929 81.85369
## 37 37 0.3061059 74.16211 80.91763
## 38 38 0.3084365 74.46551 81.22103
## 39 39 0.2720173 73.43950 83.57278
## 40 40 0.2949236 74.67352 83.11792
## 41 41 0.3194677 75.87111 82.62663
## 42 42 0.3431073 76.72660 81.79324
## 43 43 0.3030904 75.76611 84.21051
## 44 44 0.2833644 75.07421 85.20749
## 45 45 0.3075457 76.34981 84.79421
## 46 46 0.3285356 76.99068 83.74620
## 47 47 0.2906645 76.09166 86.22494
## 48 48 0.3152431 77.33863 85.78303
## 49 49 0.3773491 80.53170 85.59834
## 50 50 0.3559388 80.19523 86.95075
## 51 51 0.3802589 80.83897 85.90561
## 52 52 0.3687511 81.60974 88.36526
## 53 53 0.3491607 81.42616 89.87056
## 54 54 0.3754767 82.33273 89.08825
## 55 55 0.3759653 82.38475 89.14027
## 56 56 0.3668809 83.40636 91.85075
## 57 57 0.4349124 86.21066 91.27730
## 58 58 0.4310843 87.85702 94.61254
## 59 59 0.4334108 88.07231 94.82783
## 60 60 0.4932072 89.24204 92.61980
## 61 61 0.4295947 89.71856 98.16296
## 62 62 0.4920821 91.15070 96.21734
## 63 63 0.4931197 91.23495 96.30159
## 64 64 0.4919990 93.14394 99.89946
Lựa chọn model
kqua[which.min(kqua$CV100),]
## Model CV100 AIC BIC
## 6 6 0.1699987 58.6365 72.14754
kqua[which.min(kqua$AIC),]
## Model CV100 AIC BIC
## 2 2 0.1700517 56.64897 68.47112
kqua[which.min(kqua$BIC),]
## Model CV100 AIC BIC
## 1 1 0.1814265 57.23889 67.37217