Đọc dữ liệu

setwd("d:/DATA2020/jam_nguyen")
dulieu <-read.delim("uscities.txt")
head(dulieu)
##    y   x1  x2  x3  x4    x5  x6
## 1 10 70.3 213 582 6.0  7.05  36
## 2 13 61.0  91 132 8.2 48.52 100
## 3 12 56.7 453 716 8.7 20.66  67
## 4 17 51.9 454 515 9.0 12.95  86
## 5 56 49.1 412 158 9.0 43.37 127
## 6 36 54.0  80  80 9.0 40.25 114

Tạo biến LogY

attach(dulieu)
dulieu$LogY <-log(y)
head(dulieu)
##    y   x1  x2  x3  x4    x5  x6     LogY
## 1 10 70.3 213 582 6.0  7.05  36 2.302585
## 2 13 61.0  91 132 8.2 48.52 100 2.564949
## 3 12 56.7 453 716 8.7 20.66  67 2.484907
## 4 17 51.9 454 515 9.0 12.95  86 2.833213
## 5 56 49.1 412 158 9.0 43.37 127 4.025352
## 6 36 54.0  80  80 9.0 40.25 114 3.583519

Hồi quy đa biến

congthuc <- LogY ~ x1 + x2 + x3 + x4 + x5 + x6
hoiquy <-lm(data=dulieu,congthuc)
summary(hoiquy)
## 
## Call:
## lm(formula = congthuc, data = dulieu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79697 -0.27812 -0.03398  0.28827  0.98730 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.2599547  1.4677044   4.946 2.16e-05 ***
## x1          -0.0600496  0.0192708  -3.116  0.00378 ** 
## x2           0.0012553  0.0004891   2.566  0.01500 *  
## x3          -0.0006989  0.0004701  -1.487  0.14655    
## x4          -0.1695248  0.0562957  -3.011  0.00496 ** 
## x5           0.0169887  0.0113071   1.502  0.14248    
## x6           0.0005101  0.0050297   0.101  0.91983    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4539 on 33 degrees of freedom
## Multiple R-squared:  0.6553, Adjusted R-squared:  0.5927 
## F-statistic: 10.46 on 6 and 33 DF,  p-value: 1.72e-06

Tính AIC và BIC

AIC(hoiquy)
## [1] 58.6365
BIC(hoiquy)
## [1] 72.14754

Tính regsubsets

library(leaps)
hoiquysub <- regsubsets(LogY~.,data=dulieu, nvmax=6)
names(hoiquysub)
##  [1] "np"        "nrbar"     "d"         "rbar"      "thetab"    "first"    
##  [7] "last"      "vorder"    "tol"       "rss"       "bound"     "nvmax"    
## [13] "ress"      "ir"        "nbest"     "lopt"      "il"        "ier"      
## [19] "xnames"    "method"    "force.in"  "force.out" "sserr"     "intercept"
## [25] "lindep"    "nullrss"   "nn"        "call"
ketqua <-summary(hoiquysub)
names(ketqua)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
ketqua$bic
## [1] -75.08222 -80.88843 -90.25294 -92.47034 -92.55689 -94.05602

Tính lmSubssets

library(lmSubsets)
hquysub <- lmSubsets(data=dulieu, LogY~.)
hquysub
## Call:
##   lmSubsets(formula = LogY ~ ., data = dulieu)
## 
## Deviance:
##   [best, size (tolerance)] = RSS
##       2 (0)    3 (0)    4 (0)   5 (0)    6 (0)    7 (0)     8 (0)    
##   1st 2.510681 1.980158 1.42881 1.232673 1.121648 0.9852096 0.9778533
## 
## Subset:
##   [variable, best] = size
##                1st
##   +(Intercept) 2-8
##   y            2-8
##   x1           3-8
##   x2           6-8
##   x3           7-8
##   x4           4-8
##   x5           8  
##   x6           5-8
names(hquysub)
##  [1] "submodel"     "subset"       ".interrupted" ".nodes"       "NOBS"        
##  [6] "nobs"         "nvar"         "intercept"    "include"      "exclude"     
## [11] "size"         "tolerance"    "nbest"        "call"         "xlevels"     
## [16] "terms"        "model"
hquy <-summary(hquysub)
hquy
## Call:
##   lmSubsets(formula = LogY ~ ., data = dulieu)
## 
## Statistics:
##    SIZE BEST sigma     R2        R2adj     pval       Cp        AIC       
##    2    1    0.2570419 0.8727370 0.8693880 < 2.22e-16 46.161393   8.782066
##    3    1    0.2313391 0.8996285 0.8942030 < 2.22e-16 30.800166   1.286972
##    4    1    0.1992214 0.9275756 0.9215402 < 2.22e-16 14.757443  -9.766417
##    5    1    0.1876679 0.9375175 0.9303766 < 2.22e-16 10.338906 -13.672698
##    6    1    0.1816305 0.9431452 0.9347842 < 2.22e-16  8.705652 -15.448125
##    7    1    0.1727855 0.9500611 0.9409813 < 2.22e-16  6.240733 -18.636129
##    8    1    0.1748082 0.9504339 0.9395914 < 2.22e-16  8.000000 -16.935919
##    BIC      
##    13.848704
##     8.042490
##    -1.322020
##    -3.539422
##    -3.625969
##    -5.125093
##    -1.736004
names(hquy)
## [1] "call"  "terms" "nvar"  "nbest" "size"  "stats"

Tinh MuMIn

library(MuMIn)
hq <-lm(congthuc, data=dulieu, na.action = na.fail)
kethop <-dredge(hq)
## Fixed term is "(Intercept)"
nrow(kethop)
## [1] 64
kethop
## Global model call: lm(formula = congthuc, data = dulieu, na.action = na.fail)
## ---
## Model selection table 
##    (Intrc)       x1        x2         x3        x4        x5        x6 df
## 28   7.785 -0.07025 0.0005559            -0.180200  0.019700            6
## 32   7.373 -0.06148 0.0012560 -0.0006984 -0.170500  0.017900            7
## 60   7.688 -0.06902 0.0005552            -0.179400  0.018910 0.0004393  7
## 48   5.805 -0.03805 0.0013280 -0.0007796 -0.150400           0.0065310  7
## 44   6.103 -0.04538 0.0005463            -0.159000           0.0072210  6
## 64   7.260 -0.06005 0.0012550 -0.0006989 -0.169500  0.016990 0.0005101  8
## 30   8.084 -0.07699            0.0004635 -0.179700  0.020980            6
## 16   7.028 -0.04680 0.0014810 -0.0009160 -0.148200                      6
## 12   7.539 -0.05667 0.0005637            -0.158300                      5
## 24   5.138 -0.04710 0.0013520 -0.0008682            0.014680            6
## 62   7.947 -0.07525            0.0004625 -0.178500  0.019870 0.0006187  7
## 46   6.275 -0.05031            0.0004469 -0.156400           0.0077390  6
## 40   3.871 -0.02765 0.0013830 -0.0009007                     0.0063810  6
## 20   5.497 -0.05712 0.0004773                       0.016720            5
## 36   4.088 -0.03549 0.0004747                                0.0071750  5
## 8    5.093 -0.03635 0.0015310 -0.0010320                                5
## 47   2.927          0.0018580 -0.0012760 -0.099960           0.0095310  6
## 39   2.088          0.0017920 -0.0012650                     0.0088210  5
## 56   4.531 -0.03893 0.0013470 -0.0008651            0.009271 0.0030850  7
## 52   4.869 -0.04865 0.0004745                       0.011120 0.0031830  6
## 4    5.524 -0.04676 0.0004923                                           4
## 63   3.047          0.0017240 -0.0011570 -0.105700 -0.009779 0.0120000  7
## 14   7.820 -0.06260            0.0004520 -0.154700                      5
## 22   5.777 -0.06304            0.0003786            0.017830            5
## 55   2.150          0.0016710 -0.0011610           -0.008561 0.0109500  6
## 38   4.279 -0.03999            0.0003730                     0.0076340  5
## 35   1.630          0.0005348                                0.0111800  4
## 26   7.938 -0.07589                      -0.138000  0.020270            5
## 43   2.450          0.0005893            -0.098020           0.0118900  5
## 59   2.678          0.0005668            -0.106200 -0.013590 0.0150200  6
## 54   5.115 -0.05410            0.0003761            0.011940 0.0033480  6
## 51   1.776          0.0005102                      -0.012380 0.0139700  5
## 6    5.826 -0.05223            0.0003788                                4
## 7    3.126          0.0022260 -0.0016520                                4
## 42   6.110 -0.04945                      -0.117100           0.0079030  5
## 15   3.843          0.0023050 -0.0016850 -0.077600                      5
## 18   6.109 -0.06486                                 0.017840            4
## 34   4.573 -0.04151                                          0.0078000  4
## 58   7.538 -0.07081                      -0.134800  0.017030 0.0018050  6
## 23   2.973          0.0022340 -0.0016580            0.004196            5
## 10   7.687 -0.06200                      -0.114900                      4
## 2    6.158 -0.05404                                                     3
## 50   5.357 -0.05473                                 0.011170 0.0037900  5
## 31   3.690          0.0023130 -0.0016900 -0.077630  0.004202            6
## 53   1.681                     0.0003809           -0.014420 0.0155000  5
## 37   1.509                     0.0003933                     0.0123200  4
## 61   2.472                     0.0004277 -0.094770 -0.015680 0.0165700  6
## 45   2.195                     0.0004356 -0.083700           0.0130200  5
## 3    2.871          0.0006043                                           3
## 49   1.886                                         -0.015500 0.0160900  4
## 33   1.709                                                   0.0126800  3
## 11   3.484          0.0006445            -0.066860                      4
## 57   2.392                               -0.058790 -0.016370 0.0168000  5
## 19   2.734          0.0006065                       0.003724            4
## 41   2.101                               -0.046530           0.0130900  4
## 27   3.348          0.0006467            -0.066850  0.003722            5
## 5    2.897                     0.0004171                                3
## 13   3.302                     0.0004403 -0.044350                      4
## 21   2.775                     0.0004186            0.003334            4
## 1    3.152                                                              2
## 29   3.180                     0.0004417 -0.044280  0.003321            5
## 17   3.047                                          0.002885            3
## 9    3.214                               -0.006551                      3
## 25   3.108                               -0.006387  0.002880            4
##     logLik AICc delta weight
## 28 -22.619 59.8  0.00  0.322
## 32 -21.324 60.1  0.36  0.268
## 60 -22.615 62.7  2.95  0.074
## 48 -22.642 62.8  3.00  0.072
## 44 -24.163 62.9  3.09  0.069
## 64 -21.318 63.3  3.50  0.056
## 30 -24.966 64.5  4.69  0.031
## 16 -25.102 64.7  4.97  0.027
## 12 -26.978 65.7  5.94  0.017
## 24 -26.357 67.3  7.48  0.008
## 62 -24.958 67.4  7.63  0.007
## 46 -26.475 67.5  7.71  0.007
## 40 -26.508 67.6  7.78  0.007
## 20 -27.924 67.6  7.83  0.006
## 36 -28.190 68.1  8.36  0.005
## 8  -28.469 68.7  8.92  0.004
## 47 -27.288 69.1  9.34  0.003
## 39 -28.869 69.5  9.72  0.002
## 56 -26.174 69.8 10.06  0.002
## 52 -27.743 70.0 10.25  0.002
## 4  -30.492 70.1 10.34  0.002
## 63 -26.477 70.5 10.67  0.002
## 14 -29.358 70.5 10.70  0.002
## 22 -29.682 71.1 11.34  0.001
## 55 -28.294 71.1 11.35  0.001
## 38 -29.970 71.7 11.92  0.001
## 35 -32.075 73.3 13.51  0.000
## 26 -30.769 73.3 13.52  0.000
## 43 -30.789 73.3 13.56  0.000
## 59 -29.399 73.3 13.56  0.000
## 54 -29.499 73.5 13.76  0.000
## 51 -30.995 73.8 13.97  0.000
## 6  -32.356 73.9 14.07  0.000
## 7  -32.497 74.1 14.35  0.000
## 42 -31.574 74.9 15.13  0.000
## 15 -31.705 75.2 15.39  0.000
## 18 -33.081 75.3 15.52  0.000
## 34 -33.233 75.6 15.82  0.000
## 58 -30.720 76.0 16.20  0.000
## 23 -32.337 76.4 16.65  0.000
## 10 -33.936 77.0 17.23  0.000
## 2  -35.363 77.4 17.61  0.000
## 50 -32.883 77.5 17.75  0.000
## 31 -31.537 77.6 17.84  0.000
## 53 -33.175 78.1 18.33  0.000
## 37 -34.495 78.1 18.35  0.000
## 61 -32.046 78.6 18.85  0.000
## 45 -33.669 79.1 19.32  0.000
## 3  -37.266 81.2 21.41  0.000
## 49 -36.098 81.3 21.55  0.000
## 33 -37.419 81.5 21.72  0.000
## 11 -36.805 82.8 22.97  0.000
## 57 -35.713 83.2 23.41  0.000
## 19 -37.166 83.5 23.69  0.000
## 41 -37.192 83.5 23.74  0.000
## 27 -36.703 85.2 25.39  0.000
## 5  -40.105 86.9 27.09  0.000
## 13 -39.929 89.0 29.22  0.000
## 21 -40.036 89.2 29.43  0.000
## 1  -42.621 89.6 29.78  0.000
## 29 -39.859 91.5 31.70  0.000
## 17 -42.575 91.8 32.03  0.000
## 9  -42.617 91.9 32.12  0.000
## 25 -42.572 94.3 34.50  0.000
## Models ranked by AICc(x)
model.sel(kethop, rank=AIC) ->dulieuAIC
## New rank 'AIC' applied to logLik objects
model.sel(kethop, rank=BIC) ->dulieuBIC
## New rank 'BIC' applied to logLik objects
dulieuAIC$AIC
##  [1] 56.64897 57.23889 58.63650 59.23023 59.28327 60.32690 61.93139 62.20454
##  [9] 63.91611 63.95600 64.71440 64.95010 65.01599 65.84775 66.34779 66.38065
## [17] 66.57670 66.93869 66.95356 67.48675 67.73889 68.58834 68.71599 68.98488
## [25] 69.36348 69.93973 70.79855 70.99762 71.53735 71.57835 71.98931 72.15011
## [33] 72.71166 72.99448 73.14721 73.40929 73.43950 74.16211 74.46551 74.67352
## [41] 75.07421 75.76611 75.87111 76.09166 76.34981 76.72660 76.99068 77.33863
## [49] 80.19523 80.53170 80.83897 81.42616 81.60974 82.33273 82.38475 83.40636
## [57] 86.21066 87.85702 88.07231 89.24204 89.71856 91.15070 91.23495 93.14394
dulieuBIC$BIC
##  [1] 67.37217 68.47112 70.46018 71.05238 71.10542 72.06467 72.14754 72.33782
##  [9] 72.40040 74.29215 74.82505 74.84768 75.08338 75.14927 75.38309 75.73827
## [17] 75.74040 76.18329 76.70998 77.16039 77.62002 77.80788 78.16994 78.38413
## [25] 78.72161 78.77571 78.90563 79.46718 79.75000 79.98175 80.02274 80.43371
## [33] 80.91763 80.93183 81.13090 81.22103 81.59161 81.79324 81.85369 82.62663
## [41] 83.11792 83.57278 83.74620 84.21051 84.79421 85.20749 85.59834 85.78303
## [49] 85.90561 86.22494 86.95075 88.36526 89.08825 89.14027 89.87056 91.27730
## [57] 91.85075 92.61980 94.61254 94.82783 96.21734 96.30159 98.16296 99.89946

Câu 2.a

n <-nrow(dulieu)
n
## [1] 40
t<-ncol(dulieu)-1
t
## [1] 7
dulieu$yhat <-predict(hoiquy)
dulieu$ri<-dulieu$LogY-dulieu$yhat
dulieu$phandu<-residuals(hoiquy)
dulieu$ri2 <- dulieu$ri*dulieu$ri
head(dulieu)
##    y   x1  x2  x3  x4    x5  x6     LogY     yhat          ri      phandu
## 1 10 70.3 213 582 6.0  7.05  36 2.302585 2.020064  0.28252095  0.28252095
## 2 13 61.0  91 132 8.2 48.52 100 2.564949 3.104102 -0.53915236 -0.53915236
## 3 12 56.7 453 716 8.7 20.66  67 2.484907 2.833659 -0.34875244 -0.34875244
## 4 17 51.9 454 515 9.0 12.95  86 2.833213 3.091483 -0.25826976 -0.25826976
## 5 56 49.1 412 158 9.0 43.37 127 4.025352 3.994118  0.03123335  0.03123335
## 6 36 54.0  80  80 9.0 40.25 114 3.583519 3.278009  0.30550985  0.30550985
##            ri2
## 1 0.0798180899
## 2 0.2906852656
## 3 0.1216282650
## 4 0.0667032681
## 5 0.0009755219
## 6 0.0933362657
CV100 <-sum(dulieu$ri2)/n
CV100
## [1] 0.1699987

Tìm hiểu thêm

RSS <-function(x){
  tong <-sum(residuals(x)^2)
  return(tong)
}

CV100 <-RSS(hoiquy)/n
CV100
## [1] 0.1699987
get.models(kethop,subset=TRUE) ->tachModel
names(tachModel)
##  [1] "28" "32" "60" "48" "44" "64" "30" "16" "12" "24" "62" "46" "40" "20" "36"
## [16] "8"  "47" "39" "56" "52" "4"  "63" "14" "22" "55" "38" "35" "26" "43" "59"
## [31] "54" "51" "6"  "7"  "42" "15" "18" "34" "58" "23" "10" "2"  "50" "31" "53"
## [46] "37" "61" "45" "3"  "49" "33" "11" "57" "19" "41" "27" "5"  "13" "21" "1" 
## [61] "29" "17" "9"  "25"
tachModel[[1]]$residuals
##            1            2            3            4            5            6 
##  0.279365726 -0.464153191 -0.408660525 -0.192174055  0.227453121  0.375847413 
##            7            8            9           10           11           12 
##  0.275477832  0.095670708  0.149525788 -0.193893638  0.004767504 -0.157481856 
##           13           14           15           16           17           18 
## -0.113469764  0.006158303 -0.508364798  0.496970005 -0.106185774 -0.351928519 
##           19           20           21           22           23           24 
## -0.456630908  0.740743374 -0.259687711  0.027542609  0.291083810 -0.773205721 
##           25           26           27           28           29           30 
## -0.603054740  0.595260018 -0.236848099  0.286833738  0.653333964  1.147478938 
##           31           32           33           34           35           36 
## -0.652371548 -0.357834251 -0.038087953 -0.046840295  0.322044967  0.790757773 
##           37           38           39           40 
## -0.046029032 -0.109482174 -0.124674454 -0.565256581
RSS(tachModel[[1]])
## [1] 7.257059
#residuals(tachModel[1])
#tachModel[1]$residuals

phantach <- sapply(tachModel, residuals, datamoi = tachModel)
phantach <-data.frame(phantach)
#phantach

attach(phantach)
m=0

kqua=data.frame(Model=0,CV100=0, AIC=0, BIC=0)
for(i in 1:64){
  m=m+1
  
  z = RSS(tachModel[[i]])/n
  a = AIC(tachModel[[i]])
  b = BIC(tachModel[[i]])
  kqua[m,]=c(m,z,a,b)
  
}

kqua
##    Model     CV100      AIC      BIC
## 1      1 0.1814265 57.23889 67.37217
## 2      2 0.1700517 56.64897 68.47112
## 3      3 0.1813872 59.23023 71.05238
## 4      4 0.1816279 59.28327 71.10542
## 5      5 0.1959875 60.32690 70.46018
## 6      6 0.1699987 58.63650 72.14754
## 7      7 0.2040088 61.93139 72.06467
## 8      8 0.2054067 62.20454 72.33782
## 9      9 0.2256033 63.95600 72.40040
## 10    10 0.2187082 64.71440 74.84768
## 11    11 0.2039308 63.91611 75.73827
## 12    12 0.2200007 64.95010 75.08338
## 13    13 0.2203634 65.01599 75.14927
## 14    14 0.2365293 65.84775 74.29215
## 15    15 0.2397015 66.38065 74.82505
## 16    16 0.2430690 66.93869 75.38309
## 17    17 0.2291314 66.57670 76.70998
## 18    18 0.2479806 67.73889 76.18329
## 19    19 0.2167128 66.34779 78.16994
## 20    20 0.2344042 67.48675 77.62002
## 21    21 0.2689432 68.98488 75.74040
## 22    22 0.2200197 66.95356 78.77571
## 23    23 0.2541128 68.71599 77.16039
## 24    24 0.2582596 69.36348 77.80788
## 25    25 0.2409493 68.58834 78.72161
## 26    26 0.2620071 69.93973 78.38413
## 27    27 0.2910896 72.15011 78.90563
## 28    28 0.2726836 71.53735 79.98175
## 29    29 0.2729632 71.57835 80.02274
## 30    30 0.2546377 70.79855 80.93183
## 31    31 0.2559082 70.99762 81.13090
## 32    32 0.2757821 71.98931 80.43371
## 33    33 0.2952049 72.71166 79.46718
## 34    34 0.2972996 72.99448 79.75000
## 35    35 0.2838819 73.14721 81.59161
## 36    36 0.2857480 73.40929 81.85369
## 37    37 0.3061059 74.16211 80.91763
## 38    38 0.3084365 74.46551 81.22103
## 39    39 0.2720173 73.43950 83.57278
## 40    40 0.2949236 74.67352 83.11792
## 41    41 0.3194677 75.87111 82.62663
## 42    42 0.3431073 76.72660 81.79324
## 43    43 0.3030904 75.76611 84.21051
## 44    44 0.2833644 75.07421 85.20749
## 45    45 0.3075457 76.34981 84.79421
## 46    46 0.3285356 76.99068 83.74620
## 47    47 0.2906645 76.09166 86.22494
## 48    48 0.3152431 77.33863 85.78303
## 49    49 0.3773491 80.53170 85.59834
## 50    50 0.3559388 80.19523 86.95075
## 51    51 0.3802589 80.83897 85.90561
## 52    52 0.3687511 81.60974 88.36526
## 53    53 0.3491607 81.42616 89.87056
## 54    54 0.3754767 82.33273 89.08825
## 55    55 0.3759653 82.38475 89.14027
## 56    56 0.3668809 83.40636 91.85075
## 57    57 0.4349124 86.21066 91.27730
## 58    58 0.4310843 87.85702 94.61254
## 59    59 0.4334108 88.07231 94.82783
## 60    60 0.4932072 89.24204 92.61980
## 61    61 0.4295947 89.71856 98.16296
## 62    62 0.4920821 91.15070 96.21734
## 63    63 0.4931197 91.23495 96.30159
## 64    64 0.4919990 93.14394 99.89946

Lựa chọn model

kqua[which.min(kqua$CV100),]
##   Model     CV100     AIC      BIC
## 6     6 0.1699987 58.6365 72.14754
kqua[which.min(kqua$AIC),]
##   Model     CV100      AIC      BIC
## 2     2 0.1700517 56.64897 68.47112
kqua[which.min(kqua$BIC),]
##   Model     CV100      AIC      BIC
## 1     1 0.1814265 57.23889 67.37217