Regresi Robust

Data

Educational Testing Service Study America’s Smallest School menyelidiki hubungan prestasi pendidikan siswa dengan lingkungan rumah mereka di 37 Negara bagian. Hubungan prestasi pendidikan siswa kelas delapan dalam matematika dengan lima variabel penjelas berikut:

• Parents (X1)- Persentase siswa kelas delapan dengan kedua orang tua yang tinggal di rumah

• Homelib (X2) - Persentase siswa kelas delapan dengan tiga atau lebih jenis bahan bacaan di rumah (buku, ensiklopedi, majalah, surat kabar)

• Reading (X3) - Persentase siswa kelas delapan yang membaca lebih dari 10 halaman sehari

• TVwatch (X4) - Persentase siswa kelas delapan yang menonton TV selama enam jam atau lebih per hari

• Absences (X5) - Persentase siswa kelas delapan absen tiga hari atau lebih bulan lalu

• Mathprof (Y) – data tentang rata-rata kemahiran matematika

mathdata<-read.table("D:\\mathproficiencydata.txt",head=T)
str(mathdata)
## 'data.frame':    40 obs. of  7 variables:
##  $ State   : chr  "Alabama" "Arizona" "Arkansas" "California" ...
##  $ Mathprof: int  252 259 256 256 267 270 261 231 255 258 ...
##  $ Parents : int  75 75 77 78 78 79 75 47 75 73 ...
##  $ Homelib : int  78 73 77 68 85 86 83 76 73 80 ...
##  $ Reading : int  34 41 28 42 38 43 32 24 31 36 ...
##  $ TVwatch : int  18 12 20 11 9 12 18 33 19 17 ...
##  $ Absences: int  18 26 23 28 25 22 28 37 27 22 ...

Regresi dengan OLS

library(GGally)
ggpairs(mathdata[,-1]) #variabel state tidak digunakan

ols <- lm(Mathprof~Parents+Homelib+Reading+TVwatch+Absences,data=mathdata)
summary(ols)
## 
## Call:
## lm(formula = Mathprof ~ Parents + Homelib + Reading + TVwatch + 
##     Absences, data = mathdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.0131  -2.9395   0.4694   2.5336   9.3248 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 155.0304    36.2383   4.278 0.000145 ***
## Parents       0.3911     0.2571   1.521 0.137399    
## Homelib       0.8639     0.1797   4.807 3.05e-05 ***
## Reading       0.3616     0.2690   1.345 0.187679    
## TVwatch      -0.8467     0.3525  -2.402 0.021927 *  
## Absences      0.1923     0.2636   0.729 0.470718    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.268 on 34 degrees of freedom
## Multiple R-squared:  0.861,  Adjusted R-squared:  0.8406 
## F-statistic: 42.13 on 5 and 34 DF,  p-value: 1.276e-13

Pemeriksaan Outlier

library(MASS)
cook <- cooks.distance(ols)
plot(cook,ylab="Cooks distances")
identify(1:40,cook,mathdata$State)

## integer(0)

Cook’s Distance

• Kriteria pengamatan berpengaruh:

• Jika Di > 0.5, maka pengamatan ke-i mungkin berpengaruh

• Jika Di > 1, maka pengamatan ke-i berpengaruh

• Jika Di > 4/n, dengan n adalah banyaknya pengamatan, maka pengamatan ke-i berpengaruh

data.frame(State=mathdata$State,Cook.Distance=round(cook,6))
##             State Cook.Distance
## 1         Alabama      0.000095
## 2         Arizona      0.006089
## 3        Arkansas      0.062559
## 4      California      0.000730
## 5        Colorado      0.005878
## 6     Connecticut      0.000023
## 7        Delaware      0.009313
## 8            D.C.      0.719135
## 9         Florida      0.034872
## 10        Georgia      0.003248
## 11           Guam      0.568809
## 12         Hawaii      0.173229
## 13          Idaho      0.021580
## 14       Illinois      0.021965
## 15        Indiana      0.000178
## 16           Iowa      0.002131
## 17       Kentucky      0.002069
## 18      Louisiana      0.012272
## 19       Maryland      0.005713
## 20       Michigan      0.004357
## 21      Minnesota      0.002846
## 22        Montana      0.001765
## 23       Nebraska      0.000181
## 24  New_Hampshire      0.005878
## 25     New_Jersey      0.002604
## 26     New_Mexico      0.005004
## 27       New_York      0.010345
## 28 North_Carolina      0.003839
## 29   North_Dakota      0.011154
## 30           Ohio      0.002274
## 31       Oklahoma      0.005365
## 32         Oregon      0.000164
## 33   Pennsylvania      0.009485
## 34   Rhode_Island      0.007053
## 35          Texas      0.329190
## 36 Virgin_Islands      1.205460
## 37       Virginia      0.009441
## 38  West_Virginia      0.020053
## 39      Wisconsin      0.002309
## 40        Wyoming      0.011920
library(foreign)
library(MASS)
cook.dist <- cooks.distance(ols)
r <- stdres(ols)
a <- cbind(mathdata, cook.dist, r)
n <- nrow(mathdata)
a[cook.dist > 4/n, ]
##             State Mathprof Parents Homelib Reading TVwatch Absences cook.dist
## 8            D.C.      231      47      76      24      33       37 0.7191353
## 11           Guam      231      81      64      32      20       28 0.5688087
## 12         Hawaii      251      78      69      36      23       26 0.1732292
## 35          Texas      258      77      70      34      15       18 0.3291897
## 36 Virgin_Islands      218      63      76      23      27       22 1.2054599
##            r
## 8   1.391454
## 11 -2.575110
## 12  1.635161
## 35  2.122823
## 36 -3.917336

Menghitung Pembobot Huber

head(abs(ols$residuals-median(ols$residuals)))
##          1          2          3          4          5          6 
## 0.73322417 1.43615532 6.25007136 0.01083135 3.36678993 0.31287137
A<-ols$residuals
B<-abs(ols$residuals-median(ols$residuals))
median(abs(ols$residuals-median(ols$residuals)))
## [1] 3.133648
MAD<-median(abs(ols$residuals-median(ols$residuals)))/0.6745
MAD
## [1] 4.645883
head(ui<-ols$residuals/MAD)
##           1           2           3           4           5           6 
## -0.05677825  0.41016835  1.44633642  0.09871270 -0.62363831  0.03370030
U<-ols$residuals/MAD
head(cbind(A,B,U))
##            A          B           U
## 1 -0.2637851 0.73322417 -0.05677825
## 2  1.9055943 1.43615532  0.41016835
## 3  6.7195104 6.25007136  1.44633642
## 4  0.4586077 0.01083135  0.09871270
## 5 -2.8973509 3.36678993 -0.62363831
## 6  0.1565677 0.31287137  0.03370030
wi<-rep(1,nrow(mathdata))
for (i in 1:length(wi))
if (abs(ui[i])>1.345) {wi[i]=1.345/abs(ui[i])}
print(wi)
##  [1] 1.0000000 1.0000000 0.9299358 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 0.8611127 1.0000000 0.5668983 0.8548525 1.0000000 1.0000000
## [15] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6701211
## [36] 0.3672876 1.0000000 1.0000000 1.0000000 1.0000000

Estimasi parameter dengan Regresi Robust IRLS Huber

X<-as.matrix(cbind(1,mathdata$Parents,mathdata$Homelib,mathdata$Reading,mathdata$TVwatch,mathdata$Absences))
Y<-as.matrix(mathdata$Mathprof)
W<-diag(wi)
b_w <- solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%Y
b_w
##              [,1]
## [1,] 168.88877249
## [2,]   0.34879132
## [3,]   0.83807986
## [4,]   0.21587280
## [5,]  -0.82701462
## [6,]   0.06444969
Ytopi2<-X%*%b_w
ei2<-Y-Ytopi2

Regresi Robust dengan Pembobot Huber

library(MASS)
rr.huber <- rlm(Mathprof~Parents+Homelib+Reading+TVwatch+Absences, data =mathdata, psi = psi.huber)
summary(rr.huber)
## 
## Call: rlm(formula = Mathprof ~ Parents + Homelib + Reading + TVwatch + 
##     Absences, data = mathdata, psi = psi.huber)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.1969  -3.1202   0.9977   2.5250   5.8669 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept) 178.1624  28.3043     6.2945
## Parents       0.3573   0.2008     1.7795
## Homelib       0.7703   0.1404     5.4875
## Reading       0.1495   0.2101     0.7118
## TVwatch      -0.8218   0.2754    -2.9846
## Absences     -0.0121   0.2059    -0.0588
## 
## Residual standard error: 4.094 on 34 degrees of freedom

Regresi Robust dengan Pembobot Bisquare

library(MASS)
rr.bisquare <- rlm(Mathprof~Parents+Homelib+Reading+TVwatch+Absences, data = mathdata, psi = psi.bisquare)

summary(rr.bisquare)
## 
## Call: rlm(formula = Mathprof ~ Parents + Homelib + Reading + TVwatch + 
##     Absences, data = mathdata, psi = psi.bisquare)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.9476  -3.2096   0.5676   2.2121   4.0740 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept) 186.5746  25.0714     7.4417
## Parents       0.4344   0.1779     2.4424
## Homelib       0.6734   0.1243     5.4162
## Reading       0.0116   0.1861     0.0621
## TVwatch      -0.7559   0.2439    -3.0991
## Absences     -0.0924   0.1824    -0.5066
## 
## Residual standard error: 3.518 on 34 degrees of freedom