Perhitungan langsung : Pembentukan Model Matriks

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
df = read.csv("C:/Users/MUTHI'AH IFFA/Downloads/data liver (1).csv", header = TRUE, sep = ",")
data_selected <- select(df, Y, X1, X2, X4)
data_selected
##         Y    X1    X2    X4
## 1  158.76 16.36  8.90  6.02
## 2  197.19 26.68 21.22 12.07
## 3  144.73 12.49 16.62  8.88
## 4  140.06  8.45 22.86  7.46
## 5  129.71 10.19 14.23  2.06
## 6  162.59 19.53 17.35  7.54
## 7  178.48 20.65 10.48  4.88
## 8  120.90 22.96 14.23  3.69
## 9  191.24 21.22 21.64 11.94
## 10 150.03  8.11  3.16  8.82
## 11 173.44 24.74  7.84  3.68
## 12 211.98 11.38 15.71  7.20
## 13 193.49 15.82 15.04  9.89
## 14 164.04  8.36  9.01  3.40
## 15 156.97 12.04  9.72  6.03
## 16 208.36 10.97  4.58  5.55
## 17 154.62  7.97  9.33  4.17
## 18 137.38  7.46  6.11  2.99
## 19 180.15 29.09 15.71  9.35
## 20 228.47 10.30  8.54 10.78
## 21 153.62  7.82  4.41  4.19
## 22 121.31 14.71  6.29  6.16
## 23 157.37  8.54  6.73  5.52
## 24 211.27 23.05 11.34  3.00
## 25 178.16 13.12  5.86 10.92
## 26 174.89  7.41  9.11  5.50
## 27 142.98 14.59  5.59  3.75
## 28 165.59  8.52  6.52  6.92
## 29 141.54 18.97  6.35  5.61
## 30 238.22 35.41 36.36 15.00
## 31 138.42  4.55  1.27  2.83
## 32 247.45 22.59 28.70 10.35
## 33 140.27  9.21  4.55  7.92
## 34 216.06 18.32 11.61  8.07
## 35 144.18  5.69  6.88  2.78
## 36 156.22 11.21 11.92 10.29
attach(data_selected)
reg <- lm(Y ~ X1 + X2 + X4, data = df)
summary(reg)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X4, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.799 -16.805  -1.352   9.696  51.355 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 122.1006    11.9983  10.177 1.48e-11 ***
## X1            0.9242     0.8127   1.137   0.2639    
## X2            0.6180     0.9055   0.682   0.4999    
## X4            3.9523     1.8119   2.181   0.0366 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.97 on 32 degrees of freedom
## Multiple R-squared:  0.386,  Adjusted R-squared:  0.3284 
## F-statistic: 6.706 on 3 and 32 DF,  p-value: 0.001223

Visualisasi Data

par(mfrow=c(1,3))
par(mfrow=c(1,3))
plot(df$X1, df$Y, pch=19, main = "Scatter Plot of X1 vs Y")
plot(df$X2, df$Y, pch=7, main = "Scatter Plot of X2 vs Y")
plot(df$X4, df$Y, pch=4, main = "Scatter Plot of X4 vs Y")

Perhitungan Manual : Pembentukan Model Matriks

library(readr)
df = read.csv("C:/Users/MUTHI'AH IFFA/Downloads/data liver (1).csv", header = TRUE, sep = ",")
attach(data_selected)
## The following objects are masked from data_selected (pos = 3):
## 
##     X1, X2, X4, Y
data_selected <- select(df, Y, X1, X2, X4)
data_selected
##         Y    X1    X2    X4
## 1  158.76 16.36  8.90  6.02
## 2  197.19 26.68 21.22 12.07
## 3  144.73 12.49 16.62  8.88
## 4  140.06  8.45 22.86  7.46
## 5  129.71 10.19 14.23  2.06
## 6  162.59 19.53 17.35  7.54
## 7  178.48 20.65 10.48  4.88
## 8  120.90 22.96 14.23  3.69
## 9  191.24 21.22 21.64 11.94
## 10 150.03  8.11  3.16  8.82
## 11 173.44 24.74  7.84  3.68
## 12 211.98 11.38 15.71  7.20
## 13 193.49 15.82 15.04  9.89
## 14 164.04  8.36  9.01  3.40
## 15 156.97 12.04  9.72  6.03
## 16 208.36 10.97  4.58  5.55
## 17 154.62  7.97  9.33  4.17
## 18 137.38  7.46  6.11  2.99
## 19 180.15 29.09 15.71  9.35
## 20 228.47 10.30  8.54 10.78
## 21 153.62  7.82  4.41  4.19
## 22 121.31 14.71  6.29  6.16
## 23 157.37  8.54  6.73  5.52
## 24 211.27 23.05 11.34  3.00
## 25 178.16 13.12  5.86 10.92
## 26 174.89  7.41  9.11  5.50
## 27 142.98 14.59  5.59  3.75
## 28 165.59  8.52  6.52  6.92
## 29 141.54 18.97  6.35  5.61
## 30 238.22 35.41 36.36 15.00
## 31 138.42  4.55  1.27  2.83
## 32 247.45 22.59 28.70 10.35
## 33 140.27  9.21  4.55  7.92
## 34 216.06 18.32 11.61  8.07
## 35 144.18  5.69  6.88  2.78
## 36 156.22 11.21 11.92 10.29
x = as.matrix(cbind(1, data_selected[, c("X1", "X2", "X4")]))
y = as.matrix(data_selected$Y)
b = solve(t(x)%*%x)%*%t(x)%*%y;round(b,4)
##        [,1]
## 1  122.1006
## X1   0.9242
## X2   0.6180
## X4   3.9523
b = solve(t(x) %*% x) %*% t(x) %*% y
round(b, 4)
##        [,1]
## 1  122.1006
## X1   0.9242
## X2   0.6180
## X4   3.9523

Koefisien Determinasi dan Penyesuaian

Menghitung varians residual dan standard error

n = nrow(x) # Jumlah observasi
p = ncol(x) # Jumlah parameter (termasuk intersep)
sigma_kuadrat <- (t(y) %*% y - t(b) %*% t(x) %*% y) / (n - p)
Res_se = sqrt(sigma_kuadrat)
round(Res_se, 3)
##        [,1]
## [1,] 26.966
b0 = b[1]
b1 = b[2]
b2 = b[3]
b3 = b[4]
dbG <- n-p
dbG
## [1] 32

Menampilkan nilai b0 dan b1

print(paste("b0:",b[1]))
## [1] "b0: 122.1006349907"
print(paste("b1:",b[2]))
## [1] "b1: 0.924234526233408"
print(paste("b2:",b[3]))
## [1] "b2: 0.617950334052231"
print(paste("b3:",b[4]))
## [1] "b3: 3.95233664011293"

Menghitung R-square

y_duga = b0+ b1*X1 + b2*X2 + b3*X4 #persamaan regresi linear berganda 
Y = data.frame(y,y_duga);head(Y)
##        y   y_duga
## 1 158.76 166.5139
## 2 197.19 207.5768
## 3 144.73 179.0114
## 4 140.06 173.5212
## 5 129.71 148.4538
## 6 162.59 180.6730
R_sq = (cor(y,y_duga))^2;round(R_sq,4)
##       [,1]
## [1,] 0.386

nilai R-square sebesar 0.386 artinya bahwa meskipun ada penalti untuk jumlah variabel, model masih mampu untuk menjelaskan 38.6% dari variasi dalam y

Uji F & Standard Error Parameter Regresi

db (derajat bebas) Regresi dan Galat

dbR <- p-1 #db Regresi
dbG <- n-p #db Galat
dbR
## [1] 3
dbG
## [1] 32

Kuadrat Tengah Regresi

galat <- y-(b0 + b1*X1 + b2*X2 + b3*X4)
KTR = sum((y_duga-mean(y))^2)/(p-1)
print(paste("KTR:",KTR))
## [1] "KTR: 4876.19884277776"
galat
##              [,1]
##  [1,]  -7.7539364
##  [2,] -10.3868215
##  [3,] -34.2814081
##  [4,] -33.4611927
##  [5,] -18.7438315
##  [6,] -18.0829919
##  [7,]  11.5303997
##  [8,] -45.7986152
##  [9,] -11.0362363
## [10,] -16.3785092
## [11,]   9.0844734
## [12,]  41.1967525
## [13,]   8.3853924
## [14,]  15.2070873
## [15,]  -6.0974859
## [16,]  51.3548314
## [17,]   2.9064954
## [18,]  -7.2085877
## [19,] -15.4989647
## [20,]  48.9662646
## [21,]   5.0063995
## [22,] -42.6194262
## [23,]   1.4006982
## [24,]  49.0011925
## [25,]  -2.8472970
## [26,]  18.5734081
## [27,] -10.8808215
## [28,]   4.2356811
## [29,] -24.1899571
## [30,]   1.6384967
## [31,]   0.1441883
## [32,]  45.8290482
## [33,] -24.4570152
## [34,]  37.9576284
## [35,]   1.5814764
## [36,] -24.2768160

Kuadrat Tengah Galat

KTG = sum(dbR^2)/(dbG)
print(paste("KTG",KTG))
## [1] "KTG 0.28125"

Menghitung F-hitung

Fhit = KTR/KTG
print(paste("Fhit:",round(Fhit,2)))
## [1] "Fhit: 17337.6"

pastikan Fhit, dbR, dan dbG terdefinisi dengan benar

pf(Fhit, dbR, dbG, lower.tail = F)
## [1] 1.926304e-51

Standard Error

se_b <- sqrt(sigma_kuadrat[1]*solve(t(x)%*%x))
## Warning in sqrt(sigma_kuadrat[1] * solve(t(x) %*% x)): NaNs produced
se_b
##            1        X1        X2       X4
## 1  11.998293       NaN 1.1829188      NaN
## X1       NaN 0.8126577       NaN      NaN
## X2  1.182919       NaN 0.9055069      NaN
## X4       NaN       NaN       NaN 1.811898
se_b0 = se_b[1,1];print(paste("se_b0:",round(se_b0,4)))
## [1] "se_b0: 11.9983"
se_b1 = se_b[2,2];print(paste("se_b1:",round(se_b1,4)))
## [1] "se_b1: 0.8127"
se_b2 = se_b[3,3];print(paste("se_b2:",round(se_b2,4)))
## [1] "se_b2: 0.9055"
se_b3 = se_b[4,4];print(paste("se_b3:",round(se_b3,4)))
## [1] "se_b3: 1.8119"

Signifikansi Parameter (nilai-t)

t-value

t_b0 = b0/se_b0;print(paste("t_b0:",round(t_b0,2)))
## [1] "t_b0: 10.18"
t_b1 = b1/se_b1;print(paste("t_b1:",round(t_b1,2)))
## [1] "t_b1: 1.14"
t_b2 = b2/se_b2;print(paste("t_b2:",round(t_b2,2)))
## [1] "t_b2: 0.68"
t_b3 = b3/se_b3;print(paste("t_b3:",round(t_b3,2)))
## [1] "t_b3: 2.18"

p-value

print(paste("p_b0:",2*pt(-abs(t_b0),df = n-p)))
## [1] "p_b0: 1.47506344832897e-11"
print(paste("p_b1:",2*pt(-abs(t_b1),df = n-p)))
## [1] "p_b1: 0.263858037514289"
print(paste("p_b2:",2*pt(-abs(t_b2),df = n-p)))
## [1] "p_b2: 0.499874045483506"
print(paste("p_b3:",2*pt(-abs(t_b3),df = n-p)))
## [1] "p_b3: 0.0366193614765748"

Selang Kepercayaan

 t <- qt(.975, df <- n-p)

BB_b0 = b0-t*se_b0
BA_b0 = b0+t*se_b0

BB_b1 = b1-t*se_b1
BA_b1 = b1+t*se_b1

BB_b2 = b2-t*se_b2
BA_b2 = b2+t*se_b2

BB_b3 = b3-t*se_b3
BA_b3 = b3+t*se_b3

Batas.Bawah = as.matrix(c(round(BB_b0,6),round(BB_b1,6),round(BB_b2,6), (round(BB_b3,6))))
Batas.Atas = as.matrix(c(round(BA_b0,6),round(BA_b1,6),round(BA_b2,6), (round(BB_b3,6))))
Fit = (c(round(b0,6),round(b1,6),round(b2,6),round(BB_b3,6)))

Menghitung batas atas dan batas bawah selang kepercayaan

Batas.Bawah = as.matrix(c(round(BB_b0,6),round(BB_b1,6),round(BB_b2,6), (round(BB_b3,6))))
Batas.Atas = as.matrix(c(round(BA_b0,6),round(BA_b1,6),round(BA_b2,6), (round(BB_b3,6))))
Fit = (c(round(b0,6),round(b1,6),round(b2,6),round(BB_b3,6)))

Menghitung Selang Kepercayaan (SK)

Selang.Kepercayaan = cbind(Batas.Bawah, Fit, Batas.Atas)
colnames(Selang.Kepercayaan ) <- c("Batas bawah Selang (2.5%)", "Fit", "Batas atas Selang (97.5%)")
rownames(Selang.Kepercayaan ) <- c("Intersep", "b1", "b2", "b3")
Selang.Kepercayaan
##          Batas bawah Selang (2.5%)        Fit Batas atas Selang (97.5%)
## Intersep                 97.660912 122.100635                146.540358
## b1                       -0.731095   0.924235                  2.579564
## b2                       -1.226507   0.617950                  2.462407
## b3                        0.261620   0.261620                  0.261620

Pembentukan Model Fungsi lm()

Fungsi lm() di R digunakan untuk menyesuaikan model regresi linear dengan data

reg = lm (y ~ X1 + X2 + X4, data = data_selected)
summary(reg)
## 
## Call:
## lm(formula = y ~ X1 + X2 + X4, data = data_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.799 -16.805  -1.352   9.696  51.355 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 122.1006    11.9983  10.177 1.48e-11 ***
## X1            0.9242     0.8127   1.137   0.2639    
## X2            0.6180     0.9055   0.682   0.4999    
## X4            3.9523     1.8119   2.181   0.0366 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.97 on 32 degrees of freedom
## Multiple R-squared:  0.386,  Adjusted R-squared:  0.3284 
## F-statistic: 6.706 on 3 and 32 DF,  p-value: 0.001223

Melakukan Analisis of Variance (tabel sidik ragam)

anova(reg)
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value   Pr(>F)   
## X1         1  8501.9  8501.9 11.6922 0.001731 **
## X2         1  2666.8  2666.8  3.6674 0.064461 . 
## X4         1  3459.9  3459.9  4.7582 0.036619 * 
## Residuals 32 23268.7   727.1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Menghitung interval kepercayaan

confint(reg)
##                  2.5 %     97.5 %
## (Intercept) 97.6609117 146.540358
## X1          -0.7310951   2.579564
## X2          -1.2265068   2.462407
## X4           0.2616205   7.643053

Perbandingan Manual (matriks) dan Fungsi lm()

koef <- as.matrix(reg$coefficient)
penduga <- cbind(b, koef)
colnames(penduga) <- c('matriks', 'fungsi lm')
rownames(penduga) <- c("intersep", "b1", "b2", "b3")
penduga
##              matriks   fungsi lm
## intersep 122.1006350 122.1006350
## b1         0.9242345   0.9242345
## b2         0.6179503   0.6179503
## b3         3.9523366   3.9523366

Perbandingan 2 model

# Model 1
a = summary(reg)
r.sq1 = a$r.squared
adj_r.sq1 = a$adj.r.squared
se1 = a$sigma

# Model 2
reg2 = lm(y~.-1, data = data_selected)
b = summary(reg2)
## Warning in summary.lm(reg2): essentially perfect fit: summary may be unreliable
r.sq2 = b$r.squared
adj_r.sq2 = b$adj.r.squared
se2 = b$sigma

# Membandingkan kebaikan model
model1 = as.matrix(c(r.sq1,adj_r.sq1,se1))
model2 = as.matrix(c(r.sq2,adj_r.sq2,se2))

tabel= cbind(model1, model2)
colnames(tabel) = c("Model 1", "Model 2")
rownames(tabel) = c("R-Square", "Adj R-Square", "Standar Error Sisaan")
tabel
##                        Model 1      Model 2
## R-Square              0.386006 1.000000e+00
## Adj R-Square          0.328444 1.000000e+00
## Standar Error Sisaan 26.965682 1.355674e-14

Hasil terindikasi essentially perfect fit. Artinya hasil analisis memiliki tingkat kecocokan yang terlalu sempurna dengan data. Hal ini bisa disebabkan karena overfitting (terlalu banyak variabel relatif terhadap jumlah data yang tersedia), multikolinearitas (korelasi tinggi antara variabel dan prediktor), ataupun adanya variabel yang terlalu kuat.