library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
df = read.csv("C:/Users/MUTHI'AH IFFA/Downloads/data liver (1).csv", header = TRUE, sep = ",")
data_selected <- select(df, Y, X1, X2, X4)
data_selected
## Y X1 X2 X4
## 1 158.76 16.36 8.90 6.02
## 2 197.19 26.68 21.22 12.07
## 3 144.73 12.49 16.62 8.88
## 4 140.06 8.45 22.86 7.46
## 5 129.71 10.19 14.23 2.06
## 6 162.59 19.53 17.35 7.54
## 7 178.48 20.65 10.48 4.88
## 8 120.90 22.96 14.23 3.69
## 9 191.24 21.22 21.64 11.94
## 10 150.03 8.11 3.16 8.82
## 11 173.44 24.74 7.84 3.68
## 12 211.98 11.38 15.71 7.20
## 13 193.49 15.82 15.04 9.89
## 14 164.04 8.36 9.01 3.40
## 15 156.97 12.04 9.72 6.03
## 16 208.36 10.97 4.58 5.55
## 17 154.62 7.97 9.33 4.17
## 18 137.38 7.46 6.11 2.99
## 19 180.15 29.09 15.71 9.35
## 20 228.47 10.30 8.54 10.78
## 21 153.62 7.82 4.41 4.19
## 22 121.31 14.71 6.29 6.16
## 23 157.37 8.54 6.73 5.52
## 24 211.27 23.05 11.34 3.00
## 25 178.16 13.12 5.86 10.92
## 26 174.89 7.41 9.11 5.50
## 27 142.98 14.59 5.59 3.75
## 28 165.59 8.52 6.52 6.92
## 29 141.54 18.97 6.35 5.61
## 30 238.22 35.41 36.36 15.00
## 31 138.42 4.55 1.27 2.83
## 32 247.45 22.59 28.70 10.35
## 33 140.27 9.21 4.55 7.92
## 34 216.06 18.32 11.61 8.07
## 35 144.18 5.69 6.88 2.78
## 36 156.22 11.21 11.92 10.29
attach(data_selected)
reg <- lm(Y ~ X1 + X2 + X4, data = df)
summary(reg)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X4, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.799 -16.805 -1.352 9.696 51.355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.1006 11.9983 10.177 1.48e-11 ***
## X1 0.9242 0.8127 1.137 0.2639
## X2 0.6180 0.9055 0.682 0.4999
## X4 3.9523 1.8119 2.181 0.0366 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.97 on 32 degrees of freedom
## Multiple R-squared: 0.386, Adjusted R-squared: 0.3284
## F-statistic: 6.706 on 3 and 32 DF, p-value: 0.001223
par(mfrow=c(1,3))
par(mfrow=c(1,3))
plot(df$X1, df$Y, pch=19, main = "Scatter Plot of X1 vs Y")
plot(df$X2, df$Y, pch=7, main = "Scatter Plot of X2 vs Y")
plot(df$X4, df$Y, pch=4, main = "Scatter Plot of X4 vs Y")
library(readr)
df = read.csv("C:/Users/MUTHI'AH IFFA/Downloads/data liver (1).csv", header = TRUE, sep = ",")
attach(data_selected)
## The following objects are masked from data_selected (pos = 3):
##
## X1, X2, X4, Y
data_selected <- select(df, Y, X1, X2, X4)
data_selected
## Y X1 X2 X4
## 1 158.76 16.36 8.90 6.02
## 2 197.19 26.68 21.22 12.07
## 3 144.73 12.49 16.62 8.88
## 4 140.06 8.45 22.86 7.46
## 5 129.71 10.19 14.23 2.06
## 6 162.59 19.53 17.35 7.54
## 7 178.48 20.65 10.48 4.88
## 8 120.90 22.96 14.23 3.69
## 9 191.24 21.22 21.64 11.94
## 10 150.03 8.11 3.16 8.82
## 11 173.44 24.74 7.84 3.68
## 12 211.98 11.38 15.71 7.20
## 13 193.49 15.82 15.04 9.89
## 14 164.04 8.36 9.01 3.40
## 15 156.97 12.04 9.72 6.03
## 16 208.36 10.97 4.58 5.55
## 17 154.62 7.97 9.33 4.17
## 18 137.38 7.46 6.11 2.99
## 19 180.15 29.09 15.71 9.35
## 20 228.47 10.30 8.54 10.78
## 21 153.62 7.82 4.41 4.19
## 22 121.31 14.71 6.29 6.16
## 23 157.37 8.54 6.73 5.52
## 24 211.27 23.05 11.34 3.00
## 25 178.16 13.12 5.86 10.92
## 26 174.89 7.41 9.11 5.50
## 27 142.98 14.59 5.59 3.75
## 28 165.59 8.52 6.52 6.92
## 29 141.54 18.97 6.35 5.61
## 30 238.22 35.41 36.36 15.00
## 31 138.42 4.55 1.27 2.83
## 32 247.45 22.59 28.70 10.35
## 33 140.27 9.21 4.55 7.92
## 34 216.06 18.32 11.61 8.07
## 35 144.18 5.69 6.88 2.78
## 36 156.22 11.21 11.92 10.29
x = as.matrix(cbind(1, data_selected[, c("X1", "X2", "X4")]))
y = as.matrix(data_selected$Y)
b = solve(t(x)%*%x)%*%t(x)%*%y;round(b,4)
## [,1]
## 1 122.1006
## X1 0.9242
## X2 0.6180
## X4 3.9523
b = solve(t(x) %*% x) %*% t(x) %*% y
round(b, 4)
## [,1]
## 1 122.1006
## X1 0.9242
## X2 0.6180
## X4 3.9523
n = nrow(x) # Jumlah observasi
p = ncol(x) # Jumlah parameter (termasuk intersep)
sigma_kuadrat <- (t(y) %*% y - t(b) %*% t(x) %*% y) / (n - p)
Res_se = sqrt(sigma_kuadrat)
round(Res_se, 3)
## [,1]
## [1,] 26.966
b0 = b[1]
b1 = b[2]
b2 = b[3]
b3 = b[4]
dbG <- n-p
dbG
## [1] 32
print(paste("b0:",b[1]))
## [1] "b0: 122.1006349907"
print(paste("b1:",b[2]))
## [1] "b1: 0.924234526233408"
print(paste("b2:",b[3]))
## [1] "b2: 0.617950334052231"
print(paste("b3:",b[4]))
## [1] "b3: 3.95233664011293"
y_duga = b0+ b1*X1 + b2*X2 + b3*X4 #persamaan regresi linear berganda
Y = data.frame(y,y_duga);head(Y)
## y y_duga
## 1 158.76 166.5139
## 2 197.19 207.5768
## 3 144.73 179.0114
## 4 140.06 173.5212
## 5 129.71 148.4538
## 6 162.59 180.6730
R_sq = (cor(y,y_duga))^2;round(R_sq,4)
## [,1]
## [1,] 0.386
nilai R-square sebesar 0.386 artinya bahwa meskipun ada penalti untuk jumlah variabel, model masih mampu untuk menjelaskan 38.6% dari variasi dalam y
dbR <- p-1 #db Regresi
dbG <- n-p #db Galat
dbR
## [1] 3
dbG
## [1] 32
galat <- y-(b0 + b1*X1 + b2*X2 + b3*X4)
KTR = sum((y_duga-mean(y))^2)/(p-1)
print(paste("KTR:",KTR))
## [1] "KTR: 4876.19884277776"
galat
## [,1]
## [1,] -7.7539364
## [2,] -10.3868215
## [3,] -34.2814081
## [4,] -33.4611927
## [5,] -18.7438315
## [6,] -18.0829919
## [7,] 11.5303997
## [8,] -45.7986152
## [9,] -11.0362363
## [10,] -16.3785092
## [11,] 9.0844734
## [12,] 41.1967525
## [13,] 8.3853924
## [14,] 15.2070873
## [15,] -6.0974859
## [16,] 51.3548314
## [17,] 2.9064954
## [18,] -7.2085877
## [19,] -15.4989647
## [20,] 48.9662646
## [21,] 5.0063995
## [22,] -42.6194262
## [23,] 1.4006982
## [24,] 49.0011925
## [25,] -2.8472970
## [26,] 18.5734081
## [27,] -10.8808215
## [28,] 4.2356811
## [29,] -24.1899571
## [30,] 1.6384967
## [31,] 0.1441883
## [32,] 45.8290482
## [33,] -24.4570152
## [34,] 37.9576284
## [35,] 1.5814764
## [36,] -24.2768160
KTG = sum(dbR^2)/(dbG)
print(paste("KTG",KTG))
## [1] "KTG 0.28125"
Fhit = KTR/KTG
print(paste("Fhit:",round(Fhit,2)))
## [1] "Fhit: 17337.6"
pastikan Fhit, dbR, dan dbG terdefinisi dengan benar
pf(Fhit, dbR, dbG, lower.tail = F)
## [1] 1.926304e-51
se_b <- sqrt(sigma_kuadrat[1]*solve(t(x)%*%x))
## Warning in sqrt(sigma_kuadrat[1] * solve(t(x) %*% x)): NaNs produced
se_b
## 1 X1 X2 X4
## 1 11.998293 NaN 1.1829188 NaN
## X1 NaN 0.8126577 NaN NaN
## X2 1.182919 NaN 0.9055069 NaN
## X4 NaN NaN NaN 1.811898
se_b0 = se_b[1,1];print(paste("se_b0:",round(se_b0,4)))
## [1] "se_b0: 11.9983"
se_b1 = se_b[2,2];print(paste("se_b1:",round(se_b1,4)))
## [1] "se_b1: 0.8127"
se_b2 = se_b[3,3];print(paste("se_b2:",round(se_b2,4)))
## [1] "se_b2: 0.9055"
se_b3 = se_b[4,4];print(paste("se_b3:",round(se_b3,4)))
## [1] "se_b3: 1.8119"
t_b0 = b0/se_b0;print(paste("t_b0:",round(t_b0,2)))
## [1] "t_b0: 10.18"
t_b1 = b1/se_b1;print(paste("t_b1:",round(t_b1,2)))
## [1] "t_b1: 1.14"
t_b2 = b2/se_b2;print(paste("t_b2:",round(t_b2,2)))
## [1] "t_b2: 0.68"
t_b3 = b3/se_b3;print(paste("t_b3:",round(t_b3,2)))
## [1] "t_b3: 2.18"
print(paste("p_b0:",2*pt(-abs(t_b0),df = n-p)))
## [1] "p_b0: 1.47506344832897e-11"
print(paste("p_b1:",2*pt(-abs(t_b1),df = n-p)))
## [1] "p_b1: 0.263858037514289"
print(paste("p_b2:",2*pt(-abs(t_b2),df = n-p)))
## [1] "p_b2: 0.499874045483506"
print(paste("p_b3:",2*pt(-abs(t_b3),df = n-p)))
## [1] "p_b3: 0.0366193614765748"
t <- qt(.975, df <- n-p)
BB_b0 = b0-t*se_b0
BA_b0 = b0+t*se_b0
BB_b1 = b1-t*se_b1
BA_b1 = b1+t*se_b1
BB_b2 = b2-t*se_b2
BA_b2 = b2+t*se_b2
BB_b3 = b3-t*se_b3
BA_b3 = b3+t*se_b3
Batas.Bawah = as.matrix(c(round(BB_b0,6),round(BB_b1,6),round(BB_b2,6), (round(BB_b3,6))))
Batas.Atas = as.matrix(c(round(BA_b0,6),round(BA_b1,6),round(BA_b2,6), (round(BB_b3,6))))
Fit = (c(round(b0,6),round(b1,6),round(b2,6),round(BB_b3,6)))
Menghitung batas atas dan batas bawah selang kepercayaan
Batas.Bawah = as.matrix(c(round(BB_b0,6),round(BB_b1,6),round(BB_b2,6), (round(BB_b3,6))))
Batas.Atas = as.matrix(c(round(BA_b0,6),round(BA_b1,6),round(BA_b2,6), (round(BB_b3,6))))
Fit = (c(round(b0,6),round(b1,6),round(b2,6),round(BB_b3,6)))
Menghitung Selang Kepercayaan (SK)
Selang.Kepercayaan = cbind(Batas.Bawah, Fit, Batas.Atas)
colnames(Selang.Kepercayaan ) <- c("Batas bawah Selang (2.5%)", "Fit", "Batas atas Selang (97.5%)")
rownames(Selang.Kepercayaan ) <- c("Intersep", "b1", "b2", "b3")
Selang.Kepercayaan
## Batas bawah Selang (2.5%) Fit Batas atas Selang (97.5%)
## Intersep 97.660912 122.100635 146.540358
## b1 -0.731095 0.924235 2.579564
## b2 -1.226507 0.617950 2.462407
## b3 0.261620 0.261620 0.261620
Fungsi lm()
di R digunakan untuk menyesuaikan model
regresi linear dengan data
reg = lm (y ~ X1 + X2 + X4, data = data_selected)
summary(reg)
##
## Call:
## lm(formula = y ~ X1 + X2 + X4, data = data_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.799 -16.805 -1.352 9.696 51.355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122.1006 11.9983 10.177 1.48e-11 ***
## X1 0.9242 0.8127 1.137 0.2639
## X2 0.6180 0.9055 0.682 0.4999
## X4 3.9523 1.8119 2.181 0.0366 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.97 on 32 degrees of freedom
## Multiple R-squared: 0.386, Adjusted R-squared: 0.3284
## F-statistic: 6.706 on 3 and 32 DF, p-value: 0.001223
Melakukan Analisis of Variance (tabel sidik ragam)
anova(reg)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 1 8501.9 8501.9 11.6922 0.001731 **
## X2 1 2666.8 2666.8 3.6674 0.064461 .
## X4 1 3459.9 3459.9 4.7582 0.036619 *
## Residuals 32 23268.7 727.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Menghitung interval kepercayaan
confint(reg)
## 2.5 % 97.5 %
## (Intercept) 97.6609117 146.540358
## X1 -0.7310951 2.579564
## X2 -1.2265068 2.462407
## X4 0.2616205 7.643053
koef <- as.matrix(reg$coefficient)
penduga <- cbind(b, koef)
colnames(penduga) <- c('matriks', 'fungsi lm')
rownames(penduga) <- c("intersep", "b1", "b2", "b3")
penduga
## matriks fungsi lm
## intersep 122.1006350 122.1006350
## b1 0.9242345 0.9242345
## b2 0.6179503 0.6179503
## b3 3.9523366 3.9523366
# Model 1
a = summary(reg)
r.sq1 = a$r.squared
adj_r.sq1 = a$adj.r.squared
se1 = a$sigma
# Model 2
reg2 = lm(y~.-1, data = data_selected)
b = summary(reg2)
## Warning in summary.lm(reg2): essentially perfect fit: summary may be unreliable
r.sq2 = b$r.squared
adj_r.sq2 = b$adj.r.squared
se2 = b$sigma
# Membandingkan kebaikan model
model1 = as.matrix(c(r.sq1,adj_r.sq1,se1))
model2 = as.matrix(c(r.sq2,adj_r.sq2,se2))
tabel= cbind(model1, model2)
colnames(tabel) = c("Model 1", "Model 2")
rownames(tabel) = c("R-Square", "Adj R-Square", "Standar Error Sisaan")
tabel
## Model 1 Model 2
## R-Square 0.386006 1.000000e+00
## Adj R-Square 0.328444 1.000000e+00
## Standar Error Sisaan 26.965682 1.355674e-14
Hasil terindikasi essentially perfect fit. Artinya hasil analisis memiliki tingkat kecocokan yang terlalu sempurna dengan data. Hal ini bisa disebabkan karena overfitting (terlalu banyak variabel relatif terhadap jumlah data yang tersedia), multikolinearitas (korelasi tinggi antara variabel dan prediktor), ataupun adanya variabel yang terlalu kuat.