Dosen Pengampu: Prof. Dr. Suhartono, M.Kom
Mata Kuliah: Linear Algebra
Prodi: Teknik Informatika
Lembaga: Universitas Islam Negeri Maulana Malik Ibrahim Malang
Smoothing splines adalah pendekatan yang ampuh untuk memperkirakan hubungan fungsional antara prediktor X dan respons Y. Smoothing splines dapat ditampung baik menggunakan fungsi smooth.spline (dalam paket stats) atau fungsi ss (dalam paket npreg). Dokumen ini memberikan latar belakang teoritis tentang smoothing splines, serta contoh-contoh yang menggambarkan bagaimana menggunakan fungsi smooth.spline dan ss. Seperti yang saya tunjukkan dalam tutorial ini, kedua fungsi memiliki sintaks yang sangat mirip, tetapi fungsi ss menawarkan beberapa opsi tambahan. Dibandingkan dengan fungsi smooth.spline, fungsi ss memiliki
lebih banyak metode pemilihan parameter penghalusan lebih banyak jenis spline (linier, kubik, quintic) opsi untuk batasan periodisitas opsi untuk nilai simpul yang ditentukan pengguna ringkasan yang sesuai dan metode plot
Untuk melihat bagaimana kinerja fungsi-fungsi ini dalam praktiknya, mari kita lihat contoh simulasinya. Secara khusus, mari simulasikan beberapa data dengan hubungan fungsional (berkala) yang memiliki beberapa gangguan. Berikut Analisa Data dengan Regresi Nonparametrik dengan Pendekatan Smooting Spline Pada Data Google Mobility Index dan Covid-19 di Jakarta Agustus 2020.
## Import the data
library(readxl)
datacovidmobility <- read_excel(path = "Data_Covid_DKI_Agustus.xlsx")
datacovidmobility
plot(datacovidmobility$Tanggal,datacovidmobility$POSITIF)
library(jmv)
## Warning: package 'jmv' was built under R version 4.1.3
# Use the descritptives function to get the descritptive data
descriptives(datacovidmobility, vars = vars(POSITIF, retail_and_recreation_percent_change_from_baseline), freq = TRUE)
##
## DESCRIPTIVES
##
## Descriptives
## ----------------------------------------------------------------------------------------
## POSITIF retail_and_recreation_percent_change_from_baseline
## ----------------------------------------------------------------------------------------
## N 31 31
## Missing 0 0
## Mean 29854.23 -28.32258
## Median 29554.00 -27.00000
## Standard deviation 5474.232 4.061093
## Minimum 21575.00 -37.00000
## Maximum 40309.00 -21.00000
## ----------------------------------------------------------------------------------------
hist(datacovidmobility$POSITIF, main="Distribusi Positif di RS untuk Covid 19", xlab="Positif di RS Untuk Covid 19", xlim=c(3000,6000),ylim=c(0,15)) # histogram variabel wt
mean=mean(datacovidmobility$POSITIF) # Menghitung nilai mean variabel wt
abline(v=mean, col="green",lwd=2) # Menambahkan garis vertical pada plot
# 2. Density plot
d<-density(datacovidmobility$POSITIF) # menghitung density variabel wt
plot(d, xlab="Positif di RS untuk Covid 19", main="Distribusi Positif di RS untuk Covid 19")
library(npreg)
## Warning: package 'npreg' was built under R version 4.1.3
# fit using ss
mod.ss <- ss( datacovidmobility$Tanggal,datacovidmobility$POSITIF, nknots = 10)
mod.ss
##
## Call:
## ss(x = datacovidmobility$Tanggal, y = datacovidmobility$POSITIF,
## nknots = 10)
##
## Smoothing Parameter spar = 0.1570775 lambda = 8.130566e-07
## Equivalent Degrees of Freedom (Df) 9.619984
## Penalized Criterion (RSS) 77336.97
## Generalized Cross-Validation (GCV) 5244.846
# fit using smooth.spline
mod.smsp <- smooth.spline( datacovidmobility$Tanggal,datacovidmobility$POSITIF, nknots = 10)
mod.smsp
## Call:
## smooth.spline(x = datacovidmobility$Tanggal, y = datacovidmobility$POSITIF,
## nknots = 10)
##
## Smoothing Parameter spar= 0.09083055 lambda= 1.184222e-05 (16 iterations)
## Equivalent Degrees of Freedom (Df): 10.58814
## Penalized Criterion (RSS): 66356.69
## GCV: 4937.205
# rmse between solutions
sqrt(mean(( mod.ss$y - mod.smsp$y )^2))
## [1] 10.33558
# rmse between solutions and f(x)
sqrt(mean(( datacovidmobility$POSITIF - mod.ss$y )^2))
## [1] 49.94738
sqrt(mean(( datacovidmobility$POSITIF - mod.smsp$y )^2))
## [1] 46.26595
datacovidmobility$prediksi_ss <- mod.ss$y
datacovidmobility$prediksi_smsp <- mod.smsp$y
datacovidmobility
# plot results
plot(datacovidmobility$Tanggal, datacovidmobility$POSITIF)
lines(datacovidmobility$Tanggal, datacovidmobility$prediksi_ss , lty = 2, col = 'black', lwd = 4)
lines(datacovidmobility$Tanggal, datacovidmobility$prediksi_smsp, lty = 4, col = 'pink', lwd = 2)
# plot method
plot(datacovidmobility$Tanggal, datacovidmobility$POSITIF, lty = 10, col = 'black', lwd =5)
#plot(mod.ss)
# add lm fit
abline(coef(lm( datacovidmobility$POSITIF ~ datacovidmobility$Tanggal , data = datacovidmobility)), lty = 10, col = 'blue', lwd =5)
rug(datacovidmobility$Tanggal) # add rug to plot
#points(datacovidmobility$Tanggal, datacovidmobility$Dirawat, lty = 2, col = 2, lwd = 2)
points(datacovidmobility$Tanggal,mod.ss$y , lty = 2, col = 'red', lwd = 4)
#points(datacovidmobility$Tanggal,mod.smsp$y , lty = 2, col = 4, lwd = 2)
legend("topright",
legend = c("Real", "Model SS", "Trends"),
lty = 1:3, col = c("Black","Red","Blue"), lwd = 3, bty = "p")
library(npreg)
# fit using ss
mod.ss1 <- ss( datacovidmobility$Tanggal,datacovidmobility$retail_and_recreation_percent_change_from_baseline, nknots = 10)
mod.ss1
##
## Call:
## ss(x = datacovidmobility$Tanggal, y = datacovidmobility$retail_and_recreation_percent_change_from_baseline,
## nknots = 10)
##
## Smoothing Parameter spar = 0.3686358 lambda = 2.74518e-05
## Equivalent Degrees of Freedom (Df) 5.788914
## Penalized Criterion (RSS) 249.0038
## Generalized Cross-Validation (GCV) 12.14464
# fit using smooth.spline
mod.smsp1 <- smooth.spline( datacovidmobility$Tanggal,datacovidmobility$retail_and_recreation_percent_change_from_baseline, nknots = 10)
mod.smsp1
## Call:
## smooth.spline(x = datacovidmobility$Tanggal, y = datacovidmobility$retail_and_recreation_percent_change_from_baseline,
## nknots = 10)
##
## Smoothing Parameter spar= 0.3479396 lambda= 0.0008521042 (10 iterations)
## Equivalent Degrees of Freedom (Df): 5.797622
## Penalized Criterion (RSS): 248.853
## GCV: 12.14567
datacovidmobility$prediksi_ss1 <- mod.ss1$y
datacovidmobility$prediksi_smsp1 <- mod.smsp1$y
datacovidmobility
# plot results
plot(datacovidmobility$Tanggal, datacovidmobility$retail_and_recreation_percent_change_from_baseline, lty = 10, col = 'black', lwd =5)
# add lm fit
abline(coef(lm( datacovidmobility$retail_and_recreation_percent_change_from_baseline ~ datacovidmobility$Tanggal , data = datacovidmobility)), lty = 10, col = 'blue', lwd =5)
lines(datacovidmobility$Tanggal, datacovidmobility$prediksi_ss1 , lty = 2, col = 'black', lwd = 4)
lines(datacovidmobility$Tanggal, datacovidmobility$prediksi_smsp1, lty = 4, col = 'red', lwd = 2)
Untuk melakukan analisis korelasi Pearson cukup sederhana. Semisal kita ingin mengetahui hubungan antara mobilityjakartaPOSITIFdanmobilityjakartaretail_and_recreation_percent_change_from_baseline,
cor.test(datacovidmobility$POSITIF,datacovidmobility$retail_and_recreation_percent_change_from_baseline)
##
## Pearson's product-moment correlation
##
## data: datacovidmobility$POSITIF and datacovidmobility$retail_and_recreation_percent_change_from_baseline
## t = 1.9056, df = 29, p-value = 0.06666
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02353599 0.61520699
## sample estimates:
## cor
## 0.3335862
Dari output tersebut dapat kita simpulkan bahwa tidak ada hubungan yang signifikan antara mobilityjakartaPOSITIF dan mobilityjakartaretail_and_recreation_percent_change_from_baseline (p-value<0,01). Nilai p-value dalam output R dituliskan 0.06666 Nilai koefisien korelasi r adalah sebesar 0.3335862 yang menunjukkan hubungan yang lemah dan terbalik.
model <- lm(datacovidmobility$POSITIF ~ datacovidmobility$retail_and_recreation_percent_change_from_baseline, data = datacovidmobility)
summary(model)
##
## Call:
## lm(formula = datacovidmobility$POSITIF ~ datacovidmobility$retail_and_recreation_percent_change_from_baseline,
## data = datacovidmobility)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8005.9 -4187.8 263.4 4081.7 9280.7
##
## Coefficients:
## Estimate
## (Intercept) 42589.9
## datacovidmobility$retail_and_recreation_percent_change_from_baseline 449.7
## Std. Error
## (Intercept) 6749.5
## datacovidmobility$retail_and_recreation_percent_change_from_baseline 236.0
## t value
## (Intercept) 6.310
## datacovidmobility$retail_and_recreation_percent_change_from_baseline 1.906
## Pr(>|t|)
## (Intercept) 6.83e-07
## datacovidmobility$retail_and_recreation_percent_change_from_baseline 0.0667
##
## (Intercept) ***
## datacovidmobility$retail_and_recreation_percent_change_from_baseline .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5249 on 29 degrees of freedom
## Multiple R-squared: 0.1113, Adjusted R-squared: 0.08063
## F-statistic: 3.631 on 1 and 29 DF, p-value: 0.06666
Rumus linear regresi menjadi y = 42589.9 + (449.7) datacovidmobility$retail_and_recreation_percent_change_from_baseline
model$coefficients
## (Intercept)
## 42589.8773
## datacovidmobility$retail_and_recreation_percent_change_from_baseline
## 449.6642
model$fitted.values
## 1 2 3 4 5 6 7 8
## 25952.30 25952.30 30448.94 29999.28 30448.94 30448.94 32247.60 30448.94
## 9 10 11 12 13 14 15 16
## 28200.62 31348.27 29999.28 30898.61 28200.62 32247.60 29999.28 27750.96
## 17 18 19 20 21 22 23 24
## 28200.62 28650.29 30898.61 28200.62 29549.61 28200.62 26851.63 30448.94
## 25 26 27 28 29 30 31
## 30898.61 30898.61 31348.27 33146.93 31348.27 29999.28 32247.60
datacovidmobility$prediksi_linear <- model$fitted.values
datacovidmobility
# Function to add histograms
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
his <- hist(x, plot = FALSE)
breaks <- his$breaks
nB <- length(breaks)
y <- his$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = rgb(0, 1, 1, alpha = 0.5), ...)
# lines(density(x), col = 2, lwd = 2) # Uncomment to add density lines
}
# Equivalent with a formula
pairs(datacovidmobility$POSITIF~datacovidmobility$retail_and_recreation_percent_change_from_baseline, data = datacovidmobility,
upper.panel = NULL, # Disabling the upper panel
diag.panel = panel.hist) # Adding the histograms
# plot method
plot(datacovidmobility$Tanggal, datacovidmobility$POSITIF, lty = 10, col = 'black', lwd =5)
#plot(mod.ss)
# add lm fit
abline(coef(lm( datacovidmobility$POSITIF ~ datacovidmobility$Tanggal , data = datacovidmobility)), lty = 10, col = 'blue', lwd =5)
rug(datacovidmobility$Tanggal) # add rug to plot
#points(datacovidmobility$Tanggal, datacovidmobility$Dirawat, lty = 2, col = 2, lwd = 2)
lines(datacovidmobility$Tanggal,mod.ss$y , lty = 2, col = 'red', lwd = 4)
#points(datacovidmobility$Tanggal,mod.smsp$y , lty = 2, col = 4, lwd = 2)
lines(datacovidmobility$Tanggal, datacovidmobility$prediksi_linear, lty = 2, col = 6, lwd = 4)
legend("topright",
legend = c("Real", "Model SS", "Trends", "Linear Regresi"),
lty = 1:4, col = c("Black","Red","Blue","Pink"), lwd = 3, bty = "p")