Smoothing merupakan salah satu metode yang digunakan dalam analisis data non parametrik. Tujuan dari smoothing adalah untuk meminimalkan keragaman karakteristik data dari data yang tidak memiliki pengaruh sehingga ciri-ciri dari data akan tampak lebih jelas.
Smooth Spline Model adalah Salah satu model regresi dengan pendekatan non parametrik yang dapat digunakan untuk menduga kurva regresi adalah regresi spline.
Regresi spline merupakan smoothing untuk memplot data dengan mempertimbangkan kemulusan kurva. Spline adalah model polinomial yang tersegmentasi atau terbagi, dan sifat segmen ini memberikan fleksibilitas yang lebih besar daripada model polinomial biasa. Properti ini memungkinkan model regresi spline untuk secara efektif disesuaikan dengan properti lokal data.
Tujuan dari Analisa Data saat ini adalah membandingkan Kasus Positif Covid-19 dengan pergerakan orang di tempat Bekerja. Masing-masing unsur menggunakan Smooth Spline sehingga akan didapat Regresi Linear diantaranya dan memprediksi adakah hubungan keterikatan antara Kasus Positif Covid-19 dan Lokasi Tempat Bekerja.
Sumber Data diambil dari Data Covid-19 DKI Jakarta dari website Pemprov DKI Jakarta dan Google mobility index pada bulan Oktober 2021
#membaca data excel
library(readxl)
data.corona <- read_excel(path = "datacovidDKI.xlsx")
data.corona
## # A tibble: 31 x 13
## `Nama Kota` Tanggal Positif Dirawat Sembuh Meninggal
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 Jakarta 2021-10-01 00:00:00 857916 504 842715 13524
## 2 Jakarta 2021-10-02 00:00:00 858071 483 842842 13529
## 3 Jakarta 2021-10-03 00:00:00 858198 500 842929 13534
## 4 Jakarta 2021-10-04 00:00:00 858347 461 843091 13534
## 5 Jakarta 2021-10-05 00:00:00 858419 444 843239 13538
## 6 Jakarta 2021-10-06 00:00:00 858622 451 843414 13540
## 7 Jakarta 2021-10-07 00:00:00 858771 448 843529 13541
## 8 Jakarta 2021-10-08 00:00:00 858921 436 843738 13541
## 9 Jakarta 2021-10-09 00:00:00 859021 427 843822 13543
## 10 Jakarta 2021-10-10 00:00:00 859161 438 843891 13547
## # ... with 21 more rows, and 7 more variables: SelfIsolation <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>
library(jmv)
descriptives(data.corona, vars = vars(Dirawat, parks_percent_change_from_baseline), freq = TRUE)
##
## DESCRIPTIVES
##
## Descriptives
## ------------------------------------------------------------------------
## Dirawat parks_percent_change_from_baseline
## ------------------------------------------------------------------------
## N 31 31
## Missing 0 0
## Mean 357.7742 -30.77419
## Median 348.0000 -32.00000
## Standard deviation 80.70841 6.651865
## Minimum 256.0000 -46.00000
## Maximum 504.0000 -15.00000
## ------------------------------------------------------------------------
d<-density(data.corona$Dirawat) # menghitung density variabel wt
plot(d, xlab="Dirawat di RS untuk Covid 19", main="Distribusi Dirawat di RS untuk Covid 19")
library(npreg)
mod.ss <- ss( data.corona$Tanggal,data.corona$Dirawat, nknots = 10)
mod.ss
##
## Call:
## ss(x = data.corona$Tanggal, y = data.corona$Dirawat, nknots = 10)
##
## Smoothing Parameter spar = 0.5439813 lambda = 0.0005074494
## Equivalent Degrees of Freedom (Df) 3.398366
## Penalized Criterion (RSS) 9926.544
## Generalized Cross-Validation (GCV) 403.9152
mod.smsp <- smooth.spline( data.corona$Tanggal,data.corona$Dirawat, nknots = 10)
mod.smsp
## Call:
## smooth.spline(x = data.corona$Tanggal, y = data.corona$Dirawat,
## nknots = 10)
##
## Smoothing Parameter spar= 0.5235182 lambda= 0.01582997 (15 iterations)
## Equivalent Degrees of Freedom (Df): 3.395409
## Penalized Criterion (RSS): 9930.293
## GCV: 403.9811
sqrt(mean(( mod.ss$y - mod.smsp$y )^2))
## [1] 0.01229592
sqrt(mean(( data.corona$Dirawat - mod.ss$y )^2))
## [1] 17.89444
sqrt(mean(( data.corona$Dirawat - mod.smsp$y )^2))
## [1] 17.89782
data.corona$prediksi_ss <- mod.ss$y
data.corona$prediksi_smsp <- mod.smsp$y
data.corona
## # A tibble: 31 x 15
## `Nama Kota` Tanggal Positif Dirawat Sembuh Meninggal
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 Jakarta 2021-10-01 00:00:00 857916 504 842715 13524
## 2 Jakarta 2021-10-02 00:00:00 858071 483 842842 13529
## 3 Jakarta 2021-10-03 00:00:00 858198 500 842929 13534
## 4 Jakarta 2021-10-04 00:00:00 858347 461 843091 13534
## 5 Jakarta 2021-10-05 00:00:00 858419 444 843239 13538
## 6 Jakarta 2021-10-06 00:00:00 858622 451 843414 13540
## 7 Jakarta 2021-10-07 00:00:00 858771 448 843529 13541
## 8 Jakarta 2021-10-08 00:00:00 858921 436 843738 13541
## 9 Jakarta 2021-10-09 00:00:00 859021 427 843822 13543
## 10 Jakarta 2021-10-10 00:00:00 859161 438 843891 13547
## # ... with 21 more rows, and 9 more variables: SelfIsolation <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>, prediksi_ss <dbl>, ...
# plot results
plot(data.corona$Tanggal, data.corona$Dirawat)
lines(data.corona$Tanggal, data.corona$prediksi_ss , lty = 2, col = 'red', lwd = 4)
lines(data.corona$Tanggal, data.corona$prediksi_smsp, lty = 4, col = 'green', lwd = 2)
# plot method
plot(data.corona$Tanggal, data.corona$Dirawat, lty = 10, col = 'Green', lwd =5)
#plot(mod.ss)
# add lm fit
abline(coef(lm( data.corona$Dirawat ~ data.corona$Tanggal , data = data.corona)), lty = 10, col = 'Yellow', lwd =5)
rug(data.corona$Tanggal) # add rug to plot
#points(data.corona$Tanggal, data.corona$Dirawat, lty = 2, col = 2, lwd = 2)
points(data.corona$Tanggal,mod.ss$y , lty = 2, col = 'red', lwd = 4)
#points(data.corona$Tanggal,mod.smsp$y , lty = 2, col = 4, lwd = 2)
legend("topright",
legend = c("Real", "Model SS", "Trends"),
lty = 1:3, col = c("Green","Red","Yellow"), lwd = 3, bty = "p")
library(npreg)
# fit using ss
mod.ss1 <- ss( data.corona$Tanggal,data.corona$parks_percent_change_from_baseline, nknots = 10)
mod.ss1
##
## Call:
## ss(x = data.corona$Tanggal, y = data.corona$parks_percent_change_from_baseline,
## nknots = 10)
##
## Smoothing Parameter spar = 0.04617079 lambda = 1.284846e-07
## Equivalent Degrees of Freedom (Df) 10.63747
## Penalized Criterion (RSS) 358.7872
## Generalized Cross-Validation (GCV) 26.82471
mod.smsp1 <- smooth.spline( data.corona$Tanggal,data.corona$parks_percent_change_from_baseline, nknots = 10)
mod.smsp1
## Call:
## smooth.spline(x = data.corona$Tanggal, y = data.corona$parks_percent_change_from_baseline,
## nknots = 10)
##
## Smoothing Parameter spar= 0.08753934 lambda= 1.121128e-05 (15 iterations)
## Equivalent Degrees of Freedom (Df): 10.63323
## Penalized Criterion (RSS): 391.4619
## GCV: 29.25547
data.corona
data.corona$prediksi_ss1 <- mod.ss1$y
data.corona$prediksi_smsp1 <- mod.smsp1$y
data.corona
## # A tibble: 31 x 17
## `Nama Kota` Tanggal Positif Dirawat Sembuh Meninggal
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 Jakarta 2021-10-01 00:00:00 857916 504 842715 13524
## 2 Jakarta 2021-10-02 00:00:00 858071 483 842842 13529
## 3 Jakarta 2021-10-03 00:00:00 858198 500 842929 13534
## 4 Jakarta 2021-10-04 00:00:00 858347 461 843091 13534
## 5 Jakarta 2021-10-05 00:00:00 858419 444 843239 13538
## 6 Jakarta 2021-10-06 00:00:00 858622 451 843414 13540
## 7 Jakarta 2021-10-07 00:00:00 858771 448 843529 13541
## 8 Jakarta 2021-10-08 00:00:00 858921 436 843738 13541
## 9 Jakarta 2021-10-09 00:00:00 859021 427 843822 13543
## 10 Jakarta 2021-10-10 00:00:00 859161 438 843891 13547
## # ... with 21 more rows, and 11 more variables: SelfIsolation <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>, prediksi_ss <dbl>, ...
# plot results
plot(data.corona$Tanggal, data.corona$parks_percent_change_from_baseline, lty = 10, col = 'yellow', lwd =5)
# add lm fit
abline(coef(lm( data.corona$parks_percent_change_from_baseline ~ data.corona$Tanggal , data = data.corona)), lty = 10, col = 'green', lwd =5)
lines(data.corona$Tanggal, data.corona$prediksi_ss1 , lty = 2, col = 'yellow', lwd = 4)
lines(data.corona$Tanggal, data.corona$prediksi_smsp1, lty = 4, col = 'red', lwd = 2)
Korelasi Pearson digunakan untuk melakukan analisis korelasi Pearson cukup sederhana. Semisal kita ingin mengetahui hubungan antara mobilityjakartaDirawat dan mobilityjakartaparks_percent_change_from_baseline
cor.test(data.corona$Dirawat,data.corona$parks_percent_change_from_baseline)
##
## Pearson's product-moment correlation
##
## data: data.corona$Dirawat and data.corona$parks_percent_change_from_baseline
## t = -2.5381, df = 29, p-value = 0.01678
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.67821562 -0.08480143
## sample estimates:
## cor
## -0.4263309
model <- lm(data.corona$Dirawat ~ data.corona$parks_percent_change_from_baseline, data = data.corona)
summary(model)
##
## Call:
## lm(formula = data.corona$Dirawat ~ data.corona$parks_percent_change_from_baseline,
## data = data.corona)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.29 -49.94 -14.12 59.52 155.40
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 198.587 64.122 3.097
## data.corona$parks_percent_change_from_baseline -5.173 2.038 -2.538
## Pr(>|t|)
## (Intercept) 0.00431 **
## data.corona$parks_percent_change_from_baseline 0.01678 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.25 on 29 degrees of freedom
## Multiple R-squared: 0.1818, Adjusted R-squared: 0.1535
## F-statistic: 6.442 on 1 and 29 DF, p-value: 0.01678
model$coefficients
## (Intercept)
## 198.586731
## data.corona$parks_percent_change_from_baseline
## -5.172758
model$fitted.values
## 1 2 3 4 5 6 7 8
## 348.5967 364.1150 400.3243 400.3243 400.3243 379.6333 364.1150 327.9057
## 9 10 11 12 13 14 15 16
## 353.7695 379.6333 379.6333 379.6333 369.2878 364.1150 327.9057 364.1150
## 17 18 19 20 21 22 23 24
## 395.1515 436.5336 379.6333 296.8691 358.9422 322.7329 322.7329 338.2512
## 25 26 27 28 29 30 31
## 364.1150 364.1150 369.2878 338.2512 276.1781 312.3874 312.3874
data.corona$prediksi_linear <- model$fitted.values
data.corona
## # A tibble: 31 x 18
## `Nama Kota` Tanggal Positif Dirawat Sembuh Meninggal
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 Jakarta 2021-10-01 00:00:00 857916 504 842715 13524
## 2 Jakarta 2021-10-02 00:00:00 858071 483 842842 13529
## 3 Jakarta 2021-10-03 00:00:00 858198 500 842929 13534
## 4 Jakarta 2021-10-04 00:00:00 858347 461 843091 13534
## 5 Jakarta 2021-10-05 00:00:00 858419 444 843239 13538
## 6 Jakarta 2021-10-06 00:00:00 858622 451 843414 13540
## 7 Jakarta 2021-10-07 00:00:00 858771 448 843529 13541
## 8 Jakarta 2021-10-08 00:00:00 858921 436 843738 13541
## 9 Jakarta 2021-10-09 00:00:00 859021 427 843822 13543
## 10 Jakarta 2021-10-10 00:00:00 859161 438 843891 13547
## # ... with 21 more rows, and 12 more variables: SelfIsolation <dbl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>, prediksi_ss <dbl>, ...
# Function to add histograms
panel.hist <- function(x, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5))
his <- hist(x, plot = FALSE)
breaks <- his$breaks
nB <- length(breaks)
y <- his$counts
y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = rgb(0, 0, 1, alpha = 0.5), ...)
# lines(density(x), col = 2, lwd = 2) # Uncomment to add density lines
}
# Equivalent with a formula
pairs(data.corona$Dirawat~data.corona$parks_percent_change_from_baseline, data = data.corona,
upper.panel = NULL, # Disabling the upper panel
diag.panel = panel.hist) # Adding the histograms
# plot method
plot(data.corona$Tanggal, data.corona$Dirawat, lty = 10, col = 'black', lwd =5)
#plot(mod.ss)
# add lm fit
abline(coef(lm( data.corona$Dirawat ~ data.corona$Tanggal , data = data.corona)), lty = 10, col = 'green', lwd =5)
rug(data.corona$Tanggal) # add rug to plot
#points(data.corona$Tanggal, mobilityjakarta$Dirawat, lty = 2, col = 2, lwd = 2)
lines(data.corona$Tanggal,mod.ss$y , lty = 2, col = 'Orange', lwd = 4)
#points(data.corona$Tanggal,mod.smsp$y , lty = 2, col = 4, lwd = 2)
lines(data.corona$Tanggal, data.corona$prediksi_linear, lty = 2, col = 6, lwd = 4)
legend("topright",
legend = c("Real", "Model SS", "Trends", "Linear Regresi"),
lty = 1:4, col = c("Black","Orange","green","Pink"), lwd = 3, bty = "p")
https://github.com/juba/rmdformats
https://medium.com/machine-learning-id/simple-linear-regression-r-ea42aed17c52