Memuat Package yang Digunakan
library(tidyverse)
library(mlbench)
Berikut contoh linear regression menggunakan dataset cars.
as_tibble(cars) #Data yang digunakan
## # A tibble: 50 x 2
## speed dist
## <dbl> <dbl>
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
## # ... with 40 more rows
Plot data dengan model linear.
cars %>%
ggplot(aes(x = speed, y = dist)) +
geom_point() +
geom_smooth(method = "lm") #menggunakan linar model
## `geom_smooth()` using formula 'y ~ x'
Berikut merupakan informasi dari model.
cars %>%
lm(dist ~ speed, data = .) %>%
summary # Summary
##
## Call:
## lm(formula = dist ~ speed, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
cars %>%
lm(dist ~ speed, data = .) %>%
coefficients # Estimasi
## (Intercept) speed
## -17.579095 3.932409
cars %>%
lm(dist ~ speed, data = .) %>%
confint # Confidence Interval
## 2.5 % 97.5 %
## (Intercept) -31.167850 -3.990340
## speed 3.096964 4.767853
Membuat model berbeda dengan parameter yang lain. Untuk menyederhanakan masalah, setting dengan intercept = 0. Kemudian gambar garis lurus dengan koefisien speed berbeda.
Berikut plot dari model dengan parameter yang berbeda dari beberapa nilai koefisien speed.
predict_dist <- function(speed,
theta_1){
data.frame(speed = speed,
dist = theta_1 * speed,
theta = as.factor(theta_1)
)
}
cars %>%
ggplot(aes(x = speed,
y = dist,
colour = theta)) +
geom_point(colour = "black") +
geom_line(data = predict_dist(cars$speed,
2)) +
geom_line(data = predict_dist(cars$speed,
3)) +
geom_line(data = predict_dist(cars$speed,
4)) +
scale_color_discrete(name=expression(theta[1]))
Error terkecil adalah estimasi terbaik.
Berikut plot error dari masing-masing koefisien speed.
thetas <- seq(0,
5,
length.out = 50)
fitting_error <- Vectorize(function(thetas)
sum((thetas * cars$speed - cars$dist)**2)
)
data.frame(thetas = thetas,
errors = fitting_error(thetas)) %>%
ggplot(aes(x = thetas, y = errors)) +
geom_line() +
xlab(expression(theta[1])) +
ylab("")
Berikut merupakan model dengan parameter lain(intercept = 0)
cars %>%
lm(dist ~ speed-1, data = .) %>%
summary # Summary
##
## Call:
## lm(formula = dist ~ speed - 1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.183 -12.637 -5.455 4.590 50.181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## speed 2.9091 0.1414 20.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.26 on 49 degrees of freedom
## Multiple R-squared: 0.8963, Adjusted R-squared: 0.8942
## F-statistic: 423.5 on 1 and 49 DF, p-value: < 2.2e-16
cars %>%
lm(dist ~ speed-1, data = .) %>%
coefficients # Estimasi
## speed
## 2.909132
cars %>%
lm(dist ~ speed-1, data = .) %>%
confint # Confidence Interval
## 2.5 % 97.5 %
## speed 2.625041 3.193223
cars %>%
ggplot(aes(x = speed, y = dist)) +
geom_point() +
geom_smooth(method = "lm", formula = y~x-1)
Persamaan regresi logistik adalah :
Persamaan Regresi Logistik
Peluang persamaan regresi adalah :
Probabilitas Regresi Logistik
data("BreastCancer")
as_tibble(BreastCancer)
## # A tibble: 699 x 11
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## <chr> <ord> <ord> <ord> <ord> <ord>
## 1 1000~ 5 1 1 1 2
## 2 1002~ 5 4 4 5 7
## 3 1015~ 3 1 1 1 2
## 4 1016~ 6 8 8 1 3
## 5 1017~ 4 1 1 3 2
## 6 1017~ 8 10 10 8 7
## 7 1018~ 1 1 1 1 2
## 8 1018~ 2 1 2 1 2
## 9 1033~ 2 1 1 1 2
## 10 1033~ 4 2 1 1 2
## # ... with 689 more rows, and 5 more variables: Bare.nuclei <fct>,
## # Bl.cromatin <fct>, Normal.nucleoli <fct>, Mitoses <fct>, Class <fct>
BreastCancer %>%
ggplot(aes(x = Cl.thickness,
y = Class)) +
geom_jitter(height = 0.05,
width = 0.3,
alpha=0.4)
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness))) %>%
mutate(IsMalignant =
ifelse(Class == "benign",
0,
1)) %>%
ggplot(aes(x = Cl.thickness.numeric,
y = IsMalignant)) +
geom_jitter(height = 0.05,
width = 0.3,
alpha=0.4) +
geom_smooth(method = "glm",
method.args =
list(family = "binomial"))
## `geom_smooth()` using formula 'y ~ x'
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(
Cl.thickness))) %>%
mutate(IsMalignant =
ifelse(Class == "benign",
0,
1)) %>%
glm(IsMalignant ~ Cl.thickness.numeric,
family = "binomial",
data = .)
##
## Call: glm(formula = IsMalignant ~ Cl.thickness.numeric, family = "binomial",
## data = .)
##
## Coefficients:
## (Intercept) Cl.thickness.numeric
## -5.1602 0.9355
##
## Degrees of Freedom: 698 Total (i.e. Null); 697 Residual
## Null Deviance: 900.5
## Residual Deviance: 464.1 AIC: 468.1