SUPERVISED LEARNING

Memuat Package yang Digunakan

library(tidyverse)
library(mlbench)

Linear Regression (Regression)

Berikut contoh linear regression menggunakan dataset cars.

as_tibble(cars) #Data yang digunakan
## # A tibble: 50 x 2
##    speed  dist
##    <dbl> <dbl>
##  1     4     2
##  2     4    10
##  3     7     4
##  4     7    22
##  5     8    16
##  6     9    10
##  7    10    18
##  8    10    26
##  9    10    34
## 10    11    17
## # ... with 40 more rows

Plot data dengan model linear.

cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_smooth(method = "lm") #menggunakan linar model
## `geom_smooth()` using formula 'y ~ x'

Berikut merupakan informasi dari model.

cars %>% 
  lm(dist ~ speed, data = .) %>%
  summary # Summary
## 
## Call:
## lm(formula = dist ~ speed, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.069  -9.525  -2.272   9.215  43.201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.5791     6.7584  -2.601   0.0123 *  
## speed         3.9324     0.4155   9.464 1.49e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared:  0.6511, Adjusted R-squared:  0.6438 
## F-statistic: 89.57 on 1 and 48 DF,  p-value: 1.49e-12
cars %>% 
  lm(dist ~ speed, data = .) %>%
  coefficients # Estimasi
## (Intercept)       speed 
##  -17.579095    3.932409
cars %>% 
  lm(dist ~ speed, data = .) %>%
  confint # Confidence Interval
##                  2.5 %    97.5 %
## (Intercept) -31.167850 -3.990340
## speed         3.096964  4.767853

Membuat model berbeda dengan parameter yang lain. Untuk menyederhanakan masalah, setting dengan intercept = 0. Kemudian gambar garis lurus dengan koefisien speed berbeda.

Berikut plot dari model dengan parameter yang berbeda dari beberapa nilai koefisien speed.

predict_dist <- function(speed,
                         theta_1){
  data.frame(speed = speed,       
             dist = theta_1 * speed,
             theta = as.factor(theta_1)
             )  
  }


cars %>%
  ggplot(aes(x = speed,
             y = dist,
             colour = theta)) +
  geom_point(colour = "black") +
  geom_line(data = predict_dist(cars$speed,
                                2)) +
  geom_line(data = predict_dist(cars$speed,
                                3)) +
  geom_line(data = predict_dist(cars$speed,
                                4)) +
  scale_color_discrete(name=expression(theta[1]))

Error terkecil adalah estimasi terbaik.

Berikut plot error dari masing-masing koefisien speed.

thetas <- seq(0, 
              5, 
              length.out = 50)

fitting_error <- Vectorize(function(thetas)  
  sum((thetas * cars$speed - cars$dist)**2) 
  )

data.frame(thetas = thetas,
           errors = fitting_error(thetas)) %>%
  ggplot(aes(x = thetas, y = errors)) +
  geom_line() + 
  xlab(expression(theta[1])) +
  ylab("")

Berikut merupakan model dengan parameter lain(intercept = 0)

cars %>% 
  lm(dist ~ speed-1, data = .) %>%
  summary # Summary
## 
## Call:
## lm(formula = dist ~ speed - 1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.183 -12.637  -5.455   4.590  50.181 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)    
## speed   2.9091     0.1414   20.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.26 on 49 degrees of freedom
## Multiple R-squared:  0.8963, Adjusted R-squared:  0.8942 
## F-statistic: 423.5 on 1 and 49 DF,  p-value: < 2.2e-16
cars %>% 
  lm(dist ~ speed-1, data = .) %>%
  coefficients # Estimasi
##    speed 
## 2.909132
cars %>% 
  lm(dist ~ speed-1, data = .) %>%
  confint # Confidence Interval
##          2.5 %   97.5 %
## speed 2.625041 3.193223
cars %>% 
  ggplot(aes(x = speed, y = dist)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y~x-1)


Logistic Regression (Classifications)

Persamaan regresi logistik adalah :

Persamaan Regresi Logistik

Peluang persamaan regresi adalah :

Probabilitas Regresi Logistik

data("BreastCancer")

as_tibble(BreastCancer)
## # A tibble: 699 x 11
##    Id    Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
##    <chr> <ord>        <ord>     <ord>      <ord>         <ord>       
##  1 1000~ 5            1         1          1             2           
##  2 1002~ 5            4         4          5             7           
##  3 1015~ 3            1         1          1             2           
##  4 1016~ 6            8         8          1             3           
##  5 1017~ 4            1         1          3             2           
##  6 1017~ 8            10        10         8             7           
##  7 1018~ 1            1         1          1             2           
##  8 1018~ 2            1         2          1             2           
##  9 1033~ 2            1         1          1             2           
## 10 1033~ 4            2         1          1             2           
## # ... with 689 more rows, and 5 more variables: Bare.nuclei <fct>,
## #   Bl.cromatin <fct>, Normal.nucleoli <fct>, Mitoses <fct>, Class <fct>
BreastCancer %>% 
  ggplot(aes(x = Cl.thickness, 
             y = Class)) + 
  geom_jitter(height = 0.05, 
              width = 0.3,
              alpha=0.4)

BreastCancer %>%
  mutate(Cl.thickness.numeric = 
           as.numeric(as.character(Cl.thickness))) %>%
  mutate(IsMalignant = 
           ifelse(Class == "benign",
                  0,
                  1)) %>%
  ggplot(aes(x = Cl.thickness.numeric,
             y = IsMalignant)) +
  geom_jitter(height = 0.05,
              width = 0.3,
              alpha=0.4) + 
  geom_smooth(method = "glm", 
              method.args = 
                list(family = "binomial"))
## `geom_smooth()` using formula 'y ~ x'

BreastCancer %>% 
  mutate(Cl.thickness.numeric = 
           as.numeric(as.character(
             Cl.thickness))) %>%  
  mutate(IsMalignant = 
           ifelse(Class == "benign",
                  0,
                  1)) %>% 
  glm(IsMalignant ~ Cl.thickness.numeric,
      family = "binomial",    
      data = .)
## 
## Call:  glm(formula = IsMalignant ~ Cl.thickness.numeric, family = "binomial", 
##     data = .)
## 
## Coefficients:
##          (Intercept)  Cl.thickness.numeric  
##              -5.1602                0.9355  
## 
## Degrees of Freedom: 698 Total (i.e. Null);  697 Residual
## Null Deviance:       900.5 
## Residual Deviance: 464.1     AIC: 468.1