Check out data normalization

Column

Histogram

Bootstrap Resampling

Column

Evaluating and comparing the models with bootstrap resampling

---
title: "Austin Incidents: Machine Learning with Regression model"
author: "Viviane Schneider"
output: 
  flexdashboard::flex_dashboard:
    orientation: columns
    vertical_layout: fill
    social: menu
    source_code: embed
---

```{r setup, include=FALSE}

library(flexdashboard)
library(tidyverse)
library(dplyr)
library(purrr)
library(rjson)
library(httr)
library(jsonlite)
library(tidymodels)

setwd("~/Machine learning/ML")


# Request api
res = GET("https://data.austintexas.gov/resource/diju-g5tv.json")

data = fromJSON(rawToChar(res$content), flatten = TRUE)



```


```{r}

# Tidy data
data <- data %>%
  mutate(year = substr(month_key,1,4), month = substr(month_key,5,6))
  
   data <- select(data, -c(17:31))
   
   #take out column with missing value
 data <- select(data, -c(2))
   
```



```{r}
# convert data into regular format, if is necessary

#convert char data into integer
   # Specify columns to change
i <- c(1:17)                                  

# Specify own function to transform char into numeric within apply
data[ , i] <- apply(data[ , i], 2,           
function(x) as.numeric(as.character(x)))

#verify if works
class <- sapply(data, class)

#verify if there is some missing data
na <- sapply(data, function(x) sum(is.na(x)))




```


Check out data normalization
=======================================================================

Column {data-width=900}
-----------------------------------------------------------------------

### Histogram

```{r}

# Plot the histogram
ggplot(data, aes(x =count_incidents_coa_p1)) +
    geom_histogram(bins = 25) +
    labs(x = "Incidents priority 1",
         y = "Number of Incidents priority 1")

```


```{r}
# Simple model to understand data

# Fit a linear model
fit_all <- lm(count_incidents_coa_p1 ~ ., data = data)

# Print the summary of the model
look <- summary(fit_all)


```

```{r}
# Fit one linear regression model without data resampling.

### Training and testing of subsets data

## Creating training/testing splits for reduces overfitting, and better estimate of how it will perform on new data.

# Split the data into training and test sets
# Divides original data into 80%/20% sections and (roughly) evenly divides the partitions between the years
# Assign the 80% partition to data_train and the 20% partition to data_test.

set.seed(1234)
data_split <- data %>%
    initial_split(prop = 0.8, strata = year)

data_train <- training(data_split)
data_test <- testing(data_split)



```


```{r}

# Build a linear regression model specification
lm_mod <- linear_reg() %>%
    set_engine("lm")

# Fit a basic linear regression model to data_train data.
# Fitting to log(count_incidents_coa_or_tc) since it had a log normal distribution.
fit_lm <- lm_mod %>%
    fit(log(count_incidents_coa_p1) ~ ., 
        data = data_train)

# Print the model object
look <- fit_lm


```


```{r}
### Fit one Random Forest model without data resampling.

rf_mod <- rand_forest() %>%
    set_engine("randomForest") %>%
    set_mode("regression")

# Train a random forest model
fit_rf <- rf_mod %>%
    fit(log(count_incidents_coa_p1) ~ ., 
        data = data_train)

# Print the model object
look <- fit_rf


```


```{r}

### Evoluate model performance

## Evaluating regression models using the root mean squared error metric. Lower values indicate a better fit to the data. 

# Create new columns for model predictions from each,linear regression, and random forest.
results <- data_train %>%
    mutate(mpg = log(count_incidents_coa_p1)) %>%
    bind_cols(predict(fit_lm, data_train) %>%
                  rename(.pred_lm = .pred)) %>%
    bind_cols(predict(fit_rf, data_train) %>%
                  rename(.pred_rf = .pred))

# Evaluate the performance of these models using metrics() by specifying the column that contains the real count_incidents_coa_or_tc.
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_lm)
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_rf)

```



```{r}
### Using test data

## Evaluate how these simple models perform on the testing data instead.

# Create the new columns with data_test
results <- data_test %>%
    mutate(mpg = log(count_incidents_coa_p1)) %>%
    bind_cols(predict(fit_lm, data_test) %>%
                  rename(.pred_lm = .pred)) %>%
    bind_cols(predict(fit_rf, data_test) %>%
                  rename(.pred_rf = .pred))

# Evaluate the performance
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_lm)
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_rf)

```

Bootstrap Resampling
=======================================================================

Column {data-width=650}
-----------------------------------------------------------------------

### Evaluating and comparing the models with bootstrap resampling



```{r}

# Creating datasets the same size as the original one by randomly drawing with replacement from the original. In tidymodels, the default behavior for bootstrapping is 25 resamplings.

## Create bootstrap resamples to evaluate these models
data_boot <- bootstraps(data_train)

# Evaluate the models(the linear regression and the random forest) 
lm_res <- lm_mod %>%
    fit_resamples(
        log(count_incidents_coa_p1) ~ .,
        resamples = data_boot,
        control = control_resamples(save_pred = TRUE)
    )

rf_res <- rf_mod %>%
    fit_resamples(
        log(count_incidents_coa_p1) ~ .,
        resamples = data_boot,
        control = control_resamples(save_pred = TRUE)
    )

results <-  bind_rows(lm_res %>%
                          collect_predictions() %>%
                          mutate(model = "Linear Model"),
                      rf_res %>%
                          collect_predictions() %>%
                          mutate(model = "Random Forest model"))

results %>%
    ggplot(aes(`log(count_incidents_coa_p1)`, .pred)) +
    geom_abline(lty = 2, color = "gray50") +
    geom_point(aes(color = id), size = 1.5, alpha = 0.3, show.legend = FALSE) +
    geom_smooth(method = "lm") +
    facet_wrap(~ model)


```