---
title: "Austin Incidents: Machine Learning with Regression model"
author: "Viviane Schneider"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
social: menu
source_code: embed
---
```{r setup, include=FALSE}
library(flexdashboard)
library(tidyverse)
library(dplyr)
library(purrr)
library(rjson)
library(httr)
library(jsonlite)
library(tidymodels)
setwd("~/Machine learning/ML")
# Request api
res = GET("https://data.austintexas.gov/resource/diju-g5tv.json")
data = fromJSON(rawToChar(res$content), flatten = TRUE)
```
```{r}
# Tidy data
data <- data %>%
mutate(year = substr(month_key,1,4), month = substr(month_key,5,6))
data <- select(data, -c(17:31))
#take out column with missing value
data <- select(data, -c(2))
```
```{r}
# convert data into regular format, if is necessary
#convert char data into integer
# Specify columns to change
i <- c(1:17)
# Specify own function to transform char into numeric within apply
data[ , i] <- apply(data[ , i], 2,
function(x) as.numeric(as.character(x)))
#verify if works
class <- sapply(data, class)
#verify if there is some missing data
na <- sapply(data, function(x) sum(is.na(x)))
```
Check out data normalization
=======================================================================
Column {data-width=900}
-----------------------------------------------------------------------
### Histogram
```{r}
# Plot the histogram
ggplot(data, aes(x =count_incidents_coa_p1)) +
geom_histogram(bins = 25) +
labs(x = "Incidents priority 1",
y = "Number of Incidents priority 1")
```
```{r}
# Simple model to understand data
# Fit a linear model
fit_all <- lm(count_incidents_coa_p1 ~ ., data = data)
# Print the summary of the model
look <- summary(fit_all)
```
```{r}
# Fit one linear regression model without data resampling.
### Training and testing of subsets data
## Creating training/testing splits for reduces overfitting, and better estimate of how it will perform on new data.
# Split the data into training and test sets
# Divides original data into 80%/20% sections and (roughly) evenly divides the partitions between the years
# Assign the 80% partition to data_train and the 20% partition to data_test.
set.seed(1234)
data_split <- data %>%
initial_split(prop = 0.8, strata = year)
data_train <- training(data_split)
data_test <- testing(data_split)
```
```{r}
# Build a linear regression model specification
lm_mod <- linear_reg() %>%
set_engine("lm")
# Fit a basic linear regression model to data_train data.
# Fitting to log(count_incidents_coa_or_tc) since it had a log normal distribution.
fit_lm <- lm_mod %>%
fit(log(count_incidents_coa_p1) ~ .,
data = data_train)
# Print the model object
look <- fit_lm
```
```{r}
### Fit one Random Forest model without data resampling.
rf_mod <- rand_forest() %>%
set_engine("randomForest") %>%
set_mode("regression")
# Train a random forest model
fit_rf <- rf_mod %>%
fit(log(count_incidents_coa_p1) ~ .,
data = data_train)
# Print the model object
look <- fit_rf
```
```{r}
### Evoluate model performance
## Evaluating regression models using the root mean squared error metric. Lower values indicate a better fit to the data.
# Create new columns for model predictions from each,linear regression, and random forest.
results <- data_train %>%
mutate(mpg = log(count_incidents_coa_p1)) %>%
bind_cols(predict(fit_lm, data_train) %>%
rename(.pred_lm = .pred)) %>%
bind_cols(predict(fit_rf, data_train) %>%
rename(.pred_rf = .pred))
# Evaluate the performance of these models using metrics() by specifying the column that contains the real count_incidents_coa_or_tc.
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_lm)
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_rf)
```
```{r}
### Using test data
## Evaluate how these simple models perform on the testing data instead.
# Create the new columns with data_test
results <- data_test %>%
mutate(mpg = log(count_incidents_coa_p1)) %>%
bind_cols(predict(fit_lm, data_test) %>%
rename(.pred_lm = .pred)) %>%
bind_cols(predict(fit_rf, data_test) %>%
rename(.pred_rf = .pred))
# Evaluate the performance
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_lm)
look <- metrics(results, truth = count_incidents_coa_or_tc, estimate = .pred_rf)
```
Bootstrap Resampling
=======================================================================
Column {data-width=650}
-----------------------------------------------------------------------
### Evaluating and comparing the models with bootstrap resampling
```{r}
# Creating datasets the same size as the original one by randomly drawing with replacement from the original. In tidymodels, the default behavior for bootstrapping is 25 resamplings.
## Create bootstrap resamples to evaluate these models
data_boot <- bootstraps(data_train)
# Evaluate the models(the linear regression and the random forest)
lm_res <- lm_mod %>%
fit_resamples(
log(count_incidents_coa_p1) ~ .,
resamples = data_boot,
control = control_resamples(save_pred = TRUE)
)
rf_res <- rf_mod %>%
fit_resamples(
log(count_incidents_coa_p1) ~ .,
resamples = data_boot,
control = control_resamples(save_pred = TRUE)
)
results <- bind_rows(lm_res %>%
collect_predictions() %>%
mutate(model = "Linear Model"),
rf_res %>%
collect_predictions() %>%
mutate(model = "Random Forest model"))
results %>%
ggplot(aes(`log(count_incidents_coa_p1)`, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(aes(color = id), size = 1.5, alpha = 0.3, show.legend = FALSE) +
geom_smooth(method = "lm") +
facet_wrap(~ model)
```