Here’re the list of objectives that are met in this markdown document. Navigate to respective code section to see the work done for each objective.
Objective | Topic | Code Section |
---|---|---|
Objective 1 | Probability & Inference | Project 1 - Multiple Linear Regression Homework 1 - Simple Linear Regression |
Objective 2 | Generalized Linear Models | Project 2 - Multinomial Logistic Regression Project 2 - LDA Poisson Regression Homework 4 - Multinomial Logistic Regression |
Objective 3 | Model Selection & Diagnostics | Homework 6 - Model Selection Homework 7 - Lasso, Ridge & PCA Project 1 - Multinomial & LDA Project 2 - Multiple Regression |
Objective 4 | Communicating to Non-Technical Audiences | Project 1 Project 2 |
Objective 5 | tidymodels Framework | All examples listed in this markdown |
set.seed(2025)
#Load necessary libraries
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(mosaic))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ISLR2))
suppressPackageStartupMessages(library(discrim))
suppressPackageStartupMessages(library(poissonreg))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(ggformula))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(future))
suppressPackageStartupMessages(library(leaps))
suppressPackageStartupMessages(library(olsrr))
residualAnalysis <- function(model=NULL) {
if(!require(gridExtra)) stop('Install the gridExtra package.')
if(!require(ggformula)) stop('Install the ggformula package.')
df <- data.frame(Prediction=predict(model$fit),
Residual=rstandard(model$fit))
p1 <- gf_point(Residual~Prediction,data=df) %>% gf_hline(yintercept = 0)
p2 <- gf_qq(~Residual,data=df) %>% gf_qqline()
grid.arrange(p1, p2, ncol=2)
}
auto <- data("Auto")
lm_spec <- linear_reg() |>
set_mode("regression") |>
set_engine("lm")
slr_mod <- lm_spec |>
fit(mpg ~ horsepower, data = Auto)
tidy(slr_mod)
pred_intervals <- cbind(
Auto,
predict(slr_mod,new_data=Auto),
predict(slr_mod,new_data=Auto,type="conf_int"),
predict(slr_mod,new_data=Auto,type="pred_int"))
colnames(pred_intervals) <- make.unique(names(pred_intervals))
pred_intervals |>
gf_point(mpg ~ horsepower, alpha = 0.5) |>
gf_line(.pred ~ horsepower, color = "blue") |>
gf_ribbon(.pred_upper + .pred_lower ~ ., alpha = 0.2, fill = "red") |>
gf_ribbon(.pred_upper.1 + .pred_lower.1 ~ ., alpha = 0.1, fill = "red") +
gf_theme(theme_classic()) +
labs(title = "Plot with confidence and prediction intervals")
The plot shows the raw data points, the regression line, and the 95% confidence and prediction intervals. The confidence interval represents the uncertainty in the estimation of the mean response for a given horsepower value, while the prediction interval represents the uncertainty in predicting a single observation for a given horsepower value. We can see that the prediction interval is wider than the confidence interval, reflecting the additional uncertainty associated with predicting a single observation.
data("diamonds")
diamonds <- diamonds %>%
mutate(
magnitude = x * y *z,
quality_cut = if_else(cut %in% c("Ideal", "Premium"), "High", "Low"),
quality_color = if_else(color %in% c("D", "E", "F"), "High", "Low"),
quality_clarity = if_else(clarity %in% c("IF", "VVS1", "VVS2"), "High", "Low"),
quality_cut = as.factor(quality_cut),
quality_color = as.factor(quality_color),
quality_clarity = as.factor(quality_clarity)
)
diamonds_split <- createDataPartition(diamonds$carat, p = 0.80, list = FALSE)
train_data <- diamonds[diamonds_split,]
test_data <- diamonds[-diamonds_split,]
mlr_mod <- lm_spec %>%
fit(carat ~ magnitude * quality_cut + quality_clarity, data = train_data)
tidy(mlr_mod)
Here’s the estimated regression equation for predicting carat
\[ \begin{aligned} \hat{y}_{\text{Carat}} = & 0.0418 \\ & + 0.0056 \times Magnitude \\ & - 0.0563 \times quality\_cutLow \\ & + 0.0199 \times quality\_clarityLow \\ & + 0.00038 \times (Magnitude:quality\_cutLow) \end{aligned} \]
pred_intervals <- cbind(
test_data,
predict(mlr_mod, new_data=test_data),
predict(mlr_mod, new_data=test_data, type="conf_int"),
predict(mlr_mod, new_data=test_data, type="pred_int"))
colnames(pred_intervals) <- make.unique(names(pred_intervals))
pred_intervals <- pred_intervals |>
rename(.conf_lower = .pred_lower, .conf_upper = .pred_upper,
.pred_lower = .pred_lower.1, .pred_upper = .pred_upper.1)
pred_intervals |>
gf_point(carat ~ magnitude, col = "grey") |>
gf_line(.pred ~ magnitude) |>
gf_line(.conf_lower ~ magnitude, col='red') %>%
gf_line(.conf_upper ~ magnitude, col='red') %>%
gf_line(.pred_lower ~ magnitude, col='blue') %>%
gf_line(.pred_upper ~ magnitude, col='blue') +
facet_wrap(~ quality_cut + quality_clarity) +
gf_theme(theme_classic()) +
labs(title = "Confidence and prediction intervals", x = "Magnitude", y = "Carat")
The plots show how diamond carat weight (y-axis) changes with volume, split by cut & quality (High/Low). The red line is the model’s prediction, the inner blue lines show the average carat weight’s range (confidence interval), and the outer blue lines show the single diamond’s carat weight’s range (prediction interval), wider due to individual variability.
polData <- diamonds %>%
dplyr::select(carat,table,depth,x,y,z)
polData <- diamonds %>%
mutate(
log_x = log(x+1),
log_y = log(y+1),
log_z = log(z+1)
)
trainIndex <- createDataPartition(polData$carat, p = 0.8, list = FALSE)
trainData <- polData[trainIndex, ]
testData <- polData[-trainIndex, ]
polyModel <- lm(carat ~ poly(x, 2, raw = TRUE) +
poly(y, 2, raw = TRUE) +
poly(z, 2, raw = TRUE) +
depth + table,
data = trainData)
tidy(polyModel)
Making Predictions on test data
rmse <- sqrt(mean((testData$predicted - testData$carat)^2))
print(paste("Test RMSE:", round(rmse, 4)))
## [1] "Test RMSE: 0.7625"
The Root Mean Squared Error on the test data is smaller than Residual Standard Error which shows there is no overfitting in the model.
Confidence Intervals
## 2.5 % 97.5 %
## (Intercept) -0.5986503294 -0.5624668019
## poly(x, 2, raw = TRUE)1 -0.3321265794 -0.3042329998
## poly(x, 2, raw = TRUE)2 0.0456369513 0.0480814384
## poly(y, 2, raw = TRUE)1 -0.1489510804 -0.1208453162
## poly(y, 2, raw = TRUE)2 0.0232125819 0.0257087508
## poly(z, 2, raw = TRUE)1 0.0323262963 0.0392027077
## poly(z, 2, raw = TRUE)2 -0.0009868181 -0.0007594962
## depth 0.0197563752 0.0202301127
## table 0.0032480351 0.0034767025
Check the assumptions
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
# Import the datasets
pfi_2016 <- readxl::read_xlsx("pfi-data.xlsx", sheet = "curated 2016")
pfi_2019 <- readxl::read_xlsx("pfi-data.xlsx", sheet = "curated 2019")
# Merge both the datasets
pfi_data <- rbind(pfi_2016 |>
mutate(HHPARNX = HHPARN16X) |>
dplyr::select(SEGRADES, HHPARNX, PARGRADEX,
EDCPUB, INTACC, SEABSNT, TTLHHINC),
pfi_2019 |>
mutate(HHPARNX = HHPARN19X) |>
dplyr::select(SEGRADES, HHPARNX, PARGRADEX,
EDCPUB, INTACC, SEABSNT, TTLHHINC)
)
# Encode all the variables and convert them to factors
pfi <- pfi_data |>
mutate(GRADE = case_match(SEGRADES,
1 ~ "A",
2 ~ "B",
3 ~ "C",
4 ~ "D",
c(5,-1) ~ "Other"),
PARENT = case_match(HHPARNX,
1 ~ "Both",
2 ~ "Mother",
3 ~ "Father",
4 ~ "None"),
HAS_POST_SEC = case_match(PARGRADEX,
c(1,2) ~ "No",
c(3,4,5) ~ "Yes"),
PUBLIC_SCHOOL = case_match(EDCPUB,
1 ~ "Yes",
2 ~ "No"),
INTERNET_ACCESS = case_match(INTACC,
c(1,2,3) ~ "Yes",
4 ~ "No"),
NO_OF_ABSENT = case_match(SEABSNT,
c(1,2) ~ "0-10 days",
c(3,4) ~ "11-20 days",
.default = "Other"),
INCOME_CAT = case_match(TTLHHINC,
c(1:8) ~ "$0-100K",
c(9:10) ~ "$100-200K",
c(11:12) ~ "$200K or more"),
GRADE = factor(GRADE, levels = c("Other", "D", "C", "B", "A")),
PARENT = factor(PARENT, levels = c("None", "Father", "Mother", "Both")),
HAS_POST_SEC = factor(HAS_POST_SEC, levels = c("No", "Yes")),
PUBLIC_SCHOOL = factor(PUBLIC_SCHOOL, levels = c("No", "Yes")),
INTERNET_ACCESS = factor(INTERNET_ACCESS, levels = c("No", "Yes")),
NO_OF_ABSENT = factor(NO_OF_ABSENT, levels = c("0-10 days","11-20 days","Other")),
INCOME_CAT = factor(INCOME_CAT, levels = c("$0-100K", "$100-200K", "$200K or more"))
) |>
dplyr::select(GRADE, PARENT, HAS_POST_SEC, PUBLIC_SCHOOL,
INTERNET_ACCESS, NO_OF_ABSENT, INCOME_CAT)
Define k-fold cross validation and split the dataset into train and test splits
# Split the dataset into training (75%) and testing (25%)
pfi_split <- initial_split(pfi, prop = 0.75, strata = GRADE)
pfi_train <- training(pfi_split)
pfi_test <- testing(pfi_split)
# Define k-fold cross validation with 5 levels
pfi.folds <- vfold_cv(pfi_train, v=5, strata = GRADE)
# Multinomial Regression
multi_spec <- multinom_reg() |>
set_engine("nnet")
multi_wf <- workflow() |>
add_model(multi_spec) |>
add_formula(as.formula("GRADE ~ ."))
fit_multi <- multi_wf |>
fit_resamples(pfi.folds,
control=control_resamples(save_pred=TRUE,
save_workflow=TRUE,
extract=extract_model))
multi_metrics_train <- collect_metrics(fit_multi) |>
mutate(Model = "Multinomial Regression", Data = "Train")
# Confusion Matrix for Multinomial train data
cv_predictions_multi <- collect_predictions(fit_multi)
# Confusion matrix
conf_mat_multi <- cv_predictions_multi %>%
count(GRADE, .pred_class, .drop = FALSE)
conf_mat_multi %>%
pivot_wider(
names_from = .pred_class,
values_from = n
)
# Fit the model
multi_model <- multinom_reg() |>
set_engine("nnet") |>
fit(GRADE ~ ., data = pfi_train)
tidy(multi_model)
The log-odds of a student who got “Other/No” grade vs “A” grade.
\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"A"}}}{\hat{p}_{\texttt{"Other"}}}\right) = & 1.30662064 \\ & + 0.34581034{\space}PARENTFather \\ & + 0.06716781{\space}PARENTMother \\ & + 0.20814503{\space}PARENTBoth \\ & + 0.08797909{\space}HAS\_POST\_SECYes \\ & - 0.26304969{\space}PUBLIC\_SCHOOLYes \\ & + 0.03367262{\space}INTERNET\_ACCESSYes \\ & - 0.19298460{\space}NO\_OF\_ABSENT11-20 days \\ & - 0.10715015{\space}NO\_OF\_ABSENTOther \\ & + 0.03941675{\space}INCOME\_CAT100-200K \\ & + 0.01853510{\space}INCOME\_CAT200K or more \end{aligned} \]
Predict using the test data and check accuracy scores
# predict using the test data
pfi_multi_aug <- augment(multi_model, new_data = pfi_test)
# check accuracy and roc_auc
multi_test_metrics <- rbind(
pfi_multi_aug |>
metrics(truth = GRADE, estimate = .pred_class) |>
dplyr::filter(.metric == "accuracy") |>
mutate(Model = "Multinomial Regression", Data = "Test"),
pfi_multi_aug |>
roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
mutate(Model = "Multinomial Regression", Data = "Test")
)
multi_test_metrics
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
lda_wf <- workflow() |>
add_model(lda_spec) |>
add_formula(as.formula("GRADE ~ ."))
fit_lda <- lda_wf |>
fit_resamples(pfi.folds,
control=control_resamples(save_pred=TRUE,
save_workflow=TRUE,
extract=extract_model))
lda_metrics_train <- collect_metrics(fit_lda) |>
mutate(Model = "LDA", Data = "Train")
cv_predictions_lda <- collect_predictions(fit_lda)
# Confusion matrix
conf_mat_lda <- cv_predictions_lda %>%
count(GRADE, .pred_class, .drop = FALSE)
conf_mat_lda %>%
pivot_wider(
names_from = .pred_class,
values_from = n
)
Fit the model, predict using the test data and check accuracy scores
# Fit the model
lda_model <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS") |>
fit(GRADE ~ ., data = pfi_train)
# predict using test data
pfi_lda_aug <- augment(lda_model, new_data = pfi_test)
# check accuracy and roc_auc
lda_test_metrics <- rbind(
pfi_lda_aug |>
metrics(truth = GRADE, estimate = .pred_class) |>
dplyr::filter(.metric == "accuracy") |>
mutate(Model = "LDA", Data = "Test"),
pfi_lda_aug |>
roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
mutate(Model = "LDA", Data = "Test")
)
Comparison on Model Performance for test data
# Combine train and test metrics
all_metrics <- bind_rows(multi_metrics_train |>
mutate(.estimate = mean) |>
dplyr::select(.metric, .estimate, Model, Data),
lda_metrics_train |>
mutate(.estimate = mean) |>
dplyr::select(.metric, .estimate, Model, Data),
multi_test_metrics |>
dplyr::select(.metric, .estimate, Model, Data),
lda_test_metrics |>
dplyr::select(.metric, .estimate, Model, Data)) |>
filter(.metric %in% c("accuracy", "roc_auc"))
ggplot(all_metrics, aes(x = Model, y = .estimate, fill = .metric)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(. ~ Data) +
labs(title = "Model Performance Comparison (Train vs Test)",
y = "Metric Value", x = "Model", fill = "Metric") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2))
## Rows: 5836 Columns: 119
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): educ, race, gender, income_cat, voter_category
## dbl (114): RespId, weight, Q1, Q2_1, Q2_2, Q2_3, Q2_4, Q2_5, Q2_6, Q2_7, Q2_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nonvoters <- nonvoters %>%
mutate(
party =
case_match(Q30,
1 ~ "Republican",
2 ~ "Democrat",
3 ~ "Independent",
c(4,5,-1) ~ "Other"),
party = factor(party, levels=c("Other","Independent","Democrat","Republican")),
voter_category = factor(voter_category,
levels=c("rarely/never","sporadic","always"))
)
Fit the model
multi_mod_party <- multinom_reg() %>%
set_engine("nnet") %>%
fit(voter_category ~ ppage + educ + race + gender + income_cat + party, data = nonvoters)
multi_mod_party <- repair_call(multi_mod_party, data = nonvoters)
tidy(multi_mod_party)
The estimated regression equation for “always” with respect to “rarely/never”
\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"always"}}}{\hat{p}_{\texttt{"rarely/never"}}}\right) = & -2.9195567635 \\ & + 0.0582072009 ppage \\ & - 1.2671996403educHigh school or less \\ & - 0.3303178875 educSome college \\ & - 0.3410062108 raceHispanic \\ & - 0.6004728052 raceOther/Mixed \\ & + 0.1271973319 raceWhite \\ & - 0.1922012038 genderMale \\ & - 0.0003797663 incomecat40-75k \\ & + 0.1651906989 incomecat75-125k \\ & - 0.6641096103 incomecatLessthan40k \\ & + 0.8387114497 partyIndependent \\ & + 1.4010025518 partyDemocrat \\ & + 1.2392411768 partyRepublican \end{aligned} \]
auto <- Auto %>%
dplyr::select(-name) %>%
dplyr::select(acceleration,mpg:weight,year,origin) %>%
mutate(origin=factor(origin,levels=1:3,labels=c('American','European','Japanese')))
model <- lm(acceleration ~
horsepower + weight + cylinders + mpg + displacement + year + origin,
data = auto)
# Best subset selection
best_subset_model <- regsubsets(
acceleration ~ horsepower + weight + cylinders + mpg + displacement + year + origin,
data = auto,
nvmax = 10)
best_subset_summary <- summary(best_subset_model)
coef(best_subset_model, which.max(best_subset_summary$adjr2))
## (Intercept) horsepower weight displacement year
## 20.197361522 -0.085400840 0.003137086 -0.010431306 -0.040105886
# Forward Selection
forward_selection_model <- ols_step_forward_aic(model)
forward_selection_model[["model"]][["coefficients"]]
## (Intercept) horsepower weight displacement year
## 20.197361522 -0.085400840 0.003137086 -0.010431306 -0.040105886
# Backward Selection
backward_selection_model <- ols_step_backward_aic(model)
backward_selection_model[["model"]][["coefficients"]]
## (Intercept) horsepower weight displacement year
## 20.197361522 -0.085400840 0.003137086 -0.010431306 -0.040105886
# Load data and setup training/test/fold partitions
data(Auto)
auto <- Auto |>
drop_na() |>
dplyr::select(-name)
auto_split <- initial_split(auto, strata = "acceleration")
auto_train <- training(auto_split)
auto_test <- testing(auto_split)
auto_fold <- vfold_cv(auto_train, v = 10)
# multitasking setup
cores <- availableCores()
plan(strategy=multisession, workers=cores-1)
ridge_recipe <-
recipe(formula = acceleration ~ ., data = auto_train) %>%
step_novel(all_nominal_predictors()) %>% # add 'new' level to factors and chr -> fctr
step_dummy(all_nominal_predictors()) %>% # factors to dummy variables
step_zv(all_predictors()) %>% # remove zero variance predictors
step_normalize(all_predictors()) # normalize all predictors
ridge_spec <-
linear_reg(penalty = tune(), mixture = 0) %>%
set_mode("regression") %>%
set_engine("glmnet")
ridge_workflow <- workflow() %>%
add_recipe(ridge_recipe) %>%
add_model(ridge_spec)
penalty_grid <- grid_regular(penalty(range = c(-5, 5)), levels = 50)
tune_res <- tune_grid(
ridge_workflow,
resamples = auto_fold,
grid = penalty_grid,
control(parallel_over = "everything")
)
best_penalty <- select_best(tune_res, metric = "rmse")
ridge_final <- finalize_workflow(ridge_workflow, best_penalty)
ridge_final_fit <- fit(ridge_final, data = auto_train)
ridge_aug <- augment(ridge_final_fit, new_data = auto_test)
ridge_result <- rbind(
rmse(ridge_aug,truth = acceleration, estimate = .pred),
rsq(ridge_aug, truth = acceleration, estimate = .pred))
ridge_result
Final ridge regression model
## Loaded glmnet 4.1-8
ridge_aug <- augment(ridge_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
ridge_aug %>%
gf_qq(~.resid) %>% gf_qqline()
lm_spec <-
linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
pca_recipe <-
recipe(formula = acceleration ~ ., data = auto_train) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), threshold = tune())
pca_workflow <-
workflow() %>%
add_recipe(pca_recipe) %>%
add_model(lm_spec)
threshold_grid <- grid_regular(threshold(), levels = 10)
tune_res <- tune_grid(
pca_workflow,
resamples = auto_fold,
grid = threshold_grid,
control(parallel_over = "everything")
)
## Warning: The `...` are not used in this function but one or more objects were
## passed: ''
best_threshold <- select_best(tune_res, metric = "rmse")
pca_final <- finalize_workflow(pca_workflow, best_threshold)
pca_final_fit <- fit(pca_final, data = auto_train)
pca_aug <- augment(pca_final_fit, new_data = auto_test)
pca_result <- rbind(
rmse(pca_aug,truth = acceleration, estimate = .pred),
rsq(pca_aug, truth = acceleration, estimate = .pred))
pca_result
Final pca model
pca_aug <- augment(pca_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
pca_aug %>%
gf_qq(~.resid) %>% gf_qqline()
poly_recipe <- recipe(formula = acceleration ~ ., data = auto_train) %>%
step_rm(year, origin) %>%
step_poly(weight, displacement, horsepower, degree = tune()) %>%
step_normalize(all_predictors())
poly_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
poly_workflow <- workflow() %>%
add_recipe(poly_recipe) %>%
add_model(poly_spec)
poly_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
poly_res <- tune_grid(
poly_workflow,
resamples = auto_fold,
grid = poly_grid,
control = control_grid(save_pred = TRUE, parallel_over = "everything")
)
best_threshold <- select_best(poly_res, metric = "rmse")
poly_final <- finalize_workflow(poly_workflow, best_threshold)
poly_final_fit <- fit(poly_final, data = auto_train)
poly_aug <- augment(poly_final_fit, new_data = auto_test)
poly_result <- rbind(
rmse(poly_aug,truth = acceleration, estimate = .pred),
rsq(poly_aug, truth = acceleration, estimate = .pred))
poly_result
Final polynomial tuning model
poly_aug <- augment(poly_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
poly_aug %>%
gf_qq(~.resid) %>% gf_qqline()
bikeshare <- Bikeshare
contrasts(bikeshare$hr) = contr.sum(24)
contrasts(bikeshare$mnth) = contr.sum(12)
pr <- poisson_reg() %>%
set_engine("glm") %>%
translate()
mod.pois <- pr %>%
fit(bikers ~ mnth + hr + workingday + temp + weathersit, data = bikeshare,
family = poisson )
tidy(mod.pois)
Plot the coefficients associated with mnth
and
hr
coef.months <- c(tidy(mod.pois)$estimate[2:12], -sum(tidy(mod.pois)$estimate[2:12]))
coef.hours <- c(tidy(mod.pois)$estimate[13:35], -sum(tidy(mod.pois)$estimate[13:35]))
grid.arrange(
gf_point(coef.months~1:12,pch=19,type="o") %>% gf_line(col='blue') %>%
gf_labs(x="Month",y="Coefficient") +
scale_x_continuous(breaks=seq(1,12,1),
labels = c("J","F","M","A","M","J","J","A","S","O","N","D")),
gf_point(coef.hours~1:24,pch=19,type="o") %>% gf_line(col='blue') %>%
gf_labs(x="Hour",y="Coefficient") +
scale_x_continuous(breaks=seq(1,24,1)),
nrow=1
)