R Markdown
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(dplyr)
library(ggplot2)
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:yardstick':
##
## accuracy, mae, mape, mase, precision, recall, rmse, smape
library(readxl)
Car_Sales <-read.csv("C:/Users/Baha/OneDrive/Documents/Car_Sales_data.csv") %>%
tibble::as_tibble() %>%
janitor::clean_names()
##Data cleaning and data preparation
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Car_Sales <- Car_Sales %>% drop_na()
## dataset for modeling
Car_Sales$is_new <-factor(Car_Sales$is_new, levels =
c("TRUE", "FALSE"))
Car_Sales$body_type <-as.factor(Car_Sales$body_type)
Car_Sales$make_name <-as.factor(Car_Sales$make_name)
Car_Sales <- data.frame(Car_Sales)
class(Car_Sales)
## [1] "data.frame"
# Split the Data
set.seed(7)
Car_Sales_split <- Car_Sales %>% rsample::initial_split(strata=
"price",
prop = 0.8)
Car_Sales_train <- rsample::training(Car_Sales_split)
Car_Sales_test <- rsample::testing(Car_Sales_split)
# Fitting GLMs to model Price of used cars
#GAMMA DISTRIBUTION
# Define the recipe
attach(Car_Sales)
gamma_rec <- recipes::recipe(price ~ body_type +
fuel_tank_gallons +
highway_fuel_economy + city_fuel_economy + wheelbase_inches +
back_legroom_inches + front_legroom_inches + length_inches + width_inches +
height_inches + engine_displacement + horsepower + daysonmarket + maximum_seating + is_new +
owner_count + mileage +
seller_rating,data = Car_Sales_train
) %>%
step_zv(all_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors(), -all_nominal()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_center(all_predictors(), -all_nominal()) %>%
step_scale(all_predictors(), -all_nominal()) %>%
step_log(price, base = 10) %>%
prep(training = Car_Sales_train, retain = TRUE)
# Fit a glm with gamma distribution and log link
fit_gamma <- glm(formula = price ~fuel_tank_gallons +
wheelbase_inches +
back_legroom_inches + front_legroom_inches + length_inches + width_inches +
height_inches + daysonmarket ,data = recipes::
juice(gamma_rec),
family = Gamma(link = "log") ## Specify gamma distribution and log link
)
summary(fit_gamma)
##
## Call:
## glm(formula = price ~ fuel_tank_gallons + wheelbase_inches +
## back_legroom_inches + front_legroom_inches + length_inches +
## width_inches + height_inches + daysonmarket, family = Gamma(link = "log"),
## data = recipes::juice(gamma_rec))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4465292 0.0029593 488.811 < 2e-16 ***
## fuel_tank_gallons -0.0007795 0.0061839 -0.126 0.899762
## wheelbase_inches 0.0213827 0.0102640 2.083 0.037973 *
## back_legroom_inches 0.0051131 0.0034558 1.480 0.139911
## front_legroom_inches 0.0068612 0.0033545 2.045 0.041591 *
## length_inches -0.0147155 0.0108704 -1.354 0.176724
## width_inches 0.0171335 0.0038586 4.440 1.22e-05 ***
## height_inches -0.0024379 0.0049092 -0.497 0.619799
## daysonmarket -0.0115920 0.0029965 -3.868 0.000131 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.003056317)
##
## Null deviance: 1.3148 on 348 degrees of freedom
## Residual deviance: 1.0623 on 340 degrees of freedom
## AIC: -3.1131
##
## Number of Fisher Scoring iterations: 4
test_gamma <- recipes::bake(gamma_rec,new_data = Car_Sales_test,
all_predictors()
)
# get predictions
pred <- predict(fit_gamma, test_gamma, se.fit = FALSE, scale = NULL, df = Inf,
interval = "prediction",level = 0.95, type = "response")
results <-Car_Sales_test %>% select(id, price) %>%
mutate(price = log10(price)) %>%
bind_cols(pred)
## New names:
## • `` -> `...3`
mse <- mean((fit_gamma$residuals)^2)
cat("Mean Squared Error:", mse)
## Mean Squared Error: 0.002977501
rmse <-sqrt(mse);rmse
## [1] 0.05456648
# construct submission_File from the predictions
submissionFile <- data.frame(id = results$id, price = results$...3)
submissionFile$price <-10^submissionFile$price
submissionFile
## id price
## 1 4137401 16277.07
## 2 9237556 18860.50
## 3 6772057 13859.81
## 4 3338806 22926.22
## 5 8589684 27130.95
## 6 3506803 20130.25
## 7 3077779 15632.31
## 8 4969822 11072.24
## 9 8762397 21457.06
## 10 7012823 20399.17
## 11 6643008 16362.14
## 12 7812621 13772.70
## 13 4427107 22146.11
## 14 5659103 25113.43
## 15 8214234 17422.07
## 16 5313468 17201.87
## 17 4065486 16445.18
## 18 3848311 15973.41
## 19 9367621 21121.53
## 20 1130414 15016.12
## 21 1227852 16079.49
## 22 5150031 16819.79
## 23 8019755 20833.38
## 24 1006804 21849.57
## 25 3056740 14916.25
## 26 8782032 15676.59
## 27 6537055 16397.58
## 28 9614363 16186.38
## 29 5359373 26374.68
## 30 3288014 19896.12
## 31 6651149 16269.88
## 32 6915682 12584.60
## 33 8311194 15133.04
## 34 8807989 15348.85
## 35 1155839 13840.49
## 36 3559497 21018.77
## 37 2490771 15825.41
## 38 7803421 14447.87
## 39 8710707 46246.86
## 40 2235753 13589.27
## 41 8624715 20883.27
## 42 2749742 17373.81
## 43 8596169 17895.34
## 44 3237840 28213.28
## 45 9630730 13565.65
## 46 9481224 16721.69
## 47 6354319 18217.31
## 48 5181108 15853.42
## 49 2870716 16912.17
## 50 9468910 19107.93
## 51 5662317 18775.31
## 52 6503718 32096.07
## 53 9928796 20553.73
## 54 6056572 21885.83
## 55 6288342 21575.32
## 56 2553248 19474.82
## 57 8679207 15706.94
## 58 4916045 26374.68
## 59 6368450 19332.76
## 60 1545500 15046.69
## 61 1722443 32405.00
## 62 7492355 19107.79
## 63 2008699 14826.38
## 64 5739328 14280.67
## 65 4492228 17033.70
## 66 1050055 14417.42
## 67 7692398 19861.39
## 68 4919001 16406.02
## 69 8308679 17062.04
## 70 1447874 21726.56
## 71 9969150 20839.84
## 72 7004812 21746.27
## 73 9764300 13357.39
## 74 8337700 13088.12
## 75 9084352 18160.67
## 76 9404051 20906.72
## 77 2569803 26490.62
## 78 2056011 17039.69
## 79 2127811 15535.00
## 80 4057290 16091.48
## 81 8797364 17007.81
## 82 1990887 14294.61
## 83 4404462 15510.05
## 84 6784301 17371.59
## 85 4299309 10036.87
## 86 6746150 16055.51
## 87 4689466 19712.74
## 88 2603303 14395.43
## 89 1426785 20997.62
write.csv(submissionFile, 'sample_submission.csv',row.names = F)