R Markdown

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(dplyr)
library(ggplot2)
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:yardstick':
## 
##     accuracy, mae, mape, mase, precision, recall, rmse, smape
library(readxl)
Car_Sales <-read.csv("C:/Users/Baha/OneDrive/Documents/Car_Sales_data.csv") %>% 
  tibble::as_tibble() %>% 
  janitor::clean_names()
##Data cleaning and data preparation
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
Car_Sales <- Car_Sales %>% drop_na()
## dataset for modeling
Car_Sales$is_new <-factor(Car_Sales$is_new, levels = 
                                c("TRUE", "FALSE"))
Car_Sales$body_type <-as.factor(Car_Sales$body_type)
Car_Sales$make_name <-as.factor(Car_Sales$make_name)
Car_Sales <- data.frame(Car_Sales)
class(Car_Sales)
## [1] "data.frame"
# Split the Data
set.seed(7)
Car_Sales_split <- Car_Sales %>% rsample::initial_split(strata= 
         "price",
             prop = 0.8)
Car_Sales_train <- rsample::training(Car_Sales_split)
Car_Sales_test <- rsample::testing(Car_Sales_split)
# Fitting GLMs to model Price of used cars
#GAMMA DISTRIBUTION
# Define the recipe
attach(Car_Sales)
gamma_rec <- recipes::recipe(price ~ body_type +
                               fuel_tank_gallons +
                               highway_fuel_economy + city_fuel_economy + wheelbase_inches +
                               back_legroom_inches + front_legroom_inches + length_inches + width_inches +
                               height_inches + engine_displacement + horsepower + daysonmarket + maximum_seating + is_new +
                               owner_count + mileage +
                               seller_rating,data = Car_Sales_train
) %>%
  step_zv(all_predictors(), -all_outcomes()) %>% 
  step_naomit(all_predictors(), -all_nominal()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_center(all_predictors(), -all_nominal()) %>%
  step_scale(all_predictors(), -all_nominal()) %>%
  step_log(price, base = 10) %>%
  prep(training = Car_Sales_train, retain = TRUE)
# Fit a glm with gamma distribution and log link
fit_gamma <- glm(formula = price ~fuel_tank_gallons +
                                wheelbase_inches +
                               back_legroom_inches + front_legroom_inches + length_inches + width_inches +
                               height_inches + daysonmarket  ,data = recipes::
  juice(gamma_rec),
family = Gamma(link = "log")  ## Specify gamma distribution and log link
)
summary(fit_gamma)
## 
## Call:
## glm(formula = price ~ fuel_tank_gallons + wheelbase_inches + 
##     back_legroom_inches + front_legroom_inches + length_inches + 
##     width_inches + height_inches + daysonmarket, family = Gamma(link = "log"), 
##     data = recipes::juice(gamma_rec))
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.4465292  0.0029593 488.811  < 2e-16 ***
## fuel_tank_gallons    -0.0007795  0.0061839  -0.126 0.899762    
## wheelbase_inches      0.0213827  0.0102640   2.083 0.037973 *  
## back_legroom_inches   0.0051131  0.0034558   1.480 0.139911    
## front_legroom_inches  0.0068612  0.0033545   2.045 0.041591 *  
## length_inches        -0.0147155  0.0108704  -1.354 0.176724    
## width_inches          0.0171335  0.0038586   4.440 1.22e-05 ***
## height_inches        -0.0024379  0.0049092  -0.497 0.619799    
## daysonmarket         -0.0115920  0.0029965  -3.868 0.000131 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.003056317)
## 
##     Null deviance: 1.3148  on 348  degrees of freedom
## Residual deviance: 1.0623  on 340  degrees of freedom
## AIC: -3.1131
## 
## Number of Fisher Scoring iterations: 4
test_gamma <- recipes::bake(gamma_rec,new_data = Car_Sales_test,
                            all_predictors()
)
# get predictions
pred <- predict(fit_gamma, test_gamma, se.fit = FALSE, scale = NULL, df = Inf,
interval = "prediction",level = 0.95, type = "response")
results <-Car_Sales_test %>% select(id, price) %>% 
  mutate(price = log10(price)) %>% 
bind_cols(pred)
## New names:
## • `` -> `...3`
mse <- mean((fit_gamma$residuals)^2)
cat("Mean Squared Error:", mse)
## Mean Squared Error: 0.002977501
rmse <-sqrt(mse);rmse
## [1] 0.05456648
# construct submission_File from the predictions
submissionFile <- data.frame(id = results$id, price = results$...3)
submissionFile$price <-10^submissionFile$price
submissionFile
##         id    price
## 1  4137401 16277.07
## 2  9237556 18860.50
## 3  6772057 13859.81
## 4  3338806 22926.22
## 5  8589684 27130.95
## 6  3506803 20130.25
## 7  3077779 15632.31
## 8  4969822 11072.24
## 9  8762397 21457.06
## 10 7012823 20399.17
## 11 6643008 16362.14
## 12 7812621 13772.70
## 13 4427107 22146.11
## 14 5659103 25113.43
## 15 8214234 17422.07
## 16 5313468 17201.87
## 17 4065486 16445.18
## 18 3848311 15973.41
## 19 9367621 21121.53
## 20 1130414 15016.12
## 21 1227852 16079.49
## 22 5150031 16819.79
## 23 8019755 20833.38
## 24 1006804 21849.57
## 25 3056740 14916.25
## 26 8782032 15676.59
## 27 6537055 16397.58
## 28 9614363 16186.38
## 29 5359373 26374.68
## 30 3288014 19896.12
## 31 6651149 16269.88
## 32 6915682 12584.60
## 33 8311194 15133.04
## 34 8807989 15348.85
## 35 1155839 13840.49
## 36 3559497 21018.77
## 37 2490771 15825.41
## 38 7803421 14447.87
## 39 8710707 46246.86
## 40 2235753 13589.27
## 41 8624715 20883.27
## 42 2749742 17373.81
## 43 8596169 17895.34
## 44 3237840 28213.28
## 45 9630730 13565.65
## 46 9481224 16721.69
## 47 6354319 18217.31
## 48 5181108 15853.42
## 49 2870716 16912.17
## 50 9468910 19107.93
## 51 5662317 18775.31
## 52 6503718 32096.07
## 53 9928796 20553.73
## 54 6056572 21885.83
## 55 6288342 21575.32
## 56 2553248 19474.82
## 57 8679207 15706.94
## 58 4916045 26374.68
## 59 6368450 19332.76
## 60 1545500 15046.69
## 61 1722443 32405.00
## 62 7492355 19107.79
## 63 2008699 14826.38
## 64 5739328 14280.67
## 65 4492228 17033.70
## 66 1050055 14417.42
## 67 7692398 19861.39
## 68 4919001 16406.02
## 69 8308679 17062.04
## 70 1447874 21726.56
## 71 9969150 20839.84
## 72 7004812 21746.27
## 73 9764300 13357.39
## 74 8337700 13088.12
## 75 9084352 18160.67
## 76 9404051 20906.72
## 77 2569803 26490.62
## 78 2056011 17039.69
## 79 2127811 15535.00
## 80 4057290 16091.48
## 81 8797364 17007.81
## 82 1990887 14294.61
## 83 4404462 15510.05
## 84 6784301 17371.59
## 85 4299309 10036.87
## 86 6746150 16055.51
## 87 4689466 19712.74
## 88 2603303 14395.43
## 89 1426785 20997.62
write.csv(submissionFile, 'sample_submission.csv',row.names = F)