project two data 110

Source:https://www.tvtoyota.com

Introduction

My data set is called “MVA Vehicle Sales Counts by Month for Calendar Year 2002 through December 2025. It has 288 rows and 6 columns” and it can be founded in https://opendata.maryland.gov. It was created on November 20, 2015 and was last updated on March 25, 2026. It has 288 rows and 6 columns. With the quantitative variables being New, Used, Total Sales New and Total Sales Old. And the categorical variable are the Month and Year. I choose this data set because I was curious about the amount of used cars and new cars being sold. And I wanted to know the total sales of the cars being sold.

Research question: How do the number of new and used vehicles sold, along with total used vehicle sales, influence total new vehicle sales?

library(tidyverse) # Load all the functions everything else that I need to do the codes
library(dplyr)
library(ggplot2)
library(highcharter)
library(RColorBrewer)

setwd("C:/Users/kenne/Downloads") # Set the working directory
MVA_sales <- read.csv("MVA Vehicle Sales.csv")

EDA

str(MVA_sales) # Looking at the dimensions on the data set

'data.frame':   288 obs. of  6 variables:
 $ Year.           : int  2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
 $ Month.          : chr  "JAN" "FEB" "MAR" "APR" ...
 $ New             : chr  "31,106" "27,520" "34,225" "36,452" ...
 $ Used            : chr  "49,927" "50,982" "58,794" "59,817" ...
 $ Total.Sales.New : chr  "$755,015,820" "$664,454,223" "$805,666,244" "$846,368,297" ...
 $ Total.Sales.Used: chr  "$386,481,929" "$361,353,242" "$419,385,387" "$433,061,150" ...

head(MVA_sales, 10) # Looking at the first ten rows of my data set, so I can see what I can fix/change

   Year. Month.    New   Used Total.Sales.New Total.Sales.Used
1   2002    JAN 31,106 49,927    $755,015,820     $386,481,929
2   2002    FEB 27,520 50,982    $664,454,223     $361,353,242
3   2002    MAR 34,225 58,794    $805,666,244     $419,385,387
4   2002    APR 36,452 59,817    $846,368,297     $433,061,150
5   2002    MAY 37,359 60,577    $855,005,784     $442,569,410
6   2002    JUN 36,348 55,415    $830,251,613     $414,731,166
7   2002    JUL 30,367 55,235    $700,530,891     $384,673,023
8   2002    AUG 38,965 60,310    $934,484,212     $455,453,720
9   2002    SEP 39,740 55,485    $950,502,055     $432,112,270
10  2002    OCT 35,276 58,020    $865,326,154     $456,738,532

names(MVA_sales) <- gsub("[\\$,\\.]","_",names(MVA_sales)) # Fixing all the variables that has a . in it
MVA_sales$New <- as.numeric(gsub(",", "", MVA_sales$New))
MVA_sales$Used <- as.numeric(gsub(",", "", MVA_sales$Used))
MVA_sales$Total_Sales_New <- as.numeric(gsub("[\\$,]", "", MVA_sales$Total_Sales_New)) # Turning it into a number for the graph
MVA_sales$Total_Sales_Used <- as.numeric(gsub("[\\$,]", "", MVA_sales$Total_Sales_Used))

summary(MVA_sales) # To see the summary of the data set and see if the gsub works

     Year_         Month_               New             Used      
 Min.   :2002   Length:288         Min.   :13700   Min.   :29420  
 1st Qu.:2008   Class :character   1st Qu.:23770   1st Qu.:49493  
 Median :2014   Mode  :character   Median :27240   Median :55166  
 Mean   :2014                      Mean   :27862   Mean   :54796  
 3rd Qu.:2019                      3rd Qu.:31204   3rd Qu.:60618  
 Max.   :2025                      Max.   :45547   Max.   :73163  
 Total_Sales_New     Total_Sales_Used   
 Min.   :4.052e+08   Min.   :295521964  
 1st Qu.:7.637e+08   1st Qu.:445021274  
 Median :9.061e+08   Median :526031452  
 Mean   :8.943e+08   Mean   :575673982  
 3rd Qu.:1.009e+09   3rd Qu.:718581313  
 Max.   :1.518e+09   Max.   :934785809

Multiple linear regression

model2 <- lm(Total_Sales_New ~ New + ., data = MVA_sales) # The equation that includes all the variables

summary(model2) #seeing the results of the equation, and what variables I will need to remove


Call:
lm(formula = Total_Sales_New ~ New + ., data = MVA_sales)

Residuals:
       Min         1Q     Median         3Q        Max 
-141362617  -19604301   -2458149   19294834  257849428 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -3.296e+10  2.302e+09 -14.315  < 2e-16 ***
New               3.213e+04  8.006e+02  40.129  < 2e-16 ***
Year_             1.644e+07  1.144e+06  14.372  < 2e-16 ***
Month_AUG         2.934e+07  1.160e+07   2.530 0.011977 *  
Month_DEC         4.766e+07  1.254e+07   3.801 0.000178 ***
Month_FEB         1.648e+07  1.238e+07   1.331 0.184226    
Month_JAN         1.768e+06  1.237e+07   0.143 0.886424    
Month_JUL         3.495e+06  1.159e+07   0.302 0.763260    
Month_JUN        -1.368e+07  1.162e+07  -1.177 0.240208    
Month_MAR         3.490e+07  1.168e+07   2.987 0.003073 ** 
Month_MAY        -1.633e+07  1.156e+07  -1.413 0.158675    
Month_NOV         2.476e+07  1.228e+07   2.016 0.044807 *  
Month_OCT         3.941e+07  1.168e+07   3.375 0.000847 ***
Month_SEP         1.856e+07  1.165e+07   1.593 0.112308    
Used             -8.339e+03  5.260e+02 -15.852  < 2e-16 ***
Total_Sales_Used  5.183e-01  4.560e-02  11.367  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 39950000 on 272 degrees of freedom
Multiple R-squared:  0.9581,    Adjusted R-squared:  0.9558 
F-statistic: 415.1 on 15 and 272 DF,  p-value: < 2.2e-16

model <- lm(Total_Sales_New ~ New + Used + Total_Sales_Used, data = MVA_sales) # Using only the variables needed and removing the unimportant ones.

summary(model) # The results of the new equation


Call:
lm(formula = Total_Sales_New ~ New + Used + Total_Sales_Used, 
    data = MVA_sales)

Residuals:
       Min         1Q     Median         3Q        Max 
-190352874  -24307266     -95263   24615523  341568488 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       1.404e+08  2.384e+07   5.889  1.1e-08 ***
New               2.458e+04  7.696e+02  31.937  < 2e-16 ***
Used             -1.060e+04  5.655e+02 -18.745  < 2e-16 ***
Total_Sales_Used  1.129e+00  2.138e-02  52.807  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 53590000 on 284 degrees of freedom
Multiple R-squared:  0.9213,    Adjusted R-squared:  0.9205 
F-statistic:  1109 on 3 and 284 DF,  p-value: < 2.2e-16

Equation

Total_Sales_New=140,400,000 + 24,580(New) − 10,600(Used) + 1.129(Total_Sales_Used)

Interpretation: About 92.05% of the variation in total new vehicle sales is explained by the model looking at the adjusted R-squared. And the p-value is 2.2e-16( 0.00000000000000022) which is a lot less than 0.05, which means that the model is significant. there may be multicollinearity because Used and Total_Sales_Used are closely related. And that could affect the results.

Graphs

MVA_sales <- MVA_sales |>
 group_by(Month_) # Grouping the months for the graph

options(scipen = 999) # Fixing the number so I can see it
ggplot(MVA_sales, aes(x = New, y = Total_Sales_New, color = Total_Sales_New)) +
  geom_point(size = 3, alpha = 0.5) + # A graph of new cars being sold and the total sales of it.
  
  labs(
    title = "Relationship Between New Vehicle Sales and Total Sales Revenue",
    x = "Number of New Vehicles Sold",
    y = "Total Sales of New Vehicles ($)",
    caption = "Source: Maryland Vehicle Administration (MVA) Vehicle Sales Data"
  ) +
  
 theme_dark()

highchart() |> # The improved version of the first graph
  hc_title(text = "Relationship Between New Vehicle Sales and Total Sales Revenue") |>
  hc_xAxis(
    title = list(text = "Number of New Vehicles Sold")
  ) |>
  hc_yAxis(
    title = list(text = "Total Sales of New Vehicles ($)")
  ) |>
  hc_add_series(
    data = MVA_sales,
    type = "scatter",
    hcaes(x = New, y = Total_Sales_New, group = Month_), 
    ) |>
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) |>
  hc_caption(text = "Source: Maryland Vehicle Administration (MVA) Vehicle Sales Data")

Conclusion

The multiple linear regression model explains approximately 92% of the variation in total new vehicle sales, indicating a strong fit. All predictor variables are statistically significant. The number of new vehicles sold has a strong positive effect on total new sales, while used vehicle sales have a negative relationship, suggesting a substitution effect between used and new vehicles. Additionally, total used vehicle sales positively influence new vehicle sales, reflecting overall market demand trends. Despite the strong model performance, potential multicollinearity between predictors should be considered when interpreting the coefficients.

References

https://nyu-cdsc.github.io/learningr/assets/data-transformation.pdf

https://opendata.maryland.gov/Transportation/MVA-Vehicle-Sales-Counts-by-Month-for-Calendar-Yea/un65-7ipd/about_data