library(tidyverse) # Load all the functions everything else that I need to do the codes
library(dplyr)
library(ggplot2)
library(highcharter)
library(RColorBrewer)project two data 110
Source:https://www.tvtoyota.com
Introduction
My data set is called “MVA Vehicle Sales Counts by Month for Calendar Year 2002 through December 2025. It has 288 rows and 6 columns” and it can be founded in https://opendata.maryland.gov. It was created on November 20, 2015 and was last updated on March 25, 2026. It has 288 rows and 6 columns. With the quantitative variables being New, Used, Total Sales New and Total Sales Old. And the categorical variable are the Month and Year. I choose this data set because I was curious about the amount of used cars and new cars being sold. And I wanted to know the total sales of the cars being sold.
Research question: How do the number of new and used vehicles sold, along with total used vehicle sales, influence total new vehicle sales?
setwd("C:/Users/kenne/Downloads") # Set the working directory
MVA_sales <- read.csv("MVA Vehicle Sales.csv") EDA
str(MVA_sales) # Looking at the dimensions on the data set'data.frame': 288 obs. of 6 variables:
$ Year. : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
$ Month. : chr "JAN" "FEB" "MAR" "APR" ...
$ New : chr "31,106" "27,520" "34,225" "36,452" ...
$ Used : chr "49,927" "50,982" "58,794" "59,817" ...
$ Total.Sales.New : chr "$755,015,820" "$664,454,223" "$805,666,244" "$846,368,297" ...
$ Total.Sales.Used: chr "$386,481,929" "$361,353,242" "$419,385,387" "$433,061,150" ...
head(MVA_sales, 10) # Looking at the first ten rows of my data set, so I can see what I can fix/change Year. Month. New Used Total.Sales.New Total.Sales.Used
1 2002 JAN 31,106 49,927 $755,015,820 $386,481,929
2 2002 FEB 27,520 50,982 $664,454,223 $361,353,242
3 2002 MAR 34,225 58,794 $805,666,244 $419,385,387
4 2002 APR 36,452 59,817 $846,368,297 $433,061,150
5 2002 MAY 37,359 60,577 $855,005,784 $442,569,410
6 2002 JUN 36,348 55,415 $830,251,613 $414,731,166
7 2002 JUL 30,367 55,235 $700,530,891 $384,673,023
8 2002 AUG 38,965 60,310 $934,484,212 $455,453,720
9 2002 SEP 39,740 55,485 $950,502,055 $432,112,270
10 2002 OCT 35,276 58,020 $865,326,154 $456,738,532
names(MVA_sales) <- gsub("[\\$,\\.]","_",names(MVA_sales)) # Fixing all the variables that has a . in it
MVA_sales$New <- as.numeric(gsub(",", "", MVA_sales$New))
MVA_sales$Used <- as.numeric(gsub(",", "", MVA_sales$Used))
MVA_sales$Total_Sales_New <- as.numeric(gsub("[\\$,]", "", MVA_sales$Total_Sales_New)) # Turning it into a number for the graph
MVA_sales$Total_Sales_Used <- as.numeric(gsub("[\\$,]", "", MVA_sales$Total_Sales_Used))summary(MVA_sales) # To see the summary of the data set and see if the gsub works Year_ Month_ New Used
Min. :2002 Length:288 Min. :13700 Min. :29420
1st Qu.:2008 Class :character 1st Qu.:23770 1st Qu.:49493
Median :2014 Mode :character Median :27240 Median :55166
Mean :2014 Mean :27862 Mean :54796
3rd Qu.:2019 3rd Qu.:31204 3rd Qu.:60618
Max. :2025 Max. :45547 Max. :73163
Total_Sales_New Total_Sales_Used
Min. :4.052e+08 Min. :295521964
1st Qu.:7.637e+08 1st Qu.:445021274
Median :9.061e+08 Median :526031452
Mean :8.943e+08 Mean :575673982
3rd Qu.:1.009e+09 3rd Qu.:718581313
Max. :1.518e+09 Max. :934785809
Multiple linear regression
model2 <- lm(Total_Sales_New ~ New + ., data = MVA_sales) # The equation that includes all the variablessummary(model2) #seeing the results of the equation, and what variables I will need to remove
Call:
lm(formula = Total_Sales_New ~ New + ., data = MVA_sales)
Residuals:
Min 1Q Median 3Q Max
-141362617 -19604301 -2458149 19294834 257849428
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.296e+10 2.302e+09 -14.315 < 2e-16 ***
New 3.213e+04 8.006e+02 40.129 < 2e-16 ***
Year_ 1.644e+07 1.144e+06 14.372 < 2e-16 ***
Month_AUG 2.934e+07 1.160e+07 2.530 0.011977 *
Month_DEC 4.766e+07 1.254e+07 3.801 0.000178 ***
Month_FEB 1.648e+07 1.238e+07 1.331 0.184226
Month_JAN 1.768e+06 1.237e+07 0.143 0.886424
Month_JUL 3.495e+06 1.159e+07 0.302 0.763260
Month_JUN -1.368e+07 1.162e+07 -1.177 0.240208
Month_MAR 3.490e+07 1.168e+07 2.987 0.003073 **
Month_MAY -1.633e+07 1.156e+07 -1.413 0.158675
Month_NOV 2.476e+07 1.228e+07 2.016 0.044807 *
Month_OCT 3.941e+07 1.168e+07 3.375 0.000847 ***
Month_SEP 1.856e+07 1.165e+07 1.593 0.112308
Used -8.339e+03 5.260e+02 -15.852 < 2e-16 ***
Total_Sales_Used 5.183e-01 4.560e-02 11.367 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 39950000 on 272 degrees of freedom
Multiple R-squared: 0.9581, Adjusted R-squared: 0.9558
F-statistic: 415.1 on 15 and 272 DF, p-value: < 2.2e-16
model <- lm(Total_Sales_New ~ New + Used + Total_Sales_Used, data = MVA_sales) # Using only the variables needed and removing the unimportant ones.summary(model) # The results of the new equation
Call:
lm(formula = Total_Sales_New ~ New + Used + Total_Sales_Used,
data = MVA_sales)
Residuals:
Min 1Q Median 3Q Max
-190352874 -24307266 -95263 24615523 341568488
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.404e+08 2.384e+07 5.889 1.1e-08 ***
New 2.458e+04 7.696e+02 31.937 < 2e-16 ***
Used -1.060e+04 5.655e+02 -18.745 < 2e-16 ***
Total_Sales_Used 1.129e+00 2.138e-02 52.807 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 53590000 on 284 degrees of freedom
Multiple R-squared: 0.9213, Adjusted R-squared: 0.9205
F-statistic: 1109 on 3 and 284 DF, p-value: < 2.2e-16
Equation
Total_Sales_New=140,400,000 + 24,580(New) − 10,600(Used) + 1.129(Total_Sales_Used)
Interpretation: About 92.05% of the variation in total new vehicle sales is explained by the model looking at the adjusted R-squared. And the p-value is 2.2e-16( 0.00000000000000022) which is a lot less than 0.05, which means that the model is significant. there may be multicollinearity because Used and Total_Sales_Used are closely related. And that could affect the results.
Graphs
MVA_sales <- MVA_sales |>
group_by(Month_) # Grouping the months for the graphoptions(scipen = 999) # Fixing the number so I can see it
ggplot(MVA_sales, aes(x = New, y = Total_Sales_New, color = Total_Sales_New)) +
geom_point(size = 3, alpha = 0.5) + # A graph of new cars being sold and the total sales of it.
labs(
title = "Relationship Between New Vehicle Sales and Total Sales Revenue",
x = "Number of New Vehicles Sold",
y = "Total Sales of New Vehicles ($)",
caption = "Source: Maryland Vehicle Administration (MVA) Vehicle Sales Data"
) +
theme_dark()highchart() |> # The improved version of the first graph
hc_title(text = "Relationship Between New Vehicle Sales and Total Sales Revenue") |>
hc_xAxis(
title = list(text = "Number of New Vehicles Sold")
) |>
hc_yAxis(
title = list(text = "Total Sales of New Vehicles ($)")
) |>
hc_add_series(
data = MVA_sales,
type = "scatter",
hcaes(x = New, y = Total_Sales_New, group = Month_),
) |>
hc_plotOptions(series = list(marker = list(symbol = "circle"))) |>
hc_caption(text = "Source: Maryland Vehicle Administration (MVA) Vehicle Sales Data")Conclusion
The multiple linear regression model explains approximately 92% of the variation in total new vehicle sales, indicating a strong fit. All predictor variables are statistically significant. The number of new vehicles sold has a strong positive effect on total new sales, while used vehicle sales have a negative relationship, suggesting a substitution effect between used and new vehicles. Additionally, total used vehicle sales positively influence new vehicle sales, reflecting overall market demand trends. Despite the strong model performance, potential multicollinearity between predictors should be considered when interpreting the coefficients.
References
https://nyu-cdsc.github.io/learningr/assets/data-transformation.pdf
https://opendata.maryland.gov/Transportation/MVA-Vehicle-Sales-Counts-by-Month-for-Calendar-Yea/un65-7ipd/about_data