Changing Directory

setwd("C:/Users/HashT/ITEC4220")

Loading Dataset

car_data <- read.csv("used_car_listings.csv")

1 Ploitting the dataset

I using the columns mileage and price to see if the prices drop while the mileage of the car increased

mileage_thousands <- car_data$mileage / 1000

xticks <- seq(0, max(mileage_thousands, na.rm = TRUE), by = 10)

plot(mileage_thousands, car_data$price,
     main = "Used Car Price vs Mileage",
     xlab = "Mileage (in Miles)",
     ylab = "Price (in USD)",
     pch = 16, col = rgb(0, 0, 1, 0.4),
     cex = 0.7,
     cex.lab = 1.2, cex.axis = 1.1, cex.main = 1.4,
     xaxt = "n")

axis(1, at = xticks, labels = xticks)

#Add a trend line

lines(smooth.spline(mileage_thousands, car_data$price), col = "red", lwd = 2)

legend("topright", legend = c("Car Listings", "Trend Line"),
       col = c("blue", "red"), pch = c(16, NA), lty = c(NA, 1),lwd = c(NA, 2))

This graph shows all the cars mileage and what its typically priced at to get a good insight of the car market. It seems that once a car gets past 30,000 miles, it starts to depreciate more.

2 Finding the mean and the median of low mileage cars and high mileage cars

#2: Finding the mean and the median of low mileage cars and high mileage cars
low_mileage <- subset(car_data, mileage < 50000)
high_mileage <- subset(car_data, mileage >= 150000)

#Calculating the mean and median of the prices
mean_low <- mean(low_mileage$price, na.rm = TRUE)
mean_high <- mean(high_mileage$price, na.rm = TRUE)

median_low <- median(low_mileage$price, na.rm = TRUE)
median_high <- median(high_mileage$price, na.rm = TRUE)

#Printing Results
cat("The mean price for low mileage cars is $", mean_low, "\n")
## The mean price for low mileage cars is $ 22820.05
cat("The mean price for high mileage cars is $", mean_high, "\n")
## The mean price for high mileage cars is $ 2336.302
cat("The median price for low mileage cars is $", median_low, "\n")
## The median price for low mileage cars is $ 20412
cat("The median price for high mileage cars is $", median_high, "\n")
## The median price for high mileage cars is $ 1854

Plotting the results

#Boxplot to compare visually
boxplot(low_mileage$price, high_mileage$price,
        names = c("Low Mileage (<50k)", "High Mileage (150k+)"),
        main = "Price Comparison: Low vs High Mileage Cars",
        ylab = "Price (USD)",
        col = c("lightblue", "lightgreen"))

This plot shows the mean and the median price of cars below 50k miles or higher than 50k the mean of low mileage cars is $22,820 and $2,336 for high mile cars and the difference between them is night and day.

3 Correlation Analysis

# Remove missing values
car_data <- na.omit(car_data[, c("mileage", "price")])

# Correlation
cor.test(car_data$mileage, car_data$price)
## 
##  Pearson's product-moment correlation
## 
## data:  car_data$mileage and car_data$price
## t = -38.121, df = 2066, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6672243 -0.6165757
## sample estimates:
##        cor 
## -0.6426014
# Simple linear regression: Price ~ Mileage
model <- lm(price ~ mileage, data = car_data)

# Show results
summary(model)
## 
## Call:
## lm(formula = price ~ mileage, data = car_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12117  -4880  -1883   2595  54315 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.833e+04  2.928e+02   62.59   <2e-16 ***
## mileage     -7.908e-02  2.074e-03  -38.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7636 on 2066 degrees of freedom
## Multiple R-squared:  0.4129, Adjusted R-squared:  0.4127 
## F-statistic:  1453 on 1 and 2066 DF,  p-value: < 2.2e-16

This is a negative correlation. What that means is that as the mileage of the car increases, the mileage of the car decreases.

4 Histogram of the car manufacturing years

# 0) Load the data fresh so this chunk is self-contained
car_data <- read.csv("used_car_listings.csv", na.strings = c("", "NA", "N/A"))

# Create a SAFE numeric year without overwriting the original column
# strip non-digits (e.g., spaces, text)
# coerce to integer
year_num <- suppressWarnings(
  as.integer(gsub("\\D", "", as.character(car_data$year)))
)

# Attach as a new column 
car_data$year_num <- year_num

# Keep plausible years so outliers don't flatten the histogram
df_years <- subset(car_data, !is.na(year_num) & year_num >= 1990 & year_num <= 2025)


# Plot histogram with annotations
hist(df_years$year_num,
     main = "Manufacturing Years of Used Car Listings",
     xlab = "Year",
     ylab = "Number of Cars",
     col = "lightblue",
     border = "black",
     breaks = 20)

mu <- mean(df_years$year_num)
abline(v = mu, col = "red", lwd = 2)
legend("topleft",
       legend = paste0("Mean Year = ", round(mu, 1)),
       col = "red", lwd = 2, bty = "n")

This histogram shows counts of the year of the cars that are in the data set. I wanted to see how old the cars are that are listed for sale. As you can see, there’s a spike between 2012 through 2020 that are listed for sale.

5 Using T-Test to compare the mean price of Electric Vehicles and Gasoline Vehicles

# Keep only rows with valid price and fuel_type
df <- subset(car_data, !is.na(price) & !is.na(fuel_type))

# Comparing Electric vehicles vs Gasoline Vehicles
df_subset <- subset(df, fuel_type %in% c("Electric", "Petrol"))

t_test_result <- t.test(price ~ fuel_type, data = df_subset)

print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  price by fuel_type
## t = 1.9388, df = 307.81, p-value = 0.05343
## alternative hypothesis: true difference in means between group Electric and group Petrol is not equal to 0
## 95 percent confidence interval:
##   -24.51501 3317.97765
## sample estimates:
## mean in group Electric   mean in group Petrol 
##              11092.556               9445.824

Plotting the t-test results

# install.packages("dplyr") if not installed
library(dplyr)

# Compute means & standard errors
summary_stats <- df_subset %>%
  group_by(fuel_type) %>%
  summarise(mean_price = mean(price, na.rm = TRUE),
            se = sd(price, na.rm = TRUE)/sqrt(n()))

ggplot(summary_stats, aes(x = fuel_type, y = mean_price, fill = fuel_type)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  geom_errorbar(aes(ymin = mean_price - se, ymax = mean_price + se),
                width = 0.2, colour = "black") +
  labs(title = "Average Price with Error Bars",
       x = "Fuel Type", y = "Mean Price (USD)") +
  scale_fill_manual(values = c("Electric" = "blue", "Petrol" = "green")) +
  theme_minimal()

With this I wanted to see the average prices of Electric vehicles and gas vehicles to see which one holds their value better. I would say that electric vehicles has a better value because they are fairly new and there’s not much electric vehicles compared to regular gas vehicles.

In my conclusion, the biggest determining factor of a car’s value is the mileage of the car. Another hypothesis I had was whether the type of vehicle (gas, electric, hybrid, diesel, etc) but I was looking if electric cars can hold their value like gasoline cars. It seems like the average price of electric cars are more than gasoline but I also realized that there’s isn’t that many used electric vehicle for sale so I can’t find a true answer for that.