Week 7

Importing Libraries & Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)

## 
## Attaching package: 'pwrss'
## 
## The following object is masked from 'package:stats':
## 
##     power.t.test

df <- read.csv("C:/Users/toyha/Downloads/vehicle/car details v4.csv")

#converting non-american stuff to american stuff
df <- df |> mutate(years_since = year(now()) - Year) |> mutate(PriceUSD = Price * 0.012) |> mutate(Mileage = Kilometer * 0.621371) |> mutate(LengthInch = Length * 0.0393701) |> mutate(WidthInch = Width * 0.0393701) |> mutate(HeightInch = Height * 0.0393701) |> mutate(FuelGallons = Fuel.Tank.Capacity * 0.264172) |> mutate(Volume = LengthInch * WidthInch * HeightInch)
#Cleaning up Owner attribute
df['Owner'][df['Owner'] == 'Fourth'] <- '4 or More'
#df <- subset(df, Owner != "UnRegistered Car")

Null Hypothesis 1 (Neyman-Pearson): There is no difference in the mean price of vehicles with only one previous owner and the mean price of vehicles with more than one previous owner.

To start things off, I will make a few adjustments to the dataframe to make it more suitable for a binary null hypothesis, then calculate the mean price of cars with one previous owner and multiple previous owners.

#clearing away unregistered vehicles since they aren't necessary for our hypothesis
df_owner <- subset(df, Owner != "UnRegistered Car")
df_owner <- df_owner |> mutate(OwnerBinary = ifelse(Owner != "First", "Multiple", "Single"))
options(scipen=10000)
boxplot(PriceUSD ~ OwnerBinary, data = df_owner, main = "Mean Price (USD)", xlab = "# of Previous Owners", ylab = "Price (USD)")

avg_prices <- df_owner |>
  group_by(OwnerBinary) |>
  summarize(avg_price = mean(PriceUSD)) |>
  arrange(OwnerBinary)
avg_prices

## # A tibble: 2 × 2
##   OwnerBinary avg_price
##   <chr>           <dbl>
## 1 Multiple       17764.
## 2 Single         20570.

observed_diff <- (avg_prices$avg_price[2] -
                  avg_prices$avg_price[1])
paste("Observed Difference: ", observed_diff)

## [1] "Observed Difference:  2806.36090885531"

This is valuable information, but not enough to prove a hypothesis on its own.

#Yoinking the bootstrap function real quick from the notes
# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (a vector)
    x_sample <- sample(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}

avgs_singl <- df_owner|>
    filter(OwnerBinary == "Single") |>
    pluck("PriceUSD") |>
    bootstrap(n_iter = 100)

avgs_mult <- df_owner|>
    filter(OwnerBinary == "Multiple") |>
    pluck("PriceUSD") |>
    bootstrap(n_iter = 100)

diffs_in_avgs <- avgs_singl - avgs_mult

Now I will plot the estimated sampling distribution.

ggplot() +
  geom_function(xlim = c(-3000, 3000), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff)))) +
  labs(title = "Bootstrapped Sampling Distribution of Pricing Differences",
       x = "Difference in Price Calculated",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-3000, 3000, 1000)) +
  theme_minimal()

The line of the observed diffreences is actually on the curve, so that’s not too bad. Now I will calculate Effect Size using Cohen’s D.

cohen.d(d = filter(df_owner, OwnerBinary == "Multiple") |> pluck("PriceUSD"),
        f = filter(df_owner, OwnerBinary == "Single") |> pluck("PriceUSD"))

## 
## Cohen's d
## 
## d estimate: -0.09904823 (negligible)
## 95 percent confidence interval:
##        lower        upper 
## -0.206583741  0.008487279

The Cohen’s d estimate shows a negigible effect size for our relation.

An alternative hypothesis to my null hypothesis would be “There is a significant difference in the mean price of vehicles with one previous owner and vehicles with multiple previous owners.” I will now start determining the sample size, testing process, and everything else to actually get interpretable results.

df_owner |>
  group_by(OwnerBinary) |>
  summarize(sd = sd(PriceUSD),
            mean = mean(PriceUSD))

## # A tibble: 2 × 3
##   OwnerBinary     sd   mean
##   <chr>        <dbl>  <dbl>
## 1 Multiple    28810. 17764.
## 2 Single      28209. 20570.

Since the standard deviations are roughly equal, we’re safe to use the whole dataset to approximate sample size.

There isn’t really much riding on a false negative or false positive error besides lost profits or in the very worst case scenario a lost customer. I believe I can be fairly lenient with the error values, so I will use \(\alpha = 0.1\) and \(1 - \beta = .80\). My “eyeball” estimate of a meaningful price difference in used vehicles is about 1000 USD, so I’ll use that for my test.

test <- pwrss.t.2means(mu1 = 1000, 
                       sd1 = sd(pluck(df_owner, "PriceUSD")),
                       kappa = 1,
                       power = .80, alpha = 0.1, 
                       alternative = "not equal")

##  Difference between Two means 
##  (Independent Samples t Test) 
##  H0: mu1 = mu2 
##  HA: mu1 != mu2 
##  ------------------------------ 
##   Statistical power = 0.8 
##   n1 = 9939 
##   n2 = 9939 
##  ------------------------------ 
##  Alternative = "not equal" 
##  Degrees of freedom = 19876 
##  Non-centrality parameter = 2.487 
##  Type I error rate = 0.1 
##  Type II error rate = 0.2

plot(test)

## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'

It would appear that we’ll need a sample size of at least 9939 entries. Unfortunately, this means that our dataset isn’t nearly large enough to test this hypothesis reliably.

Null Hypothesis 2 (Fisher’s Significance Testing): There is no difference in the mean price between used vehicles with manual transmission and automatic transmission.

I’ll start by doing mostly the same thing as before and calculate the mean price of manual and automatic transmission cars.

options(scipen=10000)
boxplot(PriceUSD ~ Transmission, data = df, main = "Mean Price (USD)", xlab = "Transmission", ylab = "Price (USD)")

avg_prices <- df |>
  group_by(Transmission) |>
  summarize(avg_price = mean(PriceUSD)) |>
  arrange(Transmission)
avg_prices

## # A tibble: 2 × 2
##   Transmission avg_price
##   <chr>            <dbl>
## 1 Automatic       35749.
## 2 Manual           7921.

observed_diff <- (avg_prices$avg_price[1] -
                  avg_prices$avg_price[2])
paste("Observed Difference: ", observed_diff)

## [1] "Observed Difference:  27827.9137889241"

The observed difference between the means is extremely large, but we’ll still need to do the proper procedure to test our hypothesis.

avgs_auto <- df |>
  filter(Transmission == "Automatic") |>
  pluck("PriceUSD") |>
  bootstrap(n_iter = 10000)

avgs_man <- df |>
  filter(Transmission == "Manual") |>
  pluck("PriceUSD") |>
  bootstrap(n_iter = 10000)

diffs_in_avgs <- avgs_auto - avgs_man

f_sampling <- function(x) dnorm(x, mean = 0, 
                                sd = sd(diffs_in_avgs))

ggplot() +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(observed_diff, 30000),
                geom = "area") +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(-30000, -observed_diff),
                geom = "area") +
  geom_function(xlim = c(-30000, 30000), 
                fun = f_sampling) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff, 1)))) +
  labs(title = "Bootstrapped Sampling Distribution of Pricing Differences",
       x = "Difference in Price Calculated",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-30000, 30000, 10000)) +
  scale_fill_manual(values = 'lightblue') +
  theme_minimal()

Once again we see an extreme difference in the observed difference of means and our sampling distribution.

# "demean" the bootstrapped samples to simulate mu = 0
diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)

# proportion of times the difference is more extreme
paste("p-value ", 
      sum(abs(observed_diff) < abs(diffs_in_avgs_d)) /
        length(diffs_in_avgs_d))

## [1] "p-value  0"

A p-value of 0 is very unusual, but if the difference in means is this extreme, then this would mean it very very heavily rejects the null hypothesis.

Week 7

Wyatt Van Dyke

2024-10-12

Importing Libraries & Data

Null Hypothesis 1 (Neyman-Pearson): There is no difference in the mean price of vehicles with only one previous owner and the mean price of vehicles with more than one previous owner.

Null Hypothesis 2 (Fisher’s Significance Testing): There is no difference in the mean price between used vehicles with manual transmission and automatic transmission.