library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
df <- read.csv("C:/Users/toyha/Downloads/vehicle/car details v4.csv")
#converting non-american stuff to american stuff
df <- df |> mutate(years_since = year(now()) - Year) |> mutate(PriceUSD = Price * 0.012) |> mutate(Mileage = Kilometer * 0.621371) |> mutate(LengthInch = Length * 0.0393701) |> mutate(WidthInch = Width * 0.0393701) |> mutate(HeightInch = Height * 0.0393701) |> mutate(FuelGallons = Fuel.Tank.Capacity * 0.264172) |> mutate(Volume = LengthInch * WidthInch * HeightInch)
#Cleaning up Owner attribute
df['Owner'][df['Owner'] == 'Fourth'] <- '4 or More'
#df <- subset(df, Owner != "UnRegistered Car")
To start things off, I will make a few adjustments to the dataframe to make it more suitable for a binary null hypothesis, then calculate the mean price of cars with one previous owner and multiple previous owners.
#clearing away unregistered vehicles since they aren't necessary for our hypothesis
df_owner <- subset(df, Owner != "UnRegistered Car")
df_owner <- df_owner |> mutate(OwnerBinary = ifelse(Owner != "First", "Multiple", "Single"))
options(scipen=10000)
boxplot(PriceUSD ~ OwnerBinary, data = df_owner, main = "Mean Price (USD)", xlab = "# of Previous Owners", ylab = "Price (USD)")
avg_prices <- df_owner |>
group_by(OwnerBinary) |>
summarize(avg_price = mean(PriceUSD)) |>
arrange(OwnerBinary)
avg_prices
## # A tibble: 2 × 2
## OwnerBinary avg_price
## <chr> <dbl>
## 1 Multiple 17764.
## 2 Single 20570.
observed_diff <- (avg_prices$avg_price[2] -
avg_prices$avg_price[1])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 2806.36090885531"
This is valuable information, but not enough to prove a hypothesis on its own.
#Yoinking the bootstrap function real quick from the notes
# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (a vector)
x_sample <- sample(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
avgs_singl <- df_owner|>
filter(OwnerBinary == "Single") |>
pluck("PriceUSD") |>
bootstrap(n_iter = 100)
avgs_mult <- df_owner|>
filter(OwnerBinary == "Multiple") |>
pluck("PriceUSD") |>
bootstrap(n_iter = 100)
diffs_in_avgs <- avgs_singl - avgs_mult
Now I will plot the estimated sampling distribution.
ggplot() +
geom_function(xlim = c(-3000, 3000),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff)))) +
labs(title = "Bootstrapped Sampling Distribution of Pricing Differences",
x = "Difference in Price Calculated",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-3000, 3000, 1000)) +
theme_minimal()
The line of the observed diffreences is actually on the curve, so that’s not too bad. Now I will calculate Effect Size using Cohen’s D.
cohen.d(d = filter(df_owner, OwnerBinary == "Multiple") |> pluck("PriceUSD"),
f = filter(df_owner, OwnerBinary == "Single") |> pluck("PriceUSD"))
##
## Cohen's d
##
## d estimate: -0.09904823 (negligible)
## 95 percent confidence interval:
## lower upper
## -0.206583741 0.008487279
The Cohen’s d estimate shows a negigible effect size for our relation.
An alternative hypothesis to my null hypothesis would be “There is a significant difference in the mean price of vehicles with one previous owner and vehicles with multiple previous owners.” I will now start determining the sample size, testing process, and everything else to actually get interpretable results.
df_owner |>
group_by(OwnerBinary) |>
summarize(sd = sd(PriceUSD),
mean = mean(PriceUSD))
## # A tibble: 2 × 3
## OwnerBinary sd mean
## <chr> <dbl> <dbl>
## 1 Multiple 28810. 17764.
## 2 Single 28209. 20570.
Since the standard deviations are roughly equal, we’re safe to use the whole dataset to approximate sample size.
There isn’t really much riding on a false negative or false positive error besides lost profits or in the very worst case scenario a lost customer. I believe I can be fairly lenient with the error values, so I will use \(\alpha = 0.1\) and \(1 - \beta = .80\). My “eyeball” estimate of a meaningful price difference in used vehicles is about 1000 USD, so I’ll use that for my test.
test <- pwrss.t.2means(mu1 = 1000,
sd1 = sd(pluck(df_owner, "PriceUSD")),
kappa = 1,
power = .80, alpha = 0.1,
alternative = "not equal")
## Difference between Two means
## (Independent Samples t Test)
## H0: mu1 = mu2
## HA: mu1 != mu2
## ------------------------------
## Statistical power = 0.8
## n1 = 9939
## n2 = 9939
## ------------------------------
## Alternative = "not equal"
## Degrees of freedom = 19876
## Non-centrality parameter = 2.487
## Type I error rate = 0.1
## Type II error rate = 0.2
plot(test)
## Warning in qt(1 - prob.extreme, df = df, ncp = ncp, lower.tail = TRUE): full
## precision may not have been achieved in 'pnt{final}'
It would appear that we’ll need a sample size of at least 9939 entries. Unfortunately, this means that our dataset isn’t nearly large enough to test this hypothesis reliably.
I’ll start by doing mostly the same thing as before and calculate the mean price of manual and automatic transmission cars.
options(scipen=10000)
boxplot(PriceUSD ~ Transmission, data = df, main = "Mean Price (USD)", xlab = "Transmission", ylab = "Price (USD)")
avg_prices <- df |>
group_by(Transmission) |>
summarize(avg_price = mean(PriceUSD)) |>
arrange(Transmission)
avg_prices
## # A tibble: 2 × 2
## Transmission avg_price
## <chr> <dbl>
## 1 Automatic 35749.
## 2 Manual 7921.
observed_diff <- (avg_prices$avg_price[1] -
avg_prices$avg_price[2])
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 27827.9137889241"
The observed difference between the means is extremely large, but we’ll still need to do the proper procedure to test our hypothesis.
avgs_auto <- df |>
filter(Transmission == "Automatic") |>
pluck("PriceUSD") |>
bootstrap(n_iter = 10000)
avgs_man <- df |>
filter(Transmission == "Manual") |>
pluck("PriceUSD") |>
bootstrap(n_iter = 10000)
diffs_in_avgs <- avgs_auto - avgs_man
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_diff, 30000),
geom = "area") +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(-30000, -observed_diff),
geom = "area") +
geom_function(xlim = c(-30000, 30000),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Pricing Differences",
x = "Difference in Price Calculated",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-30000, 30000, 10000)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
Once again we see an extreme difference in the observed difference of means and our sampling distribution.
# "demean" the bootstrapped samples to simulate mu = 0
diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)
# proportion of times the difference is more extreme
paste("p-value ",
sum(abs(observed_diff) < abs(diffs_in_avgs_d)) /
length(diffs_in_avgs_d))
## [1] "p-value 0"
A p-value of 0 is very unusual, but if the difference in means is this extreme, then this would mean it very very heavily rejects the null hypothesis.