Dataset was used from Kaggle: https://www.kaggle.com/datasets/pushpakhinglaspure/used-car-price-prediction

Research Question

Does average selling price of the car differ between manual and automatic cars?

#install.packages("readxl")

library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
mydata <- read_xlsx("~/IMB/Mutivariat analysis/car data.xlsx")
head(mydata)
## # A tibble: 6 × 9
##   Car_Name     Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type
##   <chr>       <dbl>         <dbl>         <dbl>      <dbl> <chr>     <chr>      
## 1 ritz         2014           335           559      27000 Petrol    Dealer     
## 2 sx4          2013           475           954      43000 Diesel    Dealer     
## 3 ciaz         2017           725           985       6900 Petrol    Dealer     
## 4 wagon r      2011           285           415       5200 Petrol    Dealer     
## 5 swift        2014            46           687      42450 Diesel    Dealer     
## 6 vitara bre…  2018           925           983       2071 Diesel    Dealer     
## # ℹ 2 more variables: Transmission <chr>, Owner <dbl>

Explanation of dataset

I have a sample on 301 units, therefore I will make random sample of 100 units, for the purpose of testing hypothesis and normality

set.seed(1) #Setting initial point of sampling
mydata <- mydata[sample(nrow(mydata),100),]

Data manipulations

#for the purpose of the easier analysis, I will exclude some of the variables I won't use in the analysis
mydata2 <- mydata[,c(-2,-6,-7,-9)]
head(mydata2)
## # A tibble: 6 × 5
##   Car_Name             Selling_Price Present_Price Kms_Driven Transmission
##   <chr>                        <dbl>         <dbl>      <dbl> <chr>       
## 1 Hero Passion Pro                45            55       1000 Manual      
## 2 Honda CB Hornet 160R             8            87       3000 Manual      
## 3 city                           335            11      87934 Manual      
## 4 city                            67            10      18828 Manual      
## 5 TVS Wego                        25            52      22000 Automatic   
## 6 innova                         349          1346     197176 Manual
#creating new variable and informing R that we have non-numerical variable
mydata2$TransmissionF <- factor(mydata2$Transmission,
 levels = c("Manual", "Automatic"),
 labels =c("Manual", "Automatic"))

head(mydata2)
## # A tibble: 6 × 6
##   Car_Name     Selling_Price Present_Price Kms_Driven Transmission TransmissionF
##   <chr>                <dbl>         <dbl>      <dbl> <chr>        <fct>        
## 1 Hero Passio…            45            55       1000 Manual       Manual       
## 2 Honda CB Ho…             8            87       3000 Manual       Manual       
## 3 city                   335            11      87934 Manual       Manual       
## 4 city                    67            10      18828 Manual       Manual       
## 5 TVS Wego                25            52      22000 Automatic    Automatic    
## 6 innova                 349          1346     197176 Manual       Manual
summary(mydata2[,c(-1,-5)])
##  Selling_Price    Present_Price      Kms_Driven       TransmissionF
##  Min.   :   2.0   Min.   :  10.0   Min.   :  1000   Manual   :86   
##  1st Qu.:  25.0   1st Qu.:  76.0   1st Qu.: 14875   Automatic:14   
##  Median :  63.0   Median : 136.0   Median : 33494                  
##  Mean   : 216.7   Mean   : 478.5   Mean   : 35699                  
##  3rd Qu.: 272.5   3rd Qu.: 655.8   3rd Qu.: 48825                  
##  Max.   :1999.0   Max.   :3596.0   Max.   :197176
library(psych)
describe(mydata2[,c(-1,-5,-6)])
##               vars   n     mean       sd median  trimmed      mad  min    max
## Selling_Price    1 100   216.67   338.30     63   140.72    84.51    2   1999
## Present_Price    2 100   478.51   717.40    136   308.20   153.45   10   3596
## Kms_Driven       3 100 35698.61 29287.81  33494 32288.20 25788.34 1000 197176
##                range skew kurtosis      se
## Selling_Price   1997 2.61     8.15   33.83
## Present_Price   3586 2.65     7.41   71.74
## Kms_Driven    196176 2.15     8.47 2928.78

Explanations of parameters

Research Question

Does average selling price of the car significantly differ between Manual and Automatic cars?

Statistical hypothesis

Assumptions:

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(mydata2, aes(x = Selling_Price)) +
  geom_histogram(binwidth = 30, colour="gray") +
  facet_wrap(~TransmissionF, ncol = 1) + 
  ylab("Frequency")

Shapiro-Wilk Test for Normality

library(rstatix)
## Warning: package 'rstatix' was built under R version 4.3.2
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata2 %>%
  group_by(TransmissionF) %>%
  shapiro_test(Selling_Price)
## # A tibble: 2 × 4
##   TransmissionF variable      statistic        p
##   <fct>         <chr>             <dbl>    <dbl>
## 1 Manual        Selling_Price     0.704 6.97e-12
## 2 Automatic     Selling_Price     0.528 9.94e- 6
library(psych)
describeBy(mydata2$Selling_Price, mydata2$TransmissionF)
## 
##  Descriptive statistics by group 
## group: Manual
##    vars  n   mean     sd median trimmed   mad min  max range skew kurtosis   se
## X1    1 86 208.06 300.49     65   144.4 88.96   2 1425  1423 1.99     3.63 32.4
## ------------------------------------------------------------ 
## group: Automatic
##    vars  n   mean     sd median trimmed   mad min  max range skew kurtosis
## X1    1 14 269.57 526.94     51  146.67 40.03  15 1999  1984 2.48     5.32
##        se
## X1 140.83

Alternative non-parametrical test; Wilcoxon Rank Sum Test

wilcox.test(mydata2$Selling_Price ~ mydata2$TransmissionF,
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon rank sum test
## 
## data:  mydata2$Selling_Price by mydata2$TransmissionF
## W = 574.5, p-value = 0.7847
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:psych':
## 
##     phi
effectsize(wilcox.test(mydata2$Selling_Price ~ mydata2$TransmissionF,
                       paired = FALSE,
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))
## r (rank biserial) |        95% CI
## ---------------------------------
## -0.05             | [-0.36, 0.27]
interpret_rank_biserial(-0.05)
## [1] "very small"
## (Rules: funder2019)

Conclusion

Based on the sample data, we don’t have enough evidence that there is a significant difference in selling price between manual and automatic cars (p=0.79), the effect size is very small, |r|=0.05.

As assumptions are not met, it’s better to use non-parametric test, however, we can still show the results of the parametric test:

Independent Samples

t.test(mydata2$Selling_Price ~ mydata2$TransmissionF, 
       paired = FALSE,
       var.equal = FALSE,
       alternative = "two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  mydata2$Selling_Price by mydata2$TransmissionF
## t = -0.42566, df = 14.407, p-value = 0.6766
## alternative hypothesis: true difference in means between group Manual and group Automatic is not equal to 0
## 95 percent confidence interval:
##  -370.6404  247.6139
## sample estimates:
##    mean in group Manual mean in group Automatic 
##                208.0581                269.5714
library(effectsize)
effectsize::cohens_d(mydata2$Selling_Price ~ mydata2$TransmissionF,
                     pooled_sd = FALSE)
## Cohen's d |        95% CI
## -------------------------
## -0.14     | [-0.80, 0.52]
## 
## - Estimated using un-pooled SD.
interpret_cohens_d(0.14, rules = "sawilowsky2009")
## [1] "very small"
## (Rules: sawilowsky2009)

Conclusion