Dataset was used from: https://www.kaggle.com/datasets/pushpakhinglaspure/used-car-price-prediction

Research Questions

#install.packages("readxl")

library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
mydata <- read_xlsx("~/IMB/Mutivariat analysis/car data.xlsx")
head(mydata)
## # A tibble: 6 × 9
##   Car_Name     Year Selling_Price Present_Price Kms_Driven Fuel_Type Seller_Type
##   <chr>       <dbl>         <dbl>         <dbl>      <dbl> <chr>     <chr>      
## 1 ritz         2014           335           559      27000 Petrol    Dealer     
## 2 sx4          2013           475           954      43000 Diesel    Dealer     
## 3 ciaz         2017           725           985       6900 Petrol    Dealer     
## 4 wagon r      2011           285           415       5200 Petrol    Dealer     
## 5 swift        2014            46           687      42450 Diesel    Dealer     
## 6 vitara bre…  2018           925           983       2071 Diesel    Dealer     
## # ℹ 2 more variables: Transmission <chr>, Owner <dbl>

Explanation of dataset

I have a sample on 301 units, therefore I will make random sample of 100 units, for the purpose of testing hypothesis and normality

set.seed(1) #Setting initial point of sampling
mydata <- mydata[sample(nrow(mydata),100),]

Data manipulations

#for the purpose of the easier analysis, I will exclude some of the variables I won't use in the analysis
mydata2 <- mydata[,c(-2,-7,-9)]
head(mydata2)
## # A tibble: 6 × 6
##   Car_Name         Selling_Price Present_Price Kms_Driven Fuel_Type Transmission
##   <chr>                    <dbl>         <dbl>      <dbl> <chr>     <chr>       
## 1 Hero Passion Pro            45            55       1000 Petrol    Manual      
## 2 Honda CB Hornet…             8            87       3000 Petrol    Manual      
## 3 city                       335            11      87934 Petrol    Manual      
## 4 city                        67            10      18828 Petrol    Manual      
## 5 TVS Wego                    25            52      22000 Petrol    Automatic   
## 6 innova                     349          1346     197176 Diesel    Manual
#creating new variable and informing R that we have non-numerical variable
mydata2$TransmissionF <- factor(mydata2$Transmission,
 levels = c("Manual", "Automatic"),
 labels =c("Manual", "Automatic"))

mydata2$Fuel_TypeF <- factor(mydata2$Fuel_Type,
 levels = c("Petrol", "Diesel"),
 labels =c("Petrol", "Diesel"))

head(mydata2)
## # A tibble: 6 × 8
##   Car_Name         Selling_Price Present_Price Kms_Driven Fuel_Type Transmission
##   <chr>                    <dbl>         <dbl>      <dbl> <chr>     <chr>       
## 1 Hero Passion Pro            45            55       1000 Petrol    Manual      
## 2 Honda CB Hornet…             8            87       3000 Petrol    Manual      
## 3 city                       335            11      87934 Petrol    Manual      
## 4 city                        67            10      18828 Petrol    Manual      
## 5 TVS Wego                    25            52      22000 Petrol    Automatic   
## 6 innova                     349          1346     197176 Diesel    Manual      
## # ℹ 2 more variables: TransmissionF <fct>, Fuel_TypeF <fct>
summary(mydata2[,c(-1,-5,-6)])
##  Selling_Price    Present_Price      Kms_Driven       TransmissionF  Fuel_TypeF
##  Min.   :   2.0   Min.   :  10.0   Min.   :  1000   Manual   :86    Petrol:82  
##  1st Qu.:  25.0   1st Qu.:  76.0   1st Qu.: 14875   Automatic:14    Diesel:18  
##  Median :  63.0   Median : 136.0   Median : 33494                              
##  Mean   : 216.7   Mean   : 478.5   Mean   : 35699                              
##  3rd Qu.: 272.5   3rd Qu.: 655.8   3rd Qu.: 48825                              
##  Max.   :1999.0   Max.   :3596.0   Max.   :197176
library(psych)
describe(mydata2[,c(-1,-5,-6,-7,-8)])
##               vars   n     mean       sd median  trimmed      mad  min    max
## Selling_Price    1 100   216.67   338.30     63   140.72    84.51    2   1999
## Present_Price    2 100   478.51   717.40    136   308.20   153.45   10   3596
## Kms_Driven       3 100 35698.61 29287.81  33494 32288.20 25788.34 1000 197176
##                range skew kurtosis      se
## Selling_Price   1997 2.61     8.15   33.83
## Present_Price   3586 2.65     7.41   71.74
## Kms_Driven    196176 2.15     8.47 2928.78

Explanations of parameters

Research Question

Is there a correlation between present price and kilometers driven?

Assumptions:

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(mydata2$Present_Price, mydata2$Kms_Driven, 
            smooth = FALSE,
            ylim = c(1000, 197176),
            xlim = c(0, 3596),
            main = "Relationship between Present price and Kilometers driven",
            xlab = "Present price", 
            ylab = "Kilometers driven")

- Based on the scatter plot we can conclude that linearity assumption is violated, however for education purposes we will assume linearity and procede with Pearson correlation test.

cor(mydata2$Present_Price, mydata$Kms_Driven,
    method = "pearson",
    use = "complete.obs")
## [1] 0.3229115
cor.test(mydata2$Present_Price, mydata2$Kms_Driven,
         method = "pearson",
         use ="complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  mydata2$Present_Price and mydata2$Kms_Driven
## t = 3.3776, df = 98, p-value = 0.00105
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1350596 0.4883554
## sample estimates:
##       cor 
## 0.3229115

BSD we can reject H0 (P<0.001) and conclude that there is linear relationship between present price of the car and kilometers driven by the car.

Categorical variables analysis

Pearson Chi2 test

Research question

Assumptions:

results <- chisq.test(mydata2$TransmissionF, mydata2$Fuel_TypeF, 
                      correct = TRUE)
## Warning in chisq.test(mydata2$TransmissionF, mydata2$Fuel_TypeF, correct =
## TRUE): Chi-squared approximation may be incorrect
results
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mydata2$TransmissionF and mydata2$Fuel_TypeF
## X-squared = 0.54043, df = 1, p-value = 0.4623

Hypothesis

BSD we can not reject null hypothesis, we can not say that there is association between transmission type of the car and fuel type of the car.

addmargins(results$observed)
##                      mydata2$Fuel_TypeF
## mydata2$TransmissionF Petrol Diesel Sum
##             Manual        72     14  86
##             Automatic     10      4  14
##             Sum           82     18 100

Expected/theoretical frequences

round(results$expected, 2)
##                      mydata2$Fuel_TypeF
## mydata2$TransmissionF Petrol Diesel
##             Manual     70.52  15.48
##             Automatic  11.48   2.52

Because not all expected frequencies are greater than 5, I have to do Fischer’s Exact Probability Test of Independence - nonparametric test.

However I will still show the code of proportion tables (structure) and interpret the frequency from each of the proportion tables, for the education purposes.

addmargins(round(prop.table(results$observed), 3))
##                      mydata2$Fuel_TypeF
## mydata2$TransmissionF Petrol Diesel  Sum
##             Manual      0.72   0.14 0.86
##             Automatic   0.10   0.04 0.14
##             Sum         0.82   0.18 1.00
addmargins(round(prop.table(results$observed, 1), 3), 2)
##                      mydata2$Fuel_TypeF
## mydata2$TransmissionF Petrol Diesel   Sum
##             Manual     0.837  0.163 1.000
##             Automatic  0.714  0.286 1.000
addmargins(round(prop.table(results$observed, 2), 3), 1)
##                      mydata2$Fuel_TypeF
## mydata2$TransmissionF Petrol Diesel
##             Manual     0.878  0.778
##             Automatic  0.122  0.222
##             Sum        1.000  1.000
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize::cramers_v(mydata2$TransmissionF, mydata2$Fuel_TypeF)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.05              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.05)
## [1] "very small"
## (Rules: funder2019)

Fisher’s exact probability test

fisher.test(mydata2$TransmissionF, mydata2$Fuel_TypeF)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  mydata2$TransmissionF and mydata2$Fuel_TypeF
## p-value = 0.2728
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4088362 8.4529474
## sample estimates:
## odds ratio 
##   2.039865

BSD we can not reject the null hypothesis (P=0.28). We assume that the odds ratio is equal to one (not enough evidence to suggest that the odds of having a certain transmission type re different between the two fuel types). There is not enough evidence to conclude that there is a significant association between transmission and fuel type of the car.

interpret_oddsratio(2.04)
## [1] "small"
## (Rules: chen2010)