#use sind data 22.04.2024

## Warning: package 'mice' was built under R version 4.2.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'Amelia' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ## 
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Warning: package 'sqldf' was built under R version 4.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.2.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.2.3
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
## Warning: package 'reshape2' was built under R version 4.2.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Warning: package 'Hmisc' was built under R version 4.2.3
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Warning: package 'doBy' was built under R version 4.2.3
## 
## Attaching package: 'doBy'
## 
## The following object is masked from 'package:dplyr':
## 
##     order_by
## Warning: package 'gmodels' was built under R version 4.2.3
## Warning: package 'car' was built under R version 4.2.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## Warning: package 'effects' was built under R version 4.2.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## Warning: package 'polycor' was built under R version 4.2.3
## 
## Attaching package: 'polycor'
## 
## The following object is masked from 'package:psych':
## 
##     polyserial
## 
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## Warning: package 'modelr' was built under R version 4.2.3
## Warning: package 'gapminder' was built under R version 4.2.3
## Warning: package 'tree' was built under R version 4.2.3
## Warning: package 'gbm' was built under R version 4.2.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Warning: package 'ggrepel' was built under R version 4.2.3
## Warning: package 'fastDummies' was built under R version 4.2.3
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#load data
pc <- read.csv("Pak_cars.csv")
dim(pc)
## [1] 24973    10
colnames(pc)
##  [1] "X_id"             "Brand"            "Condition"        "Fuel"            
##  [5] "KMs.Driven"       "Model"            "Price"            "Registered.City" 
##  [9] "Transaction.Type" "Year"
md.pattern(pc, rotate.names = T)

##       X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 22687    1     1         1    1     1     1               1                1
## 2        1     1         1    1     1     1               1                1
## 2284     1     1         1    1     1     1               1                1
##          0     0         0    0     0     0               0                0
##       Year KMs.Driven     
## 22687    1          1    0
## 2        1          0    1
## 2284     0          0    2
##       2284       2286 4570
summary(pc)
##       X_id          Brand            Condition             Fuel          
##  Min.   :    1   Length:24973       Length:24973       Length:24973      
##  1st Qu.: 6244   Class :character   Class :character   Class :character  
##  Median :12487   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :12487                                                           
##  3rd Qu.:18730                                                           
##  Max.   :24973                                                           
##                                                                          
##    KMs.Driven          Model               Price          Registered.City   
##  Min.   :       1   Length:24973       Min.   :   50000   Length:24973      
##  1st Qu.:   16000   Class :character   1st Qu.:  380000   Class :character  
##  Median :   66510   Mode  :character   Median :  650000   Mode  :character  
##  Mean   :  127811                      Mean   :  912895                     
##  3rd Qu.:  100000                      3rd Qu.: 1145000                     
##  Max.   :10000000                      Max.   :87654321                     
##  NA's   :2286                                                               
##  Transaction.Type        Year     
##  Length:24973       Min.   :1915  
##  Class :character   1st Qu.:2002  
##  Mode  :character   Median :2008  
##                     Mean   :2006  
##                     3rd Qu.:2013  
##                     Max.   :2020  
##                     NA's   :2284
head(pc)
##   X_id  Brand Condition   Fuel KMs.Driven          Model   Price
## 1    1 Toyota      Used Diesel          1          Prado 2100000
## 2    2 Suzuki      Used Petrol     100000          Bolan  380000
## 3    3 Suzuki      Used    CNG      12345          Bolan  340000
## 4    4 Suzuki      Used Petrol      94000           Alto  535000
## 5    5 Toyota      Used Petrol     100000    Corolla XLI 1430000
## 6    6 Toyota      Used Petrol      80000 Corrolla Altis 1620000
##   Registered.City Transaction.Type Year
## 1         Karachi             Cash 1997
## 2         Karachi             Cash 2006
## 3         Karachi             Cash 1998
## 4         Karachi             Cash 2010
## 5         Karachi             Cash 2013
## 6         Karachi             Cash 2012
colnames(pc)
##  [1] "X_id"             "Brand"            "Condition"        "Fuel"            
##  [5] "KMs.Driven"       "Model"            "Price"            "Registered.City" 
##  [9] "Transaction.Type" "Year"
dim(pc)
## [1] 24973    10

find unique data in variables

pc %>% group_by(X_id) %>% unique() %>% summarise(count = n())
## # A tibble: 24,973 × 2
##     X_id count
##    <int> <int>
##  1     1     1
##  2     2     1
##  3     3     1
##  4     4     1
##  5     5     1
##  6     6     1
##  7     7     1
##  8     8     1
##  9     9     1
## 10    10     1
## # ℹ 24,963 more rows
#all unique ids found which says no entry duplication

pc %>% group_by(Brand) %>% unique() %>% summarise(count = n())
## # A tibble: 24 × 2
##    Brand                count
##    <chr>                <int>
##  1 ""                    2137
##  2 "Audi"                  18
##  3 "BMW"                   31
##  4 "Changan"                9
##  5 "Chevrolet"             47
##  6 "Classic & Antiques"    13
##  7 "Daewoo"                72
##  8 "Daihatsu"            2476
##  9 "FAW"                   80
## 10 "Honda"               3324
## # ℹ 14 more rows
#2137 brand names are unlabelled

pc %>% group_by(Condition) %>% unique() %>% summarise(count = n())
## # A tibble: 3 × 2
##   Condition count
##   <chr>     <int>
## 1 ""         2136
## 2 "New"      4365
## 3 "Used"    18472
#2136 car's condition are unlabelled

pc %>% group_by(Fuel) %>% unique() %>% summarise(count = n())
## # A tibble: 6 × 2
##   Fuel     count
##   <chr>    <int>
## 1 ""        2445
## 2 "CNG"     6797
## 3 "Diesel"   450
## 4 "Hybrid"   943
## 5 "LPG"       29
## 6 "Petrol" 14309
#2445 car's condition are unlabelled

pc %>% group_by(KMs.Driven) %>% unique() %>% summarise(count = n())
## # A tibble: 3,146 × 2
##    KMs.Driven count
##         <int> <int>
##  1          1  1699
##  2          2    56
##  3          3    11
##  4          4    16
##  5          5    89
##  6          6    10
##  7          7    14
##  8          8    31
##  9          9    13
## 10         10   165
## # ℹ 3,136 more rows
pc %>% group_by(Model) %>% unique() %>% summarise(count = n())
## # A tibble: 304 × 2
##    Model      count
##    <chr>      <int>
##  1 ""          2448
##  2 "120 Y"       24
##  3 "2 Series"     2
##  4 "200 D"        8
##  5 "240 Gd"       3
##  6 "250 D"        2
##  7 "3 Series"    11
##  8 "323"         12
##  9 "350Z"         3
## 10 "5 Series"     6
## # ℹ 294 more rows
#2448 car's model are unlabelled

pc %>% group_by(Price) %>% unique() %>% summarise(count = n())
## # A tibble: 1,131 × 2
##    Price count
##    <int> <int>
##  1 50000    19
##  2 52000     1
##  3 55000    13
##  4 57000     1
##  5 60000    11
##  6 62000     1
##  7 65000     6
##  8 68000     2
##  9 70000    21
## 10 72000     1
## # ℹ 1,121 more rows
pc %>% group_by(Registered.City) %>% unique() %>% summarise(count = n())
## # A tibble: 62 × 2
##    Registered.City count
##    <chr>           <int>
##  1 ""               4636
##  2 "Abbottabad"       13
##  3 "Ali Masjid"        4
##  4 "Askoley"           3
##  5 "Attock"            6
##  6 "Badin"             3
##  7 "Bagh"              3
##  8 "Bahawalnagar"      3
##  9 "Bahawalpur"       39
## 10 "Bela"              5
## # ℹ 52 more rows
#4636 city are unlabelled

pc %>% group_by(Transaction.Type) %>% unique() %>% summarise(count = n())
## # A tibble: 3 × 2
##   Transaction.Type      count
##   <chr>                 <int>
## 1 ""                     2445
## 2 "Cash"                21513
## 3 "Installment/Leasing"  1015
#2445 transactions are unlabelled

pc %>% group_by(Year) %>% unique() %>% summarise(count = n())
## # A tibble: 66 × 2
##     Year count
##    <int> <int>
##  1  1915     1
##  2  1925     1
##  3  1943     2
##  4  1951     2
##  5  1952     4
##  6  1956     1
##  7  1960     1
##  8  1962     2
##  9  1963     2
## 10  1964     2
## # ℹ 56 more rows
#2284 year is missing
#check missing data
options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(pc)
##                   vars     n      mean         sd median   trimmed       mad
## X_id                 1 24973  12487.00    7209.23  12487  12487.00   9255.87
## Brand*               2 24973     17.38       7.88     23     18.50      1.48
## Condition*           3 24973      2.65       0.63      3      2.80      0.00
## Fuel*                4 24973      4.29       2.05      6      4.49      0.00
## KMs.Driven           5 22687 127811.16  599672.62  66510  63807.94  61542.73
## Model*               6 24973    121.27      80.51    110    117.28     99.33
## Price                7 24973 912895.28 1537133.61 650000 739939.16 518910.00
## Registered.City*     8 24973     22.27      10.97     26     23.04      0.00
## Transaction.Type*    9 24973      1.94       0.37      2      2.00      0.00
## Year                10 22689   2005.90       9.56   2008   2007.06      7.41
##                     min      max    range  skew kurtosis      se
## X_id                  1    24973    24972  0.00    -1.20   45.62
## Brand*                1       24       23 -0.83    -0.84    0.05
## Condition*            1        3        2 -1.62     1.33    0.00
## Fuel*                 1        6        5 -0.46    -1.64    0.01
## KMs.Driven            1 10000000  9999999 14.37   222.85 3981.31
## Model*                1      304      303  0.37    -0.69    0.51
## Price             50000 87654321 87604321 26.44  1158.16 9726.94
## Registered.City*      1       62       61 -0.94     0.51    0.07
## Transaction.Type*     1        3        2 -0.68     4.00    0.00
## Year               1915     2020      105 -1.13     1.42    0.06
#visualize missing data
missmap(pc)

#2% missing data is there in years and km-driven variables
#exclude all non-numeric columns
num.cols <- sapply(pc, is.numeric)
pc2 <- pc[, num.cols]
colnames(pc2)
## [1] "X_id"       "KMs.Driven" "Price"      "Year"
#now have 2 data sets pc (all variables) and pc2 (only numeric data)
md.pattern(pc2, rotate.names = T)

##       X_id Price Year KMs.Driven     
## 22687    1     1    1          1    0
## 2        1     1    1          0    1
## 2284     1     1    0          0    2
##          0     0 2284       2286 4570
library(tidyverse) 
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.2.3
library(maps)
## Warning: package 'maps' was built under R version 4.2.3
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)
## Warning: package 'DT' was built under R version 4.2.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
#library(factoextra)
introduce(pc)
##    rows columns discrete_columns continuous_columns all_missing_columns
## 1 24973      10                6                  4                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                 4570         22687             249730      1624944
plot_intro(pc)

plot_missing(pc)

dim(pc)
## [1] 24973    10
head(pc)
##   X_id  Brand Condition   Fuel KMs.Driven          Model   Price
## 1    1 Toyota      Used Diesel          1          Prado 2100000
## 2    2 Suzuki      Used Petrol     100000          Bolan  380000
## 3    3 Suzuki      Used    CNG      12345          Bolan  340000
## 4    4 Suzuki      Used Petrol      94000           Alto  535000
## 5    5 Toyota      Used Petrol     100000    Corolla XLI 1430000
## 6    6 Toyota      Used Petrol      80000 Corrolla Altis 1620000
##   Registered.City Transaction.Type Year
## 1         Karachi             Cash 1997
## 2         Karachi             Cash 2006
## 3         Karachi             Cash 1998
## 4         Karachi             Cash 2010
## 5         Karachi             Cash 2013
## 6         Karachi             Cash 2012

#Visualize data

t <- pc %>% group_by(Brand) %>% summarise(AvgPrice = mean(Price), MaxPrice = max(Price), minPrice = min(Price))

options(repr.plot.height = 8, repr.plot.width = 8)

ggplot() +
  geom_histogram(t, mapping = aes(AvgPrice, fill = Brand))+
  geom_vline(t, mapping = aes(xintercept = AvgPrice), color = "red", size = 1.5) + 
  geom_text(t, mapping = aes(x = AvgPrice+3, y = 65, label = round(AvgPrice,digits = 2)))+
  facet_wrap(Brand~.)+
  theme_minimal()+
  theme(legend.position = "bottom")+
  labs(y = "Frequency", title = "Distribution & The Average Price of The Cars in each Brand", caption = "PBS-data")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

colnames(pc)
##  [1] "X_id"             "Brand"            "Condition"        "Fuel"            
##  [5] "KMs.Driven"       "Model"            "Price"            "Registered.City" 
##  [9] "Transaction.Type" "Year"

#total value in each brand (market share)

pc %>% 
  group_by(Brand) %>% 
  summarise(Total.share = mean(Price), na.rm = TRUE) %>% 
  ggplot(aes(reorder(Brand, Total.share),Total.share, fill = Total.share))+
  geom_col(show.legend = FALSE)+
  coord_flip()+
  theme_minimal()+
  labs(y = "Average price", x = "Brand names", title = "Average price of car as per Brand", caption = "PBS-data")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))

  scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
  pc %>% 
  group_by(Condition) %>% 
  summarise(Total.share = mean(Price), na.rm = TRUE) %>% 
  ggplot(aes(reorder(Condition, Total.share),Total.share, fill = Total.share))+
  geom_col(show.legend = FALSE)+
  #coord_flip()+
  theme_minimal()+
  labs(y = "Average price", x = "Condition",title = "Average price of car as per Condition", caption = "PBS-data")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))

  scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
  colnames(pc)
##  [1] "X_id"             "Brand"            "Condition"        "Fuel"            
##  [5] "KMs.Driven"       "Model"            "Price"            "Registered.City" 
##  [9] "Transaction.Type" "Year"
ggplot(pc, aes(Price, Brand, color = Condition)) + 
geom_point()+
  facet_wrap(~Condition)+
  theme(legend.title = element_text(size = 14),
    legend.text = element_text(size = 13),
    axis.title.x = element_text(size = 15),
      axis.title.y = element_text(size = 15),
     axis.text.x = element_text(size = 10),
     axis.text.y = element_text(size = 10),
     plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
        plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
     strip.text = element_text(size = 12)) +
scale_x_continuous(label = scales::comma_format()) +
labs(title = "Car brands average prices as per Condition)",
    subtitle = "The most expensive ones are Toyota and Suzuki irrespective of conditions", caption = "PBS-data")

Take out data with no brands

noBrands <- pc %>% filter(Brand == "")
head(noBrands)
##   X_id Brand Condition Fuel KMs.Driven Model   Price Registered.City
## 1 3566            Used           99999        350000                
## 2 3570                              NA       1040000                
## 3 3571                              NA       1065000                
## 4 3573                              NA        765000                
## 5 3575                              NA        428000                
## 6 3577                              NA       1725000                
##   Transaction.Type Year
## 1                  1995
## 2                    NA
## 3                    NA
## 4                    NA
## 5                    NA
## 6                    NA
md.pattern(noBrands, rotate.names = T)

##      X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 1       1     1         1    1     1     1               1                1
## 2136    1     1         1    1     1     1               1                1
##         0     0         0    0     0     0               0                0
##      KMs.Driven Year     
## 1             1    1    0
## 2136          0    0    2
##            2136 2136 4272
dim(noBrands)
## [1] 2137   10
#2136 data of id and price only no other information


options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(noBrands)
##                   vars    n      mean        sd median   trimmed       mad
## X_id                 1 2137  14485.31   7456.64  16254  14603.36  10584.28
## Brand*               2 2137      1.00      0.00      1      1.00      0.00
## Condition*           3 2137      1.00      0.02      1      1.00      0.00
## Fuel*                4 2137      1.00      0.00      1      1.00      0.00
## KMs.Driven           5    1  99999.00        NA  99999  99999.00      0.00
## Model*               6 2137      1.00      0.00      1      1.00      0.00
## Price                7 2137 843451.37 777637.72 656000 732799.79 527805.60
## Registered.City*     8 2137      1.00      0.00      1      1.00      0.00
## Transaction.Type*    9 2137      1.00      0.00      1      1.00      0.00
## Year                10    1   1995.00        NA   1995   1995.00      0.00
##                     min      max    range  skew kurtosis      se
## X_id               3566    24968    21402 -0.13     -1.5   161.3
## Brand*                1        1        0   NaN      NaN     0.0
## Condition*            1        2        1 46.16   2130.0     0.0
## Fuel*                 1        1        0   NaN      NaN     0.0
## KMs.Driven        99999    99999        0    NA       NA      NA
## Model*                1        1        0   NaN      NaN     0.0
## Price             50000 10600000 10550000  4.63     40.4 16821.9
## Registered.City*      1        1        0   NaN      NaN     0.0
## Transaction.Type*     1        1        0   NaN      NaN     0.0
## Year               1995     1995        0    NA       NA      NA
#visualize missing data
missmap(noBrands) 

#take out data with brands
withBrands <- pc %>% filter(Brand != "")
head(withBrands)
##   X_id  Brand Condition   Fuel KMs.Driven          Model   Price
## 1    1 Toyota      Used Diesel          1          Prado 2100000
## 2    2 Suzuki      Used Petrol     100000          Bolan  380000
## 3    3 Suzuki      Used    CNG      12345          Bolan  340000
## 4    4 Suzuki      Used Petrol      94000           Alto  535000
## 5    5 Toyota      Used Petrol     100000    Corolla XLI 1430000
## 6    6 Toyota      Used Petrol      80000 Corrolla Altis 1620000
##   Registered.City Transaction.Type Year
## 1         Karachi             Cash 1997
## 2         Karachi             Cash 2006
## 3         Karachi             Cash 1998
## 4         Karachi             Cash 2010
## 5         Karachi             Cash 2013
## 6         Karachi             Cash 2012
md.pattern(withBrands, rotate.names = T)

##       X_id Brand Condition Fuel Model Price Registered.City Transaction.Type
## 22686    1     1         1    1     1     1               1                1
## 2        1     1         1    1     1     1               1                1
## 148      1     1         1    1     1     1               1                1
##          0     0         0    0     0     0               0                0
##       Year KMs.Driven    
## 22686    1          1   0
## 2        1          0   1
## 148      0          0   2
##        148        150 298
dim(withBrands)
## [1] 22836    10
introduce(withBrands)
##    rows columns discrete_columns continuous_columns all_missing_columns
## 1 22836      10                6                  4                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                  298         22686             228360      1488048
plot_intro(withBrands)

plot_missing(withBrands)

From now on work on with Brands

#check missing data
options(repr.plot.height = 10, repr.plot.width = 12)
psych::describe(withBrands)
##                   vars     n      mean         sd   median   trimmed       mad
## X_id                 1 22836  12300.00    7157.32  12312.5  12291.41   9198.05
## Brand*               2 22836     17.91       6.36     22.0     18.67      1.48
## Condition*           3 22836      1.81       0.39      2.0      1.89      0.00
## Fuel*                4 22836      4.60       1.87      6.0      4.77      0.00
## KMs.Driven           5 22686 127812.39  599685.81  66505.0  63805.94  61535.31
## Model*               6 22836    132.52      74.88    110.0    127.86     90.44
## Price                7 22836 919393.87 1589603.60 650000.0 740675.65 518910.00
## Registered.City*     8 22836     24.26       9.23     26.0     25.49      0.00
## Transaction.Type*    9 22836      2.03       0.24      2.0      2.00      0.00
## Year                10 22688   2005.90       9.56   2008.0   2007.06      7.41
##                     min      max    range  skew kurtosis       se
## X_id                  1    24973    24972  0.00    -1.18    47.36
## Brand*                1       23       22 -0.86    -1.08     0.04
## Condition*            1        2        1 -1.57     0.47     0.00
## Fuel*                 1        6        5 -0.65    -1.48     0.01
## KMs.Driven            1 10000000  9999999 14.37   222.84  3981.48
## Model*                1      304      303  0.44    -0.65     0.50
## Price             50000 87654321 87604321 26.09  1106.63 10519.11
## Registered.City*      1       62       61 -1.23     2.80     0.06
## Transaction.Type*     1        3        2  1.89    13.77     0.00
## Year               1915     2020      105 -1.13     1.42     0.06
#visualize missing data
missmap(withBrands)   #(150 values of Km and 148 of year is missing)

  withBrands %>% 
  group_by(Condition) %>% 
  summarise(Total.share = mean(Price), na.rm = TRUE) %>% 
  ggplot(aes(reorder(Condition, Total.share),Total.share, fill = Total.share))+
  geom_col(show.legend = FALSE)+
  coord_flip()+
  theme_minimal()+
  labs(y = "Average price", x = "Condition",title = "Average price of car as per Condition", caption = "PBS-data")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))

  scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
  withBrands %>% 
  group_by(Brand) %>% 
  summarise(Total.share = mean(Price), na.rm = TRUE) %>% 
  ggplot(aes(reorder(Brand, Total.share),Total.share, fill = Total.share))+
  geom_col(show.legend = FALSE)+
  coord_flip()+
  theme_minimal()+
  labs(y = "Average price", x = "Brand",title = "Average price of car as per Condition", caption = "PBS-data")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))

  scale_x_continuous(label = scales::comma_format())
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
colnames(withBrands)
##  [1] "X_id"             "Brand"            "Condition"        "Fuel"            
##  [5] "KMs.Driven"       "Model"            "Price"            "Registered.City" 
##  [9] "Transaction.Type" "Year"
ordered_price <- withBrands %>% arrange(desc(Price))

#most expensive
high_price <- ordered_price %>% filter(Price > mean(Price)) %>% 
  dplyr::select(Price,Brand, Condition, Fuel, Model, Registered.City, Transaction.Type, Year, KMs.Driven)
 
ordered_price %>% 
  group_by(Brand) %>% 
  summarise(mean = round(mean(Price),0), sd = round(sd(Price), 0)) %>% 
  arrange(desc(mean))
## # A tibble: 23 × 3
##    Brand          mean      sd
##    <chr>         <dbl>   <dbl>
##  1 Range Rover 4657000 4339836
##  2 Audi        4124167 4553829
##  3 Lexus       3200870 3921879
##  4 Porsche     2500000 2404163
##  5 BMW         2252419 2724229
##  6 Mercedes    2046467 1923814
##  7 Toyota      1653264 2311591
##  8 Honda       1137342  979474
##  9 Nissan       772144  593684
## 10 Mazda        762704  405140
## # ℹ 13 more rows
table(high_price$Brand)
## 
##               Audi                BMW          Chevrolet Classic & Antiques 
##                 12                 19                  1                  4 
##             Daewoo           Daihatsu                FAW              Honda 
##                  3                720                 19               1633 
##                KIA              Lexus              Mazda           Mercedes 
##                  3                 16                 36                 60 
##         Mitsubishi             Nissan       Other Brands            Porsche 
##                124                284                 35                  1 
##        Range Rover             Subaru             Suzuki             Toyota 
##                  5                  4               1176               3634
#plot cars about mean price
ggplot(high_price, aes(Price, reorder(Brand, Price))) + 
geom_point() + 
facet_grid(
    factor(Brand, levels = 
                  c("Audi", 'BMW', 'Chevrolet','Classic & Antiques', 'Daewoo','Daihatsu','FAW','Honda','KIA','Lexus','Mazda','Mercedes','Mitsubishi','Nissan','Other Brands','Porsche','Range Rover','Subaru','Suzuki','Toyota')) ~., scales = "free", space = "free") +
  theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
     plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
        plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
     axis.title.x = element_text(size = 15),
      axis.title.y = element_text(size = 15),
     axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) + 
labs(title="Car brands destributed as per Price",
    subtitle = "The cars plotted here - are above the average price of cars in our data set") + ylab("Brandname")

ggplot(withBrands, aes(Price, Fuel, color = Condition))+
  geom_point()+
  facet_wrap(~Fuel) +
theme(strip.text.y = element_text(angle = 0),strip.text= element_text(size=13),axis.text.y = element_text(size = 11),
     plot.title = element_text(hjust = .5,lineheight = .20, size = 15),
        plot.subtitle = element_text(hjust = .5,lineheight = .20, size = 12),
     axis.title.x = element_text(size = 15),
      axis.title.y = element_text(size = 15),
     axis.text.x = element_text(size = 10)) +
scale_x_continuous(label = scales::comma_format()) +
  labs(title="Cars Fuel type and Condition sold",
    subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Brandname")+
  theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))

brand_nested <- ordered_price %>% 
  group_by(Brand) %>% 
  summarise(mean_price = mean(Price), sd = round(sd(Price),0),
         mean_mileage = mean(KMs.Driven), sd =  round(sd(KMs.Driven),0)) %>% 
arrange(desc(mean_price))
  
#make a dataframe
brand_nested <- as.data.frame(brand_nested, header = TRUE)

#use brand as row name
row.names(brand_nested) <- brand_nested$Brand

#remove brand names since have made row
brand_nested <- brand_nested[,-1]

head(brand_nested, 5)
##             mean_price     sd mean_mileage
## Range Rover    4657000  41955     66697.00
## Audi           4124167  64111     50485.44
## Lexus          3200870 205711    130533.09
## Porsche        2500000  54447     88500.00
## BMW            2252419  59641     61499.35
clustering_brand <- brand_nested

#scale the data
scaled_brand <- scale(clustering_brand) 

#perform hierarchical clustering
HC_brand <- hclust(dist(scaled_brand))

#canvas size - plot size
options(repr.plot.width = 8, repr.plot.height = 8)

#plot
plot(HC_brand, col = "red", main = "Car brands that share the same branch have a lot in common across all variables")

options(repr.plot.width = 12, repr.plot.height = 8)

withBrands %>% 
  group_by(Fuel, Condition) %>% 
  summarise(Total.Value = mean(Price), na.rm = TRUE) %>% 
  ggplot(aes(reorder(Fuel, Total.Value), Total.Value, fill = Total.Value))+
  geom_col(show.legend = FALSE)+
  facet_grid(~Condition)+
  coord_flip()+
  theme_minimal()+
  labs(x = NULL, y = "Market Values of rhe Leagues")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))+
  scale_y_continuous(label = scales::comma_format())+
  labs(title="Cars Fuel type and Condition sold",
    subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Price")
## `summarise()` has grouped output by 'Fuel'. You can override using the
## `.groups` argument.

#Comparison of two brands

options(repr.plot.width = 12, repr.plot.height = 8)

withBrands %>% 
  group_by(Registered.City, Condition) %>% 
  summarise(Total.Value = mean(Price)) %>%
ggplot(aes(reorder(Registered.City, Total.Value), Total.Value, fill = Total.Value))+
  geom_col(show.legend = FALSE)+
  facet_grid(~Condition)+
  coord_flip()+
  theme_minimal()+
  labs(x = NULL, y = "Market Values of rhe Leagues")+
  scale_fill_gradient(low = "khaki", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))+
  scale_y_continuous(label = scales::comma_format())+
  labs(title="Cars Fuel type and Condition of sold cars",
    subtitle = "The cars plotted here - are destributed by Fuel type and condition") + ylab("Price")
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.

withBrands %>% group_by(Registered.City, Transaction.Type) %>% filter(Condition == "New") %>% summarise(count = n()) %>% arrange(desc(count)) %>% 
  ggplot(aes(mean(count), Registered.City, color = Transaction.Type))+
  geom_point()+
  facet_grid(~Transaction.Type)+
  theme_minimal()+
  labs(x = NULL, y = "Market Values of rhe Leagues")+
  scale_fill_gradient(low = "blue", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))+
  scale_x_continuous(label = scales::comma_format())+
  labs(title="Number of New cars sold as per city using different transaction types",
    subtitle = "Cash is most predominant type of transaction type") + ylab("Registered cities") + xlab("Number of Cars")+
    theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.

withBrands %>% group_by(Registered.City, Transaction.Type) %>% filter(Condition == "Used") %>% summarise(count = n()) %>% arrange(desc(count)) %>% 
  ggplot(aes(mean(count), Registered.City, color = Transaction.Type))+
  geom_point()+
  facet_grid(~Transaction.Type)+
  theme_minimal()+
  labs(x = NULL, y = "Market Values of rhe Leagues")+
  scale_fill_gradient(low = "blue", high = "seagreen")+
  theme(axis.line.y = element_line(colour = "darkslategray"),
        axis.ticks.x = element_line(colour = "darkslategray"))+
  scale_x_continuous(label = scales::comma_format())+
  labs(title="Number of Used cars sold as per city using different transaction types",
    subtitle = "Cash is most predominant type of transaction type") + ylab("Registered cities") +
  xlab("Number of Cars")+
    theme(axis.text.x = element_text(angle = 90,vjust=0.5, size=7))
## `summarise()` has grouped output by 'Registered.City'. You can override using
## the `.groups` argument.