Importing & Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)

#remove scientific notation
options(scipen = 6)

used_cars <- read.csv("C:/Users/toyha/Downloads/vehicle/car details v4.csv")

Random Sampling

Here, I’ll take random samples of approximately 50% of entries in the total dataset five times to create five sample sets.

set.seed(1)
sample1 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(2)
sample2 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(3)
sample3 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(4)
sample4 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(5)
sample5 <- used_cars |> slice_sample(prop = .5, replace = TRUE)

Looking at the Year attribute

make_linechart <- function(df, df_x, df_y, chart_title){
  ggplot(df, aes(x = df_x, y = df_y, group = 1)) +
  geom_line(stat="identity") + labs(title = chart_title, x = df_x,  y = "# of occurrences") + geom_point()
}
year1 <- sample1 |> count(Year, sort = TRUE)
year1
##    Year   n
## 1  2018 154
## 2  2017 129
## 3  2019 110
## 4  2016  97
## 5  2015  86
## 6  2014  85
## 7  2021  76
## 8  2013  66
## 9  2020  60
## 10 2012  45
## 11 2022  38
## 12 2011  34
## 13 2010  20
## 14 2009  12
## 15 2008  11
## 16 2007   2
## 17 1988   1
## 18 1996   1
## 19 2000   1
## 20 2006   1
make_linechart(year1, year1$Year, year1$n, "# of vehicles by model year (sample 1)")

year2 <- sample2 |> count(Year, sort = TRUE)
year2
##    Year   n
## 1  2017 138
## 2  2018 126
## 3  2019 109
## 4  2015 100
## 5  2016  96
## 6  2014  92
## 7  2013  78
## 8  2021  71
## 9  2020  48
## 10 2012  47
## 11 2011  42
## 12 2022  38
## 13 2009  15
## 14 2010  15
## 15 2008   8
## 16 2007   2
## 17 1988   1
## 18 2002   1
## 19 2004   1
## 20 2006   1
make_linechart(year2, year2$Year, year2$n, "# of vehicles by model year (sample 2)")

year3 <- sample3 |> count(Year, sort = TRUE)
year3
##    Year   n
## 1  2018 148
## 2  2019 125
## 3  2017 120
## 4  2014 103
## 5  2015  87
## 6  2021  81
## 7  2016  77
## 8  2013  64
## 9  2022  54
## 10 2020  52
## 11 2011  40
## 12 2012  35
## 13 2009  16
## 14 2010  15
## 15 2008   5
## 16 2006   3
## 17 2007   3
## 18 2002   1
make_linechart(year3, year3$Year, year3$n, "# of vehicles by model year (sample 3)")

year4 <- sample4 |> count(Year, sort = TRUE)
year4
##    Year   n
## 1  2018 124
## 2  2017 121
## 3  2019 119
## 4  2014 100
## 5  2015  93
## 6  2021  89
## 7  2016  87
## 8  2013  64
## 9  2020  64
## 10 2022  52
## 11 2012  46
## 12 2011  35
## 13 2009  15
## 14 2010   8
## 15 2006   4
## 16 1988   2
## 17 2007   2
## 18 2008   2
## 19 1996   1
## 20 2000   1
make_linechart(year4, year4$Year, year4$n, "# of vehicles by model year (sample 4)")

year5 <- sample5 |> count(Year, sort = TRUE)
year5
##    Year   n
## 1  2018 132
## 2  2017 111
## 3  2019 103
## 4  2014 102
## 5  2016  96
## 6  2015  93
## 7  2021  89
## 8  2020  73
## 9  2013  68
## 10 2012  51
## 11 2022  34
## 12 2011  32
## 13 2009  17
## 14 2010  16
## 15 2008   6
## 16 2007   4
## 17 2000   1
## 18 2006   1
make_linechart(year5, year5$Year, year5$n, "# of vehicles by model year (sample 5)")

Seeing as there are 22 unique values for Year in the population, each of these samples has a few year values missing. most of it is expected with 2007 and before having very few values. Sample #2 has a sharper increase and #3 has a more gradual increase, but the general shapes of the lines in all samples is mostly the same starting with a large tail, then increasing before gradually slowing down and decreasing at the end. India appears to have mostly newer 21st-century vehicles on its roads. It may be worth looking into how India has developed differently in comparison to the USA.

##Looking at the Location attribute

make_barchart <- function(df, df_x, df_y, df_title){
  ggplot(df, aes(x = df_x, y = df_y, group = 1)) +
    geom_bar(stat="identity", width = 0.75) + labs(title = df_title, x = df_x, y =  "# of occurrences") + coord_flip() + theme(axis.text.y=element_text(size=5))
}
loc1 <- sample1 |> count(Location, sort = TRUE)
loc1
##            Location   n
## 1             Delhi 172
## 2            Mumbai 157
## 3              Pune  71
## 4         Bangalore  65
## 5         Hyderabad  50
## 6           Lucknow  47
## 7         Ahmedabad  43
## 8           Chennai  42
## 9           Kolkata  29
## 10         Ludhiana  26
## 11            Patna  25
## 12           Mohali  19
## 13           Kanpur  17
## 14           Ranchi  16
## 15       Coimbatore  14
## 16         Varanasi  14
## 17       Chandigarh  13
## 18             Agra  12
## 19        Faridabad  12
## 20          Gurgaon  12
## 21         Zirakpur  12
## 22         Dehradun  10
## 23           Jaipur   9
## 24        Jalandhar   9
## 25           Nashik   9
## 26           Raipur   8
## 27            Surat   8
## 28            Thane   7
## 29           Karnal   6
## 30      Navi Mumbai   6
## 31        Panchkula   5
## 32     Ambala Cantt   4
## 33         Amritsar   4
## 34      Bhubaneswar   4
## 35           Indore   4
## 36           Mysore   4
## 37            Noida   4
## 38         Vadodara   4
## 39           Bhopal   3
## 40        Ernakulam   3
## 41         Guwahati   3
## 42       Jamshedpur   3
## 43            Kheda   3
## 44        Mangalore   3
## 45      Muzaffurpur   3
## 46           Nagpur   3
## 47            Salem   3
## 48          Dharwad   2
## 49        Gorakhpur   2
## 50         Haldwani   2
## 51           Meerut   2
## 52         Mirzapur   2
## 53           Panvel   2
## 54           Purnea   2
## 55          Roorkee   2
## 56         Warangal   2
## 57      Yamunanagar   2
## 58     Bulandshahar   1
## 59          Deoghar   1
## 60           Kharar   1
## 61           Kollam   1
## 62             Kota   1
## 63 Pimpri-Chinchwad   1
## 64         Rudrapur   1
## 65       Samastipur   1
## 66            Udupi   1
make_barchart(loc1, reorder(loc1$Location, -loc1$n), loc1$n, "# of vehicles by location (sample 1)")

loc2 <- sample2 |> count(Location, sort = TRUE)
loc2
##        Location   n
## 1        Mumbai 178
## 2         Delhi 153
## 3     Bangalore  72
## 4          Pune  64
## 5     Hyderabad  46
## 6       Chennai  40
## 7       Kolkata  38
## 8     Ahmedabad  36
## 9       Lucknow  32
## 10       Kanpur  30
## 11   Coimbatore  21
## 12        Patna  21
## 13       Mohali  19
## 14    Faridabad  18
## 15     Ludhiana  18
## 16       Ranchi  17
## 17     Dehradun  16
## 18      Gurgaon  14
## 19       Jaipur  13
## 20       Raipur  13
## 21        Thane  13
## 22         Agra  12
## 23     Varanasi  11
## 24        Noida  10
## 25 Ambala Cantt   9
## 26   Chandigarh   9
## 27    Jalandhar   9
## 28   Jamshedpur   8
## 29     Guwahati   7
## 30       Mysore   7
## 31     Zirakpur   7
## 32  Navi Mumbai   6
## 33       Meerut   4
## 34       Nashik   4
## 35        Salem   4
## 36        Surat   4
## 37       Indore   3
## 38       Karnal   3
## 39     Mirzapur   3
## 40     Rudrapur   3
## 41    Allahabad   2
## 42       Bhopal   2
## 43          Goa   2
## 44  Muzaffurpur   2
## 45    Panchkula   2
## 46       Purnea   2
## 47       Rohtak   2
## 48        Udupi   2
## 49        Unnao   2
## 50     Vadodara   2
## 51     Warangal   2
## 52     Amritsar   1
## 53  Bhubaneswar   1
## 54 Dak. Kannada   1
## 55      Deoghar   1
## 56     Faizabad   1
## 57    Ghaziabad   1
## 58    Gorakhpur   1
## 59       Kharar   1
## 60       Kollam   1
## 61         Kota   1
## 62   Samastipur   1
## 63  Yamunanagar   1
make_barchart(loc2, reorder(loc2$Location, -loc2$n), loc2$n, "# of vehicles by location (sample 2)")

loc3 <- sample3 |> count(Location, sort = TRUE)
loc3
##        Location   n
## 1         Delhi 159
## 2        Mumbai 156
## 3     Bangalore  71
## 4          Pune  61
## 5     Hyderabad  54
## 6       Lucknow  48
## 7      Ludhiana  35
## 8       Kolkata  34
## 9         Patna  32
## 10       Kanpur  29
## 11    Ahmedabad  26
## 12      Chennai  23
## 13       Mohali  18
## 14    Faridabad  17
## 15   Coimbatore  16
## 16   Chandigarh  15
## 17       Raipur  14
## 18        Noida  13
## 19     Dehradun  12
## 20      Gurgaon  12
## 21       Jaipur  11
## 22        Surat  11
## 23     Varanasi  11
## 24         Agra  10
## 25       Ranchi  10
## 26        Thane   9
## 27     Zirakpur   9
## 28       Karnal   8
## 29  Navi Mumbai   7
## 30    Jalandhar   6
## 31   Jamshedpur   6
## 32       Meerut   6
## 33       Nashik   6
## 34 Ambala Cantt   5
## 35     Guwahati   5
## 36     Vadodara   5
## 37       Bhopal   4
## 38     Mirzapur   4
## 39      Dharwad   3
## 40       Indore   3
## 41        Kheda   3
## 42    Mangalore   3
## 43       Mysore   3
## 44        Salem   3
## 45     Amritsar   2
## 46   Aurangabad   2
## 47 Dak. Kannada   2
## 48      Deoghar   2
## 49          Goa   2
## 50       Kharar   2
## 51    Panchkula   2
## 52       Panvel   2
## 53       Purnea   2
## 54        Udupi   2
## 55    Allahabad   1
## 56 Bulandshahar   1
## 57    Ernakulam   1
## 58    Ghaziabad   1
## 59       Kollam   1
## 60         Kota   1
## 61  Muzaffurpur   1
## 62       Nagpur   1
## 63  Ranga Reddy   1
## 64      Roorkee   1
## 65   Samastipur   1
## 66     Warangal   1
## 67  Yamunanagar   1
make_barchart(loc3, reorder(loc3$Location, -loc3$n), loc3$n, "# of vehicles by location (sample 3)")

loc4 <- sample4 |> count(Location, sort = TRUE)
loc4
##            Location   n
## 1            Mumbai 168
## 2             Delhi 154
## 3         Bangalore  72
## 4              Pune  68
## 5         Hyderabad  54
## 6           Chennai  34
## 7             Patna  34
## 8           Kolkata  31
## 9          Ludhiana  31
## 10        Ahmedabad  30
## 11          Lucknow  27
## 12           Kanpur  21
## 13        Faridabad  19
## 14       Coimbatore  16
## 15         Dehradun  15
## 16           Mohali  15
## 17       Chandigarh  14
## 18          Gurgaon  14
## 19           Ranchi  14
## 20            Thane  13
## 21         Varanasi  11
## 22           Jaipur  10
## 23     Ambala Cantt   9
## 24           Karnal   9
## 25           Raipur   9
## 26         Zirakpur   9
## 27        Jalandhar   8
## 28      Navi Mumbai   8
## 29              Goa   7
## 30            Noida   7
## 31             Agra   6
## 32         Guwahati   6
## 33            Surat   6
## 34       Aurangabad   5
## 35       Jamshedpur   5
## 36           Bhopal   4
## 37           Indore   4
## 38           Meerut   4
## 39           Nashik   4
## 40        Panchkula   4
## 41        Ernakulam   3
## 42        Gorakhpur   3
## 43           Mysore   3
## 44        Allahabad   2
## 45         Amritsar   2
## 46          Dharwad   2
## 47        Ghaziabad   2
## 48             Kota   2
## 49         Mirzapur   2
## 50      Muzaffurpur   2
## 51           Panvel   2
## 52 Pimpri-Chinchwad   2
## 53         Rudrapur   2
## 54            Salem   2
## 55            Unnao   2
## 56         Vadodara   2
## 57         Warangal   2
## 58      Yamunanagar   2
## 59     Bulandshahar   1
## 60          Deoghar   1
## 61         Faizabad   1
## 62         Haldwani   1
## 63           Kharar   1
## 64        Mangalore   1
## 65           Nagpur   1
## 66           Purnea   1
## 67      Ranga Reddy   1
## 68          Roorkee   1
## 69       Samastipur   1
make_barchart(loc4, reorder(loc4$Location, -loc4$n), loc4$n, "# of vehicles by location (sample 4)")

loc5 <- sample5 |> count(Location, sort = TRUE)
loc5
##            Location   n
## 1            Mumbai 181
## 2             Delhi 161
## 3              Pune  67
## 4         Bangalore  56
## 5         Hyderabad  55
## 6         Ahmedabad  43
## 7           Lucknow  38
## 8          Ludhiana  34
## 9             Patna  28
## 10          Chennai  27
## 11          Kolkata  25
## 12           Kanpur  21
## 13        Faridabad  19
## 14           Mohali  19
## 15             Agra  15
## 16          Gurgaon  14
## 17       Chandigarh  13
## 18     Ambala Cantt  12
## 19       Coimbatore  12
## 20         Dehradun  12
## 21        Jalandhar  12
## 22         Varanasi  12
## 23      Navi Mumbai  11
## 24           Ranchi  11
## 25           Jaipur   9
## 26           Karnal   9
## 27           Raipur   9
## 28            Thane   9
## 29         Zirakpur   8
## 30         Guwahati   6
## 31           Mysore   6
## 32        Mangalore   5
## 33      Yamunanagar   5
## 34           Bhopal   4
## 35       Jamshedpur   4
## 36            Surat   4
## 37           Nashik   3
## 38           Panvel   3
## 39       Samastipur   3
## 40         Vadodara   3
## 41       Aurangabad   2
## 42      Bhubaneswar   2
## 43          Dharwad   2
## 44              Goa   2
## 45         Haldwani   2
## 46           Indore   2
## 47           Kollam   2
## 48           Meerut   2
## 49         Mirzapur   2
## 50            Noida   2
## 51      Ranga Reddy   2
## 52            Salem   2
## 53            Udupi   2
## 54        Allahabad   1
## 55         Amritsar   1
## 56     Bulandshahar   1
## 57          Deoghar   1
## 58        Ghaziabad   1
## 59        Gorakhpur   1
## 60           Kharar   1
## 61            Kheda   1
## 62             Kota   1
## 63      Muzaffurpur   1
## 64           Nagpur   1
## 65        Panchkula   1
## 66 Pimpri-Chinchwad   1
## 67           Rohtak   1
## 68         Warangal   1
make_barchart(loc5, reorder(loc5$Location, -loc5$n), loc5$n, "# of vehicles by location (sample 5)")

As mentioned in the previous week, there is a very, very large number of different locations, so I will only look at the top five. Lucknow, Hyderabad, Bangalore, Pune, Mumbai, and Delhi all occur the most frequently across all sample sets, so it’s likely that finding a used vehicle is easier in these cities.

##Looking at the Number of Previous Owners

own1 <- sample1 |> count(Owner, sort = TRUE)
own1
##              Owner   n
## 1            First 815
## 2           Second 180
## 3            Third  18
## 4 UnRegistered Car  12
## 5           Fourth   3
## 6        4 or More   1
make_barchart(own1, own1$Owner, own1$n, "# of vehicles by previous owners (sample 1)")

own2 <- sample2 |> count(Owner, sort = TRUE)
own2
##              Owner   n
## 1            First 800
## 2           Second 198
## 3            Third  15
## 4 UnRegistered Car  13
## 5        4 or More   2
## 6           Fourth   1
make_barchart(own2, own2$Owner, own2$n, "# of vehicles by previous owners (sample 2)")

own3 <- sample3 |> count(Owner, sort = TRUE)
own3
##              Owner   n
## 1            First 814
## 2           Second 180
## 3            Third  18
## 4 UnRegistered Car  11
## 5           Fourth   4
## 6        4 or More   2
make_barchart(own3, own3$Owner, own3$n, "# of vehicles by previous owners (sample 3)")

own4 <- sample4 |> count(Owner, sort = TRUE)
own4
##              Owner   n
## 1            First 827
## 2           Second 169
## 3            Third  24
## 4 UnRegistered Car   7
## 5        4 or More   1
## 6           Fourth   1
make_barchart(own4, own4$Owner, own4$n, "# of vehicles by previous owners (sample 4)")

own5 <- sample5 |> count(Owner, sort = TRUE)
own5
##              Owner   n
## 1            First 807
## 2           Second 184
## 3            Third  29
## 4 UnRegistered Car   8
## 5           Fourth   1
make_barchart(own5, own5$Owner, own5$n, "# of vehicles by previous owners (sample 5)")

All sample sets appear to follow roughly the same shape, with a vast majority of cars with one previous owner, with a sharp decrease in number for each successive owner. There is an absence of the “four or more” value in sample #5, and a single instance of a vehicle with four previous owners. There is also a small amount of unregistered cars for each sample set. The vast majority of used cars still have only one previous owner, so if you choose a used car at random, it will very likely have one previous owner, with a smaller chance of more previous owners.

Conclusion

By drawing samples from a very large set of data, you always risk losing a variable that only occurs very infrequently. Missing values can be dealt with by either removing them from samples entirely or by manually including the missing rare values, but both approaches are dependent on why you’re sampling data. If a model is being created with missing values, then it will fail to capture those values when making predictions. Even with computers, programs, and algorithms becoming more advanced, there still needs to be a human element to effectively model and draw conclusions from statistics.