library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
#remove scientific notation
options(scipen = 6)
used_cars <- read.csv("C:/Users/toyha/Downloads/vehicle/car details v4.csv")
Here, I’ll take random samples of approximately 50% of entries in the total dataset five times to create five sample sets.
set.seed(1)
sample1 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(2)
sample2 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(3)
sample3 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(4)
sample4 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
set.seed(5)
sample5 <- used_cars |> slice_sample(prop = .5, replace = TRUE)
make_linechart <- function(df, df_x, df_y, chart_title){
ggplot(df, aes(x = df_x, y = df_y, group = 1)) +
geom_line(stat="identity") + labs(title = chart_title, x = df_x, y = "# of occurrences") + geom_point()
}
year1 <- sample1 |> count(Year, sort = TRUE)
year1
## Year n
## 1 2018 154
## 2 2017 129
## 3 2019 110
## 4 2016 97
## 5 2015 86
## 6 2014 85
## 7 2021 76
## 8 2013 66
## 9 2020 60
## 10 2012 45
## 11 2022 38
## 12 2011 34
## 13 2010 20
## 14 2009 12
## 15 2008 11
## 16 2007 2
## 17 1988 1
## 18 1996 1
## 19 2000 1
## 20 2006 1
make_linechart(year1, year1$Year, year1$n, "# of vehicles by model year (sample 1)")
year2 <- sample2 |> count(Year, sort = TRUE)
year2
## Year n
## 1 2017 138
## 2 2018 126
## 3 2019 109
## 4 2015 100
## 5 2016 96
## 6 2014 92
## 7 2013 78
## 8 2021 71
## 9 2020 48
## 10 2012 47
## 11 2011 42
## 12 2022 38
## 13 2009 15
## 14 2010 15
## 15 2008 8
## 16 2007 2
## 17 1988 1
## 18 2002 1
## 19 2004 1
## 20 2006 1
make_linechart(year2, year2$Year, year2$n, "# of vehicles by model year (sample 2)")
year3 <- sample3 |> count(Year, sort = TRUE)
year3
## Year n
## 1 2018 148
## 2 2019 125
## 3 2017 120
## 4 2014 103
## 5 2015 87
## 6 2021 81
## 7 2016 77
## 8 2013 64
## 9 2022 54
## 10 2020 52
## 11 2011 40
## 12 2012 35
## 13 2009 16
## 14 2010 15
## 15 2008 5
## 16 2006 3
## 17 2007 3
## 18 2002 1
make_linechart(year3, year3$Year, year3$n, "# of vehicles by model year (sample 3)")
year4 <- sample4 |> count(Year, sort = TRUE)
year4
## Year n
## 1 2018 124
## 2 2017 121
## 3 2019 119
## 4 2014 100
## 5 2015 93
## 6 2021 89
## 7 2016 87
## 8 2013 64
## 9 2020 64
## 10 2022 52
## 11 2012 46
## 12 2011 35
## 13 2009 15
## 14 2010 8
## 15 2006 4
## 16 1988 2
## 17 2007 2
## 18 2008 2
## 19 1996 1
## 20 2000 1
make_linechart(year4, year4$Year, year4$n, "# of vehicles by model year (sample 4)")
year5 <- sample5 |> count(Year, sort = TRUE)
year5
## Year n
## 1 2018 132
## 2 2017 111
## 3 2019 103
## 4 2014 102
## 5 2016 96
## 6 2015 93
## 7 2021 89
## 8 2020 73
## 9 2013 68
## 10 2012 51
## 11 2022 34
## 12 2011 32
## 13 2009 17
## 14 2010 16
## 15 2008 6
## 16 2007 4
## 17 2000 1
## 18 2006 1
make_linechart(year5, year5$Year, year5$n, "# of vehicles by model year (sample 5)")
Seeing as there are 22 unique values for Year in the population, each of these samples has a few year values missing. most of it is expected with 2007 and before having very few values. Sample #2 has a sharper increase and #3 has a more gradual increase, but the general shapes of the lines in all samples is mostly the same starting with a large tail, then increasing before gradually slowing down and decreasing at the end. India appears to have mostly newer 21st-century vehicles on its roads. It may be worth looking into how India has developed differently in comparison to the USA.
##Looking at the Location attribute
make_barchart <- function(df, df_x, df_y, df_title){
ggplot(df, aes(x = df_x, y = df_y, group = 1)) +
geom_bar(stat="identity", width = 0.75) + labs(title = df_title, x = df_x, y = "# of occurrences") + coord_flip() + theme(axis.text.y=element_text(size=5))
}
loc1 <- sample1 |> count(Location, sort = TRUE)
loc1
## Location n
## 1 Delhi 172
## 2 Mumbai 157
## 3 Pune 71
## 4 Bangalore 65
## 5 Hyderabad 50
## 6 Lucknow 47
## 7 Ahmedabad 43
## 8 Chennai 42
## 9 Kolkata 29
## 10 Ludhiana 26
## 11 Patna 25
## 12 Mohali 19
## 13 Kanpur 17
## 14 Ranchi 16
## 15 Coimbatore 14
## 16 Varanasi 14
## 17 Chandigarh 13
## 18 Agra 12
## 19 Faridabad 12
## 20 Gurgaon 12
## 21 Zirakpur 12
## 22 Dehradun 10
## 23 Jaipur 9
## 24 Jalandhar 9
## 25 Nashik 9
## 26 Raipur 8
## 27 Surat 8
## 28 Thane 7
## 29 Karnal 6
## 30 Navi Mumbai 6
## 31 Panchkula 5
## 32 Ambala Cantt 4
## 33 Amritsar 4
## 34 Bhubaneswar 4
## 35 Indore 4
## 36 Mysore 4
## 37 Noida 4
## 38 Vadodara 4
## 39 Bhopal 3
## 40 Ernakulam 3
## 41 Guwahati 3
## 42 Jamshedpur 3
## 43 Kheda 3
## 44 Mangalore 3
## 45 Muzaffurpur 3
## 46 Nagpur 3
## 47 Salem 3
## 48 Dharwad 2
## 49 Gorakhpur 2
## 50 Haldwani 2
## 51 Meerut 2
## 52 Mirzapur 2
## 53 Panvel 2
## 54 Purnea 2
## 55 Roorkee 2
## 56 Warangal 2
## 57 Yamunanagar 2
## 58 Bulandshahar 1
## 59 Deoghar 1
## 60 Kharar 1
## 61 Kollam 1
## 62 Kota 1
## 63 Pimpri-Chinchwad 1
## 64 Rudrapur 1
## 65 Samastipur 1
## 66 Udupi 1
make_barchart(loc1, reorder(loc1$Location, -loc1$n), loc1$n, "# of vehicles by location (sample 1)")
loc2 <- sample2 |> count(Location, sort = TRUE)
loc2
## Location n
## 1 Mumbai 178
## 2 Delhi 153
## 3 Bangalore 72
## 4 Pune 64
## 5 Hyderabad 46
## 6 Chennai 40
## 7 Kolkata 38
## 8 Ahmedabad 36
## 9 Lucknow 32
## 10 Kanpur 30
## 11 Coimbatore 21
## 12 Patna 21
## 13 Mohali 19
## 14 Faridabad 18
## 15 Ludhiana 18
## 16 Ranchi 17
## 17 Dehradun 16
## 18 Gurgaon 14
## 19 Jaipur 13
## 20 Raipur 13
## 21 Thane 13
## 22 Agra 12
## 23 Varanasi 11
## 24 Noida 10
## 25 Ambala Cantt 9
## 26 Chandigarh 9
## 27 Jalandhar 9
## 28 Jamshedpur 8
## 29 Guwahati 7
## 30 Mysore 7
## 31 Zirakpur 7
## 32 Navi Mumbai 6
## 33 Meerut 4
## 34 Nashik 4
## 35 Salem 4
## 36 Surat 4
## 37 Indore 3
## 38 Karnal 3
## 39 Mirzapur 3
## 40 Rudrapur 3
## 41 Allahabad 2
## 42 Bhopal 2
## 43 Goa 2
## 44 Muzaffurpur 2
## 45 Panchkula 2
## 46 Purnea 2
## 47 Rohtak 2
## 48 Udupi 2
## 49 Unnao 2
## 50 Vadodara 2
## 51 Warangal 2
## 52 Amritsar 1
## 53 Bhubaneswar 1
## 54 Dak. Kannada 1
## 55 Deoghar 1
## 56 Faizabad 1
## 57 Ghaziabad 1
## 58 Gorakhpur 1
## 59 Kharar 1
## 60 Kollam 1
## 61 Kota 1
## 62 Samastipur 1
## 63 Yamunanagar 1
make_barchart(loc2, reorder(loc2$Location, -loc2$n), loc2$n, "# of vehicles by location (sample 2)")
loc3 <- sample3 |> count(Location, sort = TRUE)
loc3
## Location n
## 1 Delhi 159
## 2 Mumbai 156
## 3 Bangalore 71
## 4 Pune 61
## 5 Hyderabad 54
## 6 Lucknow 48
## 7 Ludhiana 35
## 8 Kolkata 34
## 9 Patna 32
## 10 Kanpur 29
## 11 Ahmedabad 26
## 12 Chennai 23
## 13 Mohali 18
## 14 Faridabad 17
## 15 Coimbatore 16
## 16 Chandigarh 15
## 17 Raipur 14
## 18 Noida 13
## 19 Dehradun 12
## 20 Gurgaon 12
## 21 Jaipur 11
## 22 Surat 11
## 23 Varanasi 11
## 24 Agra 10
## 25 Ranchi 10
## 26 Thane 9
## 27 Zirakpur 9
## 28 Karnal 8
## 29 Navi Mumbai 7
## 30 Jalandhar 6
## 31 Jamshedpur 6
## 32 Meerut 6
## 33 Nashik 6
## 34 Ambala Cantt 5
## 35 Guwahati 5
## 36 Vadodara 5
## 37 Bhopal 4
## 38 Mirzapur 4
## 39 Dharwad 3
## 40 Indore 3
## 41 Kheda 3
## 42 Mangalore 3
## 43 Mysore 3
## 44 Salem 3
## 45 Amritsar 2
## 46 Aurangabad 2
## 47 Dak. Kannada 2
## 48 Deoghar 2
## 49 Goa 2
## 50 Kharar 2
## 51 Panchkula 2
## 52 Panvel 2
## 53 Purnea 2
## 54 Udupi 2
## 55 Allahabad 1
## 56 Bulandshahar 1
## 57 Ernakulam 1
## 58 Ghaziabad 1
## 59 Kollam 1
## 60 Kota 1
## 61 Muzaffurpur 1
## 62 Nagpur 1
## 63 Ranga Reddy 1
## 64 Roorkee 1
## 65 Samastipur 1
## 66 Warangal 1
## 67 Yamunanagar 1
make_barchart(loc3, reorder(loc3$Location, -loc3$n), loc3$n, "# of vehicles by location (sample 3)")
loc4 <- sample4 |> count(Location, sort = TRUE)
loc4
## Location n
## 1 Mumbai 168
## 2 Delhi 154
## 3 Bangalore 72
## 4 Pune 68
## 5 Hyderabad 54
## 6 Chennai 34
## 7 Patna 34
## 8 Kolkata 31
## 9 Ludhiana 31
## 10 Ahmedabad 30
## 11 Lucknow 27
## 12 Kanpur 21
## 13 Faridabad 19
## 14 Coimbatore 16
## 15 Dehradun 15
## 16 Mohali 15
## 17 Chandigarh 14
## 18 Gurgaon 14
## 19 Ranchi 14
## 20 Thane 13
## 21 Varanasi 11
## 22 Jaipur 10
## 23 Ambala Cantt 9
## 24 Karnal 9
## 25 Raipur 9
## 26 Zirakpur 9
## 27 Jalandhar 8
## 28 Navi Mumbai 8
## 29 Goa 7
## 30 Noida 7
## 31 Agra 6
## 32 Guwahati 6
## 33 Surat 6
## 34 Aurangabad 5
## 35 Jamshedpur 5
## 36 Bhopal 4
## 37 Indore 4
## 38 Meerut 4
## 39 Nashik 4
## 40 Panchkula 4
## 41 Ernakulam 3
## 42 Gorakhpur 3
## 43 Mysore 3
## 44 Allahabad 2
## 45 Amritsar 2
## 46 Dharwad 2
## 47 Ghaziabad 2
## 48 Kota 2
## 49 Mirzapur 2
## 50 Muzaffurpur 2
## 51 Panvel 2
## 52 Pimpri-Chinchwad 2
## 53 Rudrapur 2
## 54 Salem 2
## 55 Unnao 2
## 56 Vadodara 2
## 57 Warangal 2
## 58 Yamunanagar 2
## 59 Bulandshahar 1
## 60 Deoghar 1
## 61 Faizabad 1
## 62 Haldwani 1
## 63 Kharar 1
## 64 Mangalore 1
## 65 Nagpur 1
## 66 Purnea 1
## 67 Ranga Reddy 1
## 68 Roorkee 1
## 69 Samastipur 1
make_barchart(loc4, reorder(loc4$Location, -loc4$n), loc4$n, "# of vehicles by location (sample 4)")
loc5 <- sample5 |> count(Location, sort = TRUE)
loc5
## Location n
## 1 Mumbai 181
## 2 Delhi 161
## 3 Pune 67
## 4 Bangalore 56
## 5 Hyderabad 55
## 6 Ahmedabad 43
## 7 Lucknow 38
## 8 Ludhiana 34
## 9 Patna 28
## 10 Chennai 27
## 11 Kolkata 25
## 12 Kanpur 21
## 13 Faridabad 19
## 14 Mohali 19
## 15 Agra 15
## 16 Gurgaon 14
## 17 Chandigarh 13
## 18 Ambala Cantt 12
## 19 Coimbatore 12
## 20 Dehradun 12
## 21 Jalandhar 12
## 22 Varanasi 12
## 23 Navi Mumbai 11
## 24 Ranchi 11
## 25 Jaipur 9
## 26 Karnal 9
## 27 Raipur 9
## 28 Thane 9
## 29 Zirakpur 8
## 30 Guwahati 6
## 31 Mysore 6
## 32 Mangalore 5
## 33 Yamunanagar 5
## 34 Bhopal 4
## 35 Jamshedpur 4
## 36 Surat 4
## 37 Nashik 3
## 38 Panvel 3
## 39 Samastipur 3
## 40 Vadodara 3
## 41 Aurangabad 2
## 42 Bhubaneswar 2
## 43 Dharwad 2
## 44 Goa 2
## 45 Haldwani 2
## 46 Indore 2
## 47 Kollam 2
## 48 Meerut 2
## 49 Mirzapur 2
## 50 Noida 2
## 51 Ranga Reddy 2
## 52 Salem 2
## 53 Udupi 2
## 54 Allahabad 1
## 55 Amritsar 1
## 56 Bulandshahar 1
## 57 Deoghar 1
## 58 Ghaziabad 1
## 59 Gorakhpur 1
## 60 Kharar 1
## 61 Kheda 1
## 62 Kota 1
## 63 Muzaffurpur 1
## 64 Nagpur 1
## 65 Panchkula 1
## 66 Pimpri-Chinchwad 1
## 67 Rohtak 1
## 68 Warangal 1
make_barchart(loc5, reorder(loc5$Location, -loc5$n), loc5$n, "# of vehicles by location (sample 5)")
As mentioned in the previous week, there is a very, very large number of different locations, so I will only look at the top five. Lucknow, Hyderabad, Bangalore, Pune, Mumbai, and Delhi all occur the most frequently across all sample sets, so it’s likely that finding a used vehicle is easier in these cities.
##Looking at the Number of Previous Owners
own1 <- sample1 |> count(Owner, sort = TRUE)
own1
## Owner n
## 1 First 815
## 2 Second 180
## 3 Third 18
## 4 UnRegistered Car 12
## 5 Fourth 3
## 6 4 or More 1
make_barchart(own1, own1$Owner, own1$n, "# of vehicles by previous owners (sample 1)")
own2 <- sample2 |> count(Owner, sort = TRUE)
own2
## Owner n
## 1 First 800
## 2 Second 198
## 3 Third 15
## 4 UnRegistered Car 13
## 5 4 or More 2
## 6 Fourth 1
make_barchart(own2, own2$Owner, own2$n, "# of vehicles by previous owners (sample 2)")
own3 <- sample3 |> count(Owner, sort = TRUE)
own3
## Owner n
## 1 First 814
## 2 Second 180
## 3 Third 18
## 4 UnRegistered Car 11
## 5 Fourth 4
## 6 4 or More 2
make_barchart(own3, own3$Owner, own3$n, "# of vehicles by previous owners (sample 3)")
own4 <- sample4 |> count(Owner, sort = TRUE)
own4
## Owner n
## 1 First 827
## 2 Second 169
## 3 Third 24
## 4 UnRegistered Car 7
## 5 4 or More 1
## 6 Fourth 1
make_barchart(own4, own4$Owner, own4$n, "# of vehicles by previous owners (sample 4)")
own5 <- sample5 |> count(Owner, sort = TRUE)
own5
## Owner n
## 1 First 807
## 2 Second 184
## 3 Third 29
## 4 UnRegistered Car 8
## 5 Fourth 1
make_barchart(own5, own5$Owner, own5$n, "# of vehicles by previous owners (sample 5)")
All sample sets appear to follow roughly the same shape, with a vast majority of cars with one previous owner, with a sharp decrease in number for each successive owner. There is an absence of the “four or more” value in sample #5, and a single instance of a vehicle with four previous owners. There is also a small amount of unregistered cars for each sample set. The vast majority of used cars still have only one previous owner, so if you choose a used car at random, it will very likely have one previous owner, with a smaller chance of more previous owners.
By drawing samples from a very large set of data, you always risk losing a variable that only occurs very infrequently. Missing values can be dealt with by either removing them from samples entirely or by manually including the missing rare values, but both approaches are dependent on why you’re sampling data. If a model is being created with missing values, then it will fail to capture those values when making predictions. Even with computers, programs, and algorithms becoming more advanced, there still needs to be a human element to effectively model and draw conclusions from statistics.