Data101Project1

For this project, I used a dataset I found on Kaggle (https://www.kaggle.com/lepchenkov/usedcarscatalog), which looks at 30 different aspects of used cars being offered for sale. The data were scraped from Belarus on December 2, 2019.

I chose to focus solely on a handful of variables from this dataset: manufacturer, model, year, transmission and color.

Some questions I had were: Which car manufacturers are worth more, used, on average? Is there a certain transmission type worth more than the other on average? Which years of cars are worth the most on average? Are specific colors of cars worth more than others on average? Most importantly, as someone who owns a used car, and would be highly interested in selling said used car, I wanted to see if my chances of selling my 2007 black Jeep Wrangler with a manual transmission for a reasonable price were good.

setwd("/cloud/project/")

install.packages("tidyverse")

## Installing package into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)

Here, I load in the dataset using read_csv.

Before loading the dataset into RStudio Cloud, I removed the “location_region” column in Excel, because it contained random and strange symbols that I didn’t want to create issues when I tried to read it into RStudio, so the dataset now contains 29 variables.

library(tidyverse)

## ── Attaching packages ─────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)
cars <- read_csv("cars2.csv")

## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   manufacturer_name = col_character(),
##   model_name = col_character(),
##   transmission = col_character(),
##   color = col_character(),
##   odometer_value = col_double(),
##   year_produced = col_double(),
##   engine_fuel = col_character(),
##   engine_type = col_character(),
##   engine_capacity = col_double(),
##   body_type = col_character(),
##   state = col_character(),
##   drivetrain = col_character(),
##   price_usd = col_double(),
##   number_of_photos = col_double(),
##   up_counter = col_double(),
##   duration_listed = col_double()
## )

## See spec(...) for full column specifications.

Let’s take a look at the structure of the data.

str(cars)

## tibble [38,531 × 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ manufacturer_name: chr [1:38531] "Subaru" "Subaru" "Subaru" "Subaru" ...
##  $ model_name       : chr [1:38531] "Outback" "Outback" "Forester" "Impreza" ...
##  $ transmission     : chr [1:38531] "automatic" "automatic" "automatic" "mechanical" ...
##  $ color            : chr [1:38531] "silver" "blue" "red" "blue" ...
##  $ odometer_value   : num [1:38531] 190000 290000 402000 10000 280000 ...
##  $ year_produced    : num [1:38531] 2010 2002 2001 1999 2001 ...
##  $ engine_fuel      : chr [1:38531] "gasoline" "gasoline" "gasoline" "gasoline" ...
##  $ engine_has_gas   : logi [1:38531] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ engine_type      : chr [1:38531] "gasoline" "gasoline" "gasoline" "gasoline" ...
##  $ engine_capacity  : num [1:38531] 2.5 3 2.5 3 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ body_type        : chr [1:38531] "universal" "universal" "suv" "sedan" ...
##  $ has_warranty     : logi [1:38531] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ state            : chr [1:38531] "owned" "owned" "owned" "owned" ...
##  $ drivetrain       : chr [1:38531] "all" "all" "all" "all" ...
##  $ price_usd        : num [1:38531] 10900 5000 2800 9999 2134 ...
##  $ is_exchangeable  : logi [1:38531] FALSE TRUE TRUE TRUE TRUE TRUE ...
##  $ number_of_photos : num [1:38531] 9 12 4 9 14 20 8 7 17 8 ...
##  $ up_counter       : num [1:38531] 13 54 72 42 7 56 147 29 33 11 ...
##  $ feature_0        : logi [1:38531] FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ feature_1        : logi [1:38531] TRUE TRUE TRUE FALSE TRUE TRUE ...
##  $ feature_2        : logi [1:38531] TRUE FALSE FALSE FALSE FALSE FALSE ...
##  $ feature_3        : logi [1:38531] TRUE FALSE FALSE FALSE TRUE FALSE ...
##  $ feature_4        : logi [1:38531] FALSE TRUE FALSE FALSE TRUE FALSE ...
##  $ feature_5        : logi [1:38531] TRUE TRUE FALSE FALSE FALSE TRUE ...
##  $ feature_6        : logi [1:38531] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ feature_7        : logi [1:38531] TRUE FALSE FALSE FALSE FALSE TRUE ...
##  $ feature_8        : logi [1:38531] TRUE FALSE TRUE FALSE FALSE TRUE ...
##  $ feature_9        : logi [1:38531] TRUE TRUE TRUE FALSE TRUE TRUE ...
##  $ duration_listed  : num [1:38531] 16 83 151 86 7 67 307 73 87 43 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   manufacturer_name = col_character(),
##   ..   model_name = col_character(),
##   ..   transmission = col_character(),
##   ..   color = col_character(),
##   ..   odometer_value = col_double(),
##   ..   year_produced = col_double(),
##   ..   engine_fuel = col_character(),
##   ..   engine_has_gas = col_logical(),
##   ..   engine_type = col_character(),
##   ..   engine_capacity = col_double(),
##   ..   body_type = col_character(),
##   ..   has_warranty = col_logical(),
##   ..   state = col_character(),
##   ..   drivetrain = col_character(),
##   ..   price_usd = col_double(),
##   ..   is_exchangeable = col_logical(),
##   ..   number_of_photos = col_double(),
##   ..   up_counter = col_double(),
##   ..   feature_0 = col_logical(),
##   ..   feature_1 = col_logical(),
##   ..   feature_2 = col_logical(),
##   ..   feature_3 = col_logical(),
##   ..   feature_4 = col_logical(),
##   ..   feature_5 = col_logical(),
##   ..   feature_6 = col_logical(),
##   ..   feature_7 = col_logical(),
##   ..   feature_8 = col_logical(),
##   ..   feature_9 = col_logical(),
##   ..   duration_listed = col_double()
##   .. )

Let’s look at the first 6 rows.

head(cars)

## # A tibble: 6 x 29
##   manufacturer_na… model_name transmission color odometer_value year_produced
##   <chr>            <chr>      <chr>        <chr>          <dbl>         <dbl>
## 1 Subaru           Outback    automatic    silv…         190000          2010
## 2 Subaru           Outback    automatic    blue          290000          2002
## 3 Subaru           Forester   automatic    red           402000          2001
## 4 Subaru           Impreza    mechanical   blue           10000          1999
## 5 Subaru           Legacy     automatic    black         280000          2001
## 6 Subaru           Outback    automatic    silv…         132449          2011
## # … with 23 more variables: engine_fuel <chr>, engine_has_gas <lgl>,
## #   engine_type <chr>, engine_capacity <dbl>, body_type <chr>,
## #   has_warranty <lgl>, state <chr>, drivetrain <chr>, price_usd <dbl>,
## #   is_exchangeable <lgl>, number_of_photos <dbl>, up_counter <dbl>,
## #   feature_0 <lgl>, feature_1 <lgl>, feature_2 <lgl>, feature_3 <lgl>,
## #   feature_4 <lgl>, feature_5 <lgl>, feature_6 <lgl>, feature_7 <lgl>,
## #   feature_8 <lgl>, feature_9 <lgl>, duration_listed <dbl>

And the last 6 rows.

tail(cars)

## # A tibble: 6 x 29
##   manufacturer_na… model_name transmission color odometer_value year_produced
##   <chr>            <chr>      <chr>        <chr>          <dbl>         <dbl>
## 1 Chrysler         Voyager    mechanical   viol…         317000          1999
## 2 Chrysler         300        automatic    silv…         290000          2000
## 3 Chrysler         PT Cruiser mechanical   blue          321000          2004
## 4 Chrysler         300        automatic    blue          777957          2000
## 5 Chrysler         PT Cruiser mechanical   black          20000          2001
## 6 Chrysler         Voyager    automatic    silv…         297729          2000
## # … with 23 more variables: engine_fuel <chr>, engine_has_gas <lgl>,
## #   engine_type <chr>, engine_capacity <dbl>, body_type <chr>,
## #   has_warranty <lgl>, state <chr>, drivetrain <chr>, price_usd <dbl>,
## #   is_exchangeable <lgl>, number_of_photos <dbl>, up_counter <dbl>,
## #   feature_0 <lgl>, feature_1 <lgl>, feature_2 <lgl>, feature_3 <lgl>,
## #   feature_4 <lgl>, feature_5 <lgl>, feature_6 <lgl>, feature_7 <lgl>,
## #   feature_8 <lgl>, feature_9 <lgl>, duration_listed <dbl>

Next, I used dplyr to select the columns of manufacturer_name and price_usd, group by manufacturer_name, and take the means. I called this new data frame “df_manufacturer”.

df_manufacturer <- cars %>% 
select(manufacturer_name, price_usd) %>%
group_by(manufacturer_name) %>%
summarise_all(funs(mean))

## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

df_manufacturer

## # A tibble: 55 x 2
##    manufacturer_name                  price_usd
##    <chr>                                  <dbl>
##  1 "_\xec__\xe3\x81_____\x9f\xe3\x88"      979.
##  2 "_\xa3_\x90_\x84"                      3443.
##  3 "_\xcd_\x90_\x84"                      1519.
##  4 "_\x84_\x90_\x84"                      1360.
##  5 "_\xf1_\x90_\x84"                      3890.
##  6 "Acura"                               12773.
##  7 "Alfa Romeo"                           2689.
##  8 "Audi"                                 7155.
##  9 "BMW"                                  9550.
## 10 "Buick"                               12876.
## # … with 45 more rows

For the purposes of my project, I wanted to see which manufacturers were worth more on average, so I arranged df_manufacturer in descening order.

df_manufacturer_desc <- df_manufacturer %>%
  arrange(desc(price_usd))
df_manufacturer_desc

## # A tibble: 55 x 2
##    manufacturer_name price_usd
##    <chr>                 <dbl>
##  1 Porsche              18630.
##  2 Jaguar               17813 
##  3 Lexus                17131.
##  4 Land Rover           15195.
##  5 Skoda                13795.
##  6 Infiniti             13795.
##  7 Mini                 13134.
##  8 Buick                12876.
##  9 Acura                12773.
## 10 Cadillac             11093.
## # … with 45 more rows

As we can see, the top 3 highest selling manufactureres on average are Porsche, Jaguar and Lexus. Sadly, Jeep was not one of the top 10 highest values. But it was number 11, which isn’t too shabby!

Next, I used dplyr again to select the columns of transmission and price_usd, group by transmission, and take the means. I called this new data frame “df_transmission”.

df_transmission <- cars %>% 
select(transmission, price_usd) %>%
group_by(transmission) %>%
summarise_all(funs(mean))
df_transmission

## # A tibble: 2 x 2
##   transmission price_usd
##   <chr>            <dbl>
## 1 automatic       10957.
## 2 mechanical       4468.

Right away, we can see that having a manual transmission isn’t ideal when selling a used car. Automatic transmissions, on average, sell for over twice as much!

Next, I used dplyr to select the columns of year_produced and price_usd, group by year_produced, and take the means. I called this new data frame “df_year”.

df_year <- cars %>% 
select(year_produced, price_usd) %>%
group_by(year_produced) %>%
summarise_all(funs(mean))
df_year

## # A tibble: 64 x 2
##    year_produced price_usd
##            <dbl>     <dbl>
##  1          1942    22012.
##  2          1950     2000 
##  3          1956     1000 
##  4          1959      800 
##  5          1960     4517.
##  6          1961     6500 
##  7          1962     4259.
##  8          1963     2331.
##  9          1964     3688.
## 10          1965     9773.
## # … with 54 more rows

Again, I used the arrange function to look at the years in order of descending price.

df_year_desc <- df_year %>%
  arrange(desc(price_usd))
df_year_desc

## # A tibble: 64 x 2
##    year_produced price_usd
##            <dbl>     <dbl>
##  1          2019    24007.
##  2          1942    22012.
##  3          2018    18546.
##  4          2017    17227.
##  5          2016    15906.
##  6          2015    15699.
##  7          2014    14937.
##  8          2013    13736.
##  9          2012    13308.
## 10          2011    11871.
## # … with 54 more rows

The values seem to follow a chronological pattern, with the newer the car the higher the value, with the exception of 1942 ranking second! Maybe I just have to wait a couple decades…

Next, I used dplyr again to select the columns of model_name and price_usd, group by model_name, and take the means. I called this new data frame “df_model”.

df_model <- cars %>% 
select(model_name, price_usd) %>%
group_by(model_name) %>%
summarise_all(funs(mean))
df_model

## # A tibble: 1,118 x 2
##    model_name                          price_usd
##    <chr>                                   <dbl>
##  1 "__\xe3\x80_\x9f_\xc7__\xe3\xf3"       18450 
##  2 "_\xe7___\xb1___\xe9\xe3\xeb"           6310.
##  3 "_\xa2_\xc1__\xe3\xf3_\x9f\xe3\x8f"      428.
##  4 "_\xcd_\xd6_\xe7"                       1944.
##  5 "_\xec20"                               5803.
##  6 "_\x9021"                              16632.
##  7 "_\x9022"                              17784.
##  8 "_\xec5"                               22875.
##  9 "100"                                   1559.
## 10 "1007"                                  4483.
## # … with 1,108 more rows

Assuming Wrangler wouldn’t be near the top, I saved myself some time by using the tail function, with n = 50, to find the average Wrangler value.

tail(df_model, n = 50)

## # A tibble: 50 x 2
##    model_name price_usd
##    <chr>          <dbl>
##  1 Vibe           5470.
##  2 Vision          950 
##  3 Vitara         8484.
##  4 Vito           8785.
##  5 Vivaro        10517.
##  6 Volt          15664.
##  7 Voyager        3781.
##  8 Wagon R        1626.
##  9 Windstar       2240 
## 10 Wrangler      18841.
## # … with 40 more rows

I wanted to see which models were the most valuable on average, so again I arranged them in descencing order of price_usd.

df_model_desc <- df_model %>%
  arrange(desc(price_usd))
df_model_desc

## # A tibble: 1,118 x 2
##    model_name price_usd
##    <chr>          <dbl>
##  1 F-Type        50000 
##  2 CLS63 AMG     49900 
##  3 GLC200        49416.
##  4 GL63          48500.
##  5 SLC200        48000 
##  6 V90           44800 
##  7 GLE350        43407.
##  8 GLE300        42375 
##  9 V250          41995.
## 10 Macan         40767.
## # … with 1,108 more rows

Next, I used dplyr to select the columns of color and price_usd, group by color, and take the means. I called this new data frame “df_color”.

df_color <- cars %>% 
select(color, price_usd) %>%
group_by(color) %>%
summarise_all(funs(mean))
df_color

## # A tibble: 12 x 2
##    color  price_usd
##    <chr>      <dbl>
##  1 black      9220.
##  2 blue       4801.
##  3 brown     12032.
##  4 green      3273.
##  5 grey       7684.
##  6 orange     7255.
##  7 other      5796.
##  8 red        4320.
##  9 silver     5919.
## 10 violet     3874.
## 11 white      8344.
## 12 yellow     4581.

I used arrange to list them in descending order of value, and hooray! Black was second highest value color on average. I didn’t know so many people loved brown cars…

df_color_desc <- df_color %>%
  arrange(desc(price_usd))
df_color_desc

## # A tibble: 12 x 2
##    color  price_usd
##    <chr>      <dbl>
##  1 brown     12032.
##  2 black      9220.
##  3 white      8344.
##  4 grey       7684.
##  5 orange     7255.
##  6 silver     5919.
##  7 other      5796.
##  8 blue       4801.
##  9 yellow     4581.
## 10 red        4320.
## 11 violet     3874.
## 12 green      3273.

Finally, in my own way of inaccurately calculating my car’s resale value, I added together the mean values of Jeep’s, Wrangler’s, 2007 cars, black cars and manual transmission cars, and found that on average, a car like my own would sell for $10,303.20! What a dream come true that would be.

mean_jeep <- 10912.4894
mean_wrangler <- 18840.7700
mean_2007 <- 8075.335
mean_black <- 9219.778
mean_manual <- 4467.64
sum(mean_jeep, mean_wrangler, mean_2007, mean_black, mean_manual)

## [1] 51516.01

sum(mean_jeep, mean_wrangler, mean_2007, mean_black, mean_manual) / 5

## [1] 10303.2