Exploring and understanding data ——————–

data exploration example using used car data

This project performs exploratory data analysis (EDA)

to understand the structure, distribution, and variability

of a used car dataset before applying any modeling techniques.

The dataset is loaded from a CSV file and inspected using str()

to identify variable types and confirm which features are numeric.

Summary statistics are used to analyze key numeric variables

such as price, mileage, and year.

Measures of central tendency (mean and median) describe

typical values in the dataset.

Measures of spread (min, max, range, and IQR) help identify

variability and potential outliers in used car prices.

Quantiles and percentiles are calculated to produce a

five-number summary and detect extreme values (e.g., 1st and 99th percentiles).

Quintiles divide the dataset into equal-sized groups,

making it easier to analyze the distribution of prices.

Boxplots are used to visualize the distribution of

used car prices and mileage, highlighting medians,

spread, skewness, and outliers.

These exploratory steps are essential for understanding

data quality and guiding feature selection and

future modeling decisions.

#getwd()
#setwd("C:/Users/npenaper/Documents")
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame':   150 obs. of  6 variables:
 $ year        : int  2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
 $ model       : chr  "SEL" "SEL" "SEL" "SEL" ...
 $ price       : int  21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
 $ mileage     : int  7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
 $ color       : chr  "Yellow" "Gray" "Silver" "Gray" ...
 $ transmission: chr  "AUTO" "AUTO" "AUTO" "AUTO" ...

Exploring numeric variables —–

# summarize numeric variables
summary(usedcars$year)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2000    2008    2009    2009    2010    2012 
summary(usedcars[c("price", "mileage")])
     price          mileage      
 Min.   : 3800   Min.   :  4867  
 1st Qu.:10995   1st Qu.: 27200  
 Median :13592   Median : 36385  
 Mean   :12962   Mean   : 44261  
 3rd Qu.:14904   3rd Qu.: 55125  
 Max.   :21992   Max.   :151479  
# calculate the mean income
(36000 + 44000 + 56000) / 3
[1] 45333.33
mean(c(36000, 44000, 56000))
[1] 45333.33
# the median income
median(c(36000, 44000, 56000))
[1] 44000
# the min/max of used car prices
range(usedcars$price)
[1]  3800 21992
# the difference of the range
diff(range(usedcars$price))
[1] 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
     0%     25%     50%     75%    100% 
 3800.0 10995.0 13591.5 14904.5 21992.0 
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
      1%      99% 
 5428.69 20505.00 
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
     0%     20%     40%     60%     80%    100% 
 3800.0 10759.4 12993.8 13992.0 14999.0 21992.0 
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
      ylab="Price ($)")


boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
      ylab="Odometer (mi.)")

# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
     xlab = "Price ($)")


hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)")

# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
[1] 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1

Exploring numeric variables —–

# one-way tables for the used car data
table(usedcars$year)

2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
   3    1    1    1    3    2    6   11   14   42   49   16    1 
table(usedcars$model)

 SE SEL SES 
 78  23  49 
table(usedcars$color)

 Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
    35     17      1     16      5     25     32     16      3 
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)

       SE       SEL       SES 
0.5200000 0.1533333 0.3266667 
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)

 Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
  23.3   11.3    0.7   10.7    3.3   16.7   21.3   10.7    2.0 

Exploring relationships between variables —–

# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")

# new variable indicating conservative colors
usedcars$conservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)

FALSE  TRUE 
   51    99 
#install.packages("gmodels")
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  150 

 
               | usedcars$conservative 
usedcars$model |     FALSE |      TRUE | Row Total | 
---------------|-----------|-----------|-----------|
            SE |        27 |        51 |        78 | 
               |     0.009 |     0.004 |           | 
               |     0.346 |     0.654 |     0.520 | 
               |     0.529 |     0.515 |           | 
               |     0.180 |     0.340 |           | 
---------------|-----------|-----------|-----------|
           SEL |         7 |        16 |        23 | 
               |     0.086 |     0.044 |           | 
               |     0.304 |     0.696 |     0.153 | 
               |     0.137 |     0.162 |           | 
               |     0.047 |     0.107 |           | 
---------------|-----------|-----------|-----------|
           SES |        17 |        32 |        49 | 
               |     0.007 |     0.004 |           | 
               |     0.347 |     0.653 |     0.327 | 
               |     0.333 |     0.323 |           | 
               |     0.113 |     0.213 |           | 
---------------|-----------|-----------|-----------|
  Column Total |        51 |        99 |       150 | 
               |     0.340 |     0.660 |           | 
---------------|-----------|-----------|-----------|

 
LS0tDQp0aXRsZTogIkV4cGxvcmluZyBhbmQgVW5kZXJzdGFuZGluZyBEYXRhIHdpdGggUiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyMjIEV4cGxvcmluZyBhbmQgdW5kZXJzdGFuZGluZyBkYXRhIC0tLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMjIGRhdGEgZXhwbG9yYXRpb24gZXhhbXBsZSB1c2luZyB1c2VkIGNhciBkYXRhDQoNCg0KIyBUaGlzIHByb2plY3QgcGVyZm9ybXMgZXhwbG9yYXRvcnkgZGF0YSBhbmFseXNpcyAoRURBKQ0KIyB0byB1bmRlcnN0YW5kIHRoZSBzdHJ1Y3R1cmUsIGRpc3RyaWJ1dGlvbiwgYW5kIHZhcmlhYmlsaXR5DQojIG9mIGEgdXNlZCBjYXIgZGF0YXNldCBiZWZvcmUgYXBwbHlpbmcgYW55IG1vZGVsaW5nIHRlY2huaXF1ZXMuDQoNCiMgVGhlIGRhdGFzZXQgaXMgbG9hZGVkIGZyb20gYSBDU1YgZmlsZSBhbmQgaW5zcGVjdGVkIHVzaW5nIHN0cigpDQojIHRvIGlkZW50aWZ5IHZhcmlhYmxlIHR5cGVzIGFuZCBjb25maXJtIHdoaWNoIGZlYXR1cmVzIGFyZSBudW1lcmljLg0KDQojIFN1bW1hcnkgc3RhdGlzdGljcyBhcmUgdXNlZCB0byBhbmFseXplIGtleSBudW1lcmljIHZhcmlhYmxlcw0KIyMjIHN1Y2ggYXMgcHJpY2UsIG1pbGVhZ2UsIGFuZCB5ZWFyLg0KDQojIyMgTWVhc3VyZXMgb2YgY2VudHJhbCB0ZW5kZW5jeSAobWVhbiBhbmQgbWVkaWFuKSBkZXNjcmliZQ0KIyMjIHR5cGljYWwgdmFsdWVzIGluIHRoZSBkYXRhc2V0Lg0KDQojIyMgTWVhc3VyZXMgb2Ygc3ByZWFkIChtaW4sIG1heCwgcmFuZ2UsIGFuZCBJUVIpIGhlbHAgaWRlbnRpZnkNCiMjIyB2YXJpYWJpbGl0eSBhbmQgcG90ZW50aWFsIG91dGxpZXJzIGluIHVzZWQgY2FyIHByaWNlcy4NCg0KIyMjIFF1YW50aWxlcyBhbmQgcGVyY2VudGlsZXMgYXJlIGNhbGN1bGF0ZWQgdG8gcHJvZHVjZSBhDQojIyMgZml2ZS1udW1iZXIgc3VtbWFyeSBhbmQgZGV0ZWN0IGV4dHJlbWUgdmFsdWVzIChlLmcuLCAxc3QgYW5kIDk5dGggcGVyY2VudGlsZXMpLg0KDQojIyMgUXVpbnRpbGVzIGRpdmlkZSB0aGUgZGF0YXNldCBpbnRvIGVxdWFsLXNpemVkIGdyb3VwcywNCiMjIyBtYWtpbmcgaXQgZWFzaWVyIHRvIGFuYWx5emUgdGhlIGRpc3RyaWJ1dGlvbiBvZiBwcmljZXMuDQoNCiMjIyBCb3hwbG90cyBhcmUgdXNlZCB0byB2aXN1YWxpemUgdGhlIGRpc3RyaWJ1dGlvbiBvZg0KIyMjIHVzZWQgY2FyIHByaWNlcyBhbmQgbWlsZWFnZSwgaGlnaGxpZ2h0aW5nIG1lZGlhbnMsDQojIyMgc3ByZWFkLCBza2V3bmVzcywgYW5kIG91dGxpZXJzLg0KDQojIyMgVGhlc2UgZXhwbG9yYXRvcnkgc3RlcHMgYXJlIGVzc2VudGlhbCBmb3IgdW5kZXJzdGFuZGluZw0KIyMjIGRhdGEgcXVhbGl0eSBhbmQgZ3VpZGluZyBmZWF0dXJlIHNlbGVjdGlvbiBhbmQNCiMjIyBmdXR1cmUgbW9kZWxpbmcgZGVjaXNpb25zLg0KDQpgYGB7cn0NCiNnZXR3ZCgpDQpgYGANCg0KYGBge3J9DQojc2V0d2QoIkM6L1VzZXJzL25wZW5hcGVyL0RvY3VtZW50cyIpDQpgYGANCg0KDQpgYGB7cn0NCnVzZWRjYXJzIDwtIHJlYWQuY3N2KCJ1c2VkY2Fycy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQpgYGANCg0KDQpgYGB7cn0NCiMgZ2V0IHN0cnVjdHVyZSBvZiB1c2VkIGNhciBkYXRhDQpzdHIodXNlZGNhcnMpDQpgYGANCg0KDQojIyBFeHBsb3JpbmcgbnVtZXJpYyB2YXJpYWJsZXMgLS0tLS0NCg0KYGBge3J9DQojIHN1bW1hcml6ZSBudW1lcmljIHZhcmlhYmxlcw0Kc3VtbWFyeSh1c2VkY2FycyR5ZWFyKQ0Kc3VtbWFyeSh1c2VkY2Fyc1tjKCJwcmljZSIsICJtaWxlYWdlIildKQ0KYGBgDQoNCg0KYGBge3J9DQojIGNhbGN1bGF0ZSB0aGUgbWVhbiBpbmNvbWUNCigzNjAwMCArIDQ0MDAwICsgNTYwMDApIC8gMw0KbWVhbihjKDM2MDAwLCA0NDAwMCwgNTYwMDApKQ0KYGBgDQoNCg0KYGBge3J9DQojIHRoZSBtZWRpYW4gaW5jb21lDQptZWRpYW4oYygzNjAwMCwgNDQwMDAsIDU2MDAwKSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyB0aGUgbWluL21heCBvZiB1c2VkIGNhciBwcmljZXMNCnJhbmdlKHVzZWRjYXJzJHByaWNlKQ0KYGBgDQoNCg0KYGBge3J9DQojIHRoZSBkaWZmZXJlbmNlIG9mIHRoZSByYW5nZQ0KZGlmZihyYW5nZSh1c2VkY2FycyRwcmljZSkpDQpgYGANCg0KDQpgYGB7cn0NCiMgSVFSIGZvciB1c2VkIGNhciBwcmljZXMNCklRUih1c2VkY2FycyRwcmljZSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyB1c2UgcXVhbnRpbGUgdG8gY2FsY3VsYXRlIGZpdmUtbnVtYmVyIHN1bW1hcnkNCnF1YW50aWxlKHVzZWRjYXJzJHByaWNlKQ0KYGBgDQoNCg0KYGBge3J9DQojIHRoZSA5OXRoIHBlcmNlbnRpbGUNCnF1YW50aWxlKHVzZWRjYXJzJHByaWNlLCBwcm9icyA9IGMoMC4wMSwgMC45OSkpDQpgYGANCg0KDQpgYGB7cn0NCiMgcXVpbnRpbGVzDQpxdWFudGlsZSh1c2VkY2FycyRwcmljZSwgc2VxKGZyb20gPSAwLCB0byA9IDEsIGJ5ID0gMC4yMCkpDQpgYGANCg0KDQpgYGB7cn0NCiMgYm94cGxvdCBvZiB1c2VkIGNhciBwcmljZXMgYW5kIG1pbGVhZ2UNCmJveHBsb3QodXNlZGNhcnMkcHJpY2UsIG1haW49IkJveHBsb3Qgb2YgVXNlZCBDYXIgUHJpY2VzIiwNCiAgICAgIHlsYWI9IlByaWNlICgkKSIpDQoNCmJveHBsb3QodXNlZGNhcnMkbWlsZWFnZSwgbWFpbj0iQm94cGxvdCBvZiBVc2VkIENhciBNaWxlYWdlIiwNCiAgICAgIHlsYWI9Ik9kb21ldGVyIChtaS4pIikNCmBgYA0KDQoNCmBgYHtyfQ0KIyBoaXN0b2dyYW1zIG9mIHVzZWQgY2FyIHByaWNlcyBhbmQgbWlsZWFnZQ0KaGlzdCh1c2VkY2FycyRwcmljZSwgbWFpbiA9ICJIaXN0b2dyYW0gb2YgVXNlZCBDYXIgUHJpY2VzIiwNCiAgICAgeGxhYiA9ICJQcmljZSAoJCkiKQ0KDQpoaXN0KHVzZWRjYXJzJG1pbGVhZ2UsIG1haW4gPSAiSGlzdG9ncmFtIG9mIFVzZWQgQ2FyIE1pbGVhZ2UiLA0KICAgICB4bGFiID0gIk9kb21ldGVyIChtaS4pIikNCmBgYA0KDQoNCmBgYHtyfQ0KIyB2YXJpYW5jZSBhbmQgc3RhbmRhcmQgZGV2aWF0aW9uIG9mIHRoZSB1c2VkIGNhciBkYXRhDQp2YXIodXNlZGNhcnMkcHJpY2UpDQpzZCh1c2VkY2FycyRwcmljZSkNCnZhcih1c2VkY2FycyRtaWxlYWdlKQ0Kc2QodXNlZGNhcnMkbWlsZWFnZSkNCmBgYA0KDQoNCiMjIEV4cGxvcmluZyBudW1lcmljIHZhcmlhYmxlcyAtLS0tLQ0KDQpgYGB7cn0NCiMgb25lLXdheSB0YWJsZXMgZm9yIHRoZSB1c2VkIGNhciBkYXRhDQp0YWJsZSh1c2VkY2FycyR5ZWFyKQ0KdGFibGUodXNlZGNhcnMkbW9kZWwpDQp0YWJsZSh1c2VkY2FycyRjb2xvcikNCmBgYA0KDQoNCg0KYGBge3J9DQojIGNvbXB1dGUgdGFibGUgcHJvcG9ydGlvbnMNCm1vZGVsX3RhYmxlIDwtIHRhYmxlKHVzZWRjYXJzJG1vZGVsKQ0KcHJvcC50YWJsZShtb2RlbF90YWJsZSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyByb3VuZCB0aGUgZGF0YQ0KY29sb3JfdGFibGUgPC0gdGFibGUodXNlZGNhcnMkY29sb3IpDQpjb2xvcl9wY3QgPC0gcHJvcC50YWJsZShjb2xvcl90YWJsZSkgKiAxMDANCnJvdW5kKGNvbG9yX3BjdCwgZGlnaXRzID0gMSkNCmBgYA0KDQoNCiMjIEV4cGxvcmluZyByZWxhdGlvbnNoaXBzIGJldHdlZW4gdmFyaWFibGVzIC0tLS0tDQoNCmBgYHtyfQ0KIyBzY2F0dGVycGxvdCBvZiBwcmljZSB2cy4gbWlsZWFnZQ0KcGxvdCh4ID0gdXNlZGNhcnMkbWlsZWFnZSwgeSA9IHVzZWRjYXJzJHByaWNlLA0KICAgICBtYWluID0gIlNjYXR0ZXJwbG90IG9mIFByaWNlIHZzLiBNaWxlYWdlIiwNCiAgICAgeGxhYiA9ICJVc2VkIENhciBPZG9tZXRlciAobWkuKSIsDQogICAgIHlsYWIgPSAiVXNlZCBDYXIgUHJpY2UgKCQpIikNCmBgYA0KDQoNCmBgYHtyfQ0KIyBuZXcgdmFyaWFibGUgaW5kaWNhdGluZyBjb25zZXJ2YXRpdmUgY29sb3JzDQp1c2VkY2FycyRjb25zZXJ2YXRpdmUgPC0NCiAgdXNlZGNhcnMkY29sb3IgJWluJSBjKCJCbGFjayIsICJHcmF5IiwgIlNpbHZlciIsICJXaGl0ZSIpDQpgYGANCg0KDQoNCmBgYHtyfQ0KIyBjaGVja2luZyBvdXIgdmFyaWFibGUNCnRhYmxlKHVzZWRjYXJzJGNvbnNlcnZhdGl2ZSkNCmBgYA0KDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCJnbW9kZWxzIikNCmBgYA0KDQoNCmBgYHtyfQ0KIyBDcm9zc3RhYiBvZiBjb25zZXJ2YXRpdmUgYnkgbW9kZWwNCmxpYnJhcnkoZ21vZGVscykNCkNyb3NzVGFibGUoeCA9IHVzZWRjYXJzJG1vZGVsLCB5ID0gdXNlZGNhcnMkY29uc2VydmF0aXZlKQ0KYGBgDQoNCiANCg0K