Exploring and Understanding Data with R

Exploring and understanding data ——————–

data exploration example using used car data

getwd()

## [1] "C:/Users/maria/OneDrive/Desktop/Data Mining - Spring 24'"

setwd("C:/Users/maria/OneDrive/Desktop/Data Mining - Spring 24'")

# import the data set and assign it to the variable usedcars
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)

# see the structure of used car data
str(usedcars)

## 'data.frame':    150 obs. of  6 variables:
##  $ year        : int  2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
##  $ model       : chr  "SEL" "SEL" "SEL" "SEL" ...
##  $ price       : int  21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
##  $ mileage     : int  7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
##  $ color       : chr  "Yellow" "Gray" "Silver" "Gray" ...
##  $ transmission: chr  "AUTO" "AUTO" "AUTO" "AUTO" ...

Exploring numeric variables —–

# summarize numeric variable year
summary(usedcars$year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2000    2008    2009    2009    2010    2012

# summarize the numeric variables price and mileage
summary(usedcars[c("price", "mileage")])

##      price          mileage      
##  Min.   : 3800   Min.   :  4867  
##  1st Qu.:10995   1st Qu.: 27200  
##  Median :13592   Median : 36385  
##  Mean   :12962   Mean   : 44261  
##  3rd Qu.:14904   3rd Qu.: 55125  
##  Max.   :21992   Max.   :151479

# calculate the mean income using the salaries 36,000, 44,000 and 56,000
mean(c(36000, 44000, 56000))

## [1] 45333.33

# calculate the median income for the problem above
median(c(36000, 44000, 56000))

## [1] 44000

#in this case, the mean is a more robust representation of the dataset

# get the min/max of used car prices
range(usedcars$price)

## [1]  3800 21992

# the difference of the range
diff(range(usedcars$price))

## [1] 18192

#This is a good metric to understand the variance in the inventory prices

# IQR for used car prices
IQR(usedcars$price)

## [1] 3909.5

#The interquartile range explains the spread between the first quartile and third quartile of a data set

# use quantile to calculate five-number summary
quantile(usedcars$price)

##      0%     25%     50%     75%    100% 
##  3800.0 10995.0 13591.5 14904.5 21992.0

#This is a quick overview of the data spread, this has the lower and upper car prices and the and the median. These are the most important metrics

# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))

##       1%      99% 
##  5428.69 20505.00

#These 2 numbers represent the lowest and highest numbers (above and below) the lower and upper limit in the data set

# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))

##      0%     20%     40%     60%     80%    100% 
##  3800.0 10759.4 12993.8 13992.0 14999.0 21992.0

#Quintiles are an extension of a quantile calculation, but these increase by a 20% as opposed to 25%. It is a more detailed summary of the data spread

#depict the boxplots for the numerical variables price and mileage

# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices", #This is the title
      ylab="Price ($)") #this sets the y label

boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage", #This is the title
      ylab="Odometer (mi.)") #this sets the y label

#Histograms show the distribution of the numerical variables in a data set

# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices", #title
     xlab = "Price ($)") #x label

#in this first histogram we can see that most of the prices are within the 12,000 and # 15,000 range

hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)") #x label

#the histogram shows that in average most cars are almost new since they have a mileage below 5,000

# variance and standard deviation of the car prices. 

#variance: This reflects how far are the numbers in average in the entire dataset
var(usedcars$price)

## [1] 9749892

#standard deviation: is the "standardized" variance. This reflects the variance of the data points from the mean
sd(usedcars$price)

## [1] 3122.482

# variance and standard deviation of the mileage
var(usedcars$mileage) #the variance is out of proportion since this numbers are raised to the second power

## [1] 728033954

sd(usedcars$mileage) #the standard deviation is a better metric to reflect the variability in the data set

## [1] 26982.1

Exploring numeric variables —–

# one-way tables for the used car data
table(usedcars$year) #the number of used cars per year

## 
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
##    3    1    1    1    3    2    6   11   14   42   49   16    1

table(usedcars$model) #the number of cars per model

## 
##  SE SEL SES 
##  78  23  49

table(usedcars$color) #the number of available cars per color

## 
##  Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
##     35     17      1     16      5     25     32     16      3

# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)

## 
##        SE       SEL       SES 
## 0.5200000 0.1533333 0.3266667

#this table shows the percentage of cars per model in the data set, the most common model in the entire inventory is SE(52%)

# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)

## 
##  Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
##   23.3   11.3    0.7   10.7    3.3   16.7   21.3   10.7    2.0

# this table shows the percentage of cars in the data set per color, the most common color is Black

Exploring relationships between variables —–

# scatterplot of price vs. mileage 
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")

#the dependent variable in this case is the price. Meaning, that the mileage is the variable that determines the price of a car. Clearly, cars with less mileage are more expensive.

# new variable indicating conservative colors
usedcarsconservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")

usedcarsconservative #this will show a true where there is a car with a conservative color in the list

##   [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13] FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [25] FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE
##  [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
##  [49]  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
##  [73]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
##  [85]  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [97] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [109] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [121]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE
## [145]  TRUE  TRUE FALSE FALSE FALSE FALSE

# checking our variable
table(usedcars$conservative) #there are 99 cars with a conservative color

## < table of extent 0 >

# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcarsconservative)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  150 
## 
##  
##                | usedcarsconservative 
## usedcars$model |     FALSE |      TRUE | Row Total | 
## ---------------|-----------|-----------|-----------|
##             SE |        27 |        51 |        78 | 
##                |     0.009 |     0.004 |           | 
##                |     0.346 |     0.654 |     0.520 | 
##                |     0.529 |     0.515 |           | 
##                |     0.180 |     0.340 |           | 
## ---------------|-----------|-----------|-----------|
##            SEL |         7 |        16 |        23 | 
##                |     0.086 |     0.044 |           | 
##                |     0.304 |     0.696 |     0.153 | 
##                |     0.137 |     0.162 |           | 
##                |     0.047 |     0.107 |           | 
## ---------------|-----------|-----------|-----------|
##            SES |        17 |        32 |        49 | 
##                |     0.007 |     0.004 |           | 
##                |     0.347 |     0.653 |     0.327 | 
##                |     0.333 |     0.323 |           | 
##                |     0.113 |     0.213 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        51 |        99 |       150 | 
##                |     0.340 |     0.660 |           | 
## ---------------|-----------|-----------|-----------|
## 
##

#the first number for each category represents the number of cars per model. This adds up to 150 cars
#the second represents the Chi square contribution to its category
#the third and fourth numbers per category are the each model total, in the first case 27, divided by the column total(51), row total(78) and table total(150)