Exploring and understanding data ——————–

data exploration example using used car data

#getwd()
#setwd("C:/Users/npenaper/Documents")
usedcars <- read.csv("C:/Users/User/OneDrive/Desktop/1 - STU - DATA ANALYTICS/7 - DMML/usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
## 'data.frame':    150 obs. of  6 variables:
##  $ year        : int  2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
##  $ model       : chr  "SEL" "SEL" "SEL" "SEL" ...
##  $ price       : int  21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
##  $ mileage     : int  7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
##  $ color       : chr  "Yellow" "Gray" "Silver" "Gray" ...
##  $ transmission: chr  "AUTO" "AUTO" "AUTO" "AUTO" ...

Exploring numeric variables —–

# summarize numeric variables
summary(usedcars$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2000    2008    2009    2009    2010    2012
summary(usedcars[c("price", "mileage")])
##      price          mileage      
##  Min.   : 3800   Min.   :  4867  
##  1st Qu.:10995   1st Qu.: 27200  
##  Median :13592   Median : 36385  
##  Mean   :12962   Mean   : 44261  
##  3rd Qu.:14904   3rd Qu.: 55125  
##  Max.   :21992   Max.   :151479
# calculate the mean income
(36000 + 44000 + 56000) / 3
## [1] 45333.33
mean(c(36000, 44000, 56000))
## [1] 45333.33
# the median income
median(c(36000, 44000, 56000))
## [1] 44000
# the min/max of used car prices
range(usedcars$price)
## [1]  3800 21992
# the difference of the range
diff(range(usedcars$price))
## [1] 18192
# IQR for used car prices
IQR(usedcars$price)
## [1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
##      0%     25%     50%     75%    100% 
##  3800.0 10995.0 13591.5 14904.5 21992.0
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
##       1%      99% 
##  5428.69 20505.00
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
##      0%     20%     40%     60%     80%    100% 
##  3800.0 10759.4 12993.8 13992.0 14999.0 21992.0
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
      ylab="Price ($)")

boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
      ylab="Odometer (mi.)")

# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
     xlab = "Price ($)")

hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)")

# variance and standard deviation of the used car data
var(usedcars$price)
## [1] 9749892
sd(usedcars$price)
## [1] 3122.482
var(usedcars$mileage)
## [1] 728033954
sd(usedcars$mileage)
## [1] 26982.1

Exploring numeric variables —–

# one-way tables for the used car data
table(usedcars$year)
## 
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
##    3    1    1    1    3    2    6   11   14   42   49   16    1
table(usedcars$model)
## 
##  SE SEL SES 
##  78  23  49
table(usedcars$color)
## 
##  Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
##     35     17      1     16      5     25     32     16      3
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
## 
##        SE       SEL       SES 
## 0.5200000 0.1533333 0.3266667
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
## 
##  Black   Blue   Gold   Gray  Green    Red Silver  White Yellow 
##   23.3   11.3    0.7   10.7    3.3   16.7   21.3   10.7    2.0

Exploring relationships between variables —–

# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")

# new variable indicating conservative colors
usedcars$conservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
## 
## FALSE  TRUE 
##    51    99
#install.packages("gmodels")
# Crosstab of conservative by model
#library(gmodels)
#CrossTable(x = usedcars$model, y = usedcars$conservative)