Exploring and understanding data ——————–

data exploration example using used car data

getwd()
[1] "C:/Users/npenaper/Downloads"
setwd("C:/Users/npenaper/Documents")
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame':   150 obs. of  6 variables:
 $ year        : int  2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
 $ model       : chr  "SEL" "SEL" "SEL" "SEL" ...
 $ price       : int  21992 20995 19995 17809 17500 17495 17000 16995 16995 16995
$ mileage     : int   ...
 $ color        chr  "Yellow" "Gray" "Silver" "Gray" ...
 $ transmission: chr  "AUTO" "AUTO" "AUTO" "AUTO" ...

Exploring numeric variables —–

# summarize numeric variables
summary(usedcars$year)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   2000    2008    2009    2009    2010    2012 
summary(usedcars[c("price", "mileage")])
     price      
 Min.   : 3800   Min.   :  4867  
 1st Qu.:10995   1st Qu.: 27200  
 Median :13592   Median : 36385  
 Mean   :12962   Mean   : 44261  
 3rd Qu.:14904  
 Max.   :21992   Max.   :151479  
# calculate the mean income
(36000 + 44000 + 56000) / 3
[1] 45333.33
mean(c(36000, 44000, 56000))
[1] 45333.33
# the median income
median(c(36000, 44000, 56000))
[1] 44000
# the min/max of used car prices
range(usedcars$price)
[1]  3800 21992
# the difference of the range
diff(range(usedcars$price))
 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
     0%     25%     50%     75%    100% 
 3800.0 10995.0 13591.5 14904.5 21992.0 
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
      1%      99% 
 5428.69 20505.00 
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
     0%     20%     40%     60%     80%    100% 
 3800.0 10759.4 12993.8 13992.0 14999.0 21992.0 
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
      ylab="Price ($)")


boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
      ylab="Odometer (mi.)")

# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
     xlab = "Price ($)")


hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
     xlab = "Odometer (mi.)")

# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1

Exploring numeric variables —–

# one-way tables for the used car data
table(usedcars$year)

2000 2001 2002 2003 2004 2005 2007 2008 2009 2010 2011 2012 
   1    1    1    3    6   11   14   42   49   16    1 
table(usedcars$model)

 SE SEL SES 
 23  49 
table(usedcars$color)
 Black   Blue   Gold   Gray  Green    Red Silver  White 
    35     17     16      5     32     16 
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)

       SE       SEL       SES 
0.5200000 0.1533333 0.3266667 
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)

 Black   Gold   Gray    Red Silver Yellow 
  11.3    0.7    3.3   16.7   10.7    2.0 

Exploring relationships between variables —–

# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
     main = "Scatterplot of Price vs. Mileage",
     xlab = "Used Car Odometer (mi.)",
     ylab = "Used Car Price ($)")

# new variable indicating conservative colors
usedcars$conservative <-
  usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)

FALSE  TRUE    51    99 
install.packages("gmodels")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/npenaper/AppData/Local/R/win-library/4.2’
(as ‘lib’ is unspecified)
also installing the dependency ‘gdata’

trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/gdata_3.0.0.zip'
Content type 'application/zip' length 495777 bytes (484 KB)
downloaded 484 KB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.2/gmodels_2.18.1.1.zip'
Content type 'application/zip' length 114240 bytes (111 KB)
downloaded 111 KB
package ‘gdata’ successfully unpacked and MD5 sums checked
package ‘gmodels’ successfully unpacked and MD5 sums checked
# Crosstab of conservative by model
library(gmodels)
Warning: package ‘gmodels’ was built under R version 4.2.3
CrossTable(x = usedcars$model, y = usedcars$conservative)

 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Col Total |
|         N / Table Total |

 
Total Observations in Table:  150 

 
              | usedcars$conservative 
usedcars$model |     FALSE |      TRUE | Row Total | 
---------------|-----------|-----------|-----------
            SE |        27 |        51 |        78
               |  |     0.004 |  | 
               |  |     0.654 |     0.520
                  0.529 |  |           |                |     0.180    0.340 |           | ---------------|----------------------||
           SEL        7 |        16 |  | 
               |     0.086 |     0.044          | 
 |     0.304 |  |     0.153
                  0.137 |  |           |                |  |     0.107          | ---------------||----------------------|
           SES       17 |        32       49 | 
               |     0.007 |     0.004          | 
                  0.347 |     0.653    0.327 | 
               |     0.323 |  | 
                  0.113 |     0.213 |           | ---------------|----------------------|-----------|
 |        51       99 |  | 
                  0.340 |     0.660          | 
|----------------------|-----------|
LS0tDQp0aXRsZTogIkV4cGxvcmluZyBhbmQgVW5kZXJzdGFuZGluZyBEYXRhIHdpdGggUiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyMjIEV4cGxvcmluZyBhbmQgdW5kZXJzdGFuZGluZyBkYXRhIC0tLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMjIGRhdGEgZXhwbG9yYXRpb24gZXhhbXBsZSB1c2luZyB1c2VkIGNhciBkYXRhDQoNCmBgYHtyfQ0KI2dldHdkKCkNCmBgYA0KDQpgYGB7cn0NCiNzZXR3ZCgiQzovVXNlcnMvbnBlbmFwZXIvRG9jdW1lbnRzIikNCmBgYA0KDQoNCmBgYHtyfQ0KdXNlZGNhcnMgPC0gcmVhZC5jc3YoInVzZWRjYXJzLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBnZXQgc3RydWN0dXJlIG9mIHVzZWQgY2FyIGRhdGENCnN0cih1c2VkY2FycykNCmBgYA0KDQoNCiMjIEV4cGxvcmluZyBudW1lcmljIHZhcmlhYmxlcyAtLS0tLQ0KDQpgYGB7cn0NCiMgc3VtbWFyaXplIG51bWVyaWMgdmFyaWFibGVzDQpzdW1tYXJ5KHVzZWRjYXJzJHllYXIpDQpzdW1tYXJ5KHVzZWRjYXJzW2MoInByaWNlIiwgIm1pbGVhZ2UiKV0pDQpgYGANCg0KDQpgYGB7cn0NCiMgY2FsY3VsYXRlIHRoZSBtZWFuIGluY29tZQ0KKDM2MDAwICsgNDQwMDAgKyA1NjAwMCkgLyAzDQptZWFuKGMoMzYwMDAsIDQ0MDAwLCA1NjAwMCkpDQpgYGANCg0KDQpgYGB7cn0NCiMgdGhlIG1lZGlhbiBpbmNvbWUNCm1lZGlhbihjKDM2MDAwLCA0NDAwMCwgNTYwMDApKQ0KYGBgDQoNCg0KYGBge3J9DQojIHRoZSBtaW4vbWF4IG9mIHVzZWQgY2FyIHByaWNlcw0KcmFuZ2UodXNlZGNhcnMkcHJpY2UpDQpgYGANCg0KDQpgYGB7cn0NCiMgdGhlIGRpZmZlcmVuY2Ugb2YgdGhlIHJhbmdlDQpkaWZmKHJhbmdlKHVzZWRjYXJzJHByaWNlKSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBJUVIgZm9yIHVzZWQgY2FyIHByaWNlcw0KSVFSKHVzZWRjYXJzJHByaWNlKQ0KYGBgDQoNCg0KYGBge3J9DQojIHVzZSBxdWFudGlsZSB0byBjYWxjdWxhdGUgZml2ZS1udW1iZXIgc3VtbWFyeQ0KcXVhbnRpbGUodXNlZGNhcnMkcHJpY2UpDQpgYGANCg0KDQpgYGB7cn0NCiMgdGhlIDk5dGggcGVyY2VudGlsZQ0KcXVhbnRpbGUodXNlZGNhcnMkcHJpY2UsIHByb2JzID0gYygwLjAxLCAwLjk5KSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBxdWludGlsZXMNCnF1YW50aWxlKHVzZWRjYXJzJHByaWNlLCBzZXEoZnJvbSA9IDAsIHRvID0gMSwgYnkgPSAwLjIwKSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBib3hwbG90IG9mIHVzZWQgY2FyIHByaWNlcyBhbmQgbWlsZWFnZQ0KYm94cGxvdCh1c2VkY2FycyRwcmljZSwgbWFpbj0iQm94cGxvdCBvZiBVc2VkIENhciBQcmljZXMiLA0KICAgICAgeWxhYj0iUHJpY2UgKCQpIikNCg0KYm94cGxvdCh1c2VkY2FycyRtaWxlYWdlLCBtYWluPSJCb3hwbG90IG9mIFVzZWQgQ2FyIE1pbGVhZ2UiLA0KICAgICAgeWxhYj0iT2RvbWV0ZXIgKG1pLikiKQ0KYGBgDQoNCg0KYGBge3J9DQojIGhpc3RvZ3JhbXMgb2YgdXNlZCBjYXIgcHJpY2VzIGFuZCBtaWxlYWdlDQpoaXN0KHVzZWRjYXJzJHByaWNlLCBtYWluID0gIkhpc3RvZ3JhbSBvZiBVc2VkIENhciBQcmljZXMiLA0KICAgICB4bGFiID0gIlByaWNlICgkKSIpDQoNCmhpc3QodXNlZGNhcnMkbWlsZWFnZSwgbWFpbiA9ICJIaXN0b2dyYW0gb2YgVXNlZCBDYXIgTWlsZWFnZSIsDQogICAgIHhsYWIgPSAiT2RvbWV0ZXIgKG1pLikiKQ0KYGBgDQoNCg0KYGBge3J9DQojIHZhcmlhbmNlIGFuZCBzdGFuZGFyZCBkZXZpYXRpb24gb2YgdGhlIHVzZWQgY2FyIGRhdGENCnZhcih1c2VkY2FycyRwcmljZSkNCnNkKHVzZWRjYXJzJHByaWNlKQ0KdmFyKHVzZWRjYXJzJG1pbGVhZ2UpDQpzZCh1c2VkY2FycyRtaWxlYWdlKQ0KYGBgDQoNCg0KIyMgRXhwbG9yaW5nIG51bWVyaWMgdmFyaWFibGVzIC0tLS0tDQoNCmBgYHtyfQ0KIyBvbmUtd2F5IHRhYmxlcyBmb3IgdGhlIHVzZWQgY2FyIGRhdGENCnRhYmxlKHVzZWRjYXJzJHllYXIpDQp0YWJsZSh1c2VkY2FycyRtb2RlbCkNCnRhYmxlKHVzZWRjYXJzJGNvbG9yKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCiMgY29tcHV0ZSB0YWJsZSBwcm9wb3J0aW9ucw0KbW9kZWxfdGFibGUgPC0gdGFibGUodXNlZGNhcnMkbW9kZWwpDQpwcm9wLnRhYmxlKG1vZGVsX3RhYmxlKQ0KYGBgDQoNCg0KYGBge3J9DQojIHJvdW5kIHRoZSBkYXRhDQpjb2xvcl90YWJsZSA8LSB0YWJsZSh1c2VkY2FycyRjb2xvcikNCmNvbG9yX3BjdCA8LSBwcm9wLnRhYmxlKGNvbG9yX3RhYmxlKSAqIDEwMA0Kcm91bmQoY29sb3JfcGN0LCBkaWdpdHMgPSAxKQ0KYGBgDQoNCg0KIyMgRXhwbG9yaW5nIHJlbGF0aW9uc2hpcHMgYmV0d2VlbiB2YXJpYWJsZXMgLS0tLS0NCg0KYGBge3J9DQojIHNjYXR0ZXJwbG90IG9mIHByaWNlIHZzLiBtaWxlYWdlDQpwbG90KHggPSB1c2VkY2FycyRtaWxlYWdlLCB5ID0gdXNlZGNhcnMkcHJpY2UsDQogICAgIG1haW4gPSAiU2NhdHRlcnBsb3Qgb2YgUHJpY2UgdnMuIE1pbGVhZ2UiLA0KICAgICB4bGFiID0gIlVzZWQgQ2FyIE9kb21ldGVyIChtaS4pIiwNCiAgICAgeWxhYiA9ICJVc2VkIENhciBQcmljZSAoJCkiKQ0KYGBgDQoNCg0KYGBge3J9DQojIG5ldyB2YXJpYWJsZSBpbmRpY2F0aW5nIGNvbnNlcnZhdGl2ZSBjb2xvcnMNCnVzZWRjYXJzJGNvbnNlcnZhdGl2ZSA8LQ0KICB1c2VkY2FycyRjb2xvciAlaW4lIGMoIkJsYWNrIiwgIkdyYXkiLCAiU2lsdmVyIiwgIldoaXRlIikNCmBgYA0KDQoNCg0KYGBge3J9DQojIGNoZWNraW5nIG91ciB2YXJpYWJsZQ0KdGFibGUodXNlZGNhcnMkY29uc2VydmF0aXZlKQ0KYGBgDQoNCmBgYHtyfQ0KI2luc3RhbGwucGFja2FnZXMoImdtb2RlbHMiKQ0KYGBgDQoNCg0KYGBge3J9DQojIENyb3NzdGFiIG9mIGNvbnNlcnZhdGl2ZSBieSBtb2RlbA0KbGlicmFyeShnbW9kZWxzKQ0KQ3Jvc3NUYWJsZSh4ID0gdXNlZGNhcnMkbW9kZWwsIHkgPSB1c2VkY2FycyRjb25zZXJ2YXRpdmUpDQpgYGANCg0KIA0KDQo=