Exploring and understanding data ——————–
data exploration example using used car data
#
getwd()
[1] "C:/Users/marly/Downloads"
#This sets the working directory. In other words, it tells R where to look for files and where to save output files by default
setwd("C:/Users/marly/Downloads")
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame': 150 obs. of 6 variables:
$ year : int 2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
$ model : chr "SEL" "SEL" "SEL" "SEL" ...
$ price : int 21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
$ mileage : int 7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
$ color : chr "Yellow" "Gray" "Silver" "Gray" ...
$ transmission: chr "AUTO" "AUTO" "AUTO" "AUTO" ...
Exploring numeric variables —–
# summarize numeric variables
summary(usedcars$year)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2000 2008 2009 2009 2010 2012
summary(usedcars[c("price", "mileage")])
price mileage
Min. : 3800 Min. : 4867
1st Qu.:10995 1st Qu.: 27200
Median :13592 Median : 36385
Mean :12962 Mean : 44261
3rd Qu.:14904 3rd Qu.: 55125
Max. :21992 Max. :151479
# calculate the mean income
(36000 + 44000 + 56000) / 3
[1] 45333.33
mean(c(36000, 44000, 56000))
[1] 45333.33
# the median income
median(c(36000, 44000, 56000))
[1] 44000
# the min/max of used car prices
range(usedcars$price)
[1] 3800 21992
# the difference of the range
diff(range(usedcars$price))
[1] 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
0% 25% 50% 75% 100%
3800.0 10995.0 13591.5 14904.5 21992.0
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
1% 99%
5428.69 20505.00
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
0% 20% 40% 60% 80% 100%
3800.0 10759.4 12993.8 13992.0 14999.0 21992.0
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
ylab="Price ($)")

boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
ylab="Odometer (mi.)")

# histograms of used car prices and mileage
#change color of scatter to blue and border add border black
hist(usedcars$price, col = "blue", border ="black", main = "Histogram of Used Car Prices",
xlab = "Price ($)")

hist(usedcars$mileage, col ="green", main = "Histogram of Used Car Mileage",
xlab = "Odometer (mi.)")

# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
[1] 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1
Exploring numeric variables —–
# one-way tables for the used car data
table(usedcars$year)
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
3 1 1 1 3 2 6 11 14 42 49 16 1
table(usedcars$model)
SE SEL SES
78 23 49
table(usedcars$color)
Black Blue Gold Gray Green Red Silver White Yellow
35 17 1 16 5 25 32 16 3
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
SE SEL SES
0.5200000 0.1533333 0.3266667
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
Black Blue Gold Gray Green Red Silver White Yellow
23.3 11.3 0.7 10.7 3.3 16.7 21.3 10.7 2.0
Exploring relationships between variables —–

# new variable indicating conservative colors
usedcars$conservative <-
usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
< table of extent 0 >
#install.packages("gmodels")
# Crosstab of conservative by model
library(gmodels)
Warning: package ‘gmodels’ was built under R version 4.3.3
CrossTable(x = usedcars$model, y = usedcars$conservative)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 150
| usedcars$conservative
usedcars$model | FALSE | TRUE | Row Total |
---------------|-----------|-----------|-----------|
SE | 27 | 51 | 78 |
| 0.009 | 0.004 | |
| 0.346 | 0.654 | 0.520 |
| 0.529 | 0.515 | |
| 0.180 | 0.340 | |
---------------|-----------|-----------|-----------|
SEL | 7 | 16 | 23 |
| 0.086 | 0.044 | |
| 0.304 | 0.696 | 0.153 |
| 0.137 | 0.162 | |
| 0.047 | 0.107 | |
---------------|-----------|-----------|-----------|
SES | 17 | 32 | 49 |
| 0.007 | 0.004 | |
| 0.347 | 0.653 | 0.327 |
| 0.333 | 0.323 | |
| 0.113 | 0.213 | |
---------------|-----------|-----------|-----------|
Column Total | 51 | 99 | 150 |
| 0.340 | 0.660 | |
---------------|-----------|-----------|-----------|
LS0tDQp0aXRsZTogIkV4cGxvcmluZyBhbmQgVW5kZXJzdGFuZGluZyBEYXRhIHdpdGggUiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyMjIEV4cGxvcmluZyBhbmQgdW5kZXJzdGFuZGluZyBkYXRhIC0tLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMjIGRhdGEgZXhwbG9yYXRpb24gZXhhbXBsZSB1c2luZyB1c2VkIGNhciBkYXRhDQoNCmBgYHtyfQ0KI3RoaXMgZnVuY3Rpb24gb3V0cHV0cyB0aGUgcGF0aCBvZiB0aGUgZGlyZWN0b3J5IG9mIHdoZXJlIFIgaXMgY3VycmVudGx5IGxvb2tpbmcgZm9yIGZpbGVzDQpnZXR3ZCgpDQpgYGANCg0KYGBge3J9DQojVGhpcyBzZXRzIHRoZSB3b3JraW5nIGRpcmVjdG9yeS4gSW4gb3RoZXIgd29yZHMsIGl0IHRlbGxzIFIgd2hlcmUgdG8gbG9vayBmb3IgZmlsZXMgYW5kIHdoZXJlIHRvIHNhdmUgb3V0cHV0IGZpbGVzIGJ5IGRlZmF1bHQNCnNldHdkKCJDOi9Vc2Vycy9tYXJseS9Eb3dubG9hZHMiKQ0KYGBgDQoNCg0KYGBge3J9DQp1c2VkY2FycyA8LSByZWFkLmNzdigidXNlZGNhcnMuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQ0KYGBgDQoNCg0KYGBge3J9DQojIGdldCBzdHJ1Y3R1cmUgb2YgdXNlZCBjYXIgZGF0YQ0Kc3RyKHVzZWRjYXJzKQ0KYGBgDQoNCg0KIyMgRXhwbG9yaW5nIG51bWVyaWMgdmFyaWFibGVzIC0tLS0tDQoNCmBgYHtyfQ0KIyBzdW1tYXJpemUgbnVtZXJpYyB2YXJpYWJsZXMNCnN1bW1hcnkodXNlZGNhcnMkeWVhcikNCnN1bW1hcnkodXNlZGNhcnNbYygicHJpY2UiLCAibWlsZWFnZSIpXSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBjYWxjdWxhdGUgdGhlIG1lYW4gaW5jb21lDQooMzYwMDAgKyA0NDAwMCArIDU2MDAwKSAvIDMNCm1lYW4oYygzNjAwMCwgNDQwMDAsIDU2MDAwKSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyB0aGUgbWVkaWFuIGluY29tZQ0KbWVkaWFuKGMoMzYwMDAsIDQ0MDAwLCA1NjAwMCkpDQpgYGANCg0KDQpgYGB7cn0NCiMgdGhlIG1pbi9tYXggb2YgdXNlZCBjYXIgcHJpY2VzDQpyYW5nZSh1c2VkY2FycyRwcmljZSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyB0aGUgZGlmZmVyZW5jZSBvZiB0aGUgcmFuZ2UNCmRpZmYocmFuZ2UodXNlZGNhcnMkcHJpY2UpKQ0KYGBgDQoNCg0KYGBge3J9DQojIElRUiBmb3IgdXNlZCBjYXIgcHJpY2VzDQpJUVIodXNlZGNhcnMkcHJpY2UpDQpgYGANCg0KDQpgYGB7cn0NCiMgdXNlIHF1YW50aWxlIHRvIGNhbGN1bGF0ZSBmaXZlLW51bWJlciBzdW1tYXJ5DQpxdWFudGlsZSh1c2VkY2FycyRwcmljZSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyB0aGUgOTl0aCBwZXJjZW50aWxlDQpxdWFudGlsZSh1c2VkY2FycyRwcmljZSwgcHJvYnMgPSBjKDAuMDEsIDAuOTkpKQ0KYGBgDQoNCg0KYGBge3J9DQojIHF1aW50aWxlcw0KcXVhbnRpbGUodXNlZGNhcnMkcHJpY2UsIHNlcShmcm9tID0gMCwgdG8gPSAxLCBieSA9IDAuMjApKQ0KYGBgDQoNCg0KYGBge3J9DQojIGJveHBsb3Qgb2YgdXNlZCBjYXIgcHJpY2VzIGFuZCBtaWxlYWdlDQpib3hwbG90KHVzZWRjYXJzJHByaWNlLCBtYWluPSJCb3hwbG90IG9mIFVzZWQgQ2FyIFByaWNlcyIsDQogICAgICB5bGFiPSJQcmljZSAoJCkiKQ0KDQpib3hwbG90KHVzZWRjYXJzJG1pbGVhZ2UsIG1haW49IkJveHBsb3Qgb2YgVXNlZCBDYXIgTWlsZWFnZSIsDQogICAgICB5bGFiPSJPZG9tZXRlciAobWkuKSIpDQpgYGANCg0KDQpgYGB7cn0NCiMgaGlzdG9ncmFtcyBvZiB1c2VkIGNhciBwcmljZXMgYW5kIG1pbGVhZ2UNCiNjaGFuZ2UgY29sb3Igb2Ygc2NhdHRlciB0byBibHVlIGFuZCBib3JkZXIgYWRkIGJvcmRlciBibGFjaw0KaGlzdCh1c2VkY2FycyRwcmljZSwgY29sID0gImJsdWUiLCBib3JkZXIgPSJibGFjayIsIG1haW4gPSAiSGlzdG9ncmFtIG9mIFVzZWQgQ2FyIFByaWNlcyIsDQogICAgIHhsYWIgPSAiUHJpY2UgKCQpIikNCiNtYWtlIGhpc3RvZ3JhbSBncmVlbiANCmhpc3QodXNlZGNhcnMkbWlsZWFnZSwgY29sID0iZ3JlZW4iLCBtYWluID0gIkhpc3RvZ3JhbSBvZiBVc2VkIENhciBNaWxlYWdlIiwNCiAgICAgeGxhYiA9ICJPZG9tZXRlciAobWkuKSIpDQpgYGANCg0KDQpgYGB7cn0NCiMgdmFyaWFuY2UgYW5kIHN0YW5kYXJkIGRldmlhdGlvbiBvZiB0aGUgdXNlZCBjYXIgZGF0YQ0KdmFyKHVzZWRjYXJzJHByaWNlKQ0Kc2QodXNlZGNhcnMkcHJpY2UpDQp2YXIodXNlZGNhcnMkbWlsZWFnZSkNCnNkKHVzZWRjYXJzJG1pbGVhZ2UpDQpgYGANCg0KDQojIyBFeHBsb3JpbmcgbnVtZXJpYyB2YXJpYWJsZXMgLS0tLS0NCg0KYGBge3J9DQojIG9uZS13YXkgdGFibGVzIGZvciB0aGUgdXNlZCBjYXIgZGF0YQ0KdGFibGUodXNlZGNhcnMkeWVhcikNCnRhYmxlKHVzZWRjYXJzJG1vZGVsKQ0KdGFibGUodXNlZGNhcnMkY29sb3IpDQpgYGANCg0KDQoNCmBgYHtyfQ0KIyBjb21wdXRlIHRhYmxlIHByb3BvcnRpb25zDQptb2RlbF90YWJsZSA8LSB0YWJsZSh1c2VkY2FycyRtb2RlbCkNCnByb3AudGFibGUobW9kZWxfdGFibGUpDQpgYGANCg0KDQpgYGB7cn0NCiMgcm91bmQgdGhlIGRhdGENCmNvbG9yX3RhYmxlIDwtIHRhYmxlKHVzZWRjYXJzJGNvbG9yKQ0KY29sb3JfcGN0IDwtIHByb3AudGFibGUoY29sb3JfdGFibGUpICogMTAwDQpyb3VuZChjb2xvcl9wY3QsIGRpZ2l0cyA9IDEpDQpgYGANCg0KDQojIyBFeHBsb3JpbmcgcmVsYXRpb25zaGlwcyBiZXR3ZWVuIHZhcmlhYmxlcyAtLS0tLQ0KDQpgYGB7cn0NCiMgc2NhdHRlcnBsb3Qgb2YgcHJpY2UgdnMuIG1pbGVhZ2UgDQojY2hhbmdlIHNjYXR0ZXIgdG8gY29sb3IgYmx1ZQ0KcGxvdCh4ID0gdXNlZGNhcnMkbWlsZWFnZSwgeSA9IHVzZWRjYXJzJHByaWNlLCBjb2wgPSJibHVlIiwNCiAgICAgbWFpbiA9ICJTY2F0dGVycGxvdCBvZiBQcmljZSB2cy4gTWlsZWFnZSIsDQogICAgIHhsYWIgPSAiVXNlZCBDYXIgT2RvbWV0ZXIgKG1pLikiLA0KICAgICB5bGFiID0gIlVzZWQgQ2FyIFByaWNlICgkKSIpDQpgYGANCg0KDQpgYGB7cn0NCiMgbmV3IHZhcmlhYmxlIGluZGljYXRpbmcgY29uc2VydmF0aXZlIGNvbG9ycw0KdXNlZGNhcnMkY29uc2VydmF0aXZlIDwtDQogIHVzZWRjYXJzJGNvbG9yICVpbiUgYygiQmxhY2siLCAiR3JheSIsICJTaWx2ZXIiLCAiV2hpdGUiKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCiMgY2hlY2tpbmcgb3VyIHZhcmlhYmxlDQp0YWJsZSh1c2VkY2FycyRjb25zZXJ2YXRpdmUpDQpgYGANCg0KYGBge3J9DQojaW5zdGFsbC5wYWNrYWdlcygiZ21vZGVscyIpDQpgYGANCg0KDQpgYGB7cn0NCiMgQ3Jvc3N0YWIgb2YgY29uc2VydmF0aXZlIGJ5IG1vZGVsDQpsaWJyYXJ5KGdtb2RlbHMpDQpDcm9zc1RhYmxlKHggPSB1c2VkY2FycyRtb2RlbCwgeSA9IHVzZWRjYXJzJGNvbnNlcnZhdGl2ZSkNCmBgYA0KDQogDQoNCg==