#working directory
setwd("/Users/gretacapelletti/Downloads")
#specify the working directory it is good
getwd()
[1] "/Users/gretacapelletti/Downloads"
# Load the CSV file
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame': 150 obs. of 6 variables:
$ year : int 2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
$ model : chr "SEL" "SEL" "SEL" "SEL" ...
$ price : int 21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
$ mileage : int 7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
$ color : chr "Yellow" "Gray" "Silver" "Gray" ...
$ transmission: chr "AUTO" "AUTO" "AUTO" "AUTO" ...
#Exploring numeric variables
# summarize numeric variables
summary(usedcars$year)
Min. 1st Qu. Median Mean 3rd Qu. Max.
2000 2008 2009 2009 2010 2012
summary(usedcars[c("price", "mileage")])
price mileage
Min. : 3800 Min. : 4867
1st Qu.:10995 1st Qu.: 27200
Median :13592 Median : 36385
Mean :12962 Mean : 44261
3rd Qu.:14904 3rd Qu.: 55124
Max. :21992 Max. :151479
# calculate the mean income
(36000 + 44000 + 56000) / 3
[1] 45333.33
mean(c(36000, 44000, 56000))
[1] 45333.33
# the median income
median(c(36000, 44000, 56000))
[1] 44000
# the min/max of used car prices
range(usedcars$price)
[1] 3800 21992
# the difference of the range
diff(range(usedcars$price))
[1] 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
0% 25% 50% 75% 100%
3800.0 10995.0 13591.5 14904.5 21992.0
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
1% 99%
5428.69 20505.00
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
0% 20% 40% 60% 80% 100%
3800.0 10759.4 12993.8 13992.0 14999.0 21992.0
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
ylab="Price ($)")

# This boxplot provides a visual summary of the distribution of used car prices in the dataset.
# The box represents the interquartile range (IQR), which contains the middle 50% of the data,
# with the horizontal line inside the box indicating the median price.
# The whiskers extend to the minimum and maximum values within 1.5 times the IQR,
# while the dots below the whiskers represent outliers—prices that are significantly lower than the rest.
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
ylab="Odometer (mi.)")

# This boxplot visualizes the distribution of used car mileage (in miles).
# The box represents the interquartile range (IQR), showing the middle 50% of the data,
# while the horizontal line inside the box marks the median mileage.
# This boxplot shows the distribution of used car mileage, highlighting the median, interquartile range, and potential outliers.
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
xlab = "Price ($)")

# This histogram displays the distribution of used car prices in dollars.
# The x-axis represents car prices, grouped into bins, while the y-axis shows the frequency of cars in each price range.
# The plot helps identify the spread and central tendency of car prices, with the majority clustered around $10,000 to $15,000.
# It also highlights any gaps or concentration of prices in specific ranges, aiding in understanding price variability.
hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
xlab = "Odometer (mi.)")

# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
[1] 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1
#Exploring numeric variables
# one-way tables for the used car data
table(usedcars$year)
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
3 1 1 1 3 2 6 11 14 42 49 16 1
table(usedcars$model)
SE SEL SES
78 23 49
table(usedcars$color)
Black Blue Gold Gray Green Red Silver White Yellow
35 17 1 16 5 25 32 16 3
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
SE SEL SES
0.5200000 0.1533333 0.3266667
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
Black Blue Gold Gray Green Red Silver White Yellow
23.3 11.3 0.7 10.7 3.3 16.7 21.3 10.7 2.0
#Exploring relationships between variables
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
main = "Scatterplot of Price vs. Mileage",
xlab = "Used Car Odometer (mi.)",
ylab = "Used Car Price ($)")

# This scatterplot shows the relationship between used car odometer readings (in miles) and their prices (in dollars).
# Each point represents a car, with the x-axis indicating the mileage and the y-axis showing the price.
# The plot illustrates a negative trend, where higher mileage tends to correspond to lower prices.
# new variable indicating conservative colors
usedcars$conservative <-
usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
FALSE TRUE
51 99
install.packages("gmodels")
Error in install.packages : Updating loaded packages
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 150
| usedcars$conservative
usedcars$model | FALSE | TRUE | Row Total |
---------------|-----------|-----------|-----------|
SE | 27 | 51 | 78 |
| 0.009 | 0.004 | |
| 0.346 | 0.654 | 0.520 |
| 0.529 | 0.515 | |
| 0.180 | 0.340 | |
---------------|-----------|-----------|-----------|
SEL | 7 | 16 | 23 |
| 0.086 | 0.044 | |
| 0.304 | 0.696 | 0.153 |
| 0.137 | 0.162 | |
| 0.047 | 0.107 | |
---------------|-----------|-----------|-----------|
SES | 17 | 32 | 49 |
| 0.007 | 0.004 | |
| 0.347 | 0.653 | 0.327 |
| 0.333 | 0.323 | |
| 0.113 | 0.213 | |
---------------|-----------|-----------|-----------|
Column Total | 51 | 99 | 150 |
| 0.340 | 0.660 | |
---------------|-----------|-----------|-----------|
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7cn0KI3dvcmtpbmcgZGlyZWN0b3J5CnNldHdkKCIvVXNlcnMvZ3JldGFjYXBlbGxldHRpL0Rvd25sb2FkcyIpCmBgYAoKYGBge3J9CiNzcGVjaWZ5IHRoZSB3b3JraW5nIGRpcmVjdG9yeSBpdCBpcyBnb29kCmdldHdkKCkKYGBgCmBgYHtyfQojIExvYWQgdGhlIENTViBmaWxlCnVzZWRjYXJzIDwtIHJlYWQuY3N2KCJ1c2VkY2Fycy5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpCmBgYApgYGB7cn0KIyBnZXQgc3RydWN0dXJlIG9mIHVzZWQgY2FyIGRhdGEKc3RyKHVzZWRjYXJzKQpgYGAKYGBge3J9CiNFeHBsb3JpbmcgbnVtZXJpYyB2YXJpYWJsZXMKIyBzdW1tYXJpemUgbnVtZXJpYyB2YXJpYWJsZXMKc3VtbWFyeSh1c2VkY2FycyR5ZWFyKQpgYGAKYGBge3J9CnN1bW1hcnkodXNlZGNhcnNbYygicHJpY2UiLCAibWlsZWFnZSIpXSkKYGBgCmBgYHtyfQojIGNhbGN1bGF0ZSB0aGUgbWVhbiBpbmNvbWUKKDM2MDAwICsgNDQwMDAgKyA1NjAwMCkgLyAzCmBgYApgYGB7cn0KbWVhbihjKDM2MDAwLCA0NDAwMCwgNTYwMDApKQpgYGAKYGBge3J9CiMgdGhlIG1lZGlhbiBpbmNvbWUKbWVkaWFuKGMoMzYwMDAsIDQ0MDAwLCA1NjAwMCkpCmBgYApgYGB7cn0KIyB0aGUgbWluL21heCBvZiB1c2VkIGNhciBwcmljZXMKcmFuZ2UodXNlZGNhcnMkcHJpY2UpCmBgYApgYGB7cn0KIyB0aGUgZGlmZmVyZW5jZSBvZiB0aGUgcmFuZ2UKZGlmZihyYW5nZSh1c2VkY2FycyRwcmljZSkpCmBgYApgYGB7cn0KIyBJUVIgZm9yIHVzZWQgY2FyIHByaWNlcwpJUVIodXNlZGNhcnMkcHJpY2UpCmBgYApgYGB7cn0KIyB1c2UgcXVhbnRpbGUgdG8gY2FsY3VsYXRlIGZpdmUtbnVtYmVyIHN1bW1hcnkKcXVhbnRpbGUodXNlZGNhcnMkcHJpY2UpCmBgYApgYGB7cn0KIyB0aGUgOTl0aCBwZXJjZW50aWxlCnF1YW50aWxlKHVzZWRjYXJzJHByaWNlLCBwcm9icyA9IGMoMC4wMSwgMC45OSkpCmBgYApgYGB7cn0KIyBxdWludGlsZXMKcXVhbnRpbGUodXNlZGNhcnMkcHJpY2UsIHNlcShmcm9tID0gMCwgdG8gPSAxLCBieSA9IDAuMjApKQpgYGAKYGBge3J9CiMgYm94cGxvdCBvZiB1c2VkIGNhciBwcmljZXMgYW5kIG1pbGVhZ2UKYm94cGxvdCh1c2VkY2FycyRwcmljZSwgbWFpbj0iQm94cGxvdCBvZiBVc2VkIENhciBQcmljZXMiLAogICAgICB5bGFiPSJQcmljZSAoJCkiKQpgYGAKYGBge3J9CiMgVGhpcyBib3hwbG90IHByb3ZpZGVzIGEgdmlzdWFsIHN1bW1hcnkgb2YgdGhlIGRpc3RyaWJ1dGlvbiBvZiB1c2VkIGNhciBwcmljZXMgaW4gdGhlIGRhdGFzZXQuIAojIFRoZSBib3ggcmVwcmVzZW50cyB0aGUgaW50ZXJxdWFydGlsZSByYW5nZSAoSVFSKSwgd2hpY2ggY29udGFpbnMgdGhlIG1pZGRsZSA1MCUgb2YgdGhlIGRhdGEsIAojIHdpdGggdGhlIGhvcml6b250YWwgbGluZSBpbnNpZGUgdGhlIGJveCBpbmRpY2F0aW5nIHRoZSBtZWRpYW4gcHJpY2UuIAojIFRoZSB3aGlza2VycyBleHRlbmQgdG8gdGhlIG1pbmltdW0gYW5kIG1heGltdW0gdmFsdWVzIHdpdGhpbiAxLjUgdGltZXMgdGhlIElRUiwgCiMgd2hpbGUgdGhlIGRvdHMgYmVsb3cgdGhlIHdoaXNrZXJzIHJlcHJlc2VudCBvdXRsaWVyc+KAlHByaWNlcyB0aGF0IGFyZSBzaWduaWZpY2FudGx5IGxvd2VyIHRoYW4gdGhlIHJlc3QuIAoKYGBgCgpgYGB7cn0KCmJveHBsb3QodXNlZGNhcnMkbWlsZWFnZSwgbWFpbj0iQm94cGxvdCBvZiBVc2VkIENhciBNaWxlYWdlIiwKICAgICAgeWxhYj0iT2RvbWV0ZXIgKG1pLikiKQpgYGAKYGBge3J9CiMgVGhpcyBib3hwbG90IHZpc3VhbGl6ZXMgdGhlIGRpc3RyaWJ1dGlvbiBvZiB1c2VkIGNhciBtaWxlYWdlIChpbiBtaWxlcykuIAojIFRoZSBib3ggcmVwcmVzZW50cyB0aGUgaW50ZXJxdWFydGlsZSByYW5nZSAoSVFSKSwgc2hvd2luZyB0aGUgbWlkZGxlIDUwJSBvZiB0aGUgZGF0YSwgCiMgd2hpbGUgdGhlIGhvcml6b250YWwgbGluZSBpbnNpZGUgdGhlIGJveCBtYXJrcyB0aGUgbWVkaWFuIG1pbGVhZ2UuIAojIFRoaXMgYm94cGxvdCBzaG93cyB0aGUgZGlzdHJpYnV0aW9uIG9mIHVzZWQgY2FyIG1pbGVhZ2UsIGhpZ2hsaWdodGluZyB0aGUgbWVkaWFuLCBpbnRlcnF1YXJ0aWxlIHJhbmdlLCBhbmQgcG90ZW50aWFsIG91dGxpZXJzLgoKYGBgCgpgYGB7cn0KIyBoaXN0b2dyYW1zIG9mIHVzZWQgY2FyIHByaWNlcyBhbmQgbWlsZWFnZQpoaXN0KHVzZWRjYXJzJHByaWNlLCBtYWluID0gIkhpc3RvZ3JhbSBvZiBVc2VkIENhciBQcmljZXMiLAogICAgIHhsYWIgPSAiUHJpY2UgKCQpIikKYGBgCmBgYHtyfQojIFRoaXMgaGlzdG9ncmFtIGRpc3BsYXlzIHRoZSBkaXN0cmlidXRpb24gb2YgdXNlZCBjYXIgcHJpY2VzIGluIGRvbGxhcnMuIAojIFRoZSB4LWF4aXMgcmVwcmVzZW50cyBjYXIgcHJpY2VzLCBncm91cGVkIGludG8gYmlucywgd2hpbGUgdGhlIHktYXhpcyBzaG93cyB0aGUgZnJlcXVlbmN5IG9mIGNhcnMgaW4gZWFjaCBwcmljZSByYW5nZS4gCiMgVGhlIHBsb3QgaGVscHMgaWRlbnRpZnkgdGhlIHNwcmVhZCBhbmQgY2VudHJhbCB0ZW5kZW5jeSBvZiBjYXIgcHJpY2VzLCB3aXRoIHRoZSBtYWpvcml0eSBjbHVzdGVyZWQgYXJvdW5kICQxMCwwMDAgdG8gJDE1LDAwMC4gCiMgSXQgYWxzbyBoaWdobGlnaHRzIGFueSBnYXBzIG9yIGNvbmNlbnRyYXRpb24gb2YgcHJpY2VzIGluIHNwZWNpZmljIHJhbmdlcywgYWlkaW5nIGluIHVuZGVyc3RhbmRpbmcgcHJpY2UgdmFyaWFiaWxpdHkuCgpgYGAKCmBgYHtyfQoKaGlzdCh1c2VkY2FycyRtaWxlYWdlLCBtYWluID0gIkhpc3RvZ3JhbSBvZiBVc2VkIENhciBNaWxlYWdlIiwKICAgICB4bGFiID0gIk9kb21ldGVyIChtaS4pIikKYGBgCmBgYHtyfQojIHZhcmlhbmNlIGFuZCBzdGFuZGFyZCBkZXZpYXRpb24gb2YgdGhlIHVzZWQgY2FyIGRhdGEKdmFyKHVzZWRjYXJzJHByaWNlKQpgYGAKYGBge3J9CnNkKHVzZWRjYXJzJHByaWNlKQpgYGAKYGBge3J9CnZhcih1c2VkY2FycyRtaWxlYWdlKQpgYGAKYGBge3J9CnNkKHVzZWRjYXJzJG1pbGVhZ2UpCmBgYApgYGB7cn0KI0V4cGxvcmluZyBudW1lcmljIHZhcmlhYmxlcwojIG9uZS13YXkgdGFibGVzIGZvciB0aGUgdXNlZCBjYXIgZGF0YQp0YWJsZSh1c2VkY2FycyR5ZWFyKQpgYGAKYGBge3J9CnRhYmxlKHVzZWRjYXJzJG1vZGVsKQpgYGAKYGBge3J9CnRhYmxlKHVzZWRjYXJzJGNvbG9yKQpgYGAKYGBge3J9CiMgY29tcHV0ZSB0YWJsZSBwcm9wb3J0aW9ucwptb2RlbF90YWJsZSA8LSB0YWJsZSh1c2VkY2FycyRtb2RlbCkKcHJvcC50YWJsZShtb2RlbF90YWJsZSkKYGBgCmBgYHtyfQojIHJvdW5kIHRoZSBkYXRhCmNvbG9yX3RhYmxlIDwtIHRhYmxlKHVzZWRjYXJzJGNvbG9yKQpjb2xvcl9wY3QgPC0gcHJvcC50YWJsZShjb2xvcl90YWJsZSkgKiAxMDAKcm91bmQoY29sb3JfcGN0LCBkaWdpdHMgPSAxKQpgYGAKYGBge3J9CiNFeHBsb3JpbmcgcmVsYXRpb25zaGlwcyBiZXR3ZWVuIHZhcmlhYmxlcyAKIyBzY2F0dGVycGxvdCBvZiBwcmljZSB2cy4gbWlsZWFnZQpwbG90KHggPSB1c2VkY2FycyRtaWxlYWdlLCB5ID0gdXNlZGNhcnMkcHJpY2UsCiAgICAgbWFpbiA9ICJTY2F0dGVycGxvdCBvZiBQcmljZSB2cy4gTWlsZWFnZSIsCiAgICAgeGxhYiA9ICJVc2VkIENhciBPZG9tZXRlciAobWkuKSIsCiAgICAgeWxhYiA9ICJVc2VkIENhciBQcmljZSAoJCkiKQpgYGAKYGBge3J9CiMgVGhpcyBzY2F0dGVycGxvdCBzaG93cyB0aGUgcmVsYXRpb25zaGlwIGJldHdlZW4gdXNlZCBjYXIgb2RvbWV0ZXIgcmVhZGluZ3MgKGluIG1pbGVzKSBhbmQgdGhlaXIgcHJpY2VzIChpbiBkb2xsYXJzKS4gCiMgRWFjaCBwb2ludCByZXByZXNlbnRzIGEgY2FyLCB3aXRoIHRoZSB4LWF4aXMgaW5kaWNhdGluZyB0aGUgbWlsZWFnZSBhbmQgdGhlIHktYXhpcyBzaG93aW5nIHRoZSBwcmljZS4gCiMgVGhlIHBsb3QgaWxsdXN0cmF0ZXMgYSBuZWdhdGl2ZSB0cmVuZCwgd2hlcmUgaGlnaGVyIG1pbGVhZ2UgdGVuZHMgdG8gY29ycmVzcG9uZCB0byBsb3dlciBwcmljZXMuIAoKYGBgCgpgYGB7cn0KIyBuZXcgdmFyaWFibGUgaW5kaWNhdGluZyBjb25zZXJ2YXRpdmUgY29sb3JzCnVzZWRjYXJzJGNvbnNlcnZhdGl2ZSA8LQogIHVzZWRjYXJzJGNvbG9yICVpbiUgYygiQmxhY2siLCAiR3JheSIsICJTaWx2ZXIiLCAiV2hpdGUiKQpgYGAKYGBge3J9CiMgY2hlY2tpbmcgb3VyIHZhcmlhYmxlCnRhYmxlKHVzZWRjYXJzJGNvbnNlcnZhdGl2ZSkKCmBgYApgYGB7cn0KaW5zdGFsbC5wYWNrYWdlcygiZ21vZGVscyIpCmBgYApgYGB7cn0KIyBDcm9zc3RhYiBvZiBjb25zZXJ2YXRpdmUgYnkgbW9kZWwKbGlicmFyeShnbW9kZWxzKQpgYGAKYGBge3J9CkNyb3NzVGFibGUoeCA9IHVzZWRjYXJzJG1vZGVsLCB5ID0gdXNlZGNhcnMkY29uc2VydmF0aXZlKQpgYGAKYGBge3J9CgpgYGAKCgoKCgoKCgo=