getwd()
## [1] "C:/Users/maria/OneDrive/Desktop/Data Mining - Spring 24'"
setwd("C:/Users/maria/OneDrive/Desktop/Data Mining - Spring 24'")
# import the data set and assign it to the variable usedcars
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# see the structure of used car data
str(usedcars)
## 'data.frame': 150 obs. of 6 variables:
## $ year : int 2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
## $ model : chr "SEL" "SEL" "SEL" "SEL" ...
## $ price : int 21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
## $ mileage : int 7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
## $ color : chr "Yellow" "Gray" "Silver" "Gray" ...
## $ transmission: chr "AUTO" "AUTO" "AUTO" "AUTO" ...
# summarize numeric variable year
summary(usedcars$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2000 2008 2009 2009 2010 2012
# summarize the numeric variables price and mileage
summary(usedcars[c("price", "mileage")])
## price mileage
## Min. : 3800 Min. : 4867
## 1st Qu.:10995 1st Qu.: 27200
## Median :13592 Median : 36385
## Mean :12962 Mean : 44261
## 3rd Qu.:14904 3rd Qu.: 55125
## Max. :21992 Max. :151479
# calculate the mean income using the salaries 36,000, 44,000 and 56,000
mean(c(36000, 44000, 56000))
## [1] 45333.33
# calculate the median income for the problem above
median(c(36000, 44000, 56000))
## [1] 44000
#in this case, the mean is a more robust representation of the dataset
# get the min/max of used car prices
range(usedcars$price)
## [1] 3800 21992
# the difference of the range
diff(range(usedcars$price))
## [1] 18192
#This is a good metric to understand the variance in the inventory prices
# IQR for used car prices
IQR(usedcars$price)
## [1] 3909.5
#The interquartile range explains the spread between the first quartile and third quartile of a data set
# use quantile to calculate five-number summary
quantile(usedcars$price)
## 0% 25% 50% 75% 100%
## 3800.0 10995.0 13591.5 14904.5 21992.0
#This is a quick overview of the data spread, this has the lower and upper car prices and the and the median. These are the most important metrics
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
## 1% 99%
## 5428.69 20505.00
#These 2 numbers represent the lowest and highest numbers (above and below) the lower and upper limit in the data set
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
## 0% 20% 40% 60% 80% 100%
## 3800.0 10759.4 12993.8 13992.0 14999.0 21992.0
#Quintiles are an extension of a quantile calculation, but these increase by a 20% as opposed to 25%. It is a more detailed summary of the data spread
#depict the boxplots for the numerical variables price and mileage
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices", #This is the title
ylab="Price ($)") #this sets the y label
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage", #This is the title
ylab="Odometer (mi.)") #this sets the y label
#Histograms show the distribution of the numerical variables in a data set
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices", #title
xlab = "Price ($)") #x label
#in this first histogram we can see that most of the prices are within the 12,000 and # 15,000 range
hist(usedcars$mileage, main = "Histogram of Used Car Mileage",
xlab = "Odometer (mi.)") #x label
#the histogram shows that in average most cars are almost new since they have a mileage below 5,000
# variance and standard deviation of the car prices.
#variance: This reflects how far are the numbers in average in the entire dataset
var(usedcars$price)
## [1] 9749892
#standard deviation: is the "standardized" variance. This reflects the variance of the data points from the mean
sd(usedcars$price)
## [1] 3122.482
# variance and standard deviation of the mileage
var(usedcars$mileage) #the variance is out of proportion since this numbers are raised to the second power
## [1] 728033954
sd(usedcars$mileage) #the standard deviation is a better metric to reflect the variability in the data set
## [1] 26982.1
# one-way tables for the used car data
table(usedcars$year) #the number of used cars per year
##
## 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 3 1 1 1 3 2 6 11 14 42 49 16 1
table(usedcars$model) #the number of cars per model
##
## SE SEL SES
## 78 23 49
table(usedcars$color) #the number of available cars per color
##
## Black Blue Gold Gray Green Red Silver White Yellow
## 35 17 1 16 5 25 32 16 3
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
##
## SE SEL SES
## 0.5200000 0.1533333 0.3266667
#this table shows the percentage of cars per model in the data set, the most common model in the entire inventory is SE(52%)
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
##
## Black Blue Gold Gray Green Red Silver White Yellow
## 23.3 11.3 0.7 10.7 3.3 16.7 21.3 10.7 2.0
# this table shows the percentage of cars in the data set per color, the most common color is Black
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
main = "Scatterplot of Price vs. Mileage",
xlab = "Used Car Odometer (mi.)",
ylab = "Used Car Price ($)")
#the dependent variable in this case is the price. Meaning, that the mileage is the variable that determines the price of a car. Clearly, cars with less mileage are more expensive.
# new variable indicating conservative colors
usedcarsconservative <-
usedcars$color %in% c("Black", "Gray", "Silver", "White")
usedcarsconservative #this will show a true where there is a car with a conservative color in the list
## [1] FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [13] FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [25] FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
## [49] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [73] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [85] TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## [109] FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
## [121] TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## [145] TRUE TRUE FALSE FALSE FALSE FALSE
# checking our variable
table(usedcars$conservative) #there are 99 cars with a conservative color
## < table of extent 0 >
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcarsconservative)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 150
##
##
## | usedcarsconservative
## usedcars$model | FALSE | TRUE | Row Total |
## ---------------|-----------|-----------|-----------|
## SE | 27 | 51 | 78 |
## | 0.009 | 0.004 | |
## | 0.346 | 0.654 | 0.520 |
## | 0.529 | 0.515 | |
## | 0.180 | 0.340 | |
## ---------------|-----------|-----------|-----------|
## SEL | 7 | 16 | 23 |
## | 0.086 | 0.044 | |
## | 0.304 | 0.696 | 0.153 |
## | 0.137 | 0.162 | |
## | 0.047 | 0.107 | |
## ---------------|-----------|-----------|-----------|
## SES | 17 | 32 | 49 |
## | 0.007 | 0.004 | |
## | 0.347 | 0.653 | 0.327 |
## | 0.333 | 0.323 | |
## | 0.113 | 0.213 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 51 | 99 | 150 |
## | 0.340 | 0.660 | |
## ---------------|-----------|-----------|-----------|
##
##
#the first number for each category represents the number of cars per model. This adds up to 150 cars
#the second represents the Chi square contribution to its category
#the third and fourth numbers per category are the each model total, in the first case 27, divided by the column total(51), row total(78) and table total(150)