This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
plot(cars)
"C:/Users/Adria/Downloads"
[1] "C:/Users/Adria/Downloads"
setwd("C:/Users/Adria/Downloads")
usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE)
# get structure of used car data
str(usedcars)
'data.frame': 150 obs. of 6 variables:
$ year : int 2011 2011 2011 2011 2012 2010 2011 2010 2011 2010 ...
$ model : chr "SEL" "SEL" "SEL" "SEL" ...
$ price : int 21992 20995 19995 17809 17500 17495 17000 16995 16995 16995 ...
$ mileage : int 7413 10926 7351 11613 8367 25125 27393 21026 32655 36116 ...
$ color : chr "Yellow" "Gray" "Silver" "Gray" ...
$ transmission: chr "AUTO" "AUTO" "AUTO" "AUTO" ...
# summarize numeric variables
summary(usedcars$year)
Min. 1st Qu. Median Mean 3rd Qu.
2000 2008 2009 2009 2010
Max.
2012
summary(usedcars[c("price", "mileage", "year", "transmission")])
price mileage
Min. : 3800 Min. : 4867
1st Qu.:10995 1st Qu.: 27200
Median :13592 Median : 36385
Mean :12962 Mean : 44261
3rd Qu.:14904 3rd Qu.: 55125
Max. :21992 Max. :151479
year transmission
Min. :2000 Length:150
1st Qu.:2008 Class :character
Median :2009 Mode :character
Mean :2009
3rd Qu.:2010
Max. :2012
mean(c(36000, 44000, 56000))
[1] 45333.33
# the min/max of used car prices
range(usedcars$price)
[1] 3800 21992
# the difference of the range
diff(range(usedcars$price))
[1] 18192
# IQR for used car prices
IQR(usedcars$price)
[1] 3909.5
# use quantile to calculate five-number summary
quantile(usedcars$price)
0% 25% 50% 75% 100%
3800.0 10995.0 13591.5 14904.5 21992.0
# the 99th percentile
quantile(usedcars$price, probs = c(0.01, 0.99))
1% 99%
5428.69 20505.00
# quintiles
quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20))
0% 20% 40% 60% 80%
3800.0 10759.4 12993.8 13992.0 14999.0
100%
21992.0
# boxplot of used car prices and mileage
boxplot(usedcars$price, main="Boxplot of Used Car Prices",
ylab="Price ($)")
boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage",
ylab="Odometer (mi.)")
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
xlab = "Price ($)")
# histograms of used car prices and mileage
hist(usedcars$price, main = "Histogram of Used Car Prices",
xlab = "Price ($)")
# variance and standard deviation of the used car data
var(usedcars$price)
[1] 9749892
sd(usedcars$price)
[1] 3122.482
var(usedcars$mileage)
[1] 728033954
sd(usedcars$mileage)
[1] 26982.1
#EXPLORING NUMERIC VARIABLES
# one-way tables for the used car data
table(usedcars$year)
2000 2001 2002 2003 2004 2005 2006 2007
3 1 1 1 3 2 6 11
2008 2009 2010 2011 2012
14 42 49 16 1
table(usedcars$model)
SE SEL SES
78 23 49
# compute table proportions
model_table <- table(usedcars$model)
prop.table(model_table)
SE SEL SES
0.5200000 0.1533333 0.3266667
# round the data
color_table <- table(usedcars$color)
color_pct <- prop.table(color_table) * 100
round(color_pct, digits = 1)
Black Blue Gold Gray Green
23.3 11.3 0.7 10.7 3.3
Red Silver White Yellow
16.7 21.3 10.7 2.0
# scatterplot of price vs. mileage
plot(x = usedcars$mileage, y = usedcars$price,
main = "Scatterplot of Price vs. Mileage",
xlab = "Used Car Odometer (mi.)",
ylab = "Used Car Price ($)")
# new variable indicating conservative colors
usedcars$conservative <-
usedcars$color %in% c("Black", "Gray", "Silver", "White")
# checking our variable
table(usedcars$conservative)
FALSE TRUE
51 99
install.packages("gmodels")
WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:
https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/Adria/AppData/Local/R/win-library/4.4’
(as ‘lib’ is unspecified)
also installing the dependencies ‘gtools’, ‘gdata’
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.4/gtools_3.9.5.zip'
Content type 'application/zip' length 368663 bytes (360 KB)
downloaded 360 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.4/gdata_3.0.1.zip'
Content type 'application/zip' length 505048 bytes (493 KB)
downloaded 493 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.4/gmodels_2.19.1.zip'
Content type 'application/zip' length 119350 bytes (116 KB)
downloaded 116 KB
package ‘gtools’ successfully unpacked and MD5 sums checked
package ‘gdata’ successfully unpacked and MD5 sums checked
package ‘gmodels’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\Adria\AppData\Local\Temp\RtmpMJxHc5\downloaded_packages
# Crosstab of conservative by model
library(gmodels)
CrossTable(x = usedcars$model, y = usedcars$conservative)
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 150
| usedcars$conservative
usedcars$model | FALSE | TRUE | Row Total |
---------------|-----------|-----------|-----------|
SE | 27 | 51 | 78 |
| 0.009 | 0.004 | |
| 0.346 | 0.654 | 0.520 |
| 0.529 | 0.515 | |
| 0.180 | 0.340 | |
---------------|-----------|-----------|-----------|
SEL | 7 | 16 | 23 |
| 0.086 | 0.044 | |
| 0.304 | 0.696 | 0.153 |
| 0.137 | 0.162 | |
| 0.047 | 0.107 | |
---------------|-----------|-----------|-----------|
SES | 17 | 32 | 49 |
| 0.007 | 0.004 | |
| 0.347 | 0.653 | 0.327 |
| 0.333 | 0.323 | |
| 0.113 | 0.213 | |
---------------|-----------|-----------|-----------|
Column Total | 51 | 99 | 150 |
| 0.340 | 0.660 | |
---------------|-----------|-----------|-----------|