#Load data
data("mtcars") # built-in dataset
df <- mtcars
# Peek
head(df, 5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
#Basic dimensions & column names
paste("Rows:", nrow(df), "| Columns:", ncol(df))
## [1] "Rows: 32 | Columns: 11"
names(df)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
#Summary statistics
summary(df)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
#Custom stats: mean, median, sd, IQR (for a few key vars)
nums <- c("mpg","hp","wt")
stats <- lapply(nums, function(v){
x <- df[[v]]
data.frame(
variable = v,
mean = mean(x),
median = median(x),
sd = sd(x),
IQR = IQR(x)
)
})
stats <- do.call(rbind, stats)
stats
## variable mean median sd IQR
## 1 mpg 20.09062 19.200 6.0269481 7.37500
## 2 hp 146.68750 123.000 68.5628685 83.50000
## 3 wt 3.21725 3.325 0.9784574 1.02875
#Correlation matrix (numeric columns)
cor_mat <- cor(df, use = "complete.obs")
round(cor_mat, 3)
## mpg cyl disp hp drat wt qsec vs am gear
## mpg 1.000 -0.852 -0.848 -0.776 0.681 -0.868 0.419 0.664 0.600 0.480
## cyl -0.852 1.000 0.902 0.832 -0.700 0.782 -0.591 -0.811 -0.523 -0.493
## disp -0.848 0.902 1.000 0.791 -0.710 0.888 -0.434 -0.710 -0.591 -0.556
## hp -0.776 0.832 0.791 1.000 -0.449 0.659 -0.708 -0.723 -0.243 -0.126
## drat 0.681 -0.700 -0.710 -0.449 1.000 -0.712 0.091 0.440 0.713 0.700
## wt -0.868 0.782 0.888 0.659 -0.712 1.000 -0.175 -0.555 -0.692 -0.583
## qsec 0.419 -0.591 -0.434 -0.708 0.091 -0.175 1.000 0.745 -0.230 -0.213
## vs 0.664 -0.811 -0.710 -0.723 0.440 -0.555 0.745 1.000 0.168 0.206
## am 0.600 -0.523 -0.591 -0.243 0.713 -0.692 -0.230 0.168 1.000 0.794
## gear 0.480 -0.493 -0.556 -0.126 0.700 -0.583 -0.213 0.206 0.794 1.000
## carb -0.551 0.527 0.395 0.750 -0.091 0.428 -0.656 -0.570 0.058 0.274
## carb
## mpg -0.551
## cyl 0.527
## disp 0.395
## hp 0.750
## drat -0.091
## wt 0.428
## qsec -0.656
## vs -0.570
## am 0.058
## gear 0.274
## carb 1.000
#Correlation matrix (numeric columns)
cor_mat <- cor(df, use = "complete.obs")
round(cor_mat, 3)
## mpg cyl disp hp drat wt qsec vs am gear
## mpg 1.000 -0.852 -0.848 -0.776 0.681 -0.868 0.419 0.664 0.600 0.480
## cyl -0.852 1.000 0.902 0.832 -0.700 0.782 -0.591 -0.811 -0.523 -0.493
## disp -0.848 0.902 1.000 0.791 -0.710 0.888 -0.434 -0.710 -0.591 -0.556
## hp -0.776 0.832 0.791 1.000 -0.449 0.659 -0.708 -0.723 -0.243 -0.126
## drat 0.681 -0.700 -0.710 -0.449 1.000 -0.712 0.091 0.440 0.713 0.700
## wt -0.868 0.782 0.888 0.659 -0.712 1.000 -0.175 -0.555 -0.692 -0.583
## qsec 0.419 -0.591 -0.434 -0.708 0.091 -0.175 1.000 0.745 -0.230 -0.213
## vs 0.664 -0.811 -0.710 -0.723 0.440 -0.555 0.745 1.000 0.168 0.206
## am 0.600 -0.523 -0.591 -0.243 0.713 -0.692 -0.230 0.168 1.000 0.794
## gear 0.480 -0.493 -0.556 -0.126 0.700 -0.583 -0.213 0.206 0.794 1.000
## carb -0.551 0.527 0.395 0.750 -0.091 0.428 -0.656 -0.570 0.058 0.274
## carb
## mpg -0.551
## cyl 0.527
## disp 0.395
## hp 0.750
## drat -0.091
## wt 0.428
## qsec -0.656
## vs -0.570
## am 0.058
## gear 0.274
## carb 1.000
plot(df$wt, df$mpg,
xlab = "Weight (1000 lbs)",
ylab = "Miles per Gallon",
main = "MPG vs Weight",
pch = 19)
abline(lm(mpg ~ wt, data = df), lwd = 2)
