Data Summaries
Example 1: House Prices in Boston
boston.housing.df <- mlba::BostonHousing
head(boston.housing.df, 9)
#> CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO LSTAT MEDV
#> 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
#> 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
#> 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
#> 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
#> 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
#> 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
#> 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 12.43 22.9
#> 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 19.15 27.1
#> 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 29.93 16.5
#> CAT.MEDV
#> 1 0
#> 2 0
#> 3 1
#> 4 1
#> 5 1
#> 6 0
#> 7 0
#> 8 0
#> 9 0
summary(boston.housing.df)
#> CRIM ZN INDUS CHAS
#> Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
#> 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
#> Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
#> Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
#> 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
#> Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
#> NOX RM AGE DIS
#> Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
#> 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
#> Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
#> Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
#> 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
#> Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
#> RAD TAX PTRATIO LSTAT
#> Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
#> 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
#> Median : 5.000 Median :330.0 Median :19.05 Median :11.36
#> Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
#> 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
#> Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
#> MEDV CAT.MEDV
#> Min. : 5.00 Min. :0.000
#> 1st Qu.:17.02 1st Qu.:0.000
#> Median :21.20 Median :0.000
#> Mean :22.53 Mean :0.166
#> 3rd Qu.:25.00 3rd Qu.:0.000
#> Max. :50.00 Max. :1.000
# compute mean, standard dev., min, max, median, length, and missing values of CRIM
mean(boston.housing.df$CRIM)
#> [1] 3.613524
sd(boston.housing.df$CRIM)
#> [1] 8.601545
min(boston.housing.df$CRIM)
#> [1] 0.00632
max(boston.housing.df$CRIM)
#> [1] 88.9762
median(boston.housing.df$CRIM)
#> [1] 0.25651
length(boston.housing.df$CRIM)
#> [1] 506
# find the number of missing values of variable CRIM
sum(is.na(boston.housing.df$CRIM))
#> [1] 0
# compute mean, standard dev., min, max, median, length, and missing values for all
# variables
data.frame(mean=sapply(boston.housing.df, mean),
sd=sapply(boston.housing.df, sd),
min=sapply(boston.housing.df, min),
max=sapply(boston.housing.df, max),
median=sapply(boston.housing.df, median),
length=sapply(boston.housing.df, length),
miss.val=sapply(boston.housing.df,
function(x) sum(length(which(is.na(x))))))
#> mean sd min max median length miss.val
#> CRIM 3.61352356 8.6015451 0.00632 88.9762 0.25651 506 0
#> ZN 11.36363636 23.3224530 0.00000 100.0000 0.00000 506 0
#> INDUS 11.13677866 6.8603529 0.46000 27.7400 9.69000 506 0
#> CHAS 0.06916996 0.2539940 0.00000 1.0000 0.00000 506 0
#> NOX 0.55469506 0.1158777 0.38500 0.8710 0.53800 506 0
#> RM 6.28463439 0.7026171 3.56100 8.7800 6.20850 506 0
#> AGE 68.57490119 28.1488614 2.90000 100.0000 77.50000 506 0
#> DIS 3.79504269 2.1057101 1.12960 12.1265 3.20745 506 0
#> RAD 9.54940711 8.7072594 1.00000 24.0000 5.00000 506 0
#> TAX 408.23715415 168.5371161 187.00000 711.0000 330.00000 506 0
#> PTRATIO 18.45553360 2.1649455 12.60000 22.0000 19.05000 506 0
#> LSTAT 12.65306324 7.1410615 1.73000 37.9700 11.36000 506 0
#> MEDV 22.53280632 9.1971041 5.00000 50.0000 21.20000 506 0
#> CAT.MEDV 0.16600791 0.3724560 0.00000 1.0000 0.00000 506 0
Summary Statistics
round(cor(boston.housing.df),2)
#> CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
#> CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
#> ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
#> INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
#> CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
#> NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
#> RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
#> AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
#> DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
#> RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
#> TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
#> PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
#> LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
#> MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
#> CAT.MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 -0.44
#> LSTAT MEDV CAT.MEDV
#> CRIM 0.46 -0.39 -0.15
#> ZN -0.41 0.36 0.37
#> INDUS 0.60 -0.48 -0.37
#> CHAS -0.05 0.18 0.11
#> NOX 0.59 -0.43 -0.23
#> RM -0.61 0.70 0.64
#> AGE 0.60 -0.38 -0.19
#> DIS -0.50 0.25 0.12
#> RAD 0.49 -0.38 -0.20
#> TAX 0.54 -0.47 -0.27
#> PTRATIO 0.37 -0.51 -0.44
#> LSTAT 1.00 -0.74 -0.47
#> MEDV -0.74 1.00 0.79
#> CAT.MEDV -0.47 0.79 1.00
Aggregation and Pivot Tables
boston.housing.df <- mlba::BostonHousing
table(boston.housing.df$CHAS)
#>
#> 0 1
#> 471 35
# tidyverse version
boston.housing.df %>% count(CHAS)
#> CHAS n
#> 1 0 471
#> 2 1 35
# create bins of size 1
boston.housing.df <- boston.housing.df %>%
mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
# compute the average of MEDV by (binned) RM and CHAS
# in aggregate() use the argument by= to define the list of aggregating variables,
# and FUN= as an aggregating function.
aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin,
CHAS=boston.housing.df$CHAS), FUN=mean)
#> RM CHAS x
#> 1 3 0 25.30000
#> 2 4 0 15.40714
#> 3 5 0 17.20000
#> 4 6 0 21.76917
#> 5 7 0 35.96444
#> 6 8 0 45.70000
#> 7 5 1 22.21818
#> 8 6 1 25.91875
#> 9 7 1 44.06667
#> 10 8 1 35.95000
# tidyverse version
boston.housing.df %>%
group_by(RM.bin, CHAS) %>%
summarise(mean(MEDV))
#> # A tibble: 10 × 3
#> # Groups: RM.bin [6]
#> RM.bin CHAS `mean(MEDV)`
#> <int> <int> <dbl>
#> 1 3 0 25.3
#> 2 4 0 15.4
#> 3 5 0 17.2
#> 4 5 1 22.2
#> 5 6 0 21.8
#> 6 6 1 25.9
#> 7 7 0 36.0
#> 8 7 1 44.1
#> 9 8 0 45.7
#> 10 8 1 36.0
# use install.packages("reshape") the first time the package is used
library(reshape)
boston.housing.df <- mlba::BostonHousing
# create bins of size 1
boston.housing.df <- boston.housing.df %>%
mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))
# use melt() to stack a set of columns into a single column of data.
# stack MEDV values for each combination of (binned) RM and CHAS
mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV"))
head(mlt, 5)
#> RM.bin CHAS variable value
#> 1 6 0 MEDV 24.0
#> 2 6 0 MEDV 21.6
#> 3 7 0 MEDV 34.7
#> 4 6 0 MEDV 33.4
#> 5 7 0 MEDV 36.2
# use cast() to reshape data and generate pivot table
cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV",
margins=c("grand_row", "grand_col"), mean)
#> RM.bin 0 1 (all)
#> 1 3 25.30000 NaN 25.30000
#> 2 4 15.40714 NaN 15.40714
#> 3 5 17.20000 22.21818 17.55159
#> 4 6 21.76917 25.91875 22.01599
#> 5 7 35.96444 44.06667 36.91765
#> 6 8 45.70000 35.95000 44.20000
#> 7 (all) 22.09384 28.44000 22.53281
# tidyverse version
boston.housing.df %>%
group_by(RM.bin, CHAS) %>%
summarize(mean=mean(MEDV)) %>%
spread(CHAS, mean)
#> # A tibble: 6 × 3
#> # Groups: RM.bin [6]
#> RM.bin `0` `1`
#> <int> <dbl> <dbl>
#> 1 3 25.3 NA
#> 2 4 15.4 NA
#> 3 5 17.2 22.2
#> 4 6 21.8 25.9
#> 5 7 36.0 44.1
#> 6 8 45.7 36.0
Reducing the Number of Categories in Categorical Variables
boston.housing.df <- mlba::BostonHousing
tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN)
prop.tbl <- prop.table(tbl, margin=2)
barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN")
axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%"))

library(tidyverse)
df <- data.frame(prop.tbl)
ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) +
geom_bar(stat="identity", color="grey", width=1) +
scale_y_continuous(labels = scales::percent, expand=expansion()) +
scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) +
labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN")

g <- last_plot() + theme_bw()
ggsave(file=file.path(getwd(), "figures", "chapter_04", "reduction-pivot-bar.pdf"),
g, width=9, height=4, units="in")
library(forecast)
tru.data <- mlba::ToysRUsRevenues
tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4)
autoplot(tru.ts) +
geom_point(size=0.5) +
labs(x="Time", y="Revenue ($ millions)") +
theme_bw()

ggsave(file=file.path(getwd(), "figures", "chapter_04", "ToysRUs.pdf"),
last_plot())
Principal Components Analysis
Example 2: Breakfast Cereals
library(tidyverse)
cereals.df <- mlba::Cereals %>% select(calories, rating)
# compute PCs on two dimensions
pcs <- prcomp(cereals.df %>% select(calories, rating))
summary(pcs)
#> Importance of components:
#> PC1 PC2
#> Standard deviation 22.3165 8.8844
#> Proportion of Variance 0.8632 0.1368
#> Cumulative Proportion 0.8632 1.0000
pcs$rot
#> PC1 PC2
#> calories 0.8470535 0.5315077
#> rating -0.5315077 0.8470535
scores <- pcs$x
head(scores, 5)
#> PC1 PC2
#> [1,] -44.921528 2.1971833
#> [2,] 15.725265 -0.3824165
#> [3,] -40.149935 -5.4072123
#> [4,] -75.310772 12.9991256
#> [5,] 7.041508 -5.3576857
getPCaxis <- function(f, pcs, pcLabel) {
return (data.frame(
rbind(pcs$center + f * pcs$rotation[, pcLabel],
pcs$center - f * pcs$rotation[, pcLabel]))
)
}
PC1 <- getPCaxis(90, pcs, "PC1")
PC2 <- getPCaxis(50, pcs, "PC2")
ggplot(cereals.df, aes(x=calories, y=rating)) +
geom_point() +
geom_line(data=PC1) +
geom_line(data=PC2) +
coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) +
labs(x="Calories", y="Rating") +
annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) +
annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) +
theme_bw()

ggsave(file=file.path(getwd(), "figures", "chapter_04", "pca_subset.pdf"),
width=5, height=3, last_plot())
Principal Components
# load and preprocess the data
cereals.df <- mlba::Cereals %>%
column_to_rownames("name") %>%
select(-c(mfr, type)) %>%
drop_na()
pcs <- prcomp(cereals.df)
summary(pcs)
#> Importance of components:
#> PC1 PC2 PC3 PC4 PC5 PC6
#> Standard deviation 83.7641 70.9143 22.64375 19.18148 8.42323 2.09167
#> Proportion of Variance 0.5395 0.3867 0.03943 0.02829 0.00546 0.00034
#> Cumulative Proportion 0.5395 0.9262 0.96560 0.99389 0.99935 0.99968
#> PC7 PC8 PC9 PC10 PC11 PC12 PC13
#> Standard deviation 1.69942 0.77963 0.65783 0.37043 0.1864 0.06302 5.334e-08
#> Proportion of Variance 0.00022 0.00005 0.00003 0.00001 0.0000 0.00000 0.000e+00
#> Cumulative Proportion 0.99991 0.99995 0.99999 1.00000 1.0000 1.00000 1.000e+00
pcs$rotation[,1:5]
#> PC1 PC2 PC3 PC4 PC5
#> calories 0.0779841812 0.0093115874 -0.6292057595 -0.6010214629 0.454958508
#> protein -0.0007567806 -0.0088010282 -0.0010261160 0.0031999095 0.056175970
#> fat -0.0001017834 -0.0026991522 -0.0161957859 -0.0252622140 -0.016098458
#> sodium 0.9802145422 -0.1408957901 0.1359018583 -0.0009680741 0.013948118
#> fiber -0.0054127550 -0.0306807512 0.0181910456 0.0204721894 0.013605026
#> carbo 0.0172462607 0.0167832981 -0.0173699816 0.0259482087 0.349266966
#> sugars 0.0029888631 0.0002534853 -0.0977049979 -0.1154809105 -0.299066459
#> potass -0.1349000039 -0.9865619808 -0.0367824989 -0.0421757390 -0.047150529
#> vitamins 0.0942933187 -0.0167288404 -0.6919777623 0.7141179984 -0.037008623
#> shelf -0.0015414195 -0.0043603994 -0.0124888415 0.0056471836 -0.007876459
#> weight 0.0005120017 -0.0009992138 -0.0038059565 -0.0025464145 0.003022113
#> cups 0.0005101111 0.0015910125 -0.0006943214 0.0009853800 0.002148458
#> rating -0.0752962922 -0.0717421528 0.3079471212 0.3345338994 0.757708025
Normalizing the Data
# Use function prcomp() with scale. = T to run PCA on normalized data
pcs.cor <- prcomp(cereals.df, scale. = T)
summary(pcs.cor)
#> Importance of components:
#> PC1 PC2 PC3 PC4 PC5 PC6 PC7
#> Standard deviation 1.9062 1.7743 1.3818 1.00969 0.9947 0.84974 0.81946
#> Proportion of Variance 0.2795 0.2422 0.1469 0.07842 0.0761 0.05554 0.05166
#> Cumulative Proportion 0.2795 0.5217 0.6685 0.74696 0.8231 0.87861 0.93026
#> PC8 PC9 PC10 PC11 PC12 PC13
#> Standard deviation 0.64515 0.56192 0.30301 0.25194 0.13897 1.499e-08
#> Proportion of Variance 0.03202 0.02429 0.00706 0.00488 0.00149 0.000e+00
#> Cumulative Proportion 0.96228 0.98657 0.99363 0.99851 1.00000 1.000e+00
pcs.cor$rotation[,1:5]
#> PC1 PC2 PC3 PC4 PC5
#> calories 0.29954236 0.3931479 -0.114857453 0.20435870 0.20389885
#> protein -0.30735632 0.1653233 -0.277281953 0.30074318 0.31974897
#> fat 0.03991542 0.3457243 0.204890102 0.18683311 0.58689327
#> sodium 0.18339651 0.1372205 -0.389431009 0.12033726 -0.33836424
#> fiber -0.45349036 0.1798119 -0.069766079 0.03917361 -0.25511906
#> carbo 0.19244902 -0.1494483 -0.562452458 0.08783547 0.18274252
#> sugars 0.22806849 0.3514345 0.355405174 -0.02270716 -0.31487243
#> potass -0.40196429 0.3005442 -0.067620183 0.09087843 -0.14836048
#> vitamins 0.11598020 0.1729092 -0.387858660 -0.60411064 -0.04928672
#> shelf -0.17126336 0.2650503 0.001531036 -0.63887859 0.32910135
#> weight 0.05029930 0.4503085 -0.247138314 0.15342874 -0.22128334
#> cups 0.29463553 -0.2122479 -0.139999705 0.04748909 0.12081645
#> rating -0.43837841 -0.2515389 -0.181842433 0.03831622 0.05758420
library(ggrepel)
ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) +
geom_point(shape=21) +
geom_text_repel(size=2, max.overlaps=7) +
theme_bw()

f <- 1.3
ggsave(file=file.path(getwd(), "figures", "chapter_04", "pca_full.pdf"),
width=f * 4, height=f * 5, last_plot())