if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

options(dplyr.summarise.inform = FALSE)
library(tidyverse)

Data Summaries

Example 1: House Prices in Boston

boston.housing.df <- mlba::BostonHousing
head(boston.housing.df, 9)
#>      CRIM   ZN INDUS CHAS   NOX    RM   AGE    DIS RAD TAX PTRATIO LSTAT MEDV
#> 1 0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3  4.98 24.0
#> 2 0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8  9.14 21.6
#> 3 0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8  4.03 34.7
#> 4 0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7  2.94 33.4
#> 5 0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7  5.33 36.2
#> 6 0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7  5.21 28.7
#> 7 0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 12.43 22.9
#> 8 0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 19.15 27.1
#> 9 0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 29.93 16.5
#>   CAT.MEDV
#> 1        0
#> 2        0
#> 3        1
#> 4        1
#> 5        1
#> 6        0
#> 7        0
#> 8        0
#> 9        0
summary(boston.housing.df)
#>       CRIM                ZN             INDUS            CHAS        
#>  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
#>  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
#>  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
#>  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
#>  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
#>  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
#>       NOX               RM             AGE              DIS        
#>  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
#>  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
#>  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
#>  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
#>  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
#>  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
#>       RAD              TAX           PTRATIO          LSTAT      
#>  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
#>  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
#>  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
#>  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
#>  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
#>  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
#>       MEDV          CAT.MEDV    
#>  Min.   : 5.00   Min.   :0.000  
#>  1st Qu.:17.02   1st Qu.:0.000  
#>  Median :21.20   Median :0.000  
#>  Mean   :22.53   Mean   :0.166  
#>  3rd Qu.:25.00   3rd Qu.:0.000  
#>  Max.   :50.00   Max.   :1.000
# compute mean, standard dev., min, max, median, length, and missing values of CRIM
mean(boston.housing.df$CRIM)
#> [1] 3.613524
sd(boston.housing.df$CRIM)
#> [1] 8.601545
min(boston.housing.df$CRIM)
#> [1] 0.00632
max(boston.housing.df$CRIM)
#> [1] 88.9762
median(boston.housing.df$CRIM)
#> [1] 0.25651
length(boston.housing.df$CRIM)
#> [1] 506
# find the number of missing values of variable CRIM
sum(is.na(boston.housing.df$CRIM))
#> [1] 0
# compute mean, standard dev., min, max, median, length, and missing values for all
# variables
data.frame(mean=sapply(boston.housing.df, mean),
           sd=sapply(boston.housing.df, sd),
           min=sapply(boston.housing.df, min),
           max=sapply(boston.housing.df, max),
           median=sapply(boston.housing.df, median),
           length=sapply(boston.housing.df, length),
           miss.val=sapply(boston.housing.df,
                           function(x) sum(length(which(is.na(x))))))
#>                  mean          sd       min      max    median length miss.val
#> CRIM       3.61352356   8.6015451   0.00632  88.9762   0.25651    506        0
#> ZN        11.36363636  23.3224530   0.00000 100.0000   0.00000    506        0
#> INDUS     11.13677866   6.8603529   0.46000  27.7400   9.69000    506        0
#> CHAS       0.06916996   0.2539940   0.00000   1.0000   0.00000    506        0
#> NOX        0.55469506   0.1158777   0.38500   0.8710   0.53800    506        0
#> RM         6.28463439   0.7026171   3.56100   8.7800   6.20850    506        0
#> AGE       68.57490119  28.1488614   2.90000 100.0000  77.50000    506        0
#> DIS        3.79504269   2.1057101   1.12960  12.1265   3.20745    506        0
#> RAD        9.54940711   8.7072594   1.00000  24.0000   5.00000    506        0
#> TAX      408.23715415 168.5371161 187.00000 711.0000 330.00000    506        0
#> PTRATIO   18.45553360   2.1649455  12.60000  22.0000  19.05000    506        0
#> LSTAT     12.65306324   7.1410615   1.73000  37.9700  11.36000    506        0
#> MEDV      22.53280632   9.1971041   5.00000  50.0000  21.20000    506        0
#> CAT.MEDV   0.16600791   0.3724560   0.00000   1.0000   0.00000    506        0

Summary Statistics

round(cor(boston.housing.df),2)
#>           CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO
#> CRIM      1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58    0.29
#> ZN       -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31   -0.39
#> INDUS     0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72    0.38
#> CHAS     -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04   -0.12
#> NOX       0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67    0.19
#> RM       -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29   -0.36
#> AGE       0.35 -0.57  0.64  0.09  0.73 -0.24  1.00 -0.75  0.46  0.51    0.26
#> DIS      -0.38  0.66 -0.71 -0.10 -0.77  0.21 -0.75  1.00 -0.49 -0.53   -0.23
#> RAD       0.63 -0.31  0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00  0.91    0.46
#> TAX       0.58 -0.31  0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00    0.46
#> PTRATIO   0.29 -0.39  0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46    1.00
#> LSTAT     0.46 -0.41  0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54    0.37
#> MEDV     -0.39  0.36 -0.48  0.18 -0.43  0.70 -0.38  0.25 -0.38 -0.47   -0.51
#> CAT.MEDV -0.15  0.37 -0.37  0.11 -0.23  0.64 -0.19  0.12 -0.20 -0.27   -0.44
#>          LSTAT  MEDV CAT.MEDV
#> CRIM      0.46 -0.39    -0.15
#> ZN       -0.41  0.36     0.37
#> INDUS     0.60 -0.48    -0.37
#> CHAS     -0.05  0.18     0.11
#> NOX       0.59 -0.43    -0.23
#> RM       -0.61  0.70     0.64
#> AGE       0.60 -0.38    -0.19
#> DIS      -0.50  0.25     0.12
#> RAD       0.49 -0.38    -0.20
#> TAX       0.54 -0.47    -0.27
#> PTRATIO   0.37 -0.51    -0.44
#> LSTAT     1.00 -0.74    -0.47
#> MEDV     -0.74  1.00     0.79
#> CAT.MEDV -0.47  0.79     1.00

Aggregation and Pivot Tables

boston.housing.df <- mlba::BostonHousing
table(boston.housing.df$CHAS)
#> 
#>   0   1 
#> 471  35
# tidyverse version
boston.housing.df %>% count(CHAS)
#>   CHAS   n
#> 1    0 471
#> 2    1  35
# create bins of size 1
boston.housing.df <- boston.housing.df %>%
  mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))

# compute the average of MEDV by (binned) RM and CHAS
# in aggregate() use the argument by= to define the list of aggregating variables,
# and FUN= as an aggregating function.
aggregate(boston.housing.df$MEDV, by=list(RM=boston.housing.df$RM.bin,
          CHAS=boston.housing.df$CHAS), FUN=mean)
#>    RM CHAS        x
#> 1   3    0 25.30000
#> 2   4    0 15.40714
#> 3   5    0 17.20000
#> 4   6    0 21.76917
#> 5   7    0 35.96444
#> 6   8    0 45.70000
#> 7   5    1 22.21818
#> 8   6    1 25.91875
#> 9   7    1 44.06667
#> 10  8    1 35.95000
# tidyverse version
boston.housing.df %>%
  group_by(RM.bin, CHAS) %>%
  summarise(mean(MEDV))
#> # A tibble: 10 × 3
#> # Groups:   RM.bin [6]
#>    RM.bin  CHAS `mean(MEDV)`
#>     <int> <int>        <dbl>
#>  1      3     0         25.3
#>  2      4     0         15.4
#>  3      5     0         17.2
#>  4      5     1         22.2
#>  5      6     0         21.8
#>  6      6     1         25.9
#>  7      7     0         36.0
#>  8      7     1         44.1
#>  9      8     0         45.7
#> 10      8     1         36.0
# use install.packages("reshape") the first time the package is used
library(reshape)
boston.housing.df <- mlba::BostonHousing
# create bins of size 1
boston.housing.df <- boston.housing.df %>%
  mutate(RM.bin = cut(RM, c(1:9), labels=FALSE))

# use melt() to stack a set of columns into a single column of data.
# stack MEDV values for each combination of (binned) RM and CHAS
mlt <- melt(boston.housing.df, id=c("RM.bin", "CHAS"), measure=c("MEDV"))
head(mlt, 5)
#>   RM.bin CHAS variable value
#> 1      6    0     MEDV  24.0
#> 2      6    0     MEDV  21.6
#> 3      7    0     MEDV  34.7
#> 4      6    0     MEDV  33.4
#> 5      7    0     MEDV  36.2
# use cast() to reshape data and generate pivot table
cast(mlt, RM.bin ~ CHAS, subset=variable=="MEDV",
     margins=c("grand_row", "grand_col"), mean)
#>   RM.bin        0        1    (all)
#> 1      3 25.30000      NaN 25.30000
#> 2      4 15.40714      NaN 15.40714
#> 3      5 17.20000 22.21818 17.55159
#> 4      6 21.76917 25.91875 22.01599
#> 5      7 35.96444 44.06667 36.91765
#> 6      8 45.70000 35.95000 44.20000
#> 7  (all) 22.09384 28.44000 22.53281
# tidyverse version
boston.housing.df %>%
  group_by(RM.bin, CHAS) %>%
  summarize(mean=mean(MEDV)) %>%
  spread(CHAS, mean)
#> # A tibble: 6 × 3
#> # Groups:   RM.bin [6]
#>   RM.bin   `0`   `1`
#>    <int> <dbl> <dbl>
#> 1      3  25.3  NA  
#> 2      4  15.4  NA  
#> 3      5  17.2  22.2
#> 4      6  21.8  25.9
#> 5      7  36.0  44.1
#> 6      8  45.7  36.0

Reducing the Number of Categories in Categorical Variables

boston.housing.df <- mlba::BostonHousing

tbl <- table(boston.housing.df$CAT.MEDV, boston.housing.df$ZN)
prop.tbl <- prop.table(tbl, margin=2)
barplot(prop.tbl, xlab="ZN", ylab="", yaxt="n",main="Distribution of CAT.MEDV by ZN")
axis(2, at=(seq(0,1, 0.2)), paste(seq(0,100,20), "%"))

library(tidyverse)
df <- data.frame(prop.tbl)
ggplot(df, aes(x=Var2, y=Freq, group=Var1, fill=Var1)) +
  geom_bar(stat="identity", color="grey", width=1) +
  scale_y_continuous(labels = scales::percent, expand=expansion()) +
  scale_fill_manual("CAT.MEDV", values=c("0"="#eeeeee", "1"="darkgrey")) +
  labs(x="ZN", y="", title="Distribution of CAT.MEDV by ZN")

g <- last_plot() + theme_bw()
  ggsave(file=file.path(getwd(), "figures", "chapter_04", "reduction-pivot-bar.pdf"),
         g, width=9, height=4, units="in")
library(forecast)
tru.data <- mlba::ToysRUsRevenues
tru.ts <- ts(tru.data[, 3], start = c(1992, 1), end = c(1995, 4), freq = 4)
autoplot(tru.ts) +
  geom_point(size=0.5) +
  labs(x="Time", y="Revenue ($ millions)") +
  theme_bw()

ggsave(file=file.path(getwd(), "figures", "chapter_04", "ToysRUs.pdf"),
       last_plot())

Principal Components Analysis

Example 2: Breakfast Cereals

library(tidyverse)
cereals.df <- mlba::Cereals %>% select(calories, rating)
# compute PCs on two dimensions
pcs <- prcomp(cereals.df %>% select(calories, rating))
summary(pcs)
#> Importance of components:
#>                            PC1    PC2
#> Standard deviation     22.3165 8.8844
#> Proportion of Variance  0.8632 0.1368
#> Cumulative Proportion   0.8632 1.0000
pcs$rot
#>                 PC1       PC2
#> calories  0.8470535 0.5315077
#> rating   -0.5315077 0.8470535
scores <- pcs$x
head(scores, 5)
#>             PC1        PC2
#> [1,] -44.921528  2.1971833
#> [2,]  15.725265 -0.3824165
#> [3,] -40.149935 -5.4072123
#> [4,] -75.310772 12.9991256
#> [5,]   7.041508 -5.3576857
getPCaxis <- function(f, pcs, pcLabel) {
  return (data.frame(
    rbind(pcs$center + f * pcs$rotation[, pcLabel],
          pcs$center - f * pcs$rotation[, pcLabel]))
  )
}
PC1 <- getPCaxis(90, pcs, "PC1")
PC2 <- getPCaxis(50, pcs, "PC2")
ggplot(cereals.df, aes(x=calories, y=rating)) +
  geom_point() +
  geom_line(data=PC1) +
  geom_line(data=PC2) +
  coord_cartesian(xlim=c(0, 200), ylim=c(0, 110)) +
  labs(x="Calories", y="Rating") +
  annotate(geom="text", x=30, y=80, label="z[1]",parse=TRUE) +
  annotate(geom="text", x=120, y=80, label="z[2]",parse=TRUE) +
  theme_bw()

ggsave(file=file.path(getwd(), "figures", "chapter_04", "pca_subset.pdf"),
       width=5, height=3, last_plot())

Principal Components

# load and preprocess the data
cereals.df <- mlba::Cereals %>%
  column_to_rownames("name") %>%
  select(-c(mfr, type)) %>%
  drop_na()

pcs <- prcomp(cereals.df)
summary(pcs)
#> Importance of components:
#>                            PC1     PC2      PC3      PC4     PC5     PC6
#> Standard deviation     83.7641 70.9143 22.64375 19.18148 8.42323 2.09167
#> Proportion of Variance  0.5395  0.3867  0.03943  0.02829 0.00546 0.00034
#> Cumulative Proportion   0.5395  0.9262  0.96560  0.99389 0.99935 0.99968
#>                            PC7     PC8     PC9    PC10   PC11    PC12      PC13
#> Standard deviation     1.69942 0.77963 0.65783 0.37043 0.1864 0.06302 5.334e-08
#> Proportion of Variance 0.00022 0.00005 0.00003 0.00001 0.0000 0.00000 0.000e+00
#> Cumulative Proportion  0.99991 0.99995 0.99999 1.00000 1.0000 1.00000 1.000e+00
pcs$rotation[,1:5]
#>                    PC1           PC2           PC3           PC4          PC5
#> calories  0.0779841812  0.0093115874 -0.6292057595 -0.6010214629  0.454958508
#> protein  -0.0007567806 -0.0088010282 -0.0010261160  0.0031999095  0.056175970
#> fat      -0.0001017834 -0.0026991522 -0.0161957859 -0.0252622140 -0.016098458
#> sodium    0.9802145422 -0.1408957901  0.1359018583 -0.0009680741  0.013948118
#> fiber    -0.0054127550 -0.0306807512  0.0181910456  0.0204721894  0.013605026
#> carbo     0.0172462607  0.0167832981 -0.0173699816  0.0259482087  0.349266966
#> sugars    0.0029888631  0.0002534853 -0.0977049979 -0.1154809105 -0.299066459
#> potass   -0.1349000039 -0.9865619808 -0.0367824989 -0.0421757390 -0.047150529
#> vitamins  0.0942933187 -0.0167288404 -0.6919777623  0.7141179984 -0.037008623
#> shelf    -0.0015414195 -0.0043603994 -0.0124888415  0.0056471836 -0.007876459
#> weight    0.0005120017 -0.0009992138 -0.0038059565 -0.0025464145  0.003022113
#> cups      0.0005101111  0.0015910125 -0.0006943214  0.0009853800  0.002148458
#> rating   -0.0752962922 -0.0717421528  0.3079471212  0.3345338994  0.757708025

Normalizing the Data

# Use function prcomp() with scale. = T to run PCA on normalized data
pcs.cor <- prcomp(cereals.df, scale. = T)

summary(pcs.cor)
#> Importance of components:
#>                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
#> Standard deviation     1.9062 1.7743 1.3818 1.00969 0.9947 0.84974 0.81946
#> Proportion of Variance 0.2795 0.2422 0.1469 0.07842 0.0761 0.05554 0.05166
#> Cumulative Proportion  0.2795 0.5217 0.6685 0.74696 0.8231 0.87861 0.93026
#>                            PC8     PC9    PC10    PC11    PC12      PC13
#> Standard deviation     0.64515 0.56192 0.30301 0.25194 0.13897 1.499e-08
#> Proportion of Variance 0.03202 0.02429 0.00706 0.00488 0.00149 0.000e+00
#> Cumulative Proportion  0.96228 0.98657 0.99363 0.99851 1.00000 1.000e+00
pcs.cor$rotation[,1:5]
#>                  PC1        PC2          PC3         PC4         PC5
#> calories  0.29954236  0.3931479 -0.114857453  0.20435870  0.20389885
#> protein  -0.30735632  0.1653233 -0.277281953  0.30074318  0.31974897
#> fat       0.03991542  0.3457243  0.204890102  0.18683311  0.58689327
#> sodium    0.18339651  0.1372205 -0.389431009  0.12033726 -0.33836424
#> fiber    -0.45349036  0.1798119 -0.069766079  0.03917361 -0.25511906
#> carbo     0.19244902 -0.1494483 -0.562452458  0.08783547  0.18274252
#> sugars    0.22806849  0.3514345  0.355405174 -0.02270716 -0.31487243
#> potass   -0.40196429  0.3005442 -0.067620183  0.09087843 -0.14836048
#> vitamins  0.11598020  0.1729092 -0.387858660 -0.60411064 -0.04928672
#> shelf    -0.17126336  0.2650503  0.001531036 -0.63887859  0.32910135
#> weight    0.05029930  0.4503085 -0.247138314  0.15342874 -0.22128334
#> cups      0.29463553 -0.2122479 -0.139999705  0.04748909  0.12081645
#> rating   -0.43837841 -0.2515389 -0.181842433  0.03831622  0.05758420
library(ggrepel)
ggplot(data.frame(pcs.cor$x), aes(x=PC1, y=PC2, label=rownames(pcs.cor$x))) +
  geom_point(shape=21) +
  geom_text_repel(size=2, max.overlaps=7) +
  theme_bw()

f <- 1.3
ggsave(file=file.path(getwd(), "figures", "chapter_04", "pca_full.pdf"),
       width=f * 4, height=f * 5, last_plot())

Dimension Reduction Using Classification and Regression Trees

Using Principal Components for Classification and Prediction

wine.df <- mlba::Wine %>% select(-Type)
  pcs.cor <- prcomp(wine.df)
  summary(pcs.cor)
#> Importance of components:
#>                             PC1      PC2     PC3     PC4     PC5     PC6    PC7
#> Standard deviation     314.9632 13.13527 3.07215 2.23409 1.10853 0.91710 0.5282
#> Proportion of Variance   0.9981  0.00174 0.00009 0.00005 0.00001 0.00001 0.0000
#> Cumulative Proportion    0.9981  0.99983 0.99992 0.99997 0.99998 0.99999 1.0000
#>                           PC8    PC9   PC10   PC11   PC12    PC13
#> Standard deviation     0.3891 0.3348 0.2678 0.1938 0.1452 0.09057
#> Proportion of Variance 0.0000 0.0000 0.0000 0.0000 0.0000 0.00000
#> Cumulative Proportion  1.0000 1.0000 1.0000 1.0000 1.0000 1.00000
  pcs.cor$rotation[,1:4]
#>                                PC1           PC2          PC3          PC4
#> Alcohol              -0.0016592647 -1.203406e-03 -0.016873809  0.141446778
#> Malic_Acid            0.0006810156 -2.154982e-03 -0.122003373  0.160389543
#> Ash                  -0.0001949057 -4.593693e-03 -0.051987430 -0.009772810
#> Ash_Alcalinity        0.0046713006 -2.645039e-02 -0.938593003 -0.330965260
#> Magnesium            -0.0178680075 -9.993442e-01  0.029780248 -0.005393756
#> Total_Phenols        -0.0009898297 -8.779622e-04  0.040484644 -0.074584656
#> Flavanoids           -0.0015672883  5.185073e-05  0.085443339 -0.169086724
#> Nonflavanoid_Phenols  0.0001230867  1.354479e-03 -0.013510780  0.010805561
#> Proanthocyanins      -0.0006006078 -5.004400e-03  0.024659382 -0.050120952
#> Color_Intensity      -0.0023271432 -1.510035e-02 -0.291398464  0.878893693
#> Hue                  -0.0001713800  7.626731e-04  0.025977662 -0.060034945
#> OD280_OD315          -0.0007049316  3.495364e-03  0.070323969 -0.178200254
#> Proline              -0.9998229365  1.777381e-02 -0.004528682 -0.003112916