fileName <- 
 "http://fisher.stats.uwo.ca/faculty/aim/2018/3859A/data/WiscLottery.csv"
readLines(fileName, 16)
##  [1] "#Data is descibed in Frees' textbook (p. 23 and Table 3.11, p.102)"   
##  [2] "#State of Wisconsin lottery sales from 50 randomly selected areas"    
##  [3] "#identified by their zip codes. Socio-demographic variables that"     
##  [4] "#may be of interest to administrators are also provided."             
##  [5] "#ZIP - zip code"                                                      
##  [6] "#PERPERHN - persons per household"                                    
##  [7] "#MEDSCHYR - median years of schooling"                                
##  [8] "#MEDHVL- median home value in $1000's for owner occupied homes"       
##  [9] "#PRCRENT - percent of housing that is owner occupied"                 
## [10] "#PRC55P - percent of population that is over 55"                      
## [11] "#HHMEDAGE - household median age"                                     
## [12] "#MEDINC - estimated median income in $1,000"                          
## [13] "#SALES - online lottery sales to individuals"                         
## [14] "#POP - population in thousands"                                       
## [15] "ZIP,PERPERHH,MEDSCHYR,MEDHVL,PRCRENT,PRC55P,HHMEDAGE,MEDINC,SALES,POP"
## [16] "53003,3,12.6,71.3,21,38,48,54.2,1285.4,435"
wdata <- read_csv(fileName, skip=14)
## Parsed with column specification:
## cols(
##   ZIP = col_integer(),
##   PERPERHH = col_double(),
##   MEDSCHYR = col_double(),
##   MEDHVL = col_double(),
##   PRCRENT = col_integer(),
##   PRC55P = col_integer(),
##   HHMEDAGE = col_integer(),
##   MEDINC = col_double(),
##   SALES = col_double(),
##   POP = col_integer()
## )
wdata
## # A tibble: 50 x 10
##      ZIP PERPERHH MEDSCHYR MEDHVL PRCRENT PRC55P HHMEDAGE MEDINC  SALES
##    <int>    <dbl>    <dbl>  <dbl>   <int>  <int>    <int>  <dbl>  <dbl>
##  1 53003      3       12.6   71.3      21     38       48   54.2  1285.
##  2 53033      3.2     12.9   98         6     28       46   70.7  3571.
##  3 53038      2.8     12.4   58.7      25     35       45   43.6  2407.
##  4 53059      3.1     12.5   65.7      24     29       45   51.9  1224.
##  5 53072      2.6     13.1   96.7      32     27       42   63.1 15046.
##  6 53083      2.7     12.8   66.4      25     38       48   55.7  9129.
##  7 53095      2.8     12.9   91        31     37       48   54.9 33181.
##  8 53098      2.9     12.5   61        26     40       50   46.9  2243.
##  9 53104      2.8     12.8   91.5      18     35       48   62.3 21588.
## 10 53172      2.6     12.7   68.8      37     39       47   49.1 15693.
## # ... with 40 more rows, and 1 more variable: POP <int>
wdata %>%
  ggplot(aes(x=POP, y=SALES)) +
  geom_point(size=1.5, shape=1, stroke=1.25) +
  geom_smooth(method="lm", se=FALSE) + 
  geom_rug() +
  ggtitle(label=NULL, subtitle="Random Sample of 50 Zip Codes") +
  labs(x="Poplulation", y="Sales in $")

wdata %>%
  mutate(
    logPOP = log(POP),
    logSALES = log(SALES)
  ) %>%
  ggplot(aes(x=logPOP, y=logSALES)) +
  geom_point(size=1.5, shape=1, stroke=1.25) +
  geom_smooth(method="lm", se=FALSE) + 
  geom_rug() +
  ggtitle(label=NULL, subtitle="Random Sample of 50 Zip Codes") +
  labs(x="log Poplulation", y="log Sales in $")
## Warning: package 'bindrcpp' was built under R version 3.4.4

Let’s drill down and find the observations corresponding to the 5 largest ratios of SALES/POP. We see that the second largest ratio corresponds to the ZIP code 53003. Using Google we find that this for the small town of Ashippun.

wdata2 <- 
 wdata %>%
   mutate(
  logPOP = log(POP),
  logSALES = log(SALES),
  SALESPerCapita=SALES/POP
 ) %>%
 arrange(desc(SALESPerCapita)) %>%
 select(ZIP, contains("POP"), contains("SALES"))
 wdata2[1:5,]
## # A tibble: 5 x 6
##     ZIP   POP logPOP  SALES logSALES SALESPerCapita
##   <int> <int>  <dbl>  <dbl>    <dbl>          <dbl>
## 1 53104  4464   8.40 21588.     9.98           4.84
## 2 53003   435   6.08  1285.     7.16           2.95
## 3 53952  1217   7.10  2001.     7.60           1.64
## 4 53072 13337   9.50 15046.     9.62           1.13
## 5 53520  5825   8.67  6209.     8.73           1.07

Here is one way to find the number of distinct elements in each column.

map_int(map(wdata, unique), length)
##      ZIP PERPERHH MEDSCHYR   MEDHVL  PRCRENT   PRC55P HHMEDAGE   MEDINC 
##       50       11       13       47       26       25       16       48 
##    SALES      POP 
##       50       50

But using the tidyverse function, dplyr::n_distinct(), provides a better solution.

map_int(wdata, n_distinct)
##      ZIP PERPERHH MEDSCHYR   MEDHVL  PRCRENT   PRC55P HHMEDAGE   MEDINC 
##       50       11       13       47       26       25       16       48 
##    SALES      POP 
##       50       50

Note that we should think of a dataframe as well as the tibble wdata has a specialized list - definitely not a matrix!

class(wdata)
## [1] "tbl_df"     "tbl"        "data.frame"
m <- as.matrix.data.frame(wdata)
class(m)
## [1] "matrix"

Here m and wdata both look like matrices - all columns are numeric - but m is a matrix and wdata is a list.

Tabulate per capita sales

Let’s examine per capita sales by median age. For each distinct value of HHMEDAGE we find the mean per capita lottery sales over all zip codes in this group. Also we find the standard deviaion and the number of zip codes corresponding to the specified age.

wdata %>% 
   mutate(SALESPerCapita=SALES/POP) %>%
   group_by(HHMEDAGE) %>%
   summarize(average=mean(SALESPerCapita), sd=sd(SALESPerCapita), n=n())
## # A tibble: 16 x 4
##    HHMEDAGE average       sd     n
##       <int>   <dbl>    <dbl> <int>
##  1       41   0.497 NaN          1
##  2       42   0.878   0.308      3
##  3       44   0.322   0.0841     3
##  4       45   0.624   0.206      5
##  5       46   0.795   0.180      4
##  6       47   0.518   0.327      2
##  7       48   1.40    1.65       8
##  8       49   0.858   0.164      2
##  9       50   0.514   0.180      7
## 10       51   0.538   0.133      4
## 11       52   0.742   0.240      4
## 12       54   0.437   0.280      2
## 13       55   0.251   0.0335     2
## 14       57   0.482 NaN          1
## 15       58   0.879 NaN          1
## 16       59   1.64  NaN          1

Variable selection with bestglm, step and lasso

The package bestglm can be used to find the best subset regression with GLM and OLS. The input dataframe (or tibble) must be arranged in Xy-format, that is the last column must be for the y-variable and the other columns are x-variables. We can use select() with helper adverb everything() to do this

wdata2 <- select(wdata, -SALES, everything(), SALES, -ZIP) %>%
 mutate(logPOP=log(POP),
        logSALES=log(SALES)
        ) %>%
 select(-SALES, -POP)
wdata2
## # A tibble: 50 x 9
##    PERPERHH MEDSCHYR MEDHVL PRCRENT PRC55P HHMEDAGE MEDINC logPOP logSALES
##       <dbl>    <dbl>  <dbl>   <int>  <int>    <int>  <dbl>  <dbl>    <dbl>
##  1      3       12.6   71.3      21     38       48   54.2   6.08     7.16
##  2      3.2     12.9   98         6     28       46   70.7   8.48     8.18
##  3      2.8     12.4   58.7      25     35       45   43.6   7.81     7.79
##  4      3.1     12.5   65.7      24     29       45   51.9   7.63     7.11
##  5      2.6     13.1   96.7      32     27       42   63.1   9.50     9.62
##  6      2.7     12.8   66.4      25     38       48   55.7   9.74     9.12
##  7      2.8     12.9   91        31     37       48   54.9  10.6     10.4 
##  8      2.9     12.5   61        26     40       50   46.9   9.20     7.72
##  9      2.8     12.8   91.5      18     35       48   62.3   8.40     9.98
## 10      2.6     12.7   68.8      37     39       47   49.1   9.95     9.66
## # ... with 40 more rows

Since bestglm() needs a data.frame we need to use as.data.frame(). Using BIC for model selection only 2 inputs are selected.

bestglm::bestglm(as.data.frame(wdata2), IC="BIC")
## BIC
## BICq equivalent for q in (0.174163567335153, 0.655660207058578)
## Best Model:
##                Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept) -0.67865414 0.565514974 -1.200064 2.361287e-01
## MEDHVL       0.01431292 0.005375437  2.662653 1.058145e-02
## logPOP       0.92182625 0.077468167 11.899420 8.753908e-16

With AIC, 3 inputs are selected.

bestglm::bestglm(as.data.frame(wdata2), IC="AIC")
## AIC
## BICq equivalent for q in (0.655660207058578, 0.761245610302138)
## Best Model:
##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -4.35865071 1.87332248 -2.326695 2.444001e-02
## HHMEDAGE     0.05074064 0.02779033  1.825838 7.436996e-02
## MEDINC       0.03795128 0.01213043  3.128601 3.045618e-03
## logPOP       0.95870854 0.07557869 12.684906 1.261417e-16

Let’s compare with backward stagewise regression using BIC selection. Only logPOP is selected. The backward stagewise regression is non-optimal.

ans <- lm(logSALES ~ ., data=wdata2)
step(ans, trace=0, k=nrow(wdata2))
## 
## Call:
## lm(formula = logSALES ~ logPOP, data = wdata2)
## 
## Coefficients:
## (Intercept)       logPOP  
##     -0.7594       1.0285
step(ans, trace=0)
## 
## Call:
## lm(formula = logSALES ~ PERPERHH + MEDSCHYR + MEDHVL + PRC55P + 
##     HHMEDAGE + logPOP, data = wdata2)
## 
## Coefficients:
## (Intercept)     PERPERHH     MEDSCHYR       MEDHVL       PRC55P  
##     2.98601     -0.85773     -0.34725      0.01693     -0.08297  
##    HHMEDAGE       logPOP  
##     0.12749      0.92101

For practice, I only compared with LASSO. Using the caret package with bootstrap cross-validation.

I use the option CACHE=TRUE to save time re-knitting.

set.seed(775513) #for reproducibility!
X <- as.matrix(wdata2[,-ncol(wdata2)]) #need matrix or dataframe
y <- wdata2[,ncol(wdata2), drop=TRUE] #need vector
glmnetGrid <- expand.grid(lambda=seq(0.001, 0.9, length=40), alpha=1)
ansLASSO <- train(x = X, y = y,
              method = "glmnet",
              tuneGrid = glmnetGrid,
              preProc = c("center", "scale"))
tmp <- ansLASSO$bestTune
lambdaHat <- as.numeric(tmp["lambda"])
ind <- as.numeric(row.names(tmp))
(ansLASSO$finalModel)$beta[,ind]
##  PERPERHH  MEDSCHYR    MEDHVL   PRCRENT    PRC55P  HHMEDAGE    MEDINC 
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 
##    logPOP 
## 0.5608683

Optimal cross-validated shrinkage parameter 0.1393077.

Interestingly both the subset approaches suggested the \(\beta\) for logPOP was included in the 95% confidence interval. This implies that we could simplify the model by using per capita SALES and removing POP.

ggplot extras

You can find a lot of ggplot extras simply using google. One of the most popular extensions is GGally. From the package Description:

Some of these functions include a pairwise plot matrix, a two group pairwise plot matrix, a parallel coordinates plot, a survival plot, and several functions to plot networks.

The scatterplot matrix extension, GGally::ggpairs(), handles categorical variables as well as numeric as illustrated with an application to the Iris dataset.

GGally::ggpairs(iris)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.