I was poking through some job requirements and found an interesting website - StatSoft Textbook. Admittedly, it doesn’t get into the fine details and mathematics I prefer, but it does give an overview of many techniques used in data science.
How can I find out the relationship between the multiple independent variables and the dependent variable? Obviously, multiple regression.
Before we go straight for multiple (linear) regression, let’s try something new to me called a MARSplines (Multivariate Adaptive Regression Splines) which are mentioned in the StatSoft textbook.
MARSplines is a nonparametric regression procedure that makes no assumption about the underlying functional relationship between the dependent and independent variables. Instead, the MARSpline constructs a relation from a set of coefficients and basis functions that are entirely “driven” from the regression data. MARSplines are particularly suitable for problems with higher input dimensions (i.e., with more than 2 variables), where the curse of dimensionality would likely create problems for other techniques.
There are at least two packages in R that support MARsplines (earth - I think it has a Python version as well) and mda. Perhaps it is just the available information, but I was able to do much better with the mda package.
library(MASS)
library(mda)
## Loading required package: class
## Loaded mda 0.4-9
data(Boston)
# FIT AN ADDITIVE MARS MODEL
mars.fit <- mars(Boston[, -14], Boston[14], degree = 1, prune = TRUE, forward.step = TRUE)
# SHOW CUT POINTS OF MARS
cuts <- mars.fit$cuts[mars.fit$selected.terms, ];
dimnames(cuts) <- list(NULL, names(Boston)[-14]);
print(cuts);
## crim zn indus chas nox rm age dis rad tax ptratio black
## [1,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [2,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [3,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [4,] 0.00000 0 0 0 0.000 6.430 0 0.0000 0 0 0.0 0.0
## [5,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 14.7 0.0
## [6,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 14.7 0.0
## [7,] 0.00000 0 0 0 0.484 0.000 0 0.0000 0 0 0.0 0.0
## [8,] 4.75237 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [9,] 0.00000 0 0 0 0.000 0.000 0 2.4329 0 0 0.0 0.0
## [10,] 0.00000 0 0 0 0.000 0.000 0 2.4329 0 0 0.0 0.0
## [11,] 11.95110 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [12,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 289 0.0 0.0
## [13,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 289 0.0 0.0
## [14,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 0.0
## [15,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 393.3
## [16,] 0.00000 0 0 0 0.000 0.000 0 0.0000 0 0 0.0 393.3
## [17,] 0.00000 0 0 0 0.000 7.923 0 0.0000 0 0 0.0 0.0
## [18,] 0.00000 0 0 0 0.000 0.000 0 0.0000 1 0 0.0 0.0
## [19,] 0.00000 0 0 0 0.000 7.412 0 0.0000 0 0 0.0 0.0
## lstat
## [1,] 0.00
## [2,] 6.07
## [3,] 6.07
## [4,] 0.00
## [5,] 0.00
## [6,] 0.00
## [7,] 0.00
## [8,] 0.00
## [9,] 0.00
## [10,] 0.00
## [11,] 0.00
## [12,] 0.00
## [13,] 0.00
## [14,] 25.68
## [15,] 0.00
## [16,] 0.00
## [17,] 0.00
## [18,] 0.00
## [19,] 0.00
factor <- mars.fit$factor[mars.fit$selected.terms, ];
dimnames(factor) <- list(NULL, names(Boston)[-14]);
print(factor);
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## [1,] 0 0 0 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0 0 0 1
## [3,] 0 0 0 0 0 0 0 0 0 0 0 0 -1
## [4,] 0 0 0 0 0 1 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0 1 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0 -1 0 0
## [7,] 0 0 0 0 1 0 0 0 0 0 0 0 0
## [8,] 1 0 0 0 0 0 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 1 0 0 0 0 0
## [10,] 0 0 0 0 0 0 0 -1 0 0 0 0 0
## [11,] 1 0 0 0 0 0 0 0 0 0 0 0 0
## [12,] 0 0 0 0 0 0 0 0 0 1 0 0 0
## [13,] 0 0 0 0 0 0 0 0 0 -1 0 0 0
## [14,] 0 0 0 0 0 0 0 0 0 0 0 0 1
## [15,] 0 0 0 0 0 0 0 0 0 0 0 1 0
## [16,] 0 0 0 0 0 0 0 0 0 0 0 -1 0
## [17,] 0 0 0 0 0 1 0 0 0 0 0 0 0
## [18,] 0 0 0 0 0 0 0 0 1 0 0 0 0
## [19,] 0 0 0 0 0 1 0 0 0 0 0 0 0
library(repr)
options(repr.plot.width=10, repr.plot.height=10)
# EXAMINE THE FITTED FUNCTION BETWEEN EACH IV AND DV
par(mfrow = c(4, 4), mar=c(2, 2, 2, 2), mgp=c(1,0,0), pty="m")
for (i in 1:13) {
xp <- matrix(sapply(Boston[1:13], mean), nrow(Boston), ncol(Boston) - 1, byrow = TRUE)
xr <- sapply(Boston, range)
xp[, i] <- seq(xr[1, i], xr[2, i], len=nrow(Boston))
xf <- predict(mars.fit, xp)
plot(xp[, i], xf, xlab = names(Boston)[i], ylab = names(Boston)[14], type = "l")
}
What does this mean? Let’s take a look at a few of the images and plot the data points for the particular variable.
par(mfrow = c(2, 2), mar=c(2, 2, 2, 2), mgp=c(1,0,0), pty="m")
intcols <- c(1,2,10,13)
for (i in intcols) {
xp <- matrix(sapply(Boston[1:13], mean), nrow(Boston), ncol(Boston) - 1, byrow = TRUE)
xr <- sapply(Boston, range)
xp[, i] <- seq(xr[1, i], xr[2, i], len=nrow(Boston))
xf <- predict(mars.fit, xp)
plot(Boston[,i], Boston[,14], type="p", xlab=names(Boston)[i], ylab=names(Boston[14]))
plot(xp[, i], xf, xlab = names(Boston)[i], ylab = names(Boston)[14], type = "l")
}
MARsplines then give a quick indication of the type of regression to use and helps us to understand if perhaps there is no real relationship at all. Let’s try some regression models on these four examples.
Crim
Looking at the MARspline, it seems like a quadratic function may be best - but I want to try linear regression as well and see what happens.
x <- Boston[,1]
y <- Boston[,14]
crim_linear <- lm(y ~ x)
crim_poly <- lm( y ~ poly(x,2))
crim_poly3 <- lm( y ~ poly(x,3))
fstat <- data.frame(summary(crim_linear)$fstatistic[1], summary(crim_poly)$fstatistic[1],
summary(crim_poly3)$fstatistic[1])
colnames(fstat) <- c('Linear', 'Quadratic', 'Cubic')
row.names(fstat) <- NULL
library(knitr)
kable(fstat, "pandoc", caption="F-Statistic (Lower is Better)", xlab="Crim", ylab="medv")
| Linear | Quadratic | Cubic |
|---|---|---|
| 89.48611 | 66.80333 | 46.56719 |
options(repr.plot.width=5, repr.plot.height=5)
plot(x,y, xlab='crim', ylab='medv')
abline(crim_linear, col='blue')
fx <- with(Boston, seq(min(crim), max(crim), length.out=2000))
lines(fx, predict(crim_poly, data.frame(x=fx)), col='red')
lines(fx, predict(crim_poly3, data.frame(x=fx)), col='green')
Cubic actually give a better estimate numerically. We don’t have enough points to really determine which of the polynomial fits is better.
x <- Boston[,13]
y <- Boston[,14]
crim_linear <- lm(y ~ x)
crim_poly <- lm( y ~ poly(x,2))
crim_poly3 <- lm( y ~ poly(x,3))
fstat <- data.frame(summary(crim_linear)$fstatistic[1], summary(crim_poly)$fstatistic[1],
summary(crim_poly3)$fstatistic[1])
colnames(fstat) <- c('Linear', 'Quadratic', 'Cubic')
row.names(fstat) <- NULL
kable(fstat, "pandoc", caption="F-Statistic (Lower is Better)", xlab="lstat", ylab="medv")
| Linear | Quadratic | Cubic |
|---|---|---|
| 601.6179 | 448.5051 | 321.7275 |
options(repr.plot.width=5, repr.plot.height=5)
plot(x,y, xlab='lstat', ylab='medv')
abline(crim_linear, col='blue')
fx <- with(Boston, seq(min(crim), max(crim), length.out=2000))
lines(fx, predict(crim_poly, data.frame(x=fx)), col='red')
lines(fx, predict(crim_poly3, data.frame(x=fx)), col='green')
Multiple regression in R is very similar to linear regression. Instead of listing one independent variable, several are mentioned. But we are one step ahead because of the MARSplines. We can use linear or polynomial regression based on the number of cuts on each of the particular MARSplines.
As explained below, we are just going to use this simple rule (and allow a little bit of leeway for data noise):
# We can use this function as needed to get the best polynomical (linear through cubic) for a fit of data.
# If you don't know what to fit in the lm, just use this function as
# lm( y ~ poly(x,y))
best_poly <- function(x, y) {
f1 <- lm(x ~ poly(y,1))
f2 <- lm(x ~ poly(y,2))
f3 <- lm(x ~ poly(y,3))
fs1 <- (summary(f1))$fstatistic[1]
fs2 <- (summary(f2))$fstatistic[1]
fs3 <- (summary(f3))$fstatistic[1]
if( fs1 < fs2 & fs1 < fs3 ) { return(1) }
if( fs2 < fs3 ) { return(2) }
return(3)
}
# However, in our case, we are going to use the polynomial indicated by the MARSplines.
# If the line is straight, we just use linear regression.
# If there is one cut, we use quadratic.
# For two more more cuts, use cubic.
#
#* **crim** : Cubic
#* **zn** : Linear
#* **indus** : Linear
#* **chas** : Linear
#* **nox** : Quadratic
#* **rm** : Cubic
#* **age** : Linear
#* **dis** : Quadratic
#* **rad** : Linear
#* **tax** : Quadratic
#* **ptratio**: Quadratic
#* **black** : Quadratic
#* **lstat** : Cubic
multiple_regression <- lm(
Boston$medv ~ poly(Boston$crim, 3) + Boston$zn + Boston$indus + Boston$chas +
poly(Boston$nox, 2) + poly(Boston$rm, 3) + Boston$age +
poly(Boston$dis, 2) + Boston$rad + poly(Boston$tax, 2) +
poly(Boston$ptratio, 2) + poly(Boston$black, 2) + poly(Boston$lstat, 3)
)
print(summary(multiple_regression))
##
## Call:
## lm(formula = Boston$medv ~ poly(Boston$crim, 3) + Boston$zn +
## Boston$indus + Boston$chas + poly(Boston$nox, 2) + poly(Boston$rm,
## 3) + Boston$age + poly(Boston$dis, 2) + Boston$rad + poly(Boston$tax,
## 2) + poly(Boston$ptratio, 2) + poly(Boston$black, 2) + poly(Boston$lstat,
## 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.7540 -2.0311 -0.2444 1.7103 25.6665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.422521 1.404546 13.116 < 2e-16 ***
## poly(Boston$crim, 3)1 -52.308286 8.103562 -6.455 2.65e-10 ***
## poly(Boston$crim, 3)2 25.545766 7.137653 3.579 0.000380 ***
## poly(Boston$crim, 3)3 -9.512244 5.329829 -1.785 0.074937 .
## Boston$zn -0.001848 0.013524 -0.137 0.891363
## Boston$indus 0.025798 0.052349 0.493 0.622372
## Boston$chas 2.406947 0.713986 3.371 0.000809 ***
## poly(Boston$nox, 2)1 -59.041496 11.694584 -5.049 6.33e-07 ***
## poly(Boston$nox, 2)2 -1.397927 6.444129 -0.217 0.828355
## poly(Boston$rm, 3)1 47.487030 6.185052 7.678 9.12e-14 ***
## poly(Boston$rm, 3)2 37.693569 4.634888 8.133 3.60e-15 ***
## poly(Boston$rm, 3)3 -1.592590 4.299603 -0.370 0.711245
## Boston$age -0.001492 0.012123 -0.123 0.902090
## poly(Boston$dis, 2)1 -62.642670 9.450528 -6.628 9.12e-11 ***
## poly(Boston$dis, 2)2 19.709455 5.384166 3.661 0.000279 ***
## Boston$rad 0.395816 0.078748 5.026 7.06e-07 ***
## poly(Boston$tax, 2)1 -40.123345 14.241724 -2.817 0.005042 **
## poly(Boston$tax, 2)2 6.548420 5.168503 1.267 0.205773
## poly(Boston$ptratio, 2)1 -39.784045 5.435262 -7.320 1.05e-12 ***
## poly(Boston$ptratio, 2)2 13.528320 5.033413 2.688 0.007444 **
## poly(Boston$black, 2)1 12.177944 4.575962 2.661 0.008045 **
## poly(Boston$black, 2)2 -6.987111 4.126244 -1.693 0.091039 .
## poly(Boston$lstat, 3)1 -95.230309 7.542806 -12.625 < 2e-16 ***
## poly(Boston$lstat, 3)2 34.984753 5.150305 6.793 3.26e-11 ***
## poly(Boston$lstat, 3)3 -11.851436 4.316768 -2.745 0.006269 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.857 on 481 degrees of freedom
## Multiple R-squared: 0.8325, Adjusted R-squared: 0.8242
## F-statistic: 99.62 on 24 and 481 DF, p-value: < 2.2e-16
I note that R-Bloggers gives other packages which may also be considered:
TODO - A review of these models.