Linear Regression I

Using scatterplots to look a candidate dependent variables

# String processing to extract variable labels
varNam <- scan(file = "CrimeVarNames.txt", what = "a", sep = "\n")
tmp <- strsplit(varNam, ":")
varLab <- unlist(tmp)[seq(1, 288, by = 2)]
n <- nchar(varLab)
varLab <- substring(varLab, 2, n)
# some labels are too long
varLab[1] <- "place"
varLab[2] <- "state"
varLab[3] <- "population"

# Reading and fixing the data
crimeDat <- read.csv(file = "crimeDat.csv", header = FALSE, na.string = "?")
colnames(crimeDat) <- varLab

# Addressing missing data count the number missing in each column and remove
# variables
myNA <- function(x) sum(is.na(x))
missDat <- apply(crimeDat, 2, myNA)
# Keep the Violent Crime column with 221 or less missing
crimeFix1 <- crimeDat[, missDat < 222]

# Find and remove cases with missing data
caseMiss <- apply(crimeFix1, 1, myNA)
crimeClean <- crimeFix1[caseMiss == 0, ]

## SCATTERPLOTS

subs <- c(106, 108, 110, 112, 114, 116, 118, 120, 121, 122)
violent <- c(106, 108, 110, 112, 121)
pairs(crimeClean[, violent], gap = 0, las = 1)

plot of chunk unnamed-chunk-1


subs <- c(114, 116, 118, 120, 122, 121)
pairs(crimeClean[, subs], gap = 0, las = 1)

plot of chunk unnamed-chunk-1

Exploratory 1D graphics

# Variable Selection
subs = c(3:84, 86:88, 90:104, 121)
crimeReg <- crimeClean[, subs]
library(car)
# source('myDensity.r')

layout(mat = matrix(1:4, ncol = 2))
oldpar <- par(mai = c(0.72, 0.68, 0.4, 0.2))
cexL <- 1.3
cexS <- 1
var <- crimeReg$ViolentCrimesPerPop
varLab1 <- "Violent Crimes"
varLab2 <- "In Communities"
varUnits <- "Rate per 100,000 People"

plot(c(0, 1), c(0, 1), axes = FALSE, type = "n", xlab = "", ylab = "", main = "")
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], density = -1, col = rgb(0.95, 1, 0.95), 
    border = "black")
text(0.5, 0.7, varLab1, cex = cexL, font = 2)
text(0.5, 0.5, varLab2, cex = cexL, font = 2)
text(0.5, 0.2, varUnits, cex = cexL, font = 1)

qqPlot(var, las = 1, xlab = "Normal Quantiles", ylab = varUnits, main = "Q-Q Plot")

# myDensity(var, units=varUnits)

boxplot(var, horizontal = TRUE, col = rgb(0, 0.85, 1), tck = 0.05, mgp = c(2, 
    0, 0), main = "Box Plot", xlab = varUnits)

plot of chunk unnamed-chunk-2

Power transformations

lm1 <- lm(ViolentCrimesPerPop ~ ., data = crimeReg)
library(MASS)
boxcox(lm1)

plot of chunk unnamed-chunk-3

power <- boxcox(lm1, lambda = seq(-1, 1, 0.05), plot = FALSE)
sub <- which(power$y == max(power$y))
power$x[sub]

## [1] 0.2

layout(mat = matrix(1:4, ncol = 2))
oldpar <- par(mai = c(0.72, 0.68, 0.4, 0.2))
cexL <- 1.3
cexS <- 1
dep <- crimeReg$ViolentCrimesPerPop
var <- dep^0.2
varLab1 <- "Violent Crimes"
varLab2 <- "In Communities"
varUnits <- "(Rate per 100,000 People)**.2"

plot(c(0, 1), c(0, 1), axes = FALSE, type = "n", xlab = "", ylab = "", main = "")
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], density = -1, col = rgb(0.95, 1, 0.95), 
    border = "black")
text(0.5, 0.7, varLab1, cex = cexL, font = 2)
text(0.5, 0.5, varLab2, cex = cexL, font = 2)
text(0.5, 0.2, varUnits, cex = cexL, font = 1)

qqPlot(var, las = 1, xlab = "Normal Quantiles", ylab = varUnits, main = "Q-Q Plot")

# myDensity(var, units=varUnits)

boxplot(var, horizontal = TRUE, col = rgb(0, 0.85, 1), tck = 0.05, mgp = c(2, 
    0, 0), main = "Box Plot", xlab = varUnits)

plot of chunk unnamed-chunk-3

Regression diagnostics

crimeReg2 <- crimeReg[, -c(11, 28, 50, 52, 90, 91)]
lm2 <- lm(ViolentCrimesPerPop ~ ., data = crimeReg2)
lmPower1 <- lm(ViolentCrimesPerPop^0.2 ~ ., data = crimeReg2)
plot(lmPower1)

plot of chunk unnamed-chunk-4

Stepwise Variable Removal

lmPower2 <- step(lmPower1)

plot(lmPower2)

plot of chunk unnamed-chunk-6

Looking at the univariate distributions

tmp <- lmPower2$terms
termLab <- attr(tmp, which = "term.labels")
termLabPlus <- c(termLab, colnames(crimeReg2)[95])
crimeReg3 <- crimeReg2[, termLabPlus]
library(lattice)
par(ask = TRUE)
for (i in 1:length(termLab)) {
    show(densityplot(crimeReg3[, termLab[i]], xlab = termLab[i]))
}

plot of chunk unnamed-chunk-7

Looking variables pairs and correlations

corMat <- cor(crimeReg3[, termLab])  # get correlations
x <- cmdscale(1 - abs(corMat), k = 1)  # seriation 
ord <- rev(order(x))
corMatOrd <- corMat[ord, ord]
# compute colors
tmp <- as.vector(corMatOrd)
low <- tmp < 0
lowBrk <- quantile(tmp[low], c(0, 0.25, 0.5, 1))
highBrk <- quantile(tmp[!low], c(0, 0.5, 0.75, 1))
brk <- c(-1.01, lowBrk[c(2, 3)], 0, highBrk[2:4])
colorSub <- cut(tmp, brk, label = FALSE)
mat <- matrix(c(118, 42, 131, 175, 141, 195, 231, 212, 232, 217, 240, 211, 127, 
    191, 123, 27, 120, 95), ncol = 3, byrow = TRUE)
colors = rgb(mat[, 1], mat[, 2], mat[, 3], max = 255)

nr <- nrow(corMatOrd)
x <- 1:nr - 0.5
cen <- expand.grid(list(y = rev(x), x = x))
par(mai = c(0.4, 0.5, 0.5, 0.2))
plot(c(-10, 43), c(0, 43), type = "n", axes = "FALSE", ylab = "", main = "Double ended scale:  Shades of purple negative, Shades of green positive")
mtext(side = 1, line = 0, "Two major variable groups. Seriation by 1D MDS, Pearson Distance: 1-abs(cor)")
# could use image()
rect(cen$x - 0.5, cen$y - 0.5, cen$x + 0.5, cen$y + 0.5, col = colors[colorSub], 
    border = rgb(0.2, 0.2, 0.2))
text(rep(-10.5, 43), y = rev(x), rownames(corMatOrd), adj = 0, cex = 0.75)

plot of chunk unnamed-chunk-8

*Bivariate outliers and scagnostics *

library(scagnostics)

## Loading required package: rJava

manyPairs <- scagnostics(crimeReg3)
write.csv(manyPairs, file = "manyPairs.csv", quote = FALSE)
manyPairs <- read.csv("manyPairs.csv", row.names = 1)

colnam <- colnames(crimeReg3)
plot(crimeReg3[, 1], crimeReg3[, 2], xlab = colnam[1], ylab = colnam[2])

plot of chunk unnamed-chunk-9