Using scatterplots to look a candidate dependent variables
# String processing to extract variable labels
varNam <- scan(file = "CrimeVarNames.txt", what = "a", sep = "\n")
tmp <- strsplit(varNam, ":")
varLab <- unlist(tmp)[seq(1, 288, by = 2)]
n <- nchar(varLab)
varLab <- substring(varLab, 2, n)
# some labels are too long
varLab[1] <- "place"
varLab[2] <- "state"
varLab[3] <- "population"
# Reading and fixing the data
crimeDat <- read.csv(file = "crimeDat.csv", header = FALSE, na.string = "?")
colnames(crimeDat) <- varLab
# Addressing missing data count the number missing in each column and remove
# variables
myNA <- function(x) sum(is.na(x))
missDat <- apply(crimeDat, 2, myNA)
# Keep the Violent Crime column with 221 or less missing
crimeFix1 <- crimeDat[, missDat < 222]
# Find and remove cases with missing data
caseMiss <- apply(crimeFix1, 1, myNA)
crimeClean <- crimeFix1[caseMiss == 0, ]
## SCATTERPLOTS
subs <- c(106, 108, 110, 112, 114, 116, 118, 120, 121, 122)
violent <- c(106, 108, 110, 112, 121)
pairs(crimeClean[, violent], gap = 0, las = 1)
subs <- c(114, 116, 118, 120, 122, 121)
pairs(crimeClean[, subs], gap = 0, las = 1)
Exploratory 1D graphics
# Variable Selection
subs = c(3:84, 86:88, 90:104, 121)
crimeReg <- crimeClean[, subs]
library(car)
# source('myDensity.r')
layout(mat = matrix(1:4, ncol = 2))
oldpar <- par(mai = c(0.72, 0.68, 0.4, 0.2))
cexL <- 1.3
cexS <- 1
var <- crimeReg$ViolentCrimesPerPop
varLab1 <- "Violent Crimes"
varLab2 <- "In Communities"
varUnits <- "Rate per 100,000 People"
plot(c(0, 1), c(0, 1), axes = FALSE, type = "n", xlab = "", ylab = "", main = "")
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], density = -1, col = rgb(0.95, 1, 0.95),
border = "black")
text(0.5, 0.7, varLab1, cex = cexL, font = 2)
text(0.5, 0.5, varLab2, cex = cexL, font = 2)
text(0.5, 0.2, varUnits, cex = cexL, font = 1)
qqPlot(var, las = 1, xlab = "Normal Quantiles", ylab = varUnits, main = "Q-Q Plot")
# myDensity(var, units=varUnits)
boxplot(var, horizontal = TRUE, col = rgb(0, 0.85, 1), tck = 0.05, mgp = c(2,
0, 0), main = "Box Plot", xlab = varUnits)
Power transformations
lm1 <- lm(ViolentCrimesPerPop ~ ., data = crimeReg)
library(MASS)
boxcox(lm1)
power <- boxcox(lm1, lambda = seq(-1, 1, 0.05), plot = FALSE)
sub <- which(power$y == max(power$y))
power$x[sub]
## [1] 0.2
layout(mat = matrix(1:4, ncol = 2))
oldpar <- par(mai = c(0.72, 0.68, 0.4, 0.2))
cexL <- 1.3
cexS <- 1
dep <- crimeReg$ViolentCrimesPerPop
var <- dep^0.2
varLab1 <- "Violent Crimes"
varLab2 <- "In Communities"
varUnits <- "(Rate per 100,000 People)**.2"
plot(c(0, 1), c(0, 1), axes = FALSE, type = "n", xlab = "", ylab = "", main = "")
usr <- par("usr")
rect(usr[1], usr[3], usr[2], usr[4], density = -1, col = rgb(0.95, 1, 0.95),
border = "black")
text(0.5, 0.7, varLab1, cex = cexL, font = 2)
text(0.5, 0.5, varLab2, cex = cexL, font = 2)
text(0.5, 0.2, varUnits, cex = cexL, font = 1)
qqPlot(var, las = 1, xlab = "Normal Quantiles", ylab = varUnits, main = "Q-Q Plot")
# myDensity(var, units=varUnits)
boxplot(var, horizontal = TRUE, col = rgb(0, 0.85, 1), tck = 0.05, mgp = c(2,
0, 0), main = "Box Plot", xlab = varUnits)
Regression diagnostics
crimeReg2 <- crimeReg[, -c(11, 28, 50, 52, 90, 91)]
lm2 <- lm(ViolentCrimesPerPop ~ ., data = crimeReg2)
lmPower1 <- lm(ViolentCrimesPerPop^0.2 ~ ., data = crimeReg2)
plot(lmPower1)
Stepwise Variable Removal
lmPower2 <- step(lmPower1)
plot(lmPower2)
Looking at the univariate distributions
tmp <- lmPower2$terms
termLab <- attr(tmp, which = "term.labels")
termLabPlus <- c(termLab, colnames(crimeReg2)[95])
crimeReg3 <- crimeReg2[, termLabPlus]
library(lattice)
par(ask = TRUE)
for (i in 1:length(termLab)) {
show(densityplot(crimeReg3[, termLab[i]], xlab = termLab[i]))
}
Looking variables pairs and correlations
corMat <- cor(crimeReg3[, termLab]) # get correlations
x <- cmdscale(1 - abs(corMat), k = 1) # seriation
ord <- rev(order(x))
corMatOrd <- corMat[ord, ord]
# compute colors
tmp <- as.vector(corMatOrd)
low <- tmp < 0
lowBrk <- quantile(tmp[low], c(0, 0.25, 0.5, 1))
highBrk <- quantile(tmp[!low], c(0, 0.5, 0.75, 1))
brk <- c(-1.01, lowBrk[c(2, 3)], 0, highBrk[2:4])
colorSub <- cut(tmp, brk, label = FALSE)
mat <- matrix(c(118, 42, 131, 175, 141, 195, 231, 212, 232, 217, 240, 211, 127,
191, 123, 27, 120, 95), ncol = 3, byrow = TRUE)
colors = rgb(mat[, 1], mat[, 2], mat[, 3], max = 255)
nr <- nrow(corMatOrd)
x <- 1:nr - 0.5
cen <- expand.grid(list(y = rev(x), x = x))
par(mai = c(0.4, 0.5, 0.5, 0.2))
plot(c(-10, 43), c(0, 43), type = "n", axes = "FALSE", ylab = "", main = "Double ended scale: Shades of purple negative, Shades of green positive")
mtext(side = 1, line = 0, "Two major variable groups. Seriation by 1D MDS, Pearson Distance: 1-abs(cor)")
# could use image()
rect(cen$x - 0.5, cen$y - 0.5, cen$x + 0.5, cen$y + 0.5, col = colors[colorSub],
border = rgb(0.2, 0.2, 0.2))
text(rep(-10.5, 43), y = rev(x), rownames(corMatOrd), adj = 0, cex = 0.75)
*Bivariate outliers and scagnostics *
library(scagnostics)
## Loading required package: rJava
manyPairs <- scagnostics(crimeReg3)
write.csv(manyPairs, file = "manyPairs.csv", quote = FALSE)
manyPairs <- read.csv("manyPairs.csv", row.names = 1)
colnam <- colnames(crimeReg3)
plot(crimeReg3[, 1], crimeReg3[, 2], xlab = colnam[1], ylab = colnam[2])