## A. Michael Bauer

## CVEN 5454: Quantitative Methods Homework 5

library(MASS)
library(BSDA)

## Loading required package: e1071

## Loading required package: class

## Loading required package: lattice

## Attaching package: 'BSDA'

## The following object is masked from 'package:datasets':
## 
## Orange

library(e1071)
library(formatR)
library(akima)
library(knitr)

## Problem 1. 11-77 11-77 a)
kwdata = read.table("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/Prob-11-77-data.txt")
x = kwdata[, 1]
y = kwdata[, 2]
plot(x, y, main = "kWh vs. kW", xlab = "kilowatt hours", ylab = "kilowatts")

$plot of chunk unnamed-chunk-1$


## 11-77 b)
qqnorm(kwdata[, 1])

plot of chunk unnamed-chunk-1

qqnorm(kwdata[, 2])  ### Data is normally distributed, with a few outliers

plot of chunk unnamed-chunk-1

kw.lm = lm(y ~ x)
plot(x, predict(kw.lm), type = "l", ylim = range(y))
points(x, y)

plot of chunk unnamed-chunk-1


## 11-77 c)
summary(kw.lm)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.326 -0.949 -0.218  1.338  2.967 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.881927   0.452124   -1.95    0.057 .  
## x            0.003848   0.000348   11.05  8.8e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.56 on 48 degrees of freedom
## Multiple R-squared:  0.718,  Adjusted R-squared:  0.712 
## F-statistic:  122 on 1 and 48 DF,  p-value: 8.81e-15

## Since p-value=8.81x10^-15 << 0.05=alpha, we reject the null hypothesis
## that Beta=0, concluding that at least one of the coefficients is not
## zero.

## 11-77 d)
zz = lsfit(x, y)
x1 = residuals(zz)
yhat = y - zz$res
plot(yhat, x1, xlab = "Y Estimate", ylab = "Residuals")

plot of chunk unnamed-chunk-1

## The residuals seem to increase with respect to either y or x, as seen
## in the quasi-funnel shaped scatterplot.  These are random, independent
## samples, but we cannot assume equality of variance; residuals are not
## normally distributed.  We can see if the residuals are related to one
## another by using ACF, below
library(sm)

## Package `sm', version 2.2-4.1 Copyright (C) 1997, 2000, 2005, 2007, 2008,
## A.W.Bowman & A.Azzalini Type help(sm) for summary information

z1 = acf(x1)

plot of chunk unnamed-chunk-1

## The residuals show heterscedasticity, confirming prior conclusions.  If
## we wanted to further test, we can explore the Coefficient of
## Determination, R^2
SSR = sum((yhat - mean(y))^2)
SST = sum((y - mean(y))^2)
SSE = sum((zz$res)^2)
COD = SSR/SST
COD

## [1] 0.7177

# The model only accounts for 72% of the variability in the data; this is
# not a very good model, as we saw in the plot of residuals versus y-hat

## 11-77 e)
yy = log(y)
xx = log(x)
zz = lsfit(xx, yy)
x1 = residuals(zz)
yhat = yy - zz$res
plot(yhat, x1, xlab = "Transformed Y Estimate", ylab = "Residuals")

plot of chunk unnamed-chunk-1

## This does not seem to have fully stabilized the relationship we saw in
## d, but does produce more randomized scatter plot, indicating it
## improves.

## Problem 2. 11-78

## 11-78 a) For the simple linear regression model, we chose Beta-hat(0)
## and Beta-hat(1) such that the residuals add up to zero. This comes from
## the method of moments for least squares. As such, we have: Let Y(i) =
## b0 + b1*x + e(i) To estimate betas, we calculate error:

## e(i) = Y(i) - b0 - b1*x We then minimize this by least squares method:

## Explained sum of squares = SSE = e^2(1)+ e^2(2)+ ....+ e^2(n) = (Y(1) -
## b0 - b1X(1))² + (Y(2) - b(0) - b(2)X(2))² + ... + (Y(n) - b(0) -
## b(1)X(n))²

## To minimize wrt b0 and b1, we differentiante wrt both and set equal to
## zero.  This gives us: (1) Derivative wrt b0: 2(Y(1) - b0 - b1X(i))(-1)
## + 2(Y(2) - b0 - b1X(2))(-1) + ... + 2(Y(n) - b0 - b1X(n))(-1) = 0
## Simplifying gives: (Y(1) - b0 - b1X(i)) + (Y(2) - b0 - b1X(2)) + ... +
## (Y(n) - b0 - b1X(n)) = 0, where (Y(1) - b0 - b1X(i)) = e1, and (Y(2) -
## b0 - b1X(2)) = e2 Therefore, e(1) + e(2) +... + e(n) = 0

## 11-78 b) Similarly to 11-78 a): Differentiate wrt b1, giving 2(Y(1) -
## b0 - b1X(i)) (-X(1)) + 2(Y(2) - b0 - b1X(2))(-X(2)) + ... + 2(Y(n) - b0
## - b1X(n))(-X(n)) = 0 Therefore, e(1)X(1) + e(2)X(2) + ...+e(n)X(n) = 0

## 11-78 c) Similarly to a) and b), the mean of the fitted values of y is
## equal to the sample mean by the following: y(i) = b0 + b1*x
## (1/n)Sigma(y-hat(i)) = (1/n)Sigma(b0 + b1*Xi) = (1/n)Sigma(ybar-b1*x+
## b1*Xi) = ybar-b1*x+ b1*Xi = ybar

## Problem 3. 11-86

## 11-86 a)
temp.data = read.csv("bauerstein/documents/school/2013 spring/numerical methods/R/Data_1186.csv")

## Warning: cannot open file 'bauerstein/documents/school/2013
## spring/numerical methods/R/Data_1186.csv': No such file or directory

## Error: cannot open the connection

x = temp.data[, 1]

## Error: object 'temp.data' not found

y = temp.data[, 2]

## Error: object 'temp.data' not found

plot(x, y, main = "IR temp vs. Thermocouple temp", xlab = "Thermocouple", ylab = "IR")

plot of chunk unnamed-chunk-1


qqnorm(x)

plot of chunk unnamed-chunk-1

qqnorm(y)  ## QQ Plots show data is close to normally distributed, with a few outliers; this is expected for a small sample such as this.

plot of chunk unnamed-chunk-1


## 11-86 b)
temp.lm = lm(y ~ x)
plot(x, predict(temp.lm), type = "l", ylim = range(y), main = "Regression of Thermocouple vs. IR", 
    xlab = "Thermocouple", ylab = "Predicted IR")
points(x, y)

plot of chunk unnamed-chunk-1


## 11-86 c)
summary(temp.lm)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.326 -0.949 -0.218  1.338  2.967 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.881927   0.452124   -1.95    0.057 .  
## x            0.003848   0.000348   11.05  8.8e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.56 on 48 degrees of freedom
## Multiple R-squared:  0.718,  Adjusted R-squared:  0.712 
## F-statistic:  122 on 1 and 48 DF,  p-value: 8.81e-15

z2 = lsfit(x, y)
x2 = residuals(z2)
yhat = y - z2$res
SSR = sum((yhat - mean(y))^2)
SST = sum((y - mean(y))^2)
SSE = sum((z2$res)^2)
COD = SSR/SST
COD

## [1] 0.7177

# It appears that our regression model only accounts for 71% of the
# variability in the data; this is not a very good model, however as the
# p-value=.002 < alpha=0.05, we reject the null hypothesis that beta = 0.
# The manually computed coefficient of determination agrees with that from
# summary(), i.e. 0.71

## 11-86 d)
c(mean(x), mean(y), var(x), var(y))

## [1] 1.133e+03 3.476e+00 4.103e+05 8.465e+00

anova(temp.lm)

## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## x          1    298   297.7     122 8.8e-15 ***
## Residuals 48    117     2.4                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## H0: mu(x) = mu(y) H1: mu(x) does not equal mu(y).  Reject H0 if either
## mu(x) or mu(y) is less than or equal to w(0.05)=78 Using Wilxcoxon
## rank-sum test: w0=78 at alpha =.05 and nx=ny=10; this is a small sample
## size.

print(wilcox.test(x, y, alternative = c("two.sided", "less", "greater"), paired = FALSE, 
    conf.int = FALSE, conf.level = 0.95))

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  x and y
## W = 2500, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0


## Since W=50 < w0=78, we reject the null hypothesis that the mean values
## for each device are roughly equal. Given that their variances are so
## divergent and the sample is so small, this makes intuitive sense.

## 11-86 e)
acf(x2)

plot of chunk unnamed-chunk-1

## Here, we can see that we have heteroscedasticity in the sample, which
## weakens the model.
par(mfrow = c(2, 2))
plot(temp.lm)

plot of chunk unnamed-chunk-1

## In the first plot, we see that a slight pattern (excetp for outlier 10)
## exists in the residuals, so the linear model may not be the best one,
## but could be OK; however, as we can see from the last plot, there are
## three values that are having undue influence on the regression
## relationship; let's try a more robust estimator to see if it can
## account for these outliers:
rlm(y ~ x)

## Call:
## rlm(formula = y ~ x)
## Converged in 4 iterations
## 
## Coefficients:
## (Intercept)           x 
##   -0.877449    0.003878 
## 
## Degrees of freedom: 50 total; 48 residual
## Scale estimate: 1.75

## Since the coefficients are still roughly the same, it seems that we can
## still reject the null hypothesis, as before.
par(mfrow = c(1, 1))
## Problem 4. 12-92

## 12-92 a) Fitting the Model: Coefficients
kr.data = read.csv("bauerstein/documents/school/2013 spring/numerical methods/R/Data_1292.csv")

## Warning: cannot open file 'bauerstein/documents/school/2013
## spring/numerical methods/R/Data_1292.csv': No such file or directory

## Error: cannot open the connection

y = kr.data[, 2]

## Error: object 'kr.data' not found

x1 = kr.data[, 1]

## Error: object 'kr.data' not found

x2 = (kr.data[, 1])^2

## Error: object 'kr.data' not found

xfull = cbind(kr.data[, 1], (kr.data[, 1])^2)

## Error: object 'kr.data' not found

kr.lmfull = lm(y ~ xfull)

## Error: object 'xfull' not found

kr.lmfull

## Error: object 'kr.lmfull' not found


## 12-92 b) Test for significance H0: Beta1=Beta2=Beta3=0 H1: Betaj does
## not equal 0 for any j Reject the null hypothesis if p-value for total
## is less than alpha
summary(kr.lmfull)

## Error: object 'kr.lmfull' not found

anova(kr.lmfull)

## Error: object 'kr.lmfull' not found

## p-value=.0002 << 005=alpha, therefore we reject the null hypothesis and
## conclude that the independent variables are linerarly related;

## 12-92 c) Contribution of Quadratic Term to the Model H0: Beta3
## (quadratic coefficient) = 0 H1: Beta3 does not equal 0.
kr.lm1 = lm(y ~ x1)
print(anova(kr.lm1, kr.lmfull))

## Error: object 'kr.lmfull' not found

## P-value = 0.0001473 < 0.05 = alpha; therefore, we reject the null
## hypothesis and assume that Beta3 is significantly different from 0.

## Problem 5. 12-93
test = read.table("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/prob12-93-data.txt")
jet.data = test[, 2:8]
y = as.matrix(jet.data[, 1])
x1 = as.matrix(jet.data[, 2])
x2 = as.matrix(jet.data[, 3])
x3 = as.matrix(jet.data[, 4])
x4 = as.matrix(jet.data[, 5])
x5 = as.matrix(jet.data[, 6])
xfull = cbind(x1, x2, x3, x4, x5)
N = length(y)
x = xfull

## 12-93 a) Select the Best Regression equation: minimum MS_e
library(leaps)
library(MPV)

## Attaching package: 'MPV'

## The following object is masked from 'package:MASS':
## 
## cement

## The following object is masked from 'package:datasets':
## 
## stackloss

combs = leaps(x, y, nbest = N)
combos = combs$which
ncombos = length(combos[, 1])
xpress = 1:ncombos
xmse = 1:ncombos
for (i in 1:ncombos) {
    zz = glm(y ~ x[, combos[i, ]])
    xpress[i] = PRESS(zz)
    xmse[i] = sum((zz$res)^2)/(N - length(zz$coef))
}

bestpress = order(xpress)[1]
bestmodelp = lsfit(x[, combos[bestpress, ]], y)
summary(bestmodelp)

##              Length Class  Mode   
## coefficients  5     -none- numeric
## residuals    40     -none- numeric
## intercept     1     -none- logical
## qr            6     qr     list

combos[bestpress, ]

##     1     2     3     4     5 
##  TRUE FALSE  TRUE  TRUE  TRUE


## From these results, we can see that the best model includes x1,x3,x4
## and x5.  As such, the linear equation would be y-hat = 7163.3743 +
## 1.3578*x1 - 0.3215*x3 + 9.6104*x4 + 1.2859x5

## 12-93 b) Selection of subeset regression model using Step
x = as.data.frame(xfull)
zm = glm(y ~ ., data = x)
zms = step(zm)

## Start:  AIC=415.4
## y ~ V1 + V2 + V3 + V4 + V5
## 
##        Df Deviance AIC
## - V2    1    54686 414
## <none>       53377 415
## - V4    1    62331 420
## - V5    1    62974 420
## - V1    1    68910 424
## - V3    1    69650 424
## 
## Step:  AIC=414.3
## y ~ V1 + V3 + V4 + V5
## 
##        Df Deviance AIC
## <none>       54686 414
## - V5    1    64476 419
## - V4    1    65242 419
## - V3    1    69953 422
## - V1    1    71658 423


## Using stepwise estimation, we see that the best model uses x1, x3, x4
## and x5, as before, givin lowest AIC = 414.33

## 12-93 c) Multicollinerity

xp = cbind(x1, x3, x4, x5)
z2 = lsfit(xp, y)
x2 = residuals(z2)
yhat = y - z2$res
SSR = sum((yhat - mean(y))^2)
SST = sum((y - mean(y))^2)
R2p = SSR/SST
VIFp = 1/(1 - R2p)
VIFp

## [1] 181.6


z3 = lsfit(x, y)
x3 = residuals(z3)
yhat = y - z3$res
SSR = sum((yhat - mean(y))^2)
SST = sum((y - mean(y))^2)
R2 = SSR/SST
VIF = 1/(1 - R2)
VIF

## [1] 186.1


## From the above calculations, we can see that the preferred model has a
## lower VIF, indicating less multicollinearity in the model; However,
## both have VIF greater than 1, indicating some level of
## multicollinearity for each.

## 12-93 d) ANOVA and Cook's Distance
zz = lm(y ~ xp)
anova(zz)

## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value Pr(>F)    
## xp         4 9877350 2469338    1580 <2e-16 ***
## Residuals 35   54686    1562                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

hatm = hatvalues(zz)
bestmodel = lsfit(xp, y)
SSE = sum((bestmodel$res)^2)
MSR = SSR/ncol(xp)
MSE = SSE/(N - length(bestmodel$coef))
ri = bestmodel$res/sqrt((1 - diag(hatm)) * MSE)
Di = ri * ri * diag(hatm)/((1 - diag(hatm)) * length(bestmodel$coef))

## These give us values for Cook's Distance, none of which aproach 1.
## Alternately, we can plot Cook's distance against leveraged values

par(mfrow = c(2, 2))
plot(zz)
par(mfrow = c(1, 1))


## Problem 6. Helsel and Hershel 12.4

## 6 a) Correlation Coefficient

mblm <- function(formula, dataframe, repeated = TRUE) {
    if (missing(dataframe)) 
        dataframe <- environment(formula)
    term <- as.character(attr(terms(formula), "variables")[-1])
    x = dataframe[[term[2]]]
    y = dataframe[[term[1]]]
    if (length(term) > 2) {
        stop("Only linear models are accepted")
    }
    xx = sort(x)
    yy = y[order(x)]
    n = length(xx)
    slopes = c()
    intercepts = c()
    smedians = c()
    imedians = c()
    if (repeated) {
        for (i in 1:n) {
            slopes = c()
            intercepts = c()
            for (j in 1:n) {
                if (xx[j] != xx[i]) {
                  slopes = c(slopes, (yy[j] - yy[i])/(xx[j] - xx[i]))
                  intercepts = c(intercepts, (xx[j] * yy[i] - xx[i] * yy[j])/(xx[j] - 
                    xx[i]))
                }
            }
            smedians = c(smedians, median(slopes))
            imedians = c(imedians, median(intercepts))
        }
        slope = median(smedians)
        intercept = median(imedians)
    } else {
        for (i in 1:(n - 1)) {
            for (j in i:n) {
                if (xx[j] != xx[i]) {
                  slopes = c(slopes, (yy[j] - yy[i])/(xx[j] - xx[i]))
                }
            }
        }
        slope = median(slopes)
        intercepts = yy - slope * xx
        intercept = median(intercepts)
    }
    res = list()
    res$coefficients = c(intercept, slope)
    names(res$coefficients) = c("(Intercept)", term[2])
    res$residuals = y - slope * x - intercept
    names(res$residuals) = as.character(1:length(res$residuals))
    res$fitted.values = x * slope + intercept
    names(res$fitted.values) = as.character(1:length(res$fitted.values))
    if (repeated) {
        res$slopes = smedians
        res$intercepts = imedians
    } else {
        res$slopes = slopes
        res$intercepts = intercepts
    }
    res$df.residual = n - 2
    res$rank = 2
    res$terms = terms(formula)
    res$call = match.call()
    res$model = data.frame(y, x)
    res$assign = c(0, 1)
    if (missing(dataframe)) {
        res$effects = lm(formula)$effects
        res$qr = lm(formula)$qr
    } else {
        res$effects = lm(formula, dataframe)$effects
        res$qr = lm(formula, dataframe)$qr
    }
    res$effects[2] = sqrt(sum((res$fitted - mean(res$fitted))^2))
    res$xlevels = list()
    names(res$model) = term
    attr(res$model, "terms") = terms(formula)
    class(res) = c("mblm", "lm")
    res
}
c.data = read.csv("bauerstein/documents/school/2013 spring/numerical methods/R/Data_HH12.4.csv")

## Warning: cannot open file 'bauerstein/documents/school/2013
## spring/numerical methods/R/Data_HH12.4.csv': No such file or directory

## Error: cannot open the connection

carbon = c.data$Carbon.14

## Error: object 'c.data' not found

calcium = c.data$Calcium

## Error: object 'c.data' not found

print(cor(carbon, calcium))

## Error: object 'calcium' not found


## 6 b) Kendall's tau
print(cor.test(carbon, calcium, method = "kendall"))

## Error: object 'carbon' not found


## 6 c) Spearman's Rank
print(cor.test(carbon, calcium, method = "spearman", conf.level = 0.05, two.sided = TRUE))

## Error: object 'carbon' not found


## 6 d)
X = carbon

## Error: object 'carbon' not found

Y = calcium

## Error: object 'calcium' not found

zkt = mblm(Y ~ X)

## Warning: is.na() applied to non-(list or vector) of type 'NULL'

## Error: argument 1 is not a vector

zlin = lsfit(X, Y)

## Error: object 'X' not found

plot(X, Y, xlab = "% Carbon-14", ylab = "% Calcium")

## Error: object 'X' not found

abline(zlin, col = "red")

## Error: object 'zlin' not found

abline(zkt, col = "blue")

## Error: object 'zkt' not found

legend("topright", col = c("red", "blue"), legend = c("Linear Fit", "Kendall-Theile Robust"), 
    lty = 1, lwd = 1)

plot of chunk unnamed-chunk-1


## Summary of Data
carbon.lm = lm(calcium ~ carbon, data = c.data)

## Error: object 'c.data' not found

print(summary(carbon.lm))

## Error: object 'carbon.lm' not found


## 6 e)
minf = function(x, y) {
    library(sm)
    air3 = cbind(x, y)
    result.12 = sm.density(air3, eval.points = air3, display = "none")
    result.1 = sm.density(x, eval.points = x, display = "none")
    result.2 = sm.density(y, eval.points = y, display = "none")
    tobs = sum(result.12$estimate * log2(result.12$estimate/(result.1$estimate * 
        result.2$estimate)))
    tobs
}
library(sm)
library(mvtnorm)
library(akima)
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess3/MI/minf.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess3/MI/mi_confs.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess3/MI/minf_normal.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess3/MI/mi_normal_confs.r")
library(rpanel)

## Loading required package: tcltk

## Package `rpanel', version 1.1-2: type help(rpanel) for summary information

test = read.table("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/prob-12-4-data.txt")
test2 = test[, 1:2]
test3 = test[, 3:4]
colnames(test2) = colnames(test3)
test = rbind(test2, test3)
x = test[, 1]
y = test[, 2]
minf(x, y)

## Loading required package: rgl

## [1] 0.001145

minfconf(x, y)

## [1] 0.001145
##                                            5%                    10% 
##             "MI range" "4.15781667928057e-05" "0.000144909300351482" 
##                    50%                    90%                    95% 
## "0.000574415321691489"  "0.00140573710599974"  "0.00181803216578894" 
##                                            5%                    10% 
##           "Cor range "    "-0.43134805589595"    "-0.36879462617211" 
##                    50%                    90%                    95% 
## "0.000921907518499569"    "0.353608116563331"    "0.458908505972696"

## Error: could not find function "myboxplot"

minfnormal(x, y)

## [1] 0.001333

minfnormalconf(x, y)

## [1] 0.001333

## Error: could not find function "myboxplot"


## Problem 7. Helsel & Herschel 15.1
XandY = read.table("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess5/prob-15-1.txt")

y = XandY[, 5]

x = as.matrix(XandY[, 1:4])  ## this is the second best combination, AIC =  59.70439
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0945  -0.3349  -0.0857  -0.0462   2.1324  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -13.2054     3.5581   -3.71  0.00021 ***
## xV1           0.5153     0.1509    3.41  0.00064 ***
## xV2           0.4291     0.2749    1.56  0.11851    
## xV3           0.0303     0.3246    0.09  0.92551    
## xV4           1.0895     0.2986    3.65  0.00026 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 49.704  on 119  degrees of freedom
## AIC: 59.7
## 
## Number of Fisher Scoring iterations: 7

print(logistic$aic)

## [1] 59.7


x = as.matrix(XandY[, 1:3])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.198  -0.513  -0.303  -0.209   2.441  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -5.0840     1.5826   -3.21   0.0013 **
## xV1           0.3304     0.1020    3.24   0.0012 **
## xV2           0.3759     0.1931    1.95   0.0516 . 
## xV3          -0.0326     0.2680   -0.12   0.9032   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 74.755  on 120  degrees of freedom
## AIC: 82.75
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 82.75


x = as.matrix(XandY[, 1:2])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.180  -0.512  -0.304  -0.212   2.452  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.217      1.150   -4.54  5.7e-06 ***
## xV1            0.329      0.101    3.25   0.0011 ** 
## xV2            0.377      0.193    1.95   0.0510 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 74.769  on 121  degrees of freedom
## AIC: 80.77
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 80.77


x = as.matrix(XandY[, 2:3])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.686  -0.570  -0.447  -0.332   2.210  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -3.4265     1.3936   -2.46    0.014 *
## xV2           0.3090     0.1811    1.71    0.088 .
## xV3           0.0491     0.2579    0.19    0.849  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 88.231  on 121  degrees of freedom
## AIC: 94.23
## 
## Number of Fisher Scoring iterations: 5

print(logistic$aic)

## [1] 94.23


x = as.matrix(XandY[, 2:4])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.465  -0.412  -0.276  -0.173   2.174  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.658      2.124   -3.61  0.00031 ***
## xV2            0.247      0.223    1.11  0.26814    
## xV3            0.109      0.293    0.37  0.70848    
## xV4            0.723      0.187    3.87  0.00011 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 69.162  on 120  degrees of freedom
## AIC: 77.16
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 77.16


x = as.matrix(XandY[, 3:4])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.373  -0.387  -0.268  -0.185   2.293  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -6.6831     1.8048   -3.70  0.00021 ***
## xV3           0.0747     0.2859    0.26  0.79377    
## xV4           0.7509     0.1862    4.03  5.5e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 70.471  on 121  degrees of freedom
## AIC: 76.47
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 76.47


x = as.matrix(XandY[, c(1, 2, 4)])  ## this is the best combination AIC = 57.71315
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0808  -0.3364  -0.0846  -0.0478   2.1443  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -13.046      3.098   -4.21  2.5e-05 ***
## xV1            0.516      0.151    3.42  0.00062 ***
## xV2            0.426      0.271    1.57  0.11654    
## xV4            1.086      0.295    3.68  0.00023 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 49.713  on 120  degrees of freedom
## AIC: 57.71
## 
## Number of Fisher Scoring iterations: 7

print(logistic$aic)

## [1] 57.71


x = as.matrix(XandY[, c(1, 3, 4)])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0654  -0.2861  -0.1124  -0.0656   2.0601  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -10.8297     2.8156   -3.85  0.00012 ***
## xV1           0.4639     0.1360    3.41  0.00065 ***
## xV3          -0.0133     0.3117   -0.04  0.96604    
## xV4           1.0728     0.2840    3.78  0.00016 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 52.543  on 120  degrees of freedom
## AIC: 60.54
## 
## Number of Fisher Scoring iterations: 7

print(logistic$aic)

## [1] 60.54


x = as.matrix(XandY[, 1])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.853  -0.565  -0.290  -0.231   2.699  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.6151     0.6998   -5.17  2.4e-07 ***
## x             0.3101     0.0988    3.14   0.0017 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 79.092  on 122  degrees of freedom
## AIC: 83.09
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 83.09


x = as.matrix(XandY[, 2])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.673  -0.585  -0.439  -0.326   2.186  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.214      0.825   -3.90  9.8e-05 ***
## x              0.307      0.180    1.71    0.088 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 88.268  on 122  degrees of freedom
## AIC: 92.27
## 
## Number of Fisher Scoring iterations: 5

print(logistic$aic)

## [1] 92.27


x = as.matrix(XandY[, 3])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.517  -0.517  -0.517  -0.495   2.115  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.1694     1.1002   -1.97    0.049 *
## x             0.0446     0.2540    0.18    0.861  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 91.443  on 122  degrees of freedom
## AIC: 95.44
## 
## Number of Fisher Scoring iterations: 4

print(logistic$aic)

## [1] 95.44


x = as.matrix(XandY[, 4])
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))

## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.343  -0.377  -0.262  -0.181   2.315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -6.347      1.239   -5.12  3.0e-07 ***
## x              0.747      0.185    4.04  5.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 91.474  on 123  degrees of freedom
## Residual deviance: 70.540  on 122  degrees of freedom
## AIC: 74.54
## 
## Number of Fisher Scoring iterations: 6

print(logistic$aic)

## [1] 74.54