library(ISLR)
data(Carseats)
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
sales.fit = lm(Sales~Advertising+ShelveLoc, data=Carseats)
summary(sales.fit)
##
## Call:
## lm(formula = Sales ~ Advertising + ShelveLoc, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6480 -1.6198 -0.0476 1.5308 6.4098
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.89662 0.25207 19.426 < 2e-16 ***
## Advertising 0.10071 0.01692 5.951 5.88e-09 ***
## ShelveLocGood 4.57686 0.33479 13.671 < 2e-16 ***
## ShelveLocMedium 1.75142 0.27475 6.375 5.11e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.244 on 396 degrees of freedom
## Multiple R-squared: 0.3733, Adjusted R-squared: 0.3685
## F-statistic: 78.62 on 3 and 396 DF, p-value: < 2.2e-16
contrasts(Carseats$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
value.fit = lm(medv~lstat*age, data=Boston)
summary(value.fit)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
library(MASS)
data(biopsy)
str(biopsy)
## 'data.frame': 699 obs. of 11 variables:
## $ ID : chr "1000025" "1002945" "1015425" "1016277" ...
## $ V1 : int 5 5 3 6 4 8 1 2 2 4 ...
## $ V2 : int 1 4 1 8 1 10 1 1 1 2 ...
## $ V3 : int 1 4 1 8 1 10 1 2 1 1 ...
## $ V4 : int 1 5 1 1 3 8 1 1 1 1 ...
## $ V5 : int 2 7 2 3 2 7 2 2 2 2 ...
## $ V6 : int 1 10 2 4 1 10 10 1 1 1 ...
## $ V7 : int 3 3 3 3 3 9 3 3 1 2 ...
## $ V8 : int 1 2 1 7 1 7 1 1 1 1 ...
## $ V9 : int 1 1 1 1 1 1 1 1 5 1 ...
## $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
biopsy$ID = NULL
names(biopsy) = c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom", "n.nuc", "mit", "class")
names(biopsy)
## [1] "thick" "u.size" "u.shape" "adhsn" "s.size" "nucl" "chrom"
## [8] "n.nuc" "mit" "class"
biopsy.v2 = na.omit(biopsy)
library(reshape2)
library(ggplot2)
biop.m = melt(biopsy.v2, id.var="class")
ggplot(data=biop.m, aes(x=class, y=value)) + geom_boxplot() +facet_wrap(~variable,ncol = 3)

library(corrplot)
bc = cor(biopsy.v2[ ,1:9]) #create an object of the features
corrplot.mixed(bc)

#random splitting
set.seed(123)
ind = sample(2, nrow(biopsy.v2), replace=TRUE, prob=c(0.7, 0.3))
train = biopsy.v2[ind==1,]
test = biopsy.v2[ind==2,]
str(test)
## 'data.frame': 209 obs. of 10 variables:
## $ thick : int 5 6 4 2 1 7 6 7 1 3 ...
## $ u.size : int 4 8 1 1 1 4 1 3 1 2 ...
## $ u.shape: int 4 8 1 2 1 6 1 2 1 1 ...
## $ adhsn : int 5 1 3 1 1 4 1 10 1 1 ...
## $ s.size : int 7 3 2 2 1 6 2 5 2 1 ...
## $ nucl : int 10 4 1 1 1 1 1 10 1 1 ...
## $ chrom : int 3 3 3 3 3 4 3 5 3 2 ...
## $ n.nuc : int 2 7 1 1 1 3 1 4 1 1 ...
## $ mit : int 1 1 1 1 1 1 1 4 1 1 ...
## $ class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 2 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
## .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...
table(train$class)
##
## benign malignant
## 302 172
table(test$class)
##
## benign malignant
## 142 67
full.fit = glm(class~., family=binomial, data=train)
summary(full.fit)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3397 -0.1387 -0.0716 0.0321 2.3559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4293 1.2273 -7.683 1.55e-14 ***
## thick 0.5252 0.1601 3.280 0.001039 **
## u.size -0.1045 0.2446 -0.427 0.669165
## u.shape 0.2798 0.2526 1.108 0.268044
## adhsn 0.3086 0.1738 1.776 0.075722 .
## s.size 0.2866 0.2074 1.382 0.167021
## nucl 0.4057 0.1213 3.344 0.000826 ***
## chrom 0.2737 0.2174 1.259 0.208006
## n.nuc 0.2244 0.1373 1.635 0.102126
## mit 0.4296 0.3393 1.266 0.205402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 620.989 on 473 degrees of freedom
## Residual deviance: 78.373 on 464 degrees of freedom
## AIC: 98.373
##
## Number of Fisher Scoring iterations: 8
confint(full.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -12.23786660 -7.3421509
## thick 0.23250518 0.8712407
## u.size -0.56108960 0.4212527
## u.shape -0.24551513 0.7725505
## adhsn -0.02257952 0.6760586
## s.size -0.11769714 0.7024139
## nucl 0.17687420 0.6582354
## chrom -0.13992177 0.7232904
## n.nuc -0.03813490 0.5110293
## mit -0.14099177 1.0142786
exp(coef(full.fit))
## (Intercept) thick u.size u.shape adhsn
## 8.033466e-05 1.690879e+00 9.007478e-01 1.322844e+00 1.361533e+00
## s.size nucl chrom n.nuc mit
## 1.331940e+00 1.500309e+00 1.314783e+00 1.251551e+00 1.536709e+00
library(car)
library(lme4)
## Loading required package: Matrix
vif(full.fit)
## thick u.size u.shape adhsn s.size nucl chrom n.nuc
## 1.235204 3.248811 2.830353 1.302178 1.635668 1.372931 1.523493 1.343145
## mit
## 1.059707
train$probs = predict(full.fit, type="response")
train$probs[1:5] #inspect the first 5 predicted probabilities
## [1] 0.02052820 0.01087838 0.99992668 0.08987453 0.01379266
contrasts(train$class)
## malignant
## benign 0
## malignant 1
train$predict = rep("benign", 474)
train$predict[train$probs>0.5]="malignant"
table(train$predict, train$class)
##
## benign malignant
## benign 294 7
## malignant 8 165
mean(train$predict==train$class)
## [1] 0.9683544
test$prob = predict(full.fit, newdata=test, type="response")
test$predict = rep("benign", 209)
test$predict[test$prob>0.5]="malignant"
table(test$predict, test$class)
##
## benign malignant
## benign 139 2
## malignant 3 65
mean(test$predict==test$class)
## [1] 0.9760766
#Logistic Regression with cross-validation
library(bestglm)
## Loading required package: leaps
train$y=rep(0,474)
train$y[train$class=="malignant"]=1
head(train[ ,13])
## [1] 0 0 1 0 0 0
biopsy.cv = train[ ,-10:-12]
head(biopsy.cv)
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit y
## 1 5 1 1 1 2 1 3 1 1 0
## 3 3 1 1 1 2 2 3 1 1 0
## 6 8 10 10 8 7 10 9 7 1 1
## 7 1 1 1 1 2 10 3 1 1 0
## 9 2 1 1 1 2 1 1 1 5 0
## 10 4 2 1 1 2 1 2 1 1 0
bestglm(Xy = biopsy.cv, IC="CV", CVArgs=list(Method="HTF", K=10, REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## CV(K = 10, REP = 1)
## BICq equivalent for q in (7.16797006619085e-05, 0.273173435514231)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.8147191 0.90996494 -8.587934 8.854687e-18
## thick 0.6188466 0.14713075 4.206100 2.598159e-05
## u.size 0.6582015 0.15295415 4.303260 1.683031e-05
## nucl 0.5725902 0.09922549 5.770596 7.899178e-09
reduce.fit = glm(class~thick+u.size+nucl, family=binomial, data=train)
train$cv.probs = predict(reduce.fit, type="response")
train$cv.predict = rep("benign", 474)
train$cv.predict[train$cv.probs>0.5]="malignant"
table(train$cv.predict, train$class)
##
## benign malignant
## benign 294 9
## malignant 8 163
test$cv.probs = predict(reduce.fit, newdata=test, type="response")
test$predict = rep("benign", 209)
test$predict[test$cv.probs>0.5]="malignant"
table(test$predict, test$class)
##
## benign malignant
## benign 139 5
## malignant 3 62
bestglm(Xy= biopsy.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.273173435514231, 0.577036596263757)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.6169613 1.03155250 -8.353391 6.633065e-17
## thick 0.7113613 0.14751510 4.822295 1.419160e-06
## adhsn 0.4537948 0.15034294 3.018398 2.541153e-03
## nucl 0.5579922 0.09848156 5.665956 1.462068e-08
## n.nuc 0.4290854 0.11845720 3.622282 2.920152e-04
bic.fit=glm(class~thick+adhsn+nucl+n.nuc, family=binomial, data=train)
test$bic.probs = predict(bic.fit, newdata=test, type="response")
test$bic.predict = rep("benign", 209)
test$bic.predict[test$bic.probs>0.5]="malignant"
table(test$bic.predict, test$class)
##
## benign malignant
## benign 138 1
## malignant 4 66
#Discriminant Analysis
lda.train = train[ ,-11:-15]
lda.train[1:3,]
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit class
## 1 5 1 1 1 2 1 3 1 1 benign
## 3 3 1 1 1 2 2 3 1 1 benign
## 6 8 10 10 8 7 10 9 7 1 malignant
lda.test = test[ ,-11:-15]
lda.test[1:3,]
## thick u.size u.shape adhsn s.size nucl chrom n.nuc mit class
## 2 5 4 4 5 7 10 3 2 1 benign
## 4 6 8 8 1 3 4 3 7 1 benign
## 5 4 1 1 3 2 1 3 1 1 benign
lda.fit = lda(class~., data=lda.train)
lda.fit
## Call:
## lda(class ~ ., data = lda.train)
##
## Prior probabilities of groups:
## benign malignant
## 0.6371308 0.3628692
##
## Group means:
## thick u.size u.shape adhsn s.size nucl chrom
## benign 2.92053 1.304636 1.413907 1.324503 2.115894 1.397351 2.082781
## malignant 7.19186 6.697674 6.686047 5.668605 5.500000 7.674419 5.959302
## n.nuc mit
## benign 1.225166 1.092715
## malignant 5.906977 2.639535
##
## Coefficients of linear discriminants:
## LD1
## thick 0.19557291
## u.size 0.10555201
## u.shape 0.06327200
## adhsn 0.04752757
## s.size 0.10678521
## nucl 0.26196145
## chrom 0.08102965
## n.nuc 0.11691054
## mit -0.01665454
plot(lda.fit, type="both")

lda.predict = predict(lda.fit)
train$lda = lda.predict$class
table(train$lda, train$class)
##
## benign malignant
## benign 296 13
## malignant 6 159
lda.test = predict(lda.fit, newdata = test)
test$lda = lda.test$class
table(test$lda, test$class)
##
## benign malignant
## benign 140 6
## malignant 2 61
mean(test$lda==test$class)
## [1] 0.9617225
qda.fit = qda(class~., data=lda.train)
qda.fit
## Call:
## qda(class ~ ., data = lda.train)
##
## Prior probabilities of groups:
## benign malignant
## 0.6371308 0.3628692
##
## Group means:
## thick u.size u.shape adhsn s.size nucl chrom
## benign 2.92053 1.304636 1.413907 1.324503 2.115894 1.397351 2.082781
## malignant 7.19186 6.697674 6.686047 5.668605 5.500000 7.674419 5.959302
## n.nuc mit
## benign 1.225166 1.092715
## malignant 5.906977 2.639535
qda.predict = predict(qda.fit)
train$qda = qda.predict$class
table(train$qda, train$class)
##
## benign malignant
## benign 287 5
## malignant 15 167
qda.test = predict(qda.fit, newdata=test)
test$qda = qda.test$class
table(test$qda, test$class)
##
## benign malignant
## benign 132 1
## malignant 10 66
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
bad.fit = glm(class~thick, family=binomial, data=test)
test$bad.probs = predict(bad.fit, type="response") #save probabilities
pred.full = prediction(test$prob, test$class)
perf.full = performance(pred.full, "tpr", "fpr")
plot(perf.full, main="ROC", col=1)
pred.bic = prediction(test$bic.probs, test$class)
perf.bic = performance(pred.bic, "tpr", "fpr")
plot(perf.bic, col=2, add=TRUE)
pred.bad = prediction(test$bad, test$class)
perf.bad = performance(pred.bad, "tpr", "fpr")
plot(perf.bad, col=3, add=TRUE)
legend(0.6, 0.6, c("FULL", "BIC", "BAD"),1:3)

auc.full = performance(pred.full, "auc")
auc.full
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.9972672
##
##
## Slot "alpha.values":
## list()
auc.bic = performance(pred.bic, "auc")
auc.bic
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.9944293
##
##
## Slot "alpha.values":
## list()
auc.bad = performance(pred.bad, "auc")
auc.bad
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.8962056
##
##
## Slot "alpha.values":
## list()