STATS 315B Homework 2

Shijia Bian || Student NetID: shijiab

Question 1-3

Attached on seperate paper.

Question 4

(a)

I choose the best iteration 1115 to model the data. The misclassification rate for the training set is 0.002282361. The misclassification rate for the test set is 0.0.02542373.The non-spam misclassification rate for the training data is 0.001081666. Of all the spam emails of the test set, 0.0210356 is misclassified. Of all the non-spam emails of the test set, 0.0283828 is misclassified.

install.packages("gbm")

## Error: trying to use CRAN without setting a mirror

library(gbm)

## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1

spam <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Data.txt", 
    sep = ",")
names(spam) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet", 
    "order", "mail", "receive", "will", "people", "report", "addresses", "free", 
    "business", "email", "you", "credit", "your", "font", "000", "money", "hp", 
    "hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85", 
    "technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original", 
    "project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$", 
    "#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(spam)

## [1] 4601   58

# 4601 58

train <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Train.txt", 
    sep = ",")
names(train) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet", 
    "order", "mail", "receive", "will", "people", "report", "addresses", "free", 
    "business", "email", "you", "credit", "your", "font", "000", "money", "hp", 
    "hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85", 
    "technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original", 
    "project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$", 
    "#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(train)

## [1] 3067   58

# 3067 58

test <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Test.txt", 
    sep = ",")
names(test) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet", 
    "order", "mail", "receive", "will", "people", "report", "addresses", "free", 
    "business", "email", "you", "credit", "your", "font", "000", "money", "hp", 
    "hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85", 
    "technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original", 
    "project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$", 
    "#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(test)

## [1] 1534   58

# 1534 58

set.seed(131)
x <- train[sample(nrow(train)), ]

# Fit the training data into the Gradient Boosting Model
set.seed(444)
gbm0_train <- gbm(type ~ ., data = train, interaction.depth = 4, shrinkage = 0.05, 
    n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "bernoulli", 
    verbose = T)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2883             nan     0.0500    0.0263
##      2        1.2346             nan     0.0500    0.0264
##      3        1.1824             nan     0.0500    0.0256
##      4        1.1404             nan     0.0500    0.0201
##      5        1.0974             nan     0.0500    0.0209
##      6        1.0569             nan     0.0500    0.0196
##      7        1.0238             nan     0.0500    0.0159
##      8        0.9928             nan     0.0500    0.0152
##      9        0.9648             nan     0.0500    0.0140
##     10        0.9330             nan     0.0500    0.0154
##     20        0.7264             nan     0.0500    0.0063
##     40        0.5117             nan     0.0500    0.0047
##     60        0.4085             nan     0.0500    0.0009
##     80        0.3467             nan     0.0500    0.0007
##    100        0.3114             nan     0.0500    0.0002
##    120        0.2849             nan     0.0500    0.0005
##    140        0.2664             nan     0.0500    0.0000
##    160        0.2504             nan     0.0500    0.0001
##    180        0.2363             nan     0.0500   -0.0000
##    200        0.2233             nan     0.0500    0.0003
##    220        0.2129             nan     0.0500    0.0001
##    240        0.2023             nan     0.0500   -0.0001
##    260        0.1921             nan     0.0500   -0.0001
##    280        0.1839             nan     0.0500    0.0000
##    300        0.1762             nan     0.0500   -0.0000
##    320        0.1708             nan     0.0500   -0.0001
##    340        0.1640             nan     0.0500   -0.0001
##    360        0.1581             nan     0.0500   -0.0001
##    380        0.1525             nan     0.0500   -0.0001
##    400        0.1471             nan     0.0500   -0.0001
##    420        0.1423             nan     0.0500   -0.0000
##    440        0.1366             nan     0.0500    0.0000
##    460        0.1326             nan     0.0500   -0.0002
##    480        0.1284             nan     0.0500   -0.0001
##    500        0.1238             nan     0.0500    0.0000
##    520        0.1199             nan     0.0500   -0.0000
##    540        0.1151             nan     0.0500   -0.0001
##    560        0.1110             nan     0.0500   -0.0000
##    580        0.1075             nan     0.0500   -0.0001
##    600        0.1039             nan     0.0500   -0.0000
##    620        0.1003             nan     0.0500   -0.0001
##    640        0.0969             nan     0.0500    0.0000
##    660        0.0942             nan     0.0500   -0.0001
##    680        0.0912             nan     0.0500   -0.0001
##    700        0.0886             nan     0.0500   -0.0001
##    720        0.0861             nan     0.0500   -0.0001
##    740        0.0832             nan     0.0500   -0.0000
##    760        0.0812             nan     0.0500   -0.0001
##    780        0.0790             nan     0.0500   -0.0001
##    800        0.0766             nan     0.0500   -0.0000
##    820        0.0742             nan     0.0500   -0.0000
##    840        0.0720             nan     0.0500   -0.0001
##    860        0.0699             nan     0.0500   -0.0000
##    880        0.0678             nan     0.0500   -0.0000
##    900        0.0657             nan     0.0500   -0.0001
##    920        0.0639             nan     0.0500   -0.0001
##    940        0.0620             nan     0.0500   -0.0001
##    960        0.0602             nan     0.0500   -0.0000
##    980        0.0586             nan     0.0500   -0.0000
##   1000        0.0573             nan     0.0500   -0.0000
##   1020        0.0560             nan     0.0500   -0.0000
##   1040        0.0546             nan     0.0500    0.0000
##   1060        0.0534             nan     0.0500   -0.0000
##   1080        0.0520             nan     0.0500   -0.0000
##   1100        0.0508             nan     0.0500   -0.0000
##   1120        0.0494             nan     0.0500   -0.0000
##   1140        0.0483             nan     0.0500   -0.0000
##   1160        0.0471             nan     0.0500   -0.0000
##   1180        0.0458             nan     0.0500   -0.0000
##   1200        0.0450             nan     0.0500   -0.0000
##   1220        0.0439             nan     0.0500   -0.0001
##   1240        0.0429             nan     0.0500   -0.0000
##   1260        0.0418             nan     0.0500   -0.0000
##   1280        0.0408             nan     0.0500   -0.0001
##   1300        0.0398             nan     0.0500   -0.0000
##   1320        0.0389             nan     0.0500   -0.0000
##   1340        0.0380             nan     0.0500   -0.0000
##   1360        0.0370             nan     0.0500    0.0000
##   1380        0.0362             nan     0.0500   -0.0001
##   1400        0.0353             nan     0.0500   -0.0000
##   1420        0.0346             nan     0.0500   -0.0000
##   1440        0.0336             nan     0.0500   -0.0000
##   1460        0.0330             nan     0.0500   -0.0000
##   1480        0.0324             nan     0.0500   -0.0000
##   1500        0.0315             nan     0.0500   -0.0000
##   1520        0.0307             nan     0.0500   -0.0000
##   1540        0.0300             nan     0.0500   -0.0000
##   1560        0.0292             nan     0.0500   -0.0000
##   1580        0.0286             nan     0.0500   -0.0000
##   1600        0.0280             nan     0.0500   -0.0000
##   1620        0.0273             nan     0.0500   -0.0000
##   1640        0.0267             nan     0.0500   -0.0000
##   1660        0.0262             nan     0.0500   -0.0000
##   1680        0.0256             nan     0.0500    0.0000
##   1700        0.0250             nan     0.0500   -0.0000
##   1720        0.0244             nan     0.0500   -0.0000
##   1740        0.0238             nan     0.0500    0.0000
##   1760        0.0232             nan     0.0500   -0.0000
##   1780        0.0228             nan     0.0500   -0.0000
##   1800        0.0222             nan     0.0500   -0.0000
##   1820        0.0218             nan     0.0500   -0.0000
##   1840        0.0213             nan     0.0500   -0.0000
##   1860        0.0209             nan     0.0500   -0.0000
##   1880        0.0204             nan     0.0500   -0.0000
##   1900        0.0200             nan     0.0500   -0.0000
##   1920        0.0196             nan     0.0500   -0.0000
##   1940        0.0192             nan     0.0500   -0.0000
##   1960        0.0187             nan     0.0500   -0.0000
##   1980        0.0183             nan     0.0500   -0.0000
##   2000        0.0180             nan     0.0500   -0.0000
##   2020        0.0176             nan     0.0500   -0.0000
##   2040        0.0172             nan     0.0500   -0.0000
##   2060        0.0168             nan     0.0500   -0.0000
##   2080        0.0164             nan     0.0500   -0.0000
##   2100        0.0161             nan     0.0500   -0.0000
##   2120        0.0157             nan     0.0500   -0.0000
##   2140        0.0154             nan     0.0500   -0.0000
##   2160        0.0151             nan     0.0500   -0.0000
##   2180        0.0148             nan     0.0500    0.0000
##   2200        0.0145             nan     0.0500   -0.0000
##   2220        0.0141             nan     0.0500    0.0000
##   2240        0.0139             nan     0.0500   -0.0000
##   2260        0.0135             nan     0.0500   -0.0000
##   2280        0.0132             nan     0.0500    0.0000
##   2300        0.0129             nan     0.0500   -0.0000
##   2320        0.0126             nan     0.0500   -0.0000
##   2340        0.0123             nan     0.0500   -0.0000
##   2360        0.0120             nan     0.0500   -0.0000
##   2380        0.0118             nan     0.0500   -0.0000
##   2400        0.0116             nan     0.0500   -0.0000
##   2420        0.0114             nan     0.0500   -0.0000
##   2440        0.0112             nan     0.0500    0.0000
##   2460        0.0110             nan     0.0500    0.0000
##   2480        0.0107             nan     0.0500   -0.0000
##   2500        0.0106             nan     0.0500   -0.0000


# Optimal number of iteration, criteria is cv that is not very conservative.
best.iter_train <- gbm.perf(gbm0_train, method = "cv")

plot of chunk unnamed-chunk-2

# 1115

# Prediction on the training set
gbm0.predict <- predict(gbm0_train, x, type = "response", n.trees = best.iter_train)
# Set the threshhold to be the mean of the predicted value
gbm0.predict[gbm0.predict < 0.5] = 0
gbm0.predict[gbm0.predict >= 0.5] = 1

matrix.train <- table(gbm0.predict, x$type)

# The training error rate is 0.002282361.
(matrix.train[1, 2] + matrix.train[2, 1])/nrow(train)

## [1] 0.001956


# non-spam error rate for the training data 0.001081666
matrix.train[2, 1]/(matrix.train[1, 1] + matrix.train[2, 1])

## [1] 0.0005408


# The test set
gbm0.predict.test <- predict(gbm0_train, test, type = "response", n.trees = best.iter_train)
# Keep the threshhold
gbm0.predict.test[gbm0.predict.test < 0.5] = 0
gbm0.predict.test[gbm0.predict.test >= 0.5] = 1

matrix.test <- table(gbm0.predict.test, test$type)


testerrro <- (matrix.test[1, 2] + matrix.test[2, 1])/nrow(test)

errornonspam <- matrix.test[2, 1]/(matrix.test[1, 1] + matrix.test[2, 1])

errorspam <- matrix.test[1, 2]/(matrix.test[1, 2] + matrix.test[2, 2])

(b) Construct final spam filter with the defined property

(i) For the training data set,Prediction on the training set, the classification error rate is 0.00652103. the classification error rate for non-spam is 0.002704164. The classification error rate for spam is 0.01231527. Because the classification error for the non-spam of the training set is less than 0.3%, this is the model that we want to have. We use this model to fit the test data set, the overall misclassification error rate is 0.03585398. The misclassification error rate for the nonspam is 0.03165939. The misclassification error rate for the spam is 0.0420712.

# Set loss matrix weights
weights <- rep(1, nrow(train))
weights[train$type == 0] = 25


x <- train[sample(nrow(train)), ]

# Fit the training data into the Gradient Boosting Model

mod.gbm2 <- gbm(type ~ ., data = x, weights = weights, interaction.depth = 4, 
    shrinkage = 0.05, n.trees = 3000, bag.fraction = 0.5, cv.folds = 5, distribution = "bernoulli", 
    verbose = T)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2882             nan     0.0500    0.0270
##      2        1.2371             nan     0.0500    0.0234
##      3        1.1867             nan     0.0500    0.0253
##      4        1.1447             nan     0.0500    0.0203
##      5        1.1068             nan     0.0500    0.0185
##      6        1.0656             nan     0.0500    0.0194
##      7        1.0336             nan     0.0500    0.0151
##      8        1.0037             nan     0.0500    0.0141
##      9        0.9705             nan     0.0500    0.0159
##     10        0.9392             nan     0.0500    0.0151
##     20        0.7306             nan     0.0500    0.0077
##     40        0.5190             nan     0.0500    0.0026
##     60        0.4118             nan     0.0500    0.0011
##     80        0.3540             nan     0.0500    0.0005
##    100        0.3185             nan     0.0500    0.0001
##    120        0.2915             nan     0.0500    0.0000
##    140        0.2716             nan     0.0500   -0.0001
##    160        0.2520             nan     0.0500    0.0003
##    180        0.2370             nan     0.0500   -0.0001
##    200        0.2221             nan     0.0500   -0.0000
##    220        0.2098             nan     0.0500   -0.0004
##    240        0.2000             nan     0.0500   -0.0001
##    260        0.1892             nan     0.0500   -0.0001
##    280        0.1798             nan     0.0500   -0.0000
##    300        0.1701             nan     0.0500    0.0001
##    320        0.1621             nan     0.0500   -0.0001
##    340        0.1548             nan     0.0500   -0.0000
##    360        0.1471             nan     0.0500   -0.0000
##    380        0.1404             nan     0.0500   -0.0000
##    400        0.1337             nan     0.0500   -0.0001
##    420        0.1285             nan     0.0500    0.0000
##    440        0.1238             nan     0.0500   -0.0001
##    460        0.1180             nan     0.0500   -0.0000
##    480        0.1127             nan     0.0500   -0.0001
##    500        0.1084             nan     0.0500   -0.0001
##    520        0.1044             nan     0.0500   -0.0001
##    540        0.1003             nan     0.0500   -0.0001
##    560        0.0963             nan     0.0500   -0.0002
##    580        0.0930             nan     0.0500   -0.0001
##    600        0.0897             nan     0.0500   -0.0001
##    620        0.0866             nan     0.0500   -0.0001
##    640        0.0833             nan     0.0500   -0.0001
##    660        0.0803             nan     0.0500   -0.0001
##    680        0.0775             nan     0.0500   -0.0001
##    700        0.0749             nan     0.0500   -0.0000
##    720        0.0717             nan     0.0500   -0.0001
##    740        0.0695             nan     0.0500   -0.0000
##    760        0.0673             nan     0.0500   -0.0000
##    780        0.0650             nan     0.0500   -0.0000
##    800        0.0632             nan     0.0500   -0.0000
##    820        0.0610             nan     0.0500   -0.0000
##    840        0.0592             nan     0.0500    0.0000
##    860        0.0574             nan     0.0500   -0.0000
##    880        0.0555             nan     0.0500   -0.0000
##    900        0.0537             nan     0.0500   -0.0001
##    920        0.0517             nan     0.0500    0.0000
##    940        0.0502             nan     0.0500   -0.0000
##    960        0.0485             nan     0.0500   -0.0001
##    980        0.0470             nan     0.0500   -0.0000
##   1000        0.0458             nan     0.0500   -0.0000
##   1020        0.0446             nan     0.0500   -0.0000
##   1040        0.0432             nan     0.0500   -0.0000
##   1060        0.0419             nan     0.0500   -0.0000
##   1080        0.0409             nan     0.0500   -0.0000
##   1100        0.0398             nan     0.0500   -0.0001
##   1120        0.0387             nan     0.0500   -0.0000
##   1140        0.0378             nan     0.0500    0.0000
##   1160        0.0367             nan     0.0500   -0.0000
##   1180        0.0356             nan     0.0500   -0.0000
##   1200        0.0348             nan     0.0500    0.0000
##   1220        0.0340             nan     0.0500   -0.0000
##   1240        0.0332             nan     0.0500   -0.0000
##   1260        0.0325             nan     0.0500   -0.0000
##   1280        0.0314             nan     0.0500   -0.0000
##   1300        0.0306             nan     0.0500   -0.0000
##   1320        0.0300             nan     0.0500   -0.0000
##   1340        0.0292             nan     0.0500   -0.0000
##   1360        0.0285             nan     0.0500   -0.0000
##   1380        0.0276             nan     0.0500   -0.0000
##   1400        0.0269             nan     0.0500   -0.0000
##   1420        0.0263             nan     0.0500   -0.0000
##   1440        0.0257             nan     0.0500   -0.0000
##   1460        0.0249             nan     0.0500   -0.0000
##   1480        0.0243             nan     0.0500   -0.0000
##   1500        0.0237             nan     0.0500   -0.0000
##   1520        0.0231             nan     0.0500   -0.0000
##   1540        0.0226             nan     0.0500   -0.0000
##   1560        0.0220             nan     0.0500   -0.0000
##   1580        0.0214             nan     0.0500   -0.0000
##   1600        0.0209             nan     0.0500   -0.0000
##   1620        0.0204             nan     0.0500   -0.0000
##   1640        0.0199             nan     0.0500   -0.0000
##   1660        0.0195             nan     0.0500   -0.0000
##   1680        0.0190             nan     0.0500   -0.0000
##   1700        0.0185             nan     0.0500   -0.0000
##   1720        0.0181             nan     0.0500   -0.0000
##   1740        0.0177             nan     0.0500   -0.0000
##   1760        0.0173             nan     0.0500   -0.0000
##   1780        0.0169             nan     0.0500   -0.0000
##   1800        0.0166             nan     0.0500   -0.0000
##   1820        0.0162             nan     0.0500   -0.0000
##   1840        0.0159             nan     0.0500   -0.0000
##   1860        0.0155             nan     0.0500   -0.0000
##   1880        0.0151             nan     0.0500   -0.0000
##   1900        0.0148             nan     0.0500   -0.0000
##   1920        0.0145             nan     0.0500   -0.0000
##   1940        0.0142             nan     0.0500    0.0000
##   1960        0.0138             nan     0.0500   -0.0000
##   1980        0.0135             nan     0.0500   -0.0000
##   2000        0.0132             nan     0.0500   -0.0000
##   2020        0.0129             nan     0.0500   -0.0000
##   2040        0.0126             nan     0.0500   -0.0000
##   2060        0.0123             nan     0.0500   -0.0000
##   2080        0.0120             nan     0.0500   -0.0000
##   2100        0.0117             nan     0.0500   -0.0000
##   2120        0.0115             nan     0.0500   -0.0000
##   2140        0.0113             nan     0.0500   -0.0000
##   2160        0.0111             nan     0.0500   -0.0000
##   2180        0.0109             nan     0.0500   -0.0000
##   2200        0.0106             nan     0.0500    0.0000
##   2220        0.0104             nan     0.0500   -0.0000
##   2240        0.0102             nan     0.0500   -0.0000
##   2260        0.0099             nan     0.0500   -0.0000
##   2280        0.0097             nan     0.0500   -0.0000
##   2300        0.0095             nan     0.0500   -0.0000
##   2320        0.0093             nan     0.0500   -0.0000
##   2340        0.0091             nan     0.0500   -0.0000
##   2360        0.0089             nan     0.0500   -0.0000
##   2380        0.0087             nan     0.0500   -0.0000
##   2400        0.0085             nan     0.0500   -0.0000
##   2420        0.0083             nan     0.0500    0.0000
##   2440        0.0081             nan     0.0500   -0.0000
##   2460        0.0079             nan     0.0500   -0.0000
##   2480        0.0078             nan     0.0500   -0.0000
##   2500        0.0076             nan     0.0500   -0.0000
##   2520        0.0075             nan     0.0500   -0.0000
##   2540        0.0074             nan     0.0500   -0.0000
##   2560        0.0072             nan     0.0500   -0.0000
##   2580        0.0071             nan     0.0500   -0.0000
##   2600        0.0069             nan     0.0500   -0.0000
##   2620        0.0068             nan     0.0500   -0.0000
##   2640        0.0066             nan     0.0500   -0.0000
##   2660        0.0064             nan     0.0500   -0.0000
##   2680        0.0063             nan     0.0500   -0.0000
##   2700        0.0062             nan     0.0500   -0.0000
##   2720        0.0061             nan     0.0500   -0.0000
##   2740        0.0060             nan     0.0500   -0.0000
##   2760        0.0059             nan     0.0500   -0.0000
##   2780        0.0058             nan     0.0500   -0.0000
##   2800        0.0056             nan     0.0500   -0.0000
##   2820        0.0055             nan     0.0500   -0.0000
##   2840        0.0054             nan     0.0500   -0.0000
##   2860        0.0053             nan     0.0500   -0.0000
##   2880        0.0052             nan     0.0500   -0.0000
##   2900        0.0050             nan     0.0500   -0.0000
##   2920        0.0049             nan     0.0500   -0.0000
##   2940        0.0049             nan     0.0500   -0.0000
##   2960        0.0048             nan     0.0500   -0.0000
##   2980        0.0047             nan     0.0500   -0.0000
##   3000        0.0046             nan     0.0500   -0.0000


# Optimal number of iteration, criteria is cv that is not very conservative:
# 632
best.iter2_train <- gbm.perf(mod.gbm2, method = "cv")

plot of chunk unnamed-chunk-3

# Prediction on the training set, the classification error rate is
# 0.00652103. the classification error rate for non-spam is 0.002704164. The
# classification error rate for spam is 0.01231527.
gbm2.predict <- predict(mod.gbm2, train, type = "response", n.trees = best.iter2_train)
summary(gbm2.predict)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0002  0.0058  0.3950  0.9990  1.0000

gbm2.predict[gbm2.predict < 0.5] = 0
gbm2.predict[gbm2.predict >= 0.5] = 1
matrix.train <- table(gbm2.predict, train$type)
# gbm2.predict 0 1 0 1844 15 1 5 1203

# The test error rate is 0.03585398
gbm2.predict.test <- predict(mod.gbm2, test, type = "response", n.trees = best.iter_train)
gbm2.predict.test[gbm2.predict.test < 0.5] = 0
gbm2.predict.test[gbm2.predict.test >= 0.5] = 1

matrix.test <- table(gbm2.predict.test, test$type)
matrix.test

##                  
## gbm2.predict.test   0   1
##                 0 893  26
##                 1  23 592

# 0.02477184
(matrix.test[1, 2] + matrix.test[2, 1])/nrow(test)

## [1] 0.03194


# Error rate in the non-spam 0.03165939
nons <- matrix.test[2, 1]/(matrix.test[1, 1] + matrix.test[2, 1])
# Error rate in the spam 0.0420712
s <- matrix.test[1, 2]/(matrix.test[1, 2] + matrix.test[2, 2])

(ii) The most important predictor is “!”, followed by remove and $. The detailed list is below.

summary(mod.gbm2, n.tree = best.iter_train, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")

plot of chunk unnamed-chunk-4

##                   var   rel.inf
## `!`               `!` 21.265728
## remove         remove 13.163120
## `$`               `$` 10.543494
## free             free  8.255817
## hp                 hp  7.817558
## CAPAVE         CAPAVE  5.869880
## CAPMAX         CAPMAX  4.562482
## money           money  3.582327
## your             your  3.192165
## our               our  2.767499
## CAPTOT         CAPTOT  2.363265
## you               you  1.811365
## edu               edu  1.501527
## `000`           `000`  1.420032
## email           email  1.317253
## george         george  1.230404
## will             will  1.026070
## `650`           `650`  0.773928
## receive       receive  0.763700
## re                 re  0.691670
## `1999`         `1999`  0.648245
## internet     internet  0.617790
## `;`               `;`  0.610480
## `(`               `(`  0.596463
## business     business  0.557676
## meeting       meeting  0.417150
## all               all  0.356686
## mail             mail  0.344985
## `#`               `#`  0.215144
## report         report  0.160104
## original     original  0.155549
## font             font  0.149922
## hpl               hpl  0.142991
## data             data  0.139631
## technology technology  0.133998
## `3d`             `3d`  0.119878
## order           order  0.105537
## labs             labs  0.102605
## addresses   addresses  0.085125
## pm                 pm  0.081688
## make             make  0.066185
## project       project  0.059774
## `[`               `[`  0.057546
## address       address  0.049215
## over             over  0.040246
## people         people  0.029670
## cs                 cs  0.013588
## parts           parts  0.008591
## conference conference  0.007644
## `85`             `85`  0.003726
## credit         credit  0.002884
## lab               lab  0.000000
## telnet         telnet  0.000000
## `857`           `857`  0.000000
## `415`           `415`  0.000000
## direct         direct  0.000000
## table           table  0.000000

(iii)

According to the first three graph below, the “!”, “$” and remove are positively related with the spam. However, the last three graphes show that any two of three marks do not have obvious patterns.

names(spam)

##  [1] "make"       "address"    "all"        "3d"         "our"       
##  [6] "over"       "remove"     "internet"   "order"      "mail"      
## [11] "receive"    "will"       "people"     "report"     "addresses" 
## [16] "free"       "business"   "email"      "you"        "credit"    
## [21] "your"       "font"       "000"        "money"      "hp"        
## [26] "hpl"        "george"     "650"        "lab"        "labs"      
## [31] "telnet"     "857"        "data"       "415"        "85"        
## [36] "technology" "1999"       "parts"      "pm"         "direct"    
## [41] "cs"         "meeting"    "original"   "project"    "re"        
## [46] "edu"        "table"      "conference" ";"          "("         
## [51] "["          "!"          "$"          "#"          "CAPAVE"    
## [56] "CAPMAX"     "CAPTOT"     "type"

plot(mod.gbm2, i.var = 52, n.trees = best.iter_train, main = "Partial Dependence of '!'")

plot of chunk unnamed-chunk-5


plot(mod.gbm2, i.var = 53, n.trees = best.iter_train, main = "Partial Dependence of '$'")

plot of chunk unnamed-chunk-5


plot(mod.gbm2, i.var = 7, n.trees = best.iter_train, main = "Partial Dependence of remove")

plot of chunk unnamed-chunk-5


plot(mod.gbm2, i.var = c(52, 53), n.trees = best.iter_train, main = "Partial Dependence on '!' and '$'")

plot of chunk unnamed-chunk-5


plot(mod.gbm2, i.var = c(7, 53), n.trees = best.iter_train, main = "Partial Dependence on remove and '$'")

plot of chunk unnamed-chunk-5


plot(mod.gbm2, i.var = c(7, 52), n.trees = best.iter_train, main = "Partial Dependence on remove and '$'")

plot of chunk unnamed-chunk-5

Question 5 Regression: California Housing

(a) The prediction accuracy of gbm on the data set.

I call the gbm model to fit the Gradient Boosting Model,gbm0: regress median housing value into the other variables of the california_data. Then I call gbm.perf to find the optimal number of iteration by setting up the criteria to be “ method='test' ”. The optimal number of iteration is 2500. The test mean square error rate is 0.4620674.

# Read into and clean the calofornia_data
california <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/california_data.txt", 
    sep = ",")
names(california) <- c("MedianValue", "MedianIncome", "HouseMedianAge", "AveNoRoom", 
    "AveNoBdrm", "Population", "AveOccupancy", "Latitude", "Longitude")
set.seed(131)
x <- california[sample(nrow(california)), ]

# Random for bag.fraction
set.seed(444)

# call function gbm on the training set (first 80% of the sampled rows),
# set: Laplace: minimize absolute error, for regression
gbm0 <- gbm(MedianValue ~ ., data = x, train.fraction = 0.8, interaction.depth = 4, 
    shrinkage = 0.05, n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "gaussian", 
    verbose = T)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2620          1.2854     0.0500    0.0624
##      2        1.2039          1.2254     0.0500    0.0567
##      3        1.1518          1.1725     0.0500    0.0519
##      4        1.1037          1.1238     0.0500    0.0473
##      5        1.0594          1.0798     0.0500    0.0436
##      6        1.0192          1.0395     0.0500    0.0406
##      7        0.9816          1.0015     0.0500    0.0386
##      8        0.9468          0.9650     0.0500    0.0337
##      9        0.9157          0.9337     0.0500    0.0307
##     10        0.8867          0.9050     0.0500    0.0273
##     20        0.6890          0.7019     0.0500    0.0143
##     40        0.5036          0.5115     0.0500    0.0066
##     60        0.4116          0.4165     0.0500    0.0020
##     80        0.3623          0.3660     0.0500    0.0009
##    100        0.3348          0.3376     0.0500    0.0013
##    120        0.3168          0.3203     0.0500    0.0010
##    140        0.3049          0.3085     0.0500    0.0008
##    160        0.2958          0.3004     0.0500    0.0003
##    180        0.2884          0.2943     0.0500    0.0000
##    200        0.2823          0.2894     0.0500    0.0000
##    220        0.2755          0.2836     0.0500    0.0001
##    240        0.2702          0.2794     0.0500    0.0001
##    260        0.2661          0.2761     0.0500   -0.0001
##    280        0.2620          0.2727     0.0500   -0.0000
##    300        0.2585          0.2701     0.0500   -0.0000
##    320        0.2556          0.2686     0.0500   -0.0001
##    340        0.2524          0.2658     0.0500    0.0000
##    360        0.2499          0.2638     0.0500    0.0000
##    380        0.2470          0.2618     0.0500   -0.0001
##    400        0.2444          0.2592     0.0500    0.0001
##    420        0.2420          0.2577     0.0500   -0.0001
##    440        0.2399          0.2562     0.0500   -0.0001
##    460        0.2375          0.2546     0.0500   -0.0001
##    480        0.2354          0.2530     0.0500   -0.0000
##    500        0.2328          0.2512     0.0500    0.0002
##    520        0.2306          0.2498     0.0500    0.0002
##    540        0.2285          0.2485     0.0500   -0.0001
##    560        0.2262          0.2464     0.0500   -0.0001
##    580        0.2245          0.2457     0.0500   -0.0000
##    600        0.2229          0.2447     0.0500    0.0000
##    620        0.2212          0.2439     0.0500    0.0000
##    640        0.2190          0.2423     0.0500    0.0001
##    660        0.2174          0.2416     0.0500   -0.0000
##    680        0.2157          0.2408     0.0500    0.0000
##    700        0.2144          0.2401     0.0500   -0.0000
##    720        0.2133          0.2397     0.0500    0.0000
##    740        0.2119          0.2391     0.0500   -0.0001
##    760        0.2106          0.2385     0.0500   -0.0000
##    780        0.2091          0.2374     0.0500    0.0000
##    800        0.2082          0.2371     0.0500   -0.0000
##    820        0.2068          0.2366     0.0500   -0.0000
##    840        0.2055          0.2357     0.0500   -0.0001
##    860        0.2045          0.2350     0.0500   -0.0000
##    880        0.2030          0.2342     0.0500    0.0001
##    900        0.2019          0.2332     0.0500   -0.0001
##    920        0.2008          0.2326     0.0500    0.0001
##    940        0.1997          0.2324     0.0500   -0.0001
##    960        0.1986          0.2318     0.0500   -0.0000
##    980        0.1976          0.2315     0.0500   -0.0000
##   1000        0.1966          0.2317     0.0500   -0.0000
##   1020        0.1954          0.2313     0.0500   -0.0000
##   1040        0.1946          0.2310     0.0500   -0.0000
##   1060        0.1937          0.2309     0.0500   -0.0000
##   1080        0.1927          0.2304     0.0500   -0.0001
##   1100        0.1920          0.2301     0.0500   -0.0000
##   1120        0.1912          0.2295     0.0500   -0.0001
##   1140        0.1905          0.2290     0.0500   -0.0000
##   1160        0.1894          0.2285     0.0500   -0.0000
##   1180        0.1887          0.2284     0.0500   -0.0001
##   1200        0.1878          0.2276     0.0500    0.0000
##   1220        0.1870          0.2271     0.0500   -0.0000
##   1240        0.1863          0.2268     0.0500   -0.0000
##   1260        0.1855          0.2265     0.0500   -0.0000
##   1280        0.1846          0.2258     0.0500   -0.0000
##   1300        0.1837          0.2254     0.0500   -0.0001
##   1320        0.1831          0.2253     0.0500   -0.0001
##   1340        0.1825          0.2250     0.0500   -0.0001
##   1360        0.1818          0.2245     0.0500    0.0000
##   1380        0.1811          0.2243     0.0500   -0.0000
##   1400        0.1803          0.2241     0.0500    0.0000
##   1420        0.1796          0.2240     0.0500   -0.0000
##   1440        0.1789          0.2235     0.0500   -0.0000
##   1460        0.1782          0.2234     0.0500   -0.0000
##   1480        0.1776          0.2232     0.0500    0.0000
##   1500        0.1768          0.2228     0.0500   -0.0000
##   1520        0.1761          0.2223     0.0500    0.0001
##   1540        0.1753          0.2220     0.0500   -0.0000
##   1560        0.1747          0.2215     0.0500   -0.0001
##   1580        0.1742          0.2214     0.0500   -0.0001
##   1600        0.1736          0.2214     0.0500    0.0000
##   1620        0.1729          0.2211     0.0500   -0.0000
##   1640        0.1723          0.2211     0.0500   -0.0000
##   1660        0.1717          0.2210     0.0500   -0.0001
##   1680        0.1712          0.2211     0.0500   -0.0000
##   1700        0.1706          0.2209     0.0500   -0.0001
##   1720        0.1699          0.2205     0.0500   -0.0000
##   1740        0.1694          0.2200     0.0500   -0.0001
##   1760        0.1688          0.2197     0.0500   -0.0000
##   1780        0.1683          0.2196     0.0500   -0.0000
##   1800        0.1678          0.2196     0.0500   -0.0000
##   1820        0.1672          0.2193     0.0500   -0.0000
##   1840        0.1667          0.2191     0.0500   -0.0000
##   1860        0.1662          0.2192     0.0500   -0.0000
##   1880        0.1655          0.2186     0.0500   -0.0000
##   1900        0.1650          0.2186     0.0500   -0.0000
##   1920        0.1645          0.2183     0.0500   -0.0000
##   1940        0.1638          0.2179     0.0500    0.0000
##   1960        0.1632          0.2176     0.0500    0.0000
##   1980        0.1626          0.2175     0.0500   -0.0000
##   2000        0.1620          0.2173     0.0500   -0.0000
##   2020        0.1614          0.2170     0.0500   -0.0000
##   2040        0.1610          0.2167     0.0500   -0.0000
##   2060        0.1605          0.2165     0.0500   -0.0000
##   2080        0.1599          0.2164     0.0500   -0.0000
##   2100        0.1594          0.2161     0.0500   -0.0000
##   2120        0.1589          0.2161     0.0500   -0.0000
##   2140        0.1582          0.2156     0.0500   -0.0000
##   2160        0.1577          0.2155     0.0500   -0.0000
##   2180        0.1573          0.2158     0.0500   -0.0000
##   2200        0.1569          0.2156     0.0500    0.0000
##   2220        0.1564          0.2155     0.0500   -0.0000
##   2240        0.1559          0.2152     0.0500   -0.0000
##   2260        0.1554          0.2151     0.0500    0.0000
##   2280        0.1550          0.2150     0.0500   -0.0000
##   2300        0.1545          0.2148     0.0500   -0.0000
##   2320        0.1541          0.2148     0.0500   -0.0000
##   2340        0.1537          0.2147     0.0500   -0.0001
##   2360        0.1532          0.2147     0.0500   -0.0001
##   2380        0.1527          0.2145     0.0500   -0.0000
##   2400        0.1524          0.2143     0.0500   -0.0000
##   2420        0.1520          0.2142     0.0500   -0.0000
##   2440        0.1515          0.2144     0.0500   -0.0000
##   2460        0.1510          0.2141     0.0500   -0.0000
##   2480        0.1507          0.2139     0.0500   -0.0000
##   2500        0.1502          0.2140     0.0500   -0.0000


# Look for the optimal number of iteration to be 2497, the criterial is set
# as method='test'
best.iter_test <- gbm.perf(gbm0, method = "test")

plot of chunk unnamed-chunk-6


# Set the training set (last 20% of the sampled rows)
test = tail(x, dim(x)[1] * 0.2)

# Prediciton on the test set
gbm0.predict <- predict(gbm0, test, type = "response", n.trees = best.iter_test)

# Test mse: 0.4620674
sqrt(sum((gbm0.predict - test[, 1])^2)/(dim(x)[1] * 0.2))

## [1] 0.4625

(b) Identification of the most important variables.

The importance of predictors is listed below. The MedianIncome is the most important variable. The detailed list is below:

sumgbm <- summary(gbm0, n.trees = best.iter_test, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")

plot of chunk unnamed-chunk-7

sumgbm

##                           var rel.inf
## MedianIncome     MedianIncome  50.199
## AveOccupancy     AveOccupancy  13.691
## Longitude           Longitude  13.128
## Latitude             Latitude  10.336
## HouseMedianAge HouseMedianAge   3.991
## AveNoRoom           AveNoRoom   3.683
## Population         Population   2.572
## AveNoBdrm           AveNoBdrm   2.399

© Comments on the dependence of the response on the most important variables (you may want to consider partial dependence plots on single and pairs of variables,etc.)

The singl plots between the median housing price and other four variables have shown below respectively. The x-axis represent the deciles of the feature values in the training data. We can clearly see that the median house price shows a linear relationship with the median income (top left) and that the house price drops when the avg. occupants per household increases (bottom right). The top right plot shows that the latitude does have a strong negative influence on the (median) house price; so does the longitude.

best.iter <- gbm.perf(gbm0, method = "cv")

plot of chunk unnamed-chunk-8

names(california)

## [1] "MedianValue"    "MedianIncome"   "HouseMedianAge" "AveNoRoom"     
## [5] "AveNoBdrm"      "Population"     "AveOccupancy"   "Latitude"      
## [9] "Longitude"

par(mfrow = c(2, 2))
plot(x = gbm0, i.var = 1, n.trees = best.iter, main = "Partial Dependence of 'MedianIncome' ")
plot(x = gbm0, i.var = 7, n.trees = best.iter, main = "Partial Dependence of 'Longitude' ")
plot(x = gbm0, i.var = 8, n.trees = best.iter, main = "Partial Dependence of 'latitude' ")
plot(x = gbm0, i.var = 6, n.trees = best.iter, main = "Partial Dependence of 'AveOccupancy' ")

plot of chunk unnamed-chunk-9

We plot the partial dependence of two dimensional latice plot.

The partial dependence of California data on variables Median Income and Longtitude: there is strong positive relationship between the Median Income and Longtitude.

plot(x = gbm0, i.var = c(1, 7), n.trees = best.iter, main = "Partial Dependence of 'Median Income' and 'Lontitude'")

plot of chunk unnamed-chunk-10

The partial dependence of California data on variables Median Income and Longtitude: there is strong positive relationship between the Median Income and Longitude.

plot(x = gbm0, i.var = c(1, 8), n.trees = best.iter, main = "Partial Dependence of 'Median Income' and 'Longitude'")

plot of chunk unnamed-chunk-11

The partial dependence of California data on variables Median Income and Ave Occupancy: there is no relationship between the Median Income and Ave Occupancy.

plot(x = gbm0, i.var = c(1, 6), n.trees = best.iter, main = "Partial Dependence of 'Ave Occupancy' and 'Median Income'")

plot of chunk unnamed-chunk-12

The partial dependence of California data on variables Latitude and Longitude: there is strong negative relationship between the Latitude and Longitude.

plot(x = gbm0, i.var = c(7, 8), n.trees = best.iter, main = "Partial Dependence of 'Longitude' and 'Latitude'")

plot of chunk unnamed-chunk-13

The partial dependence of California data on AveOccupancy and Longitude: there is no obvious relationship between the AveOccupancy and Longitude.

plot(x = gbm0, i.var = c(7, 6), n.trees = best.iter, main = "Partial Dependence of 'Longitude' and 'AveOccupancy'")

plot of chunk unnamed-chunk-14

The partial dependence of California data on AveOccupancy and Longitude: there is no obvious relationship between the AveOccupancy and Latitude.

plot(x = gbm0, i.var = c(8, 6), n.trees = best.iter, main = "Partial Dependence of 'Latitude' and 'AveOccupancy'")

plot of chunk unnamed-chunk-15

Question 6 Regression: Marketing Data

(a) Fit a gbm model for predicting income form the other demographic attributes and compare the accuracy with the accuracy of your best tree from HW1. (Income_Data)

I randomly split the data set into training part and test part. The test set has 2000 observations that is 1/5 of the original one. The criterion I use to evaluate the prediction performance is the mean squared error.

The mean square error for the test set given by the gbm is 3.630762.

By repeating the tree model from the hw1 solution (this part of my homework is incorrect, so I just use the approaches from the hw1 solution), the root node error for the test set given by the rpart is 7.6934. The smallest xerror is 0.54. We need unscale the xerror to get the RMSE: 7.69*0.54 = 4.1526. This is slightly lower than the mse we get from the gbm model.

Thus the gbm has better performance.

income <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Income_Data.txt", 
    sep = ",")
names(income) <- c("income", "sex", "mariStatus", "age", "edu", "occu", "lengthLive", 
    "duakIncome", "person", "personUnder18", "houseSta", "type", "ethnic", "language")

install.packages("gbm")

## Error: trying to use CRAN without setting a mirror

library(gbm)
set.seed(131)
income.perm <- sample(1:nrow(income))

income.train <- income[income.perm[1:6993], ]
income.test <- income[income.perm[6994:8993], ]
set.seed(444)

gbm0 <- gbm(income ~ ., data = income.train, interaction.depth = 4, shrinkage = 0.05, 
    n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "gaussian", 
    verbose = T)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        7.4090             nan     0.0500    0.2797
##      2        7.1722             nan     0.0500    0.2459
##      3        6.9402             nan     0.0500    0.2280
##      4        6.7236             nan     0.0500    0.2135
##      5        6.5254             nan     0.0500    0.1975
##      6        6.3419             nan     0.0500    0.1799
##      7        6.1800             nan     0.0500    0.1609
##      8        6.0207             nan     0.0500    0.1541
##      9        5.8780             nan     0.0500    0.1381
##     10        5.7499             nan     0.0500    0.1171
##     20        4.8754             nan     0.0500    0.0555
##     40        4.2308             nan     0.0500    0.0170
##     60        3.9985             nan     0.0500    0.0050
##     80        3.8971             nan     0.0500   -0.0011
##    100        3.8336             nan     0.0500   -0.0000
##    120        3.7908             nan     0.0500   -0.0004
##    140        3.7517             nan     0.0500   -0.0001
##    160        3.7146             nan     0.0500    0.0046
##    180        3.6802             nan     0.0500   -0.0007
##    200        3.6395             nan     0.0500    0.0001
##    220        3.6072             nan     0.0500   -0.0006
##    240        3.5815             nan     0.0500   -0.0004
##    260        3.5531             nan     0.0500   -0.0001
##    280        3.5370             nan     0.0500   -0.0007
##    300        3.5188             nan     0.0500   -0.0002
##    320        3.4992             nan     0.0500   -0.0000
##    340        3.4845             nan     0.0500   -0.0004
##    360        3.4720             nan     0.0500   -0.0010
##    380        3.4555             nan     0.0500   -0.0015
##    400        3.4433             nan     0.0500   -0.0011
##    420        3.4325             nan     0.0500   -0.0015
##    440        3.4173             nan     0.0500   -0.0003
##    460        3.4063             nan     0.0500   -0.0008
##    480        3.3928             nan     0.0500   -0.0001
##    500        3.3807             nan     0.0500   -0.0008
##    520        3.3717             nan     0.0500   -0.0013
##    540        3.3582             nan     0.0500   -0.0002
##    560        3.3476             nan     0.0500   -0.0017
##    580        3.3402             nan     0.0500   -0.0006
##    600        3.3304             nan     0.0500   -0.0014
##    620        3.3209             nan     0.0500   -0.0004
##    640        3.3124             nan     0.0500   -0.0009
##    660        3.3028             nan     0.0500   -0.0003
##    680        3.2940             nan     0.0500   -0.0016
##    700        3.2874             nan     0.0500   -0.0011
##    720        3.2780             nan     0.0500   -0.0007
##    740        3.2682             nan     0.0500   -0.0005
##    760        3.2618             nan     0.0500   -0.0008
##    780        3.2538             nan     0.0500   -0.0006
##    800        3.2475             nan     0.0500   -0.0003
##    820        3.2414             nan     0.0500   -0.0006
##    840        3.2348             nan     0.0500   -0.0012
##    860        3.2280             nan     0.0500   -0.0007
##    880        3.2208             nan     0.0500   -0.0004
##    900        3.2123             nan     0.0500   -0.0008
##    920        3.2048             nan     0.0500   -0.0006
##    940        3.1981             nan     0.0500   -0.0012
##    960        3.1897             nan     0.0500   -0.0011
##    980        3.1827             nan     0.0500   -0.0013
##   1000        3.1770             nan     0.0500   -0.0006
##   1020        3.1691             nan     0.0500   -0.0007
##   1040        3.1635             nan     0.0500   -0.0004
##   1060        3.1564             nan     0.0500   -0.0015
##   1080        3.1500             nan     0.0500   -0.0009
##   1100        3.1441             nan     0.0500   -0.0010
##   1120        3.1379             nan     0.0500   -0.0008
##   1140        3.1311             nan     0.0500   -0.0005
##   1160        3.1258             nan     0.0500   -0.0010
##   1180        3.1199             nan     0.0500   -0.0011
##   1200        3.1145             nan     0.0500   -0.0006
##   1220        3.1098             nan     0.0500   -0.0011
##   1240        3.1037             nan     0.0500   -0.0003
##   1260        3.0997             nan     0.0500   -0.0005
##   1280        3.0942             nan     0.0500   -0.0003
##   1300        3.0890             nan     0.0500   -0.0005
##   1320        3.0844             nan     0.0500   -0.0005
##   1340        3.0796             nan     0.0500   -0.0012
##   1360        3.0754             nan     0.0500   -0.0009
##   1380        3.0699             nan     0.0500   -0.0012
##   1400        3.0665             nan     0.0500   -0.0009
##   1420        3.0613             nan     0.0500   -0.0006
##   1440        3.0569             nan     0.0500   -0.0008
##   1460        3.0526             nan     0.0500   -0.0006
##   1480        3.0480             nan     0.0500   -0.0014
##   1500        3.0432             nan     0.0500   -0.0004
##   1520        3.0398             nan     0.0500   -0.0003
##   1540        3.0353             nan     0.0500   -0.0010
##   1560        3.0307             nan     0.0500   -0.0006
##   1580        3.0272             nan     0.0500   -0.0016
##   1600        3.0232             nan     0.0500   -0.0005
##   1620        3.0182             nan     0.0500   -0.0012
##   1640        3.0138             nan     0.0500   -0.0012
##   1660        3.0098             nan     0.0500   -0.0018
##   1680        3.0071             nan     0.0500   -0.0007
##   1700        3.0027             nan     0.0500   -0.0016
##   1720        2.9988             nan     0.0500   -0.0008
##   1740        2.9950             nan     0.0500   -0.0016
##   1760        2.9913             nan     0.0500   -0.0005
##   1780        2.9864             nan     0.0500   -0.0005
##   1800        2.9830             nan     0.0500   -0.0011
##   1820        2.9785             nan     0.0500   -0.0019
##   1840        2.9749             nan     0.0500   -0.0007
##   1860        2.9700             nan     0.0500   -0.0005
##   1880        2.9671             nan     0.0500   -0.0010
##   1900        2.9642             nan     0.0500   -0.0005
##   1920        2.9602             nan     0.0500   -0.0010
##   1940        2.9563             nan     0.0500   -0.0011
##   1960        2.9527             nan     0.0500   -0.0012
##   1980        2.9491             nan     0.0500   -0.0014
##   2000        2.9447             nan     0.0500   -0.0006
##   2020        2.9403             nan     0.0500   -0.0006
##   2040        2.9355             nan     0.0500   -0.0005
##   2060        2.9325             nan     0.0500   -0.0009
##   2080        2.9291             nan     0.0500   -0.0011
##   2100        2.9259             nan     0.0500   -0.0005
##   2120        2.9232             nan     0.0500   -0.0012
##   2140        2.9201             nan     0.0500   -0.0006
##   2160        2.9173             nan     0.0500   -0.0012
##   2180        2.9136             nan     0.0500   -0.0006
##   2200        2.9103             nan     0.0500   -0.0009
##   2220        2.9069             nan     0.0500   -0.0009
##   2240        2.9038             nan     0.0500   -0.0010
##   2260        2.9005             nan     0.0500   -0.0009
##   2280        2.8984             nan     0.0500   -0.0008
##   2300        2.8955             nan     0.0500   -0.0012
##   2320        2.8931             nan     0.0500   -0.0011
##   2340        2.8897             nan     0.0500   -0.0008
##   2360        2.8865             nan     0.0500   -0.0010
##   2380        2.8836             nan     0.0500   -0.0010
##   2400        2.8809             nan     0.0500   -0.0008
##   2420        2.8773             nan     0.0500   -0.0009
##   2440        2.8744             nan     0.0500   -0.0005
##   2460        2.8713             nan     0.0500   -0.0005
##   2480        2.8675             nan     0.0500   -0.0007
##   2500        2.8644             nan     0.0500   -0.0009


best.iter_test <- gbm.perf(gbm0, method = "cv")

plot of chunk unnamed-chunk-16

# 941

# for the training set
gbm0.predict.train <- predict(gbm0, income.train, type = "response", n.trees = best.iter_test)
sum((income.train$income - gbm0.predict.train)^2)/nrow(income.train)

## [1] 3.255

# 3.208059

# For the test set
gbm0.predict <- predict(gbm0, income.test, type = "response", n.trees = best.iter_test)
sum((income.test$income - gbm0.predict)^2)/nrow(income.test)

## [1] 3.615

# 3.630762

install.packages("rpart")

## Error: trying to use CRAN without setting a mirror

install.packages("ISLR")

## Error: trying to use CRAN without setting a mirror

library(rpart)
library(ISLR)
income.rpart.cv2 <- rpart(income ~ ., data = income.train, control = rpart.control(cp = 1e-04))
cp.list <- income.rpart.cv2$cptable[, 1]
test.err <- rep(0, length(cp.list))

for (i in 2:length(cp.list)) {
    cur.rpart <- prune(income.rpart.cv2, cp.list[i])
    cur.pred <- predict(cur.rpart, newdata = income.test)
    test.err[i] <- sum((cur.pred - income.test$income)^2)
}
min.test.err <- min(test.err[2:252])
index <- which(test.err == min.test.err)
income.rpart2 <- prune.rpart(income.rpart.cv2, cp.list[index])
printcp(income.rpart2)

## 
## Regression tree:
## rpart(formula = income ~ ., data = income.train, control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] age        edu        houseSta   mariStatus occu       type      
## 
## Root node error: 53800/6993 = 7.7
## 
## n= 6993 
## 
##        CP nsplit rel error xerror   xstd
## 1  0.2198      0      1.00   1.00 0.0094
## 2  0.0790      1      0.78   0.78 0.0102
## 3  0.0445      2      0.70   0.70 0.0105
## 4  0.0286      3      0.66   0.66 0.0107
## 5  0.0151      4      0.63   0.64 0.0103
## 6  0.0135      5      0.61   0.62 0.0102
## 7  0.0128      6      0.60   0.61 0.0103
## 8  0.0070      7      0.59   0.59 0.0102
## 9  0.0063      8      0.58   0.59 0.0103
## 10 0.0061      9      0.57   0.59 0.0102
## 11 0.0058     11      0.56   0.59 0.0101
## 12 0.0054     12      0.56   0.57 0.0097
## 13 0.0051     13      0.55   0.57 0.0095
## 14 0.0037     14      0.54   0.55 0.0094
## 15 0.0036     15      0.54   0.55 0.0093
## 16 0.0028     16      0.54   0.55 0.0093
## 17 0.0026     17      0.53   0.54 0.0093
## 18 0.0024     18      0.53   0.54 0.0093
## 19 0.0022     19      0.53   0.54 0.0094
## 20 0.0022     20      0.53   0.54 0.0094
## 21 0.0020     21      0.53   0.54 0.0094
## 22 0.0020     22      0.52   0.54 0.0094
## 23 0.0019     23      0.52   0.54 0.0094
## 24 0.0017     24      0.52   0.54 0.0094
## 25 0.0017     25      0.52   0.54 0.0094

(b) Identify the most important variables

According to the importance list below, the “age” is the most important predictor, followed by occupation, house status, marital status and so on. “sex” appears to be the second least influencial variable. This is not consistant with the national average result. Because our data is for california population only. On the other hand, We need explore more if this result is consistant with the national average result, such as the relationship between household income and the income of family members of different sex. For example, if one family has both men and women, the house hold income can be still high even if the female family members do not have income but the male family members have high income. california cannot

summary(gbm0, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")

plot of chunk unnamed-chunk-18

##                         var rel.inf
## occu                   occu 16.6285
## age                     age 16.3130
## houseSta           houseSta 16.1983
## mariStatus       mariStatus 11.9324
## edu                     edu  8.9902
## person               person  6.4029
## ethnic               ethnic  6.3050
## type                   type  6.2575
## lengthLive       lengthLive  4.5689
## language           language  2.1093
## personUnder18 personUnder18  1.9923
## sex                     sex  1.3863
## duakIncome       duakIncome  0.9153

Question 7 Multiclass Classification: Marketing Data

(a) Report the test set misclassification error for gbm on the data set, and also the misclassification error for each class.

For the traning set, the overall error rate is 0.3858234. The error rates for each of the accupation type are: 0.1789660, 0.9163987, 0.5877483, 0.6690562, 0.3799622, 0.1640760, 0.6225490, 0.1743119, 0.7320261.Thus, the classifier for variable 2, 4, 7, 9 are bad.

For the test set, the overall error rate is 0.4326439. The error rates for each of the accupation type are: 0.2018692, 0.9390244, 0.6754967, 0.7216981, 0.4507042, 0.2113565, 0.7400000, 0.2156863 and 0.8615385. Thus, the classifier for variable 2, 4, 7, 9 are bad, which speaks well to the result we get from the training set.

marketing <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Occupation_Data.txt", 
    sep = ",")
names(marketing) <- c("occu", "typeHome", "sex", "mariStatus", "age", "education", 
    "annualIncome", "timeLength", "dualIncome", "personNum", "personNum18", 
    "houseSta", "ethnic", "language")

set.seed(111)
market.perm <- sample(1:nrow(marketing))

market.train <- marketing[market.perm[1:7068], ]
market.test <- marketing[market.perm[7069:8857], ]

### Train the model
set.seed(324)
gbm0 <- gbm(occu ~ ., data = market.train, interaction.depth = 4, shrinkage = 0.05, 
    n.trees = 400, bag.fraction = 0.5, cv.folds = 5, distribution = "multinomial", 
    verbose = T)

## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        2.1972             nan     0.0500    0.2468
##      2        2.0641             nan     0.0500    0.1811
##      3        1.9655             nan     0.0500    0.1455
##      4        1.8870             nan     0.0500    0.1209
##      5        1.8218             nan     0.0500    0.1070
##      6        1.7640             nan     0.0500    0.0915
##      7        1.7152             nan     0.0500    0.0767
##      8        1.6729             nan     0.0500    0.0672
##      9        1.6346             nan     0.0500    0.0586
##     10        1.6021             nan     0.0500    0.0522
##     20        1.4073             nan     0.0500    0.0190
##     40        1.2773             nan     0.0500    0.0040
##     60        1.2255             nan     0.0500   -0.0001
##     80        1.1959             nan     0.0500   -0.0002
##    100        1.1730             nan     0.0500   -0.0004
##    120        1.1553             nan     0.0500   -0.0015
##    140        1.1384             nan     0.0500   -0.0015
##    160        1.1251             nan     0.0500   -0.0010
##    180        1.1122             nan     0.0500   -0.0021
##    200        1.0992             nan     0.0500   -0.0015
##    220        1.0875             nan     0.0500   -0.0016
##    240        1.0766             nan     0.0500   -0.0016
##    260        1.0667             nan     0.0500   -0.0012
##    280        1.0569             nan     0.0500   -0.0021
##    300        1.0478             nan     0.0500   -0.0018
##    320        1.0395             nan     0.0500   -0.0013
##    340        1.0309             nan     0.0500   -0.0013
##    360        1.0231             nan     0.0500   -0.0015
##    380        1.0157             nan     0.0500   -0.0014
##    400        1.0085             nan     0.0500   -0.0021


best.iter_test <- gbm.perf(gbm0, method = "test")

## Error: need finite 'ylim' values

plot of chunk unnamed-chunk-19

# best.iter_test = 136

gbm0.predict <- predict(gbm0, market.train, type = "response", n.trees = best.iter_test)

## Warning: Number of trees not specified or exceeded number fit so far.
## Using 400.


classPred <- rep(0, nrow(market.train))
for (i in 1:nrow(market.train)) {
    classPred[i] <- which(max(t(gbm0.predict[i, , 1])) == t(gbm0.predict[i, 
        , 1]))
}

matrix = table(classPred, market.train$occu)

# The overall error rate for the training set:
sum = 0
for (i in 1:9) {
    sum = sum + matrix[i, i]
}
1 - sum/nrow(market.train)

## [1] 0.3454


# Calculate the traning misclassification error for each of the occupation
# type:
ratio.train = rep(0, 9)
for (i in 1:9) {
    ratio.train[i] <- 1 - matrix[i, i]/sum(matrix[, i])
}
ratio.train

## [1] 0.1657 0.8264 0.5381 0.6189 0.3157 0.1511 0.4951 0.1450 0.6111


### Test Set
gbm0.pretest <- predict(gbm0, market.test, type = "response", n.trees = best.iter_test)

## Warning: Number of trees not specified or exceeded number fit so far.
## Using 400.


classPred.test <- rep(0, nrow(market.test))
for (i in 1:nrow(market.test)) {
    classPred.test[i] <- which(max(t(gbm0.pretest[i, , 1])) == t(gbm0.pretest[i, 
        , 1]))
}
matrix.test = table(classPred.test, market.test$occu)

# Calculate the overall test misclassification error:
sum.test = 0
for (i in 1:9) {
    sum.test = sum.test + matrix.test[i, i]
}
1 - sum.test/nrow(market.test)

## [1] 0.4449


# Calculate the test misclassification error for each of the occupation
# type:
ratio = rep(0, 9)
for (i in 1:9) {
    ratio[i] <- 1 - matrix.test[i, i]/sum(matrix.test[, i])
}

(b) Identify the most important variables

The most important variable is age, followed by annual income, education, household status and dual Income. The detailed rank is listed below.

summary(gbm0, n.tree = best.iter_test, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")

## Warning: Exceeded total number of GBM terms. Results use n.trees=400 terms.

plot of chunk unnamed-chunk-20

##                       var rel.inf
## age                   age 27.6440
## annualIncome annualIncome 18.6786
## education       education 17.9468
## houseSta         houseSta 10.9074
## dualIncome     dualIncome  7.7678
## sex                   sex  7.0867
## personNum       personNum  2.6382
## mariStatus     mariStatus  2.2580
## timeLength     timeLength  1.2840
## ethnic             ethnic  1.0980
## language         language  1.0179
## typeHome         typeHome  0.8402
## personNum18   personNum18  0.8326