Attached on seperate paper.
I choose the best iteration 1115 to model the data. The misclassification rate for the training set is 0.002282361. The misclassification rate for the test set is 0.0.02542373.The non-spam misclassification rate for the training data is 0.001081666. Of all the spam emails of the test set, 0.0210356 is misclassified. Of all the non-spam emails of the test set, 0.0283828 is misclassified.
install.packages("gbm")
## Error: trying to use CRAN without setting a mirror
library(gbm)
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1
spam <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Data.txt",
sep = ",")
names(spam) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet",
"order", "mail", "receive", "will", "people", "report", "addresses", "free",
"business", "email", "you", "credit", "your", "font", "000", "money", "hp",
"hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85",
"technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original",
"project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$",
"#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(spam)
## [1] 4601 58
# 4601 58
train <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Train.txt",
sep = ",")
names(train) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet",
"order", "mail", "receive", "will", "people", "report", "addresses", "free",
"business", "email", "you", "credit", "your", "font", "000", "money", "hp",
"hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85",
"technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original",
"project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$",
"#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(train)
## [1] 3067 58
# 3067 58
test <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Spam_Test.txt",
sep = ",")
names(test) <- c("make", "address", "all", "3d", "our", "over", "remove", "internet",
"order", "mail", "receive", "will", "people", "report", "addresses", "free",
"business", "email", "you", "credit", "your", "font", "000", "money", "hp",
"hpl", "george", "650", "lab", "labs", "telnet", "857", "data", "415", "85",
"technology", "1999", "parts", "pm", "direct", "cs", "meeting", "original",
"project", "re", "edu", "table", "conference", ";", "(", "[", "!", "$",
"#", "CAPAVE", "CAPMAX", "CAPTOT", "type")
dim(test)
## [1] 1534 58
# 1534 58
set.seed(131)
x <- train[sample(nrow(train)), ]
# Fit the training data into the Gradient Boosting Model
set.seed(444)
gbm0_train <- gbm(type ~ ., data = train, interaction.depth = 4, shrinkage = 0.05,
n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "bernoulli",
verbose = T)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2883 nan 0.0500 0.0263
## 2 1.2346 nan 0.0500 0.0264
## 3 1.1824 nan 0.0500 0.0256
## 4 1.1404 nan 0.0500 0.0201
## 5 1.0974 nan 0.0500 0.0209
## 6 1.0569 nan 0.0500 0.0196
## 7 1.0238 nan 0.0500 0.0159
## 8 0.9928 nan 0.0500 0.0152
## 9 0.9648 nan 0.0500 0.0140
## 10 0.9330 nan 0.0500 0.0154
## 20 0.7264 nan 0.0500 0.0063
## 40 0.5117 nan 0.0500 0.0047
## 60 0.4085 nan 0.0500 0.0009
## 80 0.3467 nan 0.0500 0.0007
## 100 0.3114 nan 0.0500 0.0002
## 120 0.2849 nan 0.0500 0.0005
## 140 0.2664 nan 0.0500 0.0000
## 160 0.2504 nan 0.0500 0.0001
## 180 0.2363 nan 0.0500 -0.0000
## 200 0.2233 nan 0.0500 0.0003
## 220 0.2129 nan 0.0500 0.0001
## 240 0.2023 nan 0.0500 -0.0001
## 260 0.1921 nan 0.0500 -0.0001
## 280 0.1839 nan 0.0500 0.0000
## 300 0.1762 nan 0.0500 -0.0000
## 320 0.1708 nan 0.0500 -0.0001
## 340 0.1640 nan 0.0500 -0.0001
## 360 0.1581 nan 0.0500 -0.0001
## 380 0.1525 nan 0.0500 -0.0001
## 400 0.1471 nan 0.0500 -0.0001
## 420 0.1423 nan 0.0500 -0.0000
## 440 0.1366 nan 0.0500 0.0000
## 460 0.1326 nan 0.0500 -0.0002
## 480 0.1284 nan 0.0500 -0.0001
## 500 0.1238 nan 0.0500 0.0000
## 520 0.1199 nan 0.0500 -0.0000
## 540 0.1151 nan 0.0500 -0.0001
## 560 0.1110 nan 0.0500 -0.0000
## 580 0.1075 nan 0.0500 -0.0001
## 600 0.1039 nan 0.0500 -0.0000
## 620 0.1003 nan 0.0500 -0.0001
## 640 0.0969 nan 0.0500 0.0000
## 660 0.0942 nan 0.0500 -0.0001
## 680 0.0912 nan 0.0500 -0.0001
## 700 0.0886 nan 0.0500 -0.0001
## 720 0.0861 nan 0.0500 -0.0001
## 740 0.0832 nan 0.0500 -0.0000
## 760 0.0812 nan 0.0500 -0.0001
## 780 0.0790 nan 0.0500 -0.0001
## 800 0.0766 nan 0.0500 -0.0000
## 820 0.0742 nan 0.0500 -0.0000
## 840 0.0720 nan 0.0500 -0.0001
## 860 0.0699 nan 0.0500 -0.0000
## 880 0.0678 nan 0.0500 -0.0000
## 900 0.0657 nan 0.0500 -0.0001
## 920 0.0639 nan 0.0500 -0.0001
## 940 0.0620 nan 0.0500 -0.0001
## 960 0.0602 nan 0.0500 -0.0000
## 980 0.0586 nan 0.0500 -0.0000
## 1000 0.0573 nan 0.0500 -0.0000
## 1020 0.0560 nan 0.0500 -0.0000
## 1040 0.0546 nan 0.0500 0.0000
## 1060 0.0534 nan 0.0500 -0.0000
## 1080 0.0520 nan 0.0500 -0.0000
## 1100 0.0508 nan 0.0500 -0.0000
## 1120 0.0494 nan 0.0500 -0.0000
## 1140 0.0483 nan 0.0500 -0.0000
## 1160 0.0471 nan 0.0500 -0.0000
## 1180 0.0458 nan 0.0500 -0.0000
## 1200 0.0450 nan 0.0500 -0.0000
## 1220 0.0439 nan 0.0500 -0.0001
## 1240 0.0429 nan 0.0500 -0.0000
## 1260 0.0418 nan 0.0500 -0.0000
## 1280 0.0408 nan 0.0500 -0.0001
## 1300 0.0398 nan 0.0500 -0.0000
## 1320 0.0389 nan 0.0500 -0.0000
## 1340 0.0380 nan 0.0500 -0.0000
## 1360 0.0370 nan 0.0500 0.0000
## 1380 0.0362 nan 0.0500 -0.0001
## 1400 0.0353 nan 0.0500 -0.0000
## 1420 0.0346 nan 0.0500 -0.0000
## 1440 0.0336 nan 0.0500 -0.0000
## 1460 0.0330 nan 0.0500 -0.0000
## 1480 0.0324 nan 0.0500 -0.0000
## 1500 0.0315 nan 0.0500 -0.0000
## 1520 0.0307 nan 0.0500 -0.0000
## 1540 0.0300 nan 0.0500 -0.0000
## 1560 0.0292 nan 0.0500 -0.0000
## 1580 0.0286 nan 0.0500 -0.0000
## 1600 0.0280 nan 0.0500 -0.0000
## 1620 0.0273 nan 0.0500 -0.0000
## 1640 0.0267 nan 0.0500 -0.0000
## 1660 0.0262 nan 0.0500 -0.0000
## 1680 0.0256 nan 0.0500 0.0000
## 1700 0.0250 nan 0.0500 -0.0000
## 1720 0.0244 nan 0.0500 -0.0000
## 1740 0.0238 nan 0.0500 0.0000
## 1760 0.0232 nan 0.0500 -0.0000
## 1780 0.0228 nan 0.0500 -0.0000
## 1800 0.0222 nan 0.0500 -0.0000
## 1820 0.0218 nan 0.0500 -0.0000
## 1840 0.0213 nan 0.0500 -0.0000
## 1860 0.0209 nan 0.0500 -0.0000
## 1880 0.0204 nan 0.0500 -0.0000
## 1900 0.0200 nan 0.0500 -0.0000
## 1920 0.0196 nan 0.0500 -0.0000
## 1940 0.0192 nan 0.0500 -0.0000
## 1960 0.0187 nan 0.0500 -0.0000
## 1980 0.0183 nan 0.0500 -0.0000
## 2000 0.0180 nan 0.0500 -0.0000
## 2020 0.0176 nan 0.0500 -0.0000
## 2040 0.0172 nan 0.0500 -0.0000
## 2060 0.0168 nan 0.0500 -0.0000
## 2080 0.0164 nan 0.0500 -0.0000
## 2100 0.0161 nan 0.0500 -0.0000
## 2120 0.0157 nan 0.0500 -0.0000
## 2140 0.0154 nan 0.0500 -0.0000
## 2160 0.0151 nan 0.0500 -0.0000
## 2180 0.0148 nan 0.0500 0.0000
## 2200 0.0145 nan 0.0500 -0.0000
## 2220 0.0141 nan 0.0500 0.0000
## 2240 0.0139 nan 0.0500 -0.0000
## 2260 0.0135 nan 0.0500 -0.0000
## 2280 0.0132 nan 0.0500 0.0000
## 2300 0.0129 nan 0.0500 -0.0000
## 2320 0.0126 nan 0.0500 -0.0000
## 2340 0.0123 nan 0.0500 -0.0000
## 2360 0.0120 nan 0.0500 -0.0000
## 2380 0.0118 nan 0.0500 -0.0000
## 2400 0.0116 nan 0.0500 -0.0000
## 2420 0.0114 nan 0.0500 -0.0000
## 2440 0.0112 nan 0.0500 0.0000
## 2460 0.0110 nan 0.0500 0.0000
## 2480 0.0107 nan 0.0500 -0.0000
## 2500 0.0106 nan 0.0500 -0.0000
# Optimal number of iteration, criteria is cv that is not very conservative.
best.iter_train <- gbm.perf(gbm0_train, method = "cv")
# 1115
# Prediction on the training set
gbm0.predict <- predict(gbm0_train, x, type = "response", n.trees = best.iter_train)
# Set the threshhold to be the mean of the predicted value
gbm0.predict[gbm0.predict < 0.5] = 0
gbm0.predict[gbm0.predict >= 0.5] = 1
matrix.train <- table(gbm0.predict, x$type)
# The training error rate is 0.002282361.
(matrix.train[1, 2] + matrix.train[2, 1])/nrow(train)
## [1] 0.001956
# non-spam error rate for the training data 0.001081666
matrix.train[2, 1]/(matrix.train[1, 1] + matrix.train[2, 1])
## [1] 0.0005408
# The test set
gbm0.predict.test <- predict(gbm0_train, test, type = "response", n.trees = best.iter_train)
# Keep the threshhold
gbm0.predict.test[gbm0.predict.test < 0.5] = 0
gbm0.predict.test[gbm0.predict.test >= 0.5] = 1
matrix.test <- table(gbm0.predict.test, test$type)
testerrro <- (matrix.test[1, 2] + matrix.test[2, 1])/nrow(test)
errornonspam <- matrix.test[2, 1]/(matrix.test[1, 1] + matrix.test[2, 1])
errorspam <- matrix.test[1, 2]/(matrix.test[1, 2] + matrix.test[2, 2])
(i) For the training data set,Prediction on the training set, the classification error rate is 0.00652103. the classification error rate for non-spam is 0.002704164. The classification error rate for spam is 0.01231527. Because the classification error for the non-spam of the training set is less than 0.3%, this is the model that we want to have. We use this model to fit the test data set, the overall misclassification error rate is 0.03585398. The misclassification error rate for the nonspam is 0.03165939. The misclassification error rate for the spam is 0.0420712.
# Set loss matrix weights
weights <- rep(1, nrow(train))
weights[train$type == 0] = 25
x <- train[sample(nrow(train)), ]
# Fit the training data into the Gradient Boosting Model
mod.gbm2 <- gbm(type ~ ., data = x, weights = weights, interaction.depth = 4,
shrinkage = 0.05, n.trees = 3000, bag.fraction = 0.5, cv.folds = 5, distribution = "bernoulli",
verbose = T)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2882 nan 0.0500 0.0270
## 2 1.2371 nan 0.0500 0.0234
## 3 1.1867 nan 0.0500 0.0253
## 4 1.1447 nan 0.0500 0.0203
## 5 1.1068 nan 0.0500 0.0185
## 6 1.0656 nan 0.0500 0.0194
## 7 1.0336 nan 0.0500 0.0151
## 8 1.0037 nan 0.0500 0.0141
## 9 0.9705 nan 0.0500 0.0159
## 10 0.9392 nan 0.0500 0.0151
## 20 0.7306 nan 0.0500 0.0077
## 40 0.5190 nan 0.0500 0.0026
## 60 0.4118 nan 0.0500 0.0011
## 80 0.3540 nan 0.0500 0.0005
## 100 0.3185 nan 0.0500 0.0001
## 120 0.2915 nan 0.0500 0.0000
## 140 0.2716 nan 0.0500 -0.0001
## 160 0.2520 nan 0.0500 0.0003
## 180 0.2370 nan 0.0500 -0.0001
## 200 0.2221 nan 0.0500 -0.0000
## 220 0.2098 nan 0.0500 -0.0004
## 240 0.2000 nan 0.0500 -0.0001
## 260 0.1892 nan 0.0500 -0.0001
## 280 0.1798 nan 0.0500 -0.0000
## 300 0.1701 nan 0.0500 0.0001
## 320 0.1621 nan 0.0500 -0.0001
## 340 0.1548 nan 0.0500 -0.0000
## 360 0.1471 nan 0.0500 -0.0000
## 380 0.1404 nan 0.0500 -0.0000
## 400 0.1337 nan 0.0500 -0.0001
## 420 0.1285 nan 0.0500 0.0000
## 440 0.1238 nan 0.0500 -0.0001
## 460 0.1180 nan 0.0500 -0.0000
## 480 0.1127 nan 0.0500 -0.0001
## 500 0.1084 nan 0.0500 -0.0001
## 520 0.1044 nan 0.0500 -0.0001
## 540 0.1003 nan 0.0500 -0.0001
## 560 0.0963 nan 0.0500 -0.0002
## 580 0.0930 nan 0.0500 -0.0001
## 600 0.0897 nan 0.0500 -0.0001
## 620 0.0866 nan 0.0500 -0.0001
## 640 0.0833 nan 0.0500 -0.0001
## 660 0.0803 nan 0.0500 -0.0001
## 680 0.0775 nan 0.0500 -0.0001
## 700 0.0749 nan 0.0500 -0.0000
## 720 0.0717 nan 0.0500 -0.0001
## 740 0.0695 nan 0.0500 -0.0000
## 760 0.0673 nan 0.0500 -0.0000
## 780 0.0650 nan 0.0500 -0.0000
## 800 0.0632 nan 0.0500 -0.0000
## 820 0.0610 nan 0.0500 -0.0000
## 840 0.0592 nan 0.0500 0.0000
## 860 0.0574 nan 0.0500 -0.0000
## 880 0.0555 nan 0.0500 -0.0000
## 900 0.0537 nan 0.0500 -0.0001
## 920 0.0517 nan 0.0500 0.0000
## 940 0.0502 nan 0.0500 -0.0000
## 960 0.0485 nan 0.0500 -0.0001
## 980 0.0470 nan 0.0500 -0.0000
## 1000 0.0458 nan 0.0500 -0.0000
## 1020 0.0446 nan 0.0500 -0.0000
## 1040 0.0432 nan 0.0500 -0.0000
## 1060 0.0419 nan 0.0500 -0.0000
## 1080 0.0409 nan 0.0500 -0.0000
## 1100 0.0398 nan 0.0500 -0.0001
## 1120 0.0387 nan 0.0500 -0.0000
## 1140 0.0378 nan 0.0500 0.0000
## 1160 0.0367 nan 0.0500 -0.0000
## 1180 0.0356 nan 0.0500 -0.0000
## 1200 0.0348 nan 0.0500 0.0000
## 1220 0.0340 nan 0.0500 -0.0000
## 1240 0.0332 nan 0.0500 -0.0000
## 1260 0.0325 nan 0.0500 -0.0000
## 1280 0.0314 nan 0.0500 -0.0000
## 1300 0.0306 nan 0.0500 -0.0000
## 1320 0.0300 nan 0.0500 -0.0000
## 1340 0.0292 nan 0.0500 -0.0000
## 1360 0.0285 nan 0.0500 -0.0000
## 1380 0.0276 nan 0.0500 -0.0000
## 1400 0.0269 nan 0.0500 -0.0000
## 1420 0.0263 nan 0.0500 -0.0000
## 1440 0.0257 nan 0.0500 -0.0000
## 1460 0.0249 nan 0.0500 -0.0000
## 1480 0.0243 nan 0.0500 -0.0000
## 1500 0.0237 nan 0.0500 -0.0000
## 1520 0.0231 nan 0.0500 -0.0000
## 1540 0.0226 nan 0.0500 -0.0000
## 1560 0.0220 nan 0.0500 -0.0000
## 1580 0.0214 nan 0.0500 -0.0000
## 1600 0.0209 nan 0.0500 -0.0000
## 1620 0.0204 nan 0.0500 -0.0000
## 1640 0.0199 nan 0.0500 -0.0000
## 1660 0.0195 nan 0.0500 -0.0000
## 1680 0.0190 nan 0.0500 -0.0000
## 1700 0.0185 nan 0.0500 -0.0000
## 1720 0.0181 nan 0.0500 -0.0000
## 1740 0.0177 nan 0.0500 -0.0000
## 1760 0.0173 nan 0.0500 -0.0000
## 1780 0.0169 nan 0.0500 -0.0000
## 1800 0.0166 nan 0.0500 -0.0000
## 1820 0.0162 nan 0.0500 -0.0000
## 1840 0.0159 nan 0.0500 -0.0000
## 1860 0.0155 nan 0.0500 -0.0000
## 1880 0.0151 nan 0.0500 -0.0000
## 1900 0.0148 nan 0.0500 -0.0000
## 1920 0.0145 nan 0.0500 -0.0000
## 1940 0.0142 nan 0.0500 0.0000
## 1960 0.0138 nan 0.0500 -0.0000
## 1980 0.0135 nan 0.0500 -0.0000
## 2000 0.0132 nan 0.0500 -0.0000
## 2020 0.0129 nan 0.0500 -0.0000
## 2040 0.0126 nan 0.0500 -0.0000
## 2060 0.0123 nan 0.0500 -0.0000
## 2080 0.0120 nan 0.0500 -0.0000
## 2100 0.0117 nan 0.0500 -0.0000
## 2120 0.0115 nan 0.0500 -0.0000
## 2140 0.0113 nan 0.0500 -0.0000
## 2160 0.0111 nan 0.0500 -0.0000
## 2180 0.0109 nan 0.0500 -0.0000
## 2200 0.0106 nan 0.0500 0.0000
## 2220 0.0104 nan 0.0500 -0.0000
## 2240 0.0102 nan 0.0500 -0.0000
## 2260 0.0099 nan 0.0500 -0.0000
## 2280 0.0097 nan 0.0500 -0.0000
## 2300 0.0095 nan 0.0500 -0.0000
## 2320 0.0093 nan 0.0500 -0.0000
## 2340 0.0091 nan 0.0500 -0.0000
## 2360 0.0089 nan 0.0500 -0.0000
## 2380 0.0087 nan 0.0500 -0.0000
## 2400 0.0085 nan 0.0500 -0.0000
## 2420 0.0083 nan 0.0500 0.0000
## 2440 0.0081 nan 0.0500 -0.0000
## 2460 0.0079 nan 0.0500 -0.0000
## 2480 0.0078 nan 0.0500 -0.0000
## 2500 0.0076 nan 0.0500 -0.0000
## 2520 0.0075 nan 0.0500 -0.0000
## 2540 0.0074 nan 0.0500 -0.0000
## 2560 0.0072 nan 0.0500 -0.0000
## 2580 0.0071 nan 0.0500 -0.0000
## 2600 0.0069 nan 0.0500 -0.0000
## 2620 0.0068 nan 0.0500 -0.0000
## 2640 0.0066 nan 0.0500 -0.0000
## 2660 0.0064 nan 0.0500 -0.0000
## 2680 0.0063 nan 0.0500 -0.0000
## 2700 0.0062 nan 0.0500 -0.0000
## 2720 0.0061 nan 0.0500 -0.0000
## 2740 0.0060 nan 0.0500 -0.0000
## 2760 0.0059 nan 0.0500 -0.0000
## 2780 0.0058 nan 0.0500 -0.0000
## 2800 0.0056 nan 0.0500 -0.0000
## 2820 0.0055 nan 0.0500 -0.0000
## 2840 0.0054 nan 0.0500 -0.0000
## 2860 0.0053 nan 0.0500 -0.0000
## 2880 0.0052 nan 0.0500 -0.0000
## 2900 0.0050 nan 0.0500 -0.0000
## 2920 0.0049 nan 0.0500 -0.0000
## 2940 0.0049 nan 0.0500 -0.0000
## 2960 0.0048 nan 0.0500 -0.0000
## 2980 0.0047 nan 0.0500 -0.0000
## 3000 0.0046 nan 0.0500 -0.0000
# Optimal number of iteration, criteria is cv that is not very conservative:
# 632
best.iter2_train <- gbm.perf(mod.gbm2, method = "cv")
# Prediction on the training set, the classification error rate is
# 0.00652103. the classification error rate for non-spam is 0.002704164. The
# classification error rate for spam is 0.01231527.
gbm2.predict <- predict(mod.gbm2, train, type = "response", n.trees = best.iter2_train)
summary(gbm2.predict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0002 0.0058 0.3950 0.9990 1.0000
gbm2.predict[gbm2.predict < 0.5] = 0
gbm2.predict[gbm2.predict >= 0.5] = 1
matrix.train <- table(gbm2.predict, train$type)
# gbm2.predict 0 1 0 1844 15 1 5 1203
# The test error rate is 0.03585398
gbm2.predict.test <- predict(mod.gbm2, test, type = "response", n.trees = best.iter_train)
gbm2.predict.test[gbm2.predict.test < 0.5] = 0
gbm2.predict.test[gbm2.predict.test >= 0.5] = 1
matrix.test <- table(gbm2.predict.test, test$type)
matrix.test
##
## gbm2.predict.test 0 1
## 0 893 26
## 1 23 592
# 0.02477184
(matrix.test[1, 2] + matrix.test[2, 1])/nrow(test)
## [1] 0.03194
# Error rate in the non-spam 0.03165939
nons <- matrix.test[2, 1]/(matrix.test[1, 1] + matrix.test[2, 1])
# Error rate in the spam 0.0420712
s <- matrix.test[1, 2]/(matrix.test[1, 2] + matrix.test[2, 2])
(ii) The most important predictor is “!”, followed by remove and $. The detailed list is below.
summary(mod.gbm2, n.tree = best.iter_train, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")
## var rel.inf
## `!` `!` 21.265728
## remove remove 13.163120
## `$` `$` 10.543494
## free free 8.255817
## hp hp 7.817558
## CAPAVE CAPAVE 5.869880
## CAPMAX CAPMAX 4.562482
## money money 3.582327
## your your 3.192165
## our our 2.767499
## CAPTOT CAPTOT 2.363265
## you you 1.811365
## edu edu 1.501527
## `000` `000` 1.420032
## email email 1.317253
## george george 1.230404
## will will 1.026070
## `650` `650` 0.773928
## receive receive 0.763700
## re re 0.691670
## `1999` `1999` 0.648245
## internet internet 0.617790
## `;` `;` 0.610480
## `(` `(` 0.596463
## business business 0.557676
## meeting meeting 0.417150
## all all 0.356686
## mail mail 0.344985
## `#` `#` 0.215144
## report report 0.160104
## original original 0.155549
## font font 0.149922
## hpl hpl 0.142991
## data data 0.139631
## technology technology 0.133998
## `3d` `3d` 0.119878
## order order 0.105537
## labs labs 0.102605
## addresses addresses 0.085125
## pm pm 0.081688
## make make 0.066185
## project project 0.059774
## `[` `[` 0.057546
## address address 0.049215
## over over 0.040246
## people people 0.029670
## cs cs 0.013588
## parts parts 0.008591
## conference conference 0.007644
## `85` `85` 0.003726
## credit credit 0.002884
## lab lab 0.000000
## telnet telnet 0.000000
## `857` `857` 0.000000
## `415` `415` 0.000000
## direct direct 0.000000
## table table 0.000000
(iii)
According to the first three graph below, the “!”, “$” and remove are positively related with the spam. However, the last three graphes show that any two of three marks do not have obvious patterns.
names(spam)
## [1] "make" "address" "all" "3d" "our"
## [6] "over" "remove" "internet" "order" "mail"
## [11] "receive" "will" "people" "report" "addresses"
## [16] "free" "business" "email" "you" "credit"
## [21] "your" "font" "000" "money" "hp"
## [26] "hpl" "george" "650" "lab" "labs"
## [31] "telnet" "857" "data" "415" "85"
## [36] "technology" "1999" "parts" "pm" "direct"
## [41] "cs" "meeting" "original" "project" "re"
## [46] "edu" "table" "conference" ";" "("
## [51] "[" "!" "$" "#" "CAPAVE"
## [56] "CAPMAX" "CAPTOT" "type"
plot(mod.gbm2, i.var = 52, n.trees = best.iter_train, main = "Partial Dependence of '!'")
plot(mod.gbm2, i.var = 53, n.trees = best.iter_train, main = "Partial Dependence of '$'")
plot(mod.gbm2, i.var = 7, n.trees = best.iter_train, main = "Partial Dependence of remove")
plot(mod.gbm2, i.var = c(52, 53), n.trees = best.iter_train, main = "Partial Dependence on '!' and '$'")
plot(mod.gbm2, i.var = c(7, 53), n.trees = best.iter_train, main = "Partial Dependence on remove and '$'")
plot(mod.gbm2, i.var = c(7, 52), n.trees = best.iter_train, main = "Partial Dependence on remove and '$'")
I call the gbm model to fit the Gradient Boosting Model,gbm0: regress median housing value into the other variables of the california_data. Then I call gbm.perf to find the optimal number of iteration by setting up the criteria to be “ method='test' ”. The optimal number of iteration is 2500. The test mean square error rate is 0.4620674.
# Read into and clean the calofornia_data
california <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/california_data.txt",
sep = ",")
names(california) <- c("MedianValue", "MedianIncome", "HouseMedianAge", "AveNoRoom",
"AveNoBdrm", "Population", "AveOccupancy", "Latitude", "Longitude")
set.seed(131)
x <- california[sample(nrow(california)), ]
# Random for bag.fraction
set.seed(444)
# call function gbm on the training set (first 80% of the sampled rows),
# set: Laplace: minimize absolute error, for regression
gbm0 <- gbm(MedianValue ~ ., data = x, train.fraction = 0.8, interaction.depth = 4,
shrinkage = 0.05, n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "gaussian",
verbose = T)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2620 1.2854 0.0500 0.0624
## 2 1.2039 1.2254 0.0500 0.0567
## 3 1.1518 1.1725 0.0500 0.0519
## 4 1.1037 1.1238 0.0500 0.0473
## 5 1.0594 1.0798 0.0500 0.0436
## 6 1.0192 1.0395 0.0500 0.0406
## 7 0.9816 1.0015 0.0500 0.0386
## 8 0.9468 0.9650 0.0500 0.0337
## 9 0.9157 0.9337 0.0500 0.0307
## 10 0.8867 0.9050 0.0500 0.0273
## 20 0.6890 0.7019 0.0500 0.0143
## 40 0.5036 0.5115 0.0500 0.0066
## 60 0.4116 0.4165 0.0500 0.0020
## 80 0.3623 0.3660 0.0500 0.0009
## 100 0.3348 0.3376 0.0500 0.0013
## 120 0.3168 0.3203 0.0500 0.0010
## 140 0.3049 0.3085 0.0500 0.0008
## 160 0.2958 0.3004 0.0500 0.0003
## 180 0.2884 0.2943 0.0500 0.0000
## 200 0.2823 0.2894 0.0500 0.0000
## 220 0.2755 0.2836 0.0500 0.0001
## 240 0.2702 0.2794 0.0500 0.0001
## 260 0.2661 0.2761 0.0500 -0.0001
## 280 0.2620 0.2727 0.0500 -0.0000
## 300 0.2585 0.2701 0.0500 -0.0000
## 320 0.2556 0.2686 0.0500 -0.0001
## 340 0.2524 0.2658 0.0500 0.0000
## 360 0.2499 0.2638 0.0500 0.0000
## 380 0.2470 0.2618 0.0500 -0.0001
## 400 0.2444 0.2592 0.0500 0.0001
## 420 0.2420 0.2577 0.0500 -0.0001
## 440 0.2399 0.2562 0.0500 -0.0001
## 460 0.2375 0.2546 0.0500 -0.0001
## 480 0.2354 0.2530 0.0500 -0.0000
## 500 0.2328 0.2512 0.0500 0.0002
## 520 0.2306 0.2498 0.0500 0.0002
## 540 0.2285 0.2485 0.0500 -0.0001
## 560 0.2262 0.2464 0.0500 -0.0001
## 580 0.2245 0.2457 0.0500 -0.0000
## 600 0.2229 0.2447 0.0500 0.0000
## 620 0.2212 0.2439 0.0500 0.0000
## 640 0.2190 0.2423 0.0500 0.0001
## 660 0.2174 0.2416 0.0500 -0.0000
## 680 0.2157 0.2408 0.0500 0.0000
## 700 0.2144 0.2401 0.0500 -0.0000
## 720 0.2133 0.2397 0.0500 0.0000
## 740 0.2119 0.2391 0.0500 -0.0001
## 760 0.2106 0.2385 0.0500 -0.0000
## 780 0.2091 0.2374 0.0500 0.0000
## 800 0.2082 0.2371 0.0500 -0.0000
## 820 0.2068 0.2366 0.0500 -0.0000
## 840 0.2055 0.2357 0.0500 -0.0001
## 860 0.2045 0.2350 0.0500 -0.0000
## 880 0.2030 0.2342 0.0500 0.0001
## 900 0.2019 0.2332 0.0500 -0.0001
## 920 0.2008 0.2326 0.0500 0.0001
## 940 0.1997 0.2324 0.0500 -0.0001
## 960 0.1986 0.2318 0.0500 -0.0000
## 980 0.1976 0.2315 0.0500 -0.0000
## 1000 0.1966 0.2317 0.0500 -0.0000
## 1020 0.1954 0.2313 0.0500 -0.0000
## 1040 0.1946 0.2310 0.0500 -0.0000
## 1060 0.1937 0.2309 0.0500 -0.0000
## 1080 0.1927 0.2304 0.0500 -0.0001
## 1100 0.1920 0.2301 0.0500 -0.0000
## 1120 0.1912 0.2295 0.0500 -0.0001
## 1140 0.1905 0.2290 0.0500 -0.0000
## 1160 0.1894 0.2285 0.0500 -0.0000
## 1180 0.1887 0.2284 0.0500 -0.0001
## 1200 0.1878 0.2276 0.0500 0.0000
## 1220 0.1870 0.2271 0.0500 -0.0000
## 1240 0.1863 0.2268 0.0500 -0.0000
## 1260 0.1855 0.2265 0.0500 -0.0000
## 1280 0.1846 0.2258 0.0500 -0.0000
## 1300 0.1837 0.2254 0.0500 -0.0001
## 1320 0.1831 0.2253 0.0500 -0.0001
## 1340 0.1825 0.2250 0.0500 -0.0001
## 1360 0.1818 0.2245 0.0500 0.0000
## 1380 0.1811 0.2243 0.0500 -0.0000
## 1400 0.1803 0.2241 0.0500 0.0000
## 1420 0.1796 0.2240 0.0500 -0.0000
## 1440 0.1789 0.2235 0.0500 -0.0000
## 1460 0.1782 0.2234 0.0500 -0.0000
## 1480 0.1776 0.2232 0.0500 0.0000
## 1500 0.1768 0.2228 0.0500 -0.0000
## 1520 0.1761 0.2223 0.0500 0.0001
## 1540 0.1753 0.2220 0.0500 -0.0000
## 1560 0.1747 0.2215 0.0500 -0.0001
## 1580 0.1742 0.2214 0.0500 -0.0001
## 1600 0.1736 0.2214 0.0500 0.0000
## 1620 0.1729 0.2211 0.0500 -0.0000
## 1640 0.1723 0.2211 0.0500 -0.0000
## 1660 0.1717 0.2210 0.0500 -0.0001
## 1680 0.1712 0.2211 0.0500 -0.0000
## 1700 0.1706 0.2209 0.0500 -0.0001
## 1720 0.1699 0.2205 0.0500 -0.0000
## 1740 0.1694 0.2200 0.0500 -0.0001
## 1760 0.1688 0.2197 0.0500 -0.0000
## 1780 0.1683 0.2196 0.0500 -0.0000
## 1800 0.1678 0.2196 0.0500 -0.0000
## 1820 0.1672 0.2193 0.0500 -0.0000
## 1840 0.1667 0.2191 0.0500 -0.0000
## 1860 0.1662 0.2192 0.0500 -0.0000
## 1880 0.1655 0.2186 0.0500 -0.0000
## 1900 0.1650 0.2186 0.0500 -0.0000
## 1920 0.1645 0.2183 0.0500 -0.0000
## 1940 0.1638 0.2179 0.0500 0.0000
## 1960 0.1632 0.2176 0.0500 0.0000
## 1980 0.1626 0.2175 0.0500 -0.0000
## 2000 0.1620 0.2173 0.0500 -0.0000
## 2020 0.1614 0.2170 0.0500 -0.0000
## 2040 0.1610 0.2167 0.0500 -0.0000
## 2060 0.1605 0.2165 0.0500 -0.0000
## 2080 0.1599 0.2164 0.0500 -0.0000
## 2100 0.1594 0.2161 0.0500 -0.0000
## 2120 0.1589 0.2161 0.0500 -0.0000
## 2140 0.1582 0.2156 0.0500 -0.0000
## 2160 0.1577 0.2155 0.0500 -0.0000
## 2180 0.1573 0.2158 0.0500 -0.0000
## 2200 0.1569 0.2156 0.0500 0.0000
## 2220 0.1564 0.2155 0.0500 -0.0000
## 2240 0.1559 0.2152 0.0500 -0.0000
## 2260 0.1554 0.2151 0.0500 0.0000
## 2280 0.1550 0.2150 0.0500 -0.0000
## 2300 0.1545 0.2148 0.0500 -0.0000
## 2320 0.1541 0.2148 0.0500 -0.0000
## 2340 0.1537 0.2147 0.0500 -0.0001
## 2360 0.1532 0.2147 0.0500 -0.0001
## 2380 0.1527 0.2145 0.0500 -0.0000
## 2400 0.1524 0.2143 0.0500 -0.0000
## 2420 0.1520 0.2142 0.0500 -0.0000
## 2440 0.1515 0.2144 0.0500 -0.0000
## 2460 0.1510 0.2141 0.0500 -0.0000
## 2480 0.1507 0.2139 0.0500 -0.0000
## 2500 0.1502 0.2140 0.0500 -0.0000
# Look for the optimal number of iteration to be 2497, the criterial is set
# as method='test'
best.iter_test <- gbm.perf(gbm0, method = "test")
# Set the training set (last 20% of the sampled rows)
test = tail(x, dim(x)[1] * 0.2)
# Prediciton on the test set
gbm0.predict <- predict(gbm0, test, type = "response", n.trees = best.iter_test)
# Test mse: 0.4620674
sqrt(sum((gbm0.predict - test[, 1])^2)/(dim(x)[1] * 0.2))
## [1] 0.4625
The importance of predictors is listed below. The MedianIncome is the most important variable. The detailed list is below:
sumgbm <- summary(gbm0, n.trees = best.iter_test, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")
sumgbm
## var rel.inf
## MedianIncome MedianIncome 50.199
## AveOccupancy AveOccupancy 13.691
## Longitude Longitude 13.128
## Latitude Latitude 10.336
## HouseMedianAge HouseMedianAge 3.991
## AveNoRoom AveNoRoom 3.683
## Population Population 2.572
## AveNoBdrm AveNoBdrm 2.399
The singl plots between the median housing price and other four variables have shown below respectively. The x-axis represent the deciles of the feature values in the training data. We can clearly see that the median house price shows a linear relationship with the median income (top left) and that the house price drops when the avg. occupants per household increases (bottom right). The top right plot shows that the latitude does have a strong negative influence on the (median) house price; so does the longitude.
best.iter <- gbm.perf(gbm0, method = "cv")
names(california)
## [1] "MedianValue" "MedianIncome" "HouseMedianAge" "AveNoRoom"
## [5] "AveNoBdrm" "Population" "AveOccupancy" "Latitude"
## [9] "Longitude"
par(mfrow = c(2, 2))
plot(x = gbm0, i.var = 1, n.trees = best.iter, main = "Partial Dependence of 'MedianIncome' ")
plot(x = gbm0, i.var = 7, n.trees = best.iter, main = "Partial Dependence of 'Longitude' ")
plot(x = gbm0, i.var = 8, n.trees = best.iter, main = "Partial Dependence of 'latitude' ")
plot(x = gbm0, i.var = 6, n.trees = best.iter, main = "Partial Dependence of 'AveOccupancy' ")
We plot the partial dependence of two dimensional latice plot.
The partial dependence of California data on variables Median Income and Longtitude: there is strong positive relationship between the Median Income and Longtitude.
plot(x = gbm0, i.var = c(1, 7), n.trees = best.iter, main = "Partial Dependence of 'Median Income' and 'Lontitude'")
The partial dependence of California data on variables Median Income and Longtitude: there is strong positive relationship between the Median Income and Longitude.
plot(x = gbm0, i.var = c(1, 8), n.trees = best.iter, main = "Partial Dependence of 'Median Income' and 'Longitude'")
The partial dependence of California data on variables Median Income and Ave Occupancy: there is no relationship between the Median Income and Ave Occupancy.
plot(x = gbm0, i.var = c(1, 6), n.trees = best.iter, main = "Partial Dependence of 'Ave Occupancy' and 'Median Income'")
The partial dependence of California data on variables Latitude and Longitude: there is strong negative relationship between the Latitude and Longitude.
plot(x = gbm0, i.var = c(7, 8), n.trees = best.iter, main = "Partial Dependence of 'Longitude' and 'Latitude'")
The partial dependence of California data on AveOccupancy and Longitude: there is no obvious relationship between the AveOccupancy and Longitude.
plot(x = gbm0, i.var = c(7, 6), n.trees = best.iter, main = "Partial Dependence of 'Longitude' and 'AveOccupancy'")
The partial dependence of California data on AveOccupancy and Longitude: there is no obvious relationship between the AveOccupancy and Latitude.
plot(x = gbm0, i.var = c(8, 6), n.trees = best.iter, main = "Partial Dependence of 'Latitude' and 'AveOccupancy'")
I randomly split the data set into training part and test part. The test set has 2000 observations that is 1/5 of the original one. The criterion I use to evaluate the prediction performance is the mean squared error.
The mean square error for the test set given by the gbm is 3.630762.
By repeating the tree model from the hw1 solution (this part of my homework is incorrect, so I just use the approaches from the hw1 solution), the root node error for the test set given by the rpart is 7.6934. The smallest xerror is 0.54. We need unscale the xerror to get the RMSE: 7.69*0.54 = 4.1526. This is slightly lower than the mse we get from the gbm model.
Thus the gbm has better performance.
income <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Income_Data.txt",
sep = ",")
names(income) <- c("income", "sex", "mariStatus", "age", "edu", "occu", "lengthLive",
"duakIncome", "person", "personUnder18", "houseSta", "type", "ethnic", "language")
install.packages("gbm")
## Error: trying to use CRAN without setting a mirror
library(gbm)
set.seed(131)
income.perm <- sample(1:nrow(income))
income.train <- income[income.perm[1:6993], ]
income.test <- income[income.perm[6994:8993], ]
set.seed(444)
gbm0 <- gbm(income ~ ., data = income.train, interaction.depth = 4, shrinkage = 0.05,
n.trees = 2500, bag.fraction = 0.5, cv.folds = 5, distribution = "gaussian",
verbose = T)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 7.4090 nan 0.0500 0.2797
## 2 7.1722 nan 0.0500 0.2459
## 3 6.9402 nan 0.0500 0.2280
## 4 6.7236 nan 0.0500 0.2135
## 5 6.5254 nan 0.0500 0.1975
## 6 6.3419 nan 0.0500 0.1799
## 7 6.1800 nan 0.0500 0.1609
## 8 6.0207 nan 0.0500 0.1541
## 9 5.8780 nan 0.0500 0.1381
## 10 5.7499 nan 0.0500 0.1171
## 20 4.8754 nan 0.0500 0.0555
## 40 4.2308 nan 0.0500 0.0170
## 60 3.9985 nan 0.0500 0.0050
## 80 3.8971 nan 0.0500 -0.0011
## 100 3.8336 nan 0.0500 -0.0000
## 120 3.7908 nan 0.0500 -0.0004
## 140 3.7517 nan 0.0500 -0.0001
## 160 3.7146 nan 0.0500 0.0046
## 180 3.6802 nan 0.0500 -0.0007
## 200 3.6395 nan 0.0500 0.0001
## 220 3.6072 nan 0.0500 -0.0006
## 240 3.5815 nan 0.0500 -0.0004
## 260 3.5531 nan 0.0500 -0.0001
## 280 3.5370 nan 0.0500 -0.0007
## 300 3.5188 nan 0.0500 -0.0002
## 320 3.4992 nan 0.0500 -0.0000
## 340 3.4845 nan 0.0500 -0.0004
## 360 3.4720 nan 0.0500 -0.0010
## 380 3.4555 nan 0.0500 -0.0015
## 400 3.4433 nan 0.0500 -0.0011
## 420 3.4325 nan 0.0500 -0.0015
## 440 3.4173 nan 0.0500 -0.0003
## 460 3.4063 nan 0.0500 -0.0008
## 480 3.3928 nan 0.0500 -0.0001
## 500 3.3807 nan 0.0500 -0.0008
## 520 3.3717 nan 0.0500 -0.0013
## 540 3.3582 nan 0.0500 -0.0002
## 560 3.3476 nan 0.0500 -0.0017
## 580 3.3402 nan 0.0500 -0.0006
## 600 3.3304 nan 0.0500 -0.0014
## 620 3.3209 nan 0.0500 -0.0004
## 640 3.3124 nan 0.0500 -0.0009
## 660 3.3028 nan 0.0500 -0.0003
## 680 3.2940 nan 0.0500 -0.0016
## 700 3.2874 nan 0.0500 -0.0011
## 720 3.2780 nan 0.0500 -0.0007
## 740 3.2682 nan 0.0500 -0.0005
## 760 3.2618 nan 0.0500 -0.0008
## 780 3.2538 nan 0.0500 -0.0006
## 800 3.2475 nan 0.0500 -0.0003
## 820 3.2414 nan 0.0500 -0.0006
## 840 3.2348 nan 0.0500 -0.0012
## 860 3.2280 nan 0.0500 -0.0007
## 880 3.2208 nan 0.0500 -0.0004
## 900 3.2123 nan 0.0500 -0.0008
## 920 3.2048 nan 0.0500 -0.0006
## 940 3.1981 nan 0.0500 -0.0012
## 960 3.1897 nan 0.0500 -0.0011
## 980 3.1827 nan 0.0500 -0.0013
## 1000 3.1770 nan 0.0500 -0.0006
## 1020 3.1691 nan 0.0500 -0.0007
## 1040 3.1635 nan 0.0500 -0.0004
## 1060 3.1564 nan 0.0500 -0.0015
## 1080 3.1500 nan 0.0500 -0.0009
## 1100 3.1441 nan 0.0500 -0.0010
## 1120 3.1379 nan 0.0500 -0.0008
## 1140 3.1311 nan 0.0500 -0.0005
## 1160 3.1258 nan 0.0500 -0.0010
## 1180 3.1199 nan 0.0500 -0.0011
## 1200 3.1145 nan 0.0500 -0.0006
## 1220 3.1098 nan 0.0500 -0.0011
## 1240 3.1037 nan 0.0500 -0.0003
## 1260 3.0997 nan 0.0500 -0.0005
## 1280 3.0942 nan 0.0500 -0.0003
## 1300 3.0890 nan 0.0500 -0.0005
## 1320 3.0844 nan 0.0500 -0.0005
## 1340 3.0796 nan 0.0500 -0.0012
## 1360 3.0754 nan 0.0500 -0.0009
## 1380 3.0699 nan 0.0500 -0.0012
## 1400 3.0665 nan 0.0500 -0.0009
## 1420 3.0613 nan 0.0500 -0.0006
## 1440 3.0569 nan 0.0500 -0.0008
## 1460 3.0526 nan 0.0500 -0.0006
## 1480 3.0480 nan 0.0500 -0.0014
## 1500 3.0432 nan 0.0500 -0.0004
## 1520 3.0398 nan 0.0500 -0.0003
## 1540 3.0353 nan 0.0500 -0.0010
## 1560 3.0307 nan 0.0500 -0.0006
## 1580 3.0272 nan 0.0500 -0.0016
## 1600 3.0232 nan 0.0500 -0.0005
## 1620 3.0182 nan 0.0500 -0.0012
## 1640 3.0138 nan 0.0500 -0.0012
## 1660 3.0098 nan 0.0500 -0.0018
## 1680 3.0071 nan 0.0500 -0.0007
## 1700 3.0027 nan 0.0500 -0.0016
## 1720 2.9988 nan 0.0500 -0.0008
## 1740 2.9950 nan 0.0500 -0.0016
## 1760 2.9913 nan 0.0500 -0.0005
## 1780 2.9864 nan 0.0500 -0.0005
## 1800 2.9830 nan 0.0500 -0.0011
## 1820 2.9785 nan 0.0500 -0.0019
## 1840 2.9749 nan 0.0500 -0.0007
## 1860 2.9700 nan 0.0500 -0.0005
## 1880 2.9671 nan 0.0500 -0.0010
## 1900 2.9642 nan 0.0500 -0.0005
## 1920 2.9602 nan 0.0500 -0.0010
## 1940 2.9563 nan 0.0500 -0.0011
## 1960 2.9527 nan 0.0500 -0.0012
## 1980 2.9491 nan 0.0500 -0.0014
## 2000 2.9447 nan 0.0500 -0.0006
## 2020 2.9403 nan 0.0500 -0.0006
## 2040 2.9355 nan 0.0500 -0.0005
## 2060 2.9325 nan 0.0500 -0.0009
## 2080 2.9291 nan 0.0500 -0.0011
## 2100 2.9259 nan 0.0500 -0.0005
## 2120 2.9232 nan 0.0500 -0.0012
## 2140 2.9201 nan 0.0500 -0.0006
## 2160 2.9173 nan 0.0500 -0.0012
## 2180 2.9136 nan 0.0500 -0.0006
## 2200 2.9103 nan 0.0500 -0.0009
## 2220 2.9069 nan 0.0500 -0.0009
## 2240 2.9038 nan 0.0500 -0.0010
## 2260 2.9005 nan 0.0500 -0.0009
## 2280 2.8984 nan 0.0500 -0.0008
## 2300 2.8955 nan 0.0500 -0.0012
## 2320 2.8931 nan 0.0500 -0.0011
## 2340 2.8897 nan 0.0500 -0.0008
## 2360 2.8865 nan 0.0500 -0.0010
## 2380 2.8836 nan 0.0500 -0.0010
## 2400 2.8809 nan 0.0500 -0.0008
## 2420 2.8773 nan 0.0500 -0.0009
## 2440 2.8744 nan 0.0500 -0.0005
## 2460 2.8713 nan 0.0500 -0.0005
## 2480 2.8675 nan 0.0500 -0.0007
## 2500 2.8644 nan 0.0500 -0.0009
best.iter_test <- gbm.perf(gbm0, method = "cv")
# 941
# for the training set
gbm0.predict.train <- predict(gbm0, income.train, type = "response", n.trees = best.iter_test)
sum((income.train$income - gbm0.predict.train)^2)/nrow(income.train)
## [1] 3.255
# 3.208059
# For the test set
gbm0.predict <- predict(gbm0, income.test, type = "response", n.trees = best.iter_test)
sum((income.test$income - gbm0.predict)^2)/nrow(income.test)
## [1] 3.615
# 3.630762
install.packages("rpart")
## Error: trying to use CRAN without setting a mirror
install.packages("ISLR")
## Error: trying to use CRAN without setting a mirror
library(rpart)
library(ISLR)
income.rpart.cv2 <- rpart(income ~ ., data = income.train, control = rpart.control(cp = 1e-04))
cp.list <- income.rpart.cv2$cptable[, 1]
test.err <- rep(0, length(cp.list))
for (i in 2:length(cp.list)) {
cur.rpart <- prune(income.rpart.cv2, cp.list[i])
cur.pred <- predict(cur.rpart, newdata = income.test)
test.err[i] <- sum((cur.pred - income.test$income)^2)
}
min.test.err <- min(test.err[2:252])
index <- which(test.err == min.test.err)
income.rpart2 <- prune.rpart(income.rpart.cv2, cp.list[index])
printcp(income.rpart2)
##
## Regression tree:
## rpart(formula = income ~ ., data = income.train, control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] age edu houseSta mariStatus occu type
##
## Root node error: 53800/6993 = 7.7
##
## n= 6993
##
## CP nsplit rel error xerror xstd
## 1 0.2198 0 1.00 1.00 0.0094
## 2 0.0790 1 0.78 0.78 0.0102
## 3 0.0445 2 0.70 0.70 0.0105
## 4 0.0286 3 0.66 0.66 0.0107
## 5 0.0151 4 0.63 0.64 0.0103
## 6 0.0135 5 0.61 0.62 0.0102
## 7 0.0128 6 0.60 0.61 0.0103
## 8 0.0070 7 0.59 0.59 0.0102
## 9 0.0063 8 0.58 0.59 0.0103
## 10 0.0061 9 0.57 0.59 0.0102
## 11 0.0058 11 0.56 0.59 0.0101
## 12 0.0054 12 0.56 0.57 0.0097
## 13 0.0051 13 0.55 0.57 0.0095
## 14 0.0037 14 0.54 0.55 0.0094
## 15 0.0036 15 0.54 0.55 0.0093
## 16 0.0028 16 0.54 0.55 0.0093
## 17 0.0026 17 0.53 0.54 0.0093
## 18 0.0024 18 0.53 0.54 0.0093
## 19 0.0022 19 0.53 0.54 0.0094
## 20 0.0022 20 0.53 0.54 0.0094
## 21 0.0020 21 0.53 0.54 0.0094
## 22 0.0020 22 0.52 0.54 0.0094
## 23 0.0019 23 0.52 0.54 0.0094
## 24 0.0017 24 0.52 0.54 0.0094
## 25 0.0017 25 0.52 0.54 0.0094
According to the importance list below, the “age” is the most important predictor, followed by occupation, house status, marital status and so on. “sex” appears to be the second least influencial variable. This is not consistant with the national average result. Because our data is for california population only. On the other hand, We need explore more if this result is consistant with the national average result, such as the relationship between household income and the income of family members of different sex. For example, if one family has both men and women, the house hold income can be still high even if the female family members do not have income but the male family members have high income. california cannot
summary(gbm0, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")
## var rel.inf
## occu occu 16.6285
## age age 16.3130
## houseSta houseSta 16.1983
## mariStatus mariStatus 11.9324
## edu edu 8.9902
## person person 6.4029
## ethnic ethnic 6.3050
## type type 6.2575
## lengthLive lengthLive 4.5689
## language language 2.1093
## personUnder18 personUnder18 1.9923
## sex sex 1.3863
## duakIncome duakIncome 0.9153
For the traning set, the overall error rate is 0.3858234. The error rates for each of the accupation type are: 0.1789660, 0.9163987, 0.5877483, 0.6690562, 0.3799622, 0.1640760, 0.6225490, 0.1743119, 0.7320261.Thus, the classifier for variable 2, 4, 7, 9 are bad.
For the test set, the overall error rate is 0.4326439. The error rates for each of the accupation type are: 0.2018692, 0.9390244, 0.6754967, 0.7216981, 0.4507042, 0.2113565, 0.7400000, 0.2156863 and 0.8615385. Thus, the classifier for variable 2, 4, 7, 9 are bad, which speaks well to the result we get from the training set.
marketing <- read.table("/Users/shijiabian/Dropbox/STATS315B/Data/Occupation_Data.txt",
sep = ",")
names(marketing) <- c("occu", "typeHome", "sex", "mariStatus", "age", "education",
"annualIncome", "timeLength", "dualIncome", "personNum", "personNum18",
"houseSta", "ethnic", "language")
set.seed(111)
market.perm <- sample(1:nrow(marketing))
market.train <- marketing[market.perm[1:7068], ]
market.test <- marketing[market.perm[7069:8857], ]
### Train the model
set.seed(324)
gbm0 <- gbm(occu ~ ., data = market.train, interaction.depth = 4, shrinkage = 0.05,
n.trees = 400, bag.fraction = 0.5, cv.folds = 5, distribution = "multinomial",
verbose = T)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 2.1972 nan 0.0500 0.2468
## 2 2.0641 nan 0.0500 0.1811
## 3 1.9655 nan 0.0500 0.1455
## 4 1.8870 nan 0.0500 0.1209
## 5 1.8218 nan 0.0500 0.1070
## 6 1.7640 nan 0.0500 0.0915
## 7 1.7152 nan 0.0500 0.0767
## 8 1.6729 nan 0.0500 0.0672
## 9 1.6346 nan 0.0500 0.0586
## 10 1.6021 nan 0.0500 0.0522
## 20 1.4073 nan 0.0500 0.0190
## 40 1.2773 nan 0.0500 0.0040
## 60 1.2255 nan 0.0500 -0.0001
## 80 1.1959 nan 0.0500 -0.0002
## 100 1.1730 nan 0.0500 -0.0004
## 120 1.1553 nan 0.0500 -0.0015
## 140 1.1384 nan 0.0500 -0.0015
## 160 1.1251 nan 0.0500 -0.0010
## 180 1.1122 nan 0.0500 -0.0021
## 200 1.0992 nan 0.0500 -0.0015
## 220 1.0875 nan 0.0500 -0.0016
## 240 1.0766 nan 0.0500 -0.0016
## 260 1.0667 nan 0.0500 -0.0012
## 280 1.0569 nan 0.0500 -0.0021
## 300 1.0478 nan 0.0500 -0.0018
## 320 1.0395 nan 0.0500 -0.0013
## 340 1.0309 nan 0.0500 -0.0013
## 360 1.0231 nan 0.0500 -0.0015
## 380 1.0157 nan 0.0500 -0.0014
## 400 1.0085 nan 0.0500 -0.0021
best.iter_test <- gbm.perf(gbm0, method = "test")
## Error: need finite 'ylim' values
# best.iter_test = 136
gbm0.predict <- predict(gbm0, market.train, type = "response", n.trees = best.iter_test)
## Warning: Number of trees not specified or exceeded number fit so far.
## Using 400.
classPred <- rep(0, nrow(market.train))
for (i in 1:nrow(market.train)) {
classPred[i] <- which(max(t(gbm0.predict[i, , 1])) == t(gbm0.predict[i,
, 1]))
}
matrix = table(classPred, market.train$occu)
# The overall error rate for the training set:
sum = 0
for (i in 1:9) {
sum = sum + matrix[i, i]
}
1 - sum/nrow(market.train)
## [1] 0.3454
# Calculate the traning misclassification error for each of the occupation
# type:
ratio.train = rep(0, 9)
for (i in 1:9) {
ratio.train[i] <- 1 - matrix[i, i]/sum(matrix[, i])
}
ratio.train
## [1] 0.1657 0.8264 0.5381 0.6189 0.3157 0.1511 0.4951 0.1450 0.6111
### Test Set
gbm0.pretest <- predict(gbm0, market.test, type = "response", n.trees = best.iter_test)
## Warning: Number of trees not specified or exceeded number fit so far.
## Using 400.
classPred.test <- rep(0, nrow(market.test))
for (i in 1:nrow(market.test)) {
classPred.test[i] <- which(max(t(gbm0.pretest[i, , 1])) == t(gbm0.pretest[i,
, 1]))
}
matrix.test = table(classPred.test, market.test$occu)
# Calculate the overall test misclassification error:
sum.test = 0
for (i in 1:9) {
sum.test = sum.test + matrix.test[i, i]
}
1 - sum.test/nrow(market.test)
## [1] 0.4449
# Calculate the test misclassification error for each of the occupation
# type:
ratio = rep(0, 9)
for (i in 1:9) {
ratio[i] <- 1 - matrix.test[i, i]/sum(matrix.test[, i])
}
The most important variable is age, followed by annual income, education, household status and dual Income. The detailed rank is listed below.
summary(gbm0, n.tree = best.iter_test, main = "RELATIVE INFLUENCE OF ALL PREDICTORS")
## Warning: Exceeded total number of GBM terms. Results use n.trees=400 terms.
## var rel.inf
## age age 27.6440
## annualIncome annualIncome 18.6786
## education education 17.9468
## houseSta houseSta 10.9074
## dualIncome dualIncome 7.7678
## sex sex 7.0867
## personNum personNum 2.6382
## mariStatus mariStatus 2.2580
## timeLength timeLength 1.2840
## ethnic ethnic 1.0980
## language language 1.0179
## typeHome typeHome 0.8402
## personNum18 personNum18 0.8326