Juan Olazaba 10/23/2023
Please see other attachment for handwritten work on this problem. Thank you.
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
plot(Weekly$Volume, main = "Volume over time", col = "red",
ylab = "Number of Shares Traded (billions)", xlab = "Time (weeks)")Lag2 appears to be the only predictor that is statistically significant, all other predictors display a high p-value.
logit.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(logit.fit)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The confusion matrix has demonstrated that our model has a prediction accuracy of 56%. In addition, the precision of the model yields 92%. Which allows us to view a specific class prediction. In our case, 92% of out of all of our “up” predictions were correctly predicted. While our recall rate was 56%, which means the model correctly found 56% of the actual “up” weeks.
# use model to predict
glm.probs <- predict(logit.fit, type = "response")
glm.pred <- rep("Down", length(glm.probs))
glm.pred[glm.probs > .5] = "Up"
# use table function to produce confusion matrix
table(glm.pred, Weekly$Direction)##
## glm.pred Down Up
## Down 54 48
## Up 430 557
## [1] 0.5610652
## [1] 0.9206612
## [1] 0.5643364
One could infer that splitting the data in this manner improved our prediction accuracy to 62%. However, our training error is still relatively high at 38%. Our goal should be to continue to minimize that percentage by implementing different pre-processing techniques.
## [1] 104 9
reduced.fit <- glm(Direction ~ Lag2 + Lag3 + Lag4 + Lag2*Lag4 + Lag2*Lag3 + Lag3*Lag4, data = Weekly, family = binomial, subset = train)
summary(reduced.fit)##
## Call:
## glm(formula = Direction ~ Lag2 + Lag3 + Lag4 + Lag2 * Lag4 +
## Lag2 * Lag3 + Lag3 * Lag4, family = binomial, data = Weekly,
## subset = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2163273 0.0650295 3.327 0.000879 ***
## Lag2 0.0547920 0.0312040 1.756 0.079101 .
## Lag3 -0.0014305 0.0307488 -0.047 0.962894
## Lag4 -0.0233000 0.0297382 -0.784 0.433332
## Lag2:Lag4 -0.0072979 0.0095672 -0.763 0.445582
## Lag2:Lag3 0.0005984 0.0076869 0.078 0.937946
## Lag3:Lag4 0.0109796 0.0084119 1.305 0.191812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1346.9 on 978 degrees of freedom
## AIC: 1360.9
##
## Number of Fisher Scoring iterations: 4
glmp <- predict(reduced.fit, Weekly.2008, type = "response")
# Confusion matrix build
glm.pred <- rep("Down", 104)
glm.pred[glmp > .5] <- "Up"
# use table function to produce confusion matrix
table(glm.pred, Weekly.2008$Direction)##
## glm.pred Down Up
## Down 9 3
## Up 34 58
## [1] 0.6442308
#Weekly$Lag2 <- sqrt(Weekly$Lag2) transformation weakened model performance
lda.mod <- lda(Direction ~ Lag2, data = Weekly, subset = train)
summary(lda.mod)## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 1 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 4 -none- call
## terms 3 terms call
## xlevels 0 -none- list
lda.pred <- predict(lda.mod, Weekly.2008)
lda.class <- lda.pred$class
Direction.2008 <- Weekly.2008$Direction
table(lda.class, Direction.2008)## Direction.2008
## lda.class Down Up
## Down 9 5
## Up 34 56
## [1] 0.625
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 2 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 4 -none- call
## terms 3 terms call
## xlevels 0 -none- list
## Direction.2008
## qda.class Down Up
## Down 0 0
## Up 43 61
## [1] 0.5865385
train.x <- data.frame(Weekly[train, ]$Lag2)
test.x <- data.frame(Weekly[!train, ]$Lag2)
train.Direction <- Weekly[train, ]$Direction
set.seed(1)
knn.func <- knn(train.x, test.x, train.Direction, k=1)
table(knn.func, Weekly[!train, ]$Direction)##
## knn.func Down Up
## Down 21 30
## Up 22 31
## [1] 0.5
## Length Class Mode
## apriori 2 table numeric
## tables 1 -none- list
## levels 2 -none- character
## isnumeric 1 -none- logical
## call 4 -none- call
## Direction.2008
## nb.class Down Up
## Down 0 0
## Up 43 61
## [1] 0.5865385
After splitting our data 70/30, and using logistic regression to predict whether a census tract fell below or above the median crime rate, our model was able to predict with 97% accuracy. All variables were included in the model.
Logistic Regression
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## df.rs
## boston.glm 0 1
## 0 74 1
## 1 2 75
## [1] 0.9802632
Linear Discriminant Analysis (LDA)
Predictors mdv, crim, chas, and rm were removed and model performance did not perform as well.
lda.fit <- lda(crime ~ zn+indus+nox+age+dis+rad+tax+black+lstat, data = Boston, subset = tr.ind)
lda.pred <- predict(lda.fit, Boston)
lda.class <- lda.pred$class
table(lda.class, Boston$crime)##
## lda.class 0 1
## 0 243 64
## 1 10 189
## [1] 0.8537549
Naive Bayes
Naive Bayes seems to produce results similar to our logistic tregression model. When I remove predictors, model seems to perform worse with a prediction accuracy of 84%.
nb.fit <- naiveBayes(crime ~ . -crim -medv, data = Boston, subset = tr.ind)
nb.class <- predict(nb.fit, df.te)
table(nb.class, df.rs)## df.rs
## nb.class 0 1
## 0 74 18
## 1 2 58
## [1] 0.8684211
KNN
Q4: This churn dataset comes from IBM Sample Data Sets. Customer churn occurs when customers stop doing business with a company, also known as customer attrition. The data set contains rows (customers) and columns (features). The “Churn” column is our target which indicate whether customer churned (left the company) or not.
## Warning: package 'liver' was built under R version 4.3.2
##
## Attaching package: 'liver'
## The following object is masked from 'package:e1071':
##
## skewness
## The following object is masked from 'package:base':
##
## transform
## 'data.frame': 5000 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ area.code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ account.length: int 128 107 137 84 75 118 121 147 117 141 ...
## $ voice.plan : Factor w/ 2 levels "yes","no": 1 1 2 2 2 2 1 2 2 1 ...
## $ voice.messages: int 25 26 0 0 0 0 24 0 0 37 ...
## $ intl.plan : Factor w/ 2 levels "yes","no": 2 2 2 1 1 1 2 1 2 1 ...
## $ intl.mins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ intl.calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ intl.charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ day.mins : num 265 162 243 299 167 ...
## $ day.calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ day.charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ eve.mins : num 197.4 195.5 121.2 61.9 148.3 ...
## $ eve.calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ eve.charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ night.mins : num 245 254 163 197 187 ...
## $ night.calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ night.charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ customer.calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
## state area.code account.length voice.plan voice.messages intl.plan
## 1 KS area_code_415 128 yes 25 no
## 2 OH area_code_415 107 yes 26 no
## 3 NJ area_code_415 137 no 0 no
## 4 OH area_code_408 84 no 0 yes
## 5 OK area_code_415 75 no 0 yes
## 6 AL area_code_510 118 no 0 yes
## intl.mins intl.calls intl.charge day.mins day.calls day.charge eve.mins
## 1 10.0 3 2.70 265.1 110 45.07 197.4
## 2 13.7 3 3.70 161.6 123 27.47 195.5
## 3 12.2 5 3.29 243.4 114 41.38 121.2
## 4 6.6 7 1.78 299.4 71 50.90 61.9
## 5 10.1 3 2.73 166.7 113 28.34 148.3
## 6 6.3 6 1.70 223.4 98 37.98 220.6
## eve.calls eve.charge night.mins night.calls night.charge customer.calls churn
## 1 99 16.78 244.7 91 11.01 1 no
## 2 103 16.62 254.4 103 11.45 1 no
## 3 110 10.30 162.6 104 7.32 0 no
## 4 88 5.26 196.9 89 8.86 2 no
## 5 122 12.61 186.9 121 8.41 3 no
## 6 101 18.75 203.9 118 9.18 0 no
There are several methods we could employ to create a training and testing set given a classification imbalance. I have chosen to under sample the majority class in our data set which happens to be “no”. By doing so, one is able to random sample either the same amount of observations as the minority class or a fraction of the original majority class.
ggplot(churn, aes(x = account.length)) +
geom_density(binwidth = 0.5, fill = "lightblue", alpha = 0.7) +
labs(
title = "Normally distributed",
x = "Values",
y = "Account Length"
)## Warning in geom_density(binwidth = 0.5, fill = "lightblue", alpha = 0.7):
## Ignoring unknown parameters: `binwidth`
ggplot(churn, aes(x = churn, fill = churn)) +
geom_bar(alpha = 0.5) +
labs(
title = "Histogram of Values",
x = "Customer Churn",
y = "Frequency"
)To optimize our classification statistic, I have chosen createDataPartition. This method would help us create a balanced data split by maintaining a ratio of our imbalanced factor classes.
set.seed(49)
minority_class <- subset(churn, churn == "yes")
majority_class <- subset(churn, churn == "no")
mundersample <- minority_class[sample(1:nrow(majority_class), nrow(minority_class)), ]
aggr <- rbind(minority_class, mundersample)
ggplot(aggr, aes(x = churn, fill = churn)) +
geom_bar(alpha = 0.5) +
labs(
title = "Undersampling Plot",
x = "Customer Churn",
y = "Frequency"
)Final decision: QDA proved to be the model with the lowest test error rate at 11%. The variables that were removed from the model were state, area.code, voice.plan, and intl.plan.
Logistic regression - Data was split using ‘createDataPartition’ and pre-processing of data consisted of removing several variables. Test error rate for this model came out to 14%.
set.seed(247)
trainIndex <- createDataPartition(churn$churn,
p = 0.7,
list = FALSE,
times = 1)
train <- churn[trainIndex, ]
test <- churn[-trainIndex, ]
model <- glm(churn ~ .-state -area.code -voice.plan -intl.plan, data = train, family = binomial)
summary(model)##
## Call:
## glm(formula = churn ~ . - state - area.code - voice.plan - intl.plan,
## family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.273e+00 6.759e-01 12.240 < 2e-16 ***
## account.length -1.460e-03 1.324e-03 -1.102 0.27025
## voice.messages 2.853e-02 4.657e-03 6.126 9e-10 ***
## intl.mins -7.998e-01 4.965e+00 -0.161 0.87202
## intl.calls 6.227e-02 2.280e-02 2.731 0.00631 **
## intl.charge 2.622e+00 1.839e+01 0.143 0.88661
## day.mins -3.392e+00 3.129e+00 -1.084 0.27827
## day.calls -2.734e-03 2.623e-03 -1.042 0.29725
## day.charge 1.987e+01 1.840e+01 1.080 0.28024
## eve.mins 1.581e-01 1.557e+00 0.102 0.91910
## eve.calls 2.353e-03 2.644e-03 0.890 0.37345
## eve.charge -1.943e+00 1.832e+01 -0.106 0.91552
## night.mins -7.219e-01 8.315e-01 -0.868 0.38529
## night.calls -7.062e-05 2.631e-03 -0.027 0.97858
## night.charge 1.595e+01 1.848e+01 0.863 0.38789
## customer.calls -4.766e-01 3.796e-02 -12.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2853.1 on 3500 degrees of freedom
## Residual deviance: 2405.8 on 3485 degrees of freedom
## AIC: 2437.8
##
## Number of Fisher Scoring iterations: 5
predictions <- predict(model, newdata = test, type = "response")
# Evaluate the model
glm.ch <- rep("yes", 1499)
glm.ch[predictions > .5] <- "no"
table(glm.ch, test$churn)##
## glm.ch yes no
## no 182 1260
## yes 30 27
## [1] 0.1394263
LDA - error rate is 14%, similar to logistic regression.
lda.fitt <- lda(churn ~ .-state -area.code -voice.plan -intl.plan, data = train)
lda.pred <- predict(lda.fitt, test)
lda.class <- lda.pred$class
table(lda.class, test$churn)##
## lda.class yes no
## yes 27 30
## no 185 1257
## [1] 0.143429
QDA - error rate decreased, which is a good sign.
cqda.fit <- qda(churn ~ .-state -area.code -voice.plan -intl.plan, data = train)
cqda.pred <- predict(cqda.fit, test)
qda.class <- cqda.pred$class
table(qda.class, test$churn)##
## qda.class yes no
## yes 84 31
## no 128 1256
## [1] 0.1060707
Naive Bayes - error rate is better than LDA & logistic regression but worse than QDA.
nb.fit <- naiveBayes(churn ~ .-state -area.code -voice.plan -intl.plan, data = train)
nb.class <- predict(nb.fit, test)
table(nb.class, test$churn)##
## nb.class yes no
## yes 94 64
## no 118 1223
## [1] 0.1214143
```