library(ISLR2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(Boston)
crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston_d <- data.frame(Boston, crim01)
cor(Boston_d)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## lstat medv crim01
## crim 0.4556215 -0.3883046 0.40939545
## zn -0.4129946 0.3604453 -0.43615103
## indus 0.6037997 -0.4837252 0.60326017
## chas -0.0539293 0.1752602 0.07009677
## nox 0.5908789 -0.4273208 0.72323480
## rm -0.6138083 0.6953599 -0.15637178
## age 0.6023385 -0.3769546 0.61393992
## dis -0.4969958 0.2499287 -0.61634164
## rad 0.4886763 -0.3816262 0.61978625
## tax 0.5439934 -0.4685359 0.60874128
## ptratio 0.3740443 -0.5077867 0.25356836
## lstat 1.0000000 -0.7376627 0.45326273
## medv -0.7376627 1.0000000 -0.26301673
## crim01 0.4532627 -0.2630167 1.00000000
set.seed(1)
boston_s <- sample(c(TRUE, FALSE), nrow(Boston_d), replace=TRUE, prob = c(.6,.4))
boston_tr <- Boston_d[boston_s, ]
boston_t <- Boston_d[!boston_s, ]
dim(boston_tr)
## [1] 314 14
dim(boston_t)
## [1] 192 14
glm.bos <- glm(crim01~indus+nox+age+rad+tax+lstat, family=binomial, data=boston_tr)
summary(glm.bos)
##
## Call:
## glm(formula = crim01 ~ indus + nox + age + rad + tax + lstat,
## family = binomial, data = boston_tr)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.535448 3.975270 -5.669 1.44e-08 ***
## indus -0.072520 0.056568 -1.282 0.199841
## nox 40.095414 8.392236 4.778 1.77e-06 ***
## age 0.022459 0.012367 1.816 0.069362 .
## rad 0.576305 0.151603 3.801 0.000144 ***
## tax -0.008295 0.003199 -2.593 0.009521 **
## lstat -0.011124 0.043571 -0.255 0.798480
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 435.25 on 313 degrees of freedom
## Residual deviance: 139.35 on 307 degrees of freedom
## AIC: 153.35
##
## Number of Fisher Scoring iterations: 8
glm.probs.b <- predict(glm.bos,boston_t,type="response")
glm.pred.b <- rep(0,nrow(boston_t))
glm.pred.b[glm.probs.b > 0.50]=1
table(glm.pred.b,boston_t$crim01)
##
## glm.pred.b 0 1
## 0 88 13
## 1 10 81
mean(glm.pred.b==boston_t$crim01)
## [1] 0.8802083
1-mean(glm.pred.b==boston_t$crim01)
## [1] 0.1197917
Using a logistic regression model on the test set, we achieved a prediction accuracy of 88.02%, corresponding to a test error rate of 11.98%.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
lda.fit.b <- lda(crim01~indus+nox+age+rad+tax+lstat, data=boston_tr)
lda.fit.b
## Call:
## lda(crim01 ~ indus + nox + age + rad + tax + lstat, data = boston_tr)
##
## Prior probabilities of groups:
## 0 1
## 0.4936306 0.5063694
##
## Group means:
## indus nox age rad tax lstat
## 0 6.663226 0.4648245 50.03871 4.096774 303.8065 9.240387
## 1 15.310189 0.6426730 86.79308 14.610063 506.4969 15.832264
##
## Coefficients of linear discriminants:
## LD1
## indus 0.019009728
## nox 7.126273863
## age 0.018582977
## rad 0.089442271
## tax -0.001811084
## lstat -0.022915042
lda.class.b <- predict(lda.fit.b,boston_t)$class
table(lda.class.b,boston_t$crim01)
##
## lda.class.b 0 1
## 0 90 23
## 1 8 71
mean(lda.class.b==boston_t$crim01)
## [1] 0.8385417
1-mean(lda.class.b==boston_t$crim01)
## [1] 0.1614583
Using LDA, we obtained a lower accuracy of 83.85%, which corresponds to a test error rate of 16.15%.
library(e1071)
nb_model.b <- naiveBayes(crim01~indus+nox+age+rad+tax+lstat, data=boston_tr)
summary(nb_model.b)
## Length Class Mode
## apriori 2 table numeric
## tables 6 -none- list
## levels 2 -none- character
## isnumeric 6 -none- logical
## call 4 -none- call
pred.b <- predict(nb_model.b, boston_t)
table(pred.b, boston_t$crim01)
##
## pred.b 0 1
## 0 90 26
## 1 8 68
mean(pred.b == boston_t$crim01)
## [1] 0.8229167
mean(pred.b==boston_t$crim01)
## [1] 0.8229167
1-mean(pred.b==boston_t$crim01)
## [1] 0.1770833
Naive Bayes yielded relatively poor results, with an accuracy of only 51.04% and a corresponding error rate of 48.96%.
library(class)
set.seed(1)
train.Boston <- scale(boston_tr)
test.Boston <- scale(boston_t)
train.Boston = boston_tr[,c("indus","nox","age","rad","tax","lstat")]
test.Boston = boston_t[,c("indus","nox","age","rad","tax","lstat")]
knn.pred=knn(train.Boston,test.Boston,boston_tr$crim01,k=1)
table(knn.pred,boston_t$crim01)
##
## knn.pred 0 1
## 0 91 7
## 1 7 87
mean(knn.pred==boston_t$crim01)
## [1] 0.9270833
1-mean(knn.pred==boston_t$crim01)
## [1] 0.07291667
knn.pred=knn(train.Boston,test.Boston,boston_tr$crim01,k=2)
table(knn.pred,boston_t$crim01)
##
## knn.pred 0 1
## 0 86 12
## 1 12 82
mean(knn.pred==boston_t$crim01)
## [1] 0.875
knn.pred=knn(train.Boston,test.Boston,boston_tr$crim01,k=3)
table(knn.pred,boston_t$crim01)
##
## knn.pred 0 1
## 0 91 10
## 1 7 84
mean(knn.pred==boston_t$crim01)
## [1] 0.9114583
knn.pred=knn(train.Boston,test.Boston,boston_tr$crim01,k=15)
table(knn.pred,boston_t$crim01)
##
## knn.pred 0 1
## 0 81 13
## 1 17 81
mean(knn.pred==boston_t$crim01)
## [1] 0.84375
Among the tested models, the KNN model with 𝑘=1 achieved the highest classification accuracy of 92.71%, corresponding to a test error rate of 7.29%, outperforming logistic regression, LDA, and Naive Bayes on this subset of predictors.
library(ISLR)
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
set.seed(312)
In this section, we fit a logistic regression model using the entire dataset to predict the likelihood of default based on the predictors income and balance.
def.glm = glm(default ~ income + balance, data = Default, family = "binomial")
summary(def.glm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
To estimate the test error of the logistic regression model using the validation set approach, I will follow these steps:
Split the dataset into a training set (75%) and a validation set (25%).
Fit a multiple logistic regression model on the training data, using income and balance as predictors of default.
Predict the default status for each observation in the validation set by calculating the posterior probability of default. If the predicted probability is greater than 0.5, the observation will be classified as a default.
Calculate the validation set error rate, defined as the proportion of observations in the validation set that are incorrectly classified.
train = sample(dim(Default)[1], 0.75*dim(Default)[1])
def.glm = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")
prob.glm = predict(def.glm , Default[-train, ], type = "response")
pred.glm = rep("No", dim(Default)[1])
pred.glm [prob.glm > 0.5] = "Yes"
mean(pred.glm != Default[-train, "default"])
## [1] 0.0284
For this train-test split, the validation set error was 0.0284, meaning that 2.84% of the observations in the validation set were misclassified by the model.
Repeat the procedure from Part 2 three times, each time using a different random split of the data into training (75%) and validation (25%) sets. For each split:
Fit the logistic regression model using income and balance on the training data.
Predict default status on the corresponding validation set.
Calculate the validation set error rate.
After performing all three trials, compare the validation errors. This will give insight into the variability of the model’s performance across different data splits.
train = sample(dim(Default)[1], 0.75*dim(Default)[1])
def.glm = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")
prob.glm = predict(def.glm, Default[-train, ], type = "response")
pred.glm = rep("No", length(prob.glm))
pred.glm [prob.glm > 0.5] = "Yes"
mean(pred.glm != Default[-train, "default"])
## [1] 0.0268
train = sample(dim(Default)[1], 0.75*dim(Default)[1])
def.glm = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")
prob.glm = predict(def.glm, Default[-train, ], type = "response")
pred.glm = rep("No", dim(Default)[1])
pred.glm [prob.glm > 0.5] = "Yes"
mean(pred.glm != Default[-train, "default"])
## [1] 0.0256
train = sample(dim(Default)[1], 0.75*dim(Default)[1])
def.glm = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")
prob.glm = predict(def.glm, Default[-train, ], type = "response")
pred.glm = rep("No", dim(Default)[1])
pred.glm[prob.glm > 0.5] = "Yes"
mean(pred.glm != Default[-train, "default"])
## [1] 0.0252
1 - mean(Default[-train, "default"] == "No")
## [1] 0.0356
Using three different 75-25 splits of the data, the validation set error remained fairly stable across runs. The average validation error, including the result from Part 2, was approximately 0.0265, with individual errors varying by no more than about 13% from each other. Notably, these error rates are roughly 30% lower than the test error associated with the naive baseline strategy of predicting that no one will default, highlighting the effectiveness of the logistic regression model.
Now, we extend the logistic regression model by including a dummy variable for student status along with income and balance as predictors of default. Using the validation set approach with a 75-25 train-test split, we estimate the test error rate of this updated model.
After fitting the model and evaluating it on the validation set, we compare the resulting error to the previous model (which used only income and balance). This allows us to assess whether adding the student variable improves predictive performance.
If the validation error decreases, it suggests that student status adds useful information and enhances model accuracy. Conversely, if the error remains the same or increases, it implies that student status may not contribute meaningfully to predicting default.
with.student = rep(0, 50)
without.student = rep(0, 50)
for (i in 1:50){
train = sample(dim(Default)[1], 0.75 * dim(Default)[1])
# Includes all predictors: income, balance, and student (dummy)
with.student.fit = glm(default ~ ., data = Default, subset = train, family = "binomial")
without.student.fit = glm(default ~ income + balance, data = Default, subset = train, family = "binomial")
with.student.probs = predict(with.student.fit, Default[-train, ], type = "response")
without.student.probs = predict(without.student.fit, Default[-train, ], type = "response")
with.student.preds = rep("No", length(with.student.probs))
without.student.preds = rep("No", length(without.student.probs))
with.student.preds[with.student.probs > 0.5] = "Yes"
without.student.preds[without.student.probs > 0.5] = "Yes"
with.student[i] = mean(with.student.preds != Default[-train, "default"])
without.student[i] = mean(without.student.preds != Default[-train, "default"])
}
difference = with.student - without.student
errors = data.frame(with.student, without.student, difference)
mean(errors$with.student)
## [1] 0.026664
mean(errors$without.student)
## [1] 0.026368
We performed 50 train-test splits using the validation set approach and compared the test error rates of two logistic regression models:
A model using only income and balance as predictors.
A model that includes income, balance, and a dummy variable for student status.
Across all 50 iterations, we observed that including the student variable did not lead to a reduction in the average test error rate. In fact, it resulted in a slightly higher average error, suggesting that student status does not add meaningful predictive value in this context. Therefore, adding this variable may slightly reduce model performance due to added complexity without improved accuracy.
# Load required packages
library(ISLR2)
library(MASS) # For lm.ridge
library(pls) # For PCR and PLS
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
# Set seed and split data
set.seed(1)
train_index <- sample(1:nrow(College), nrow(College) / 2)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]
# Create model matrices for Ridge
x_train <- model.matrix(Apps ~ ., data = train_data)[, -1]
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_train <- train_data$Apps
y_test <- test_data$Apps
I split the data into 50% training and 50% test sets. This is essential for evaluating model performance on unseen data.
lm_fit <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_fit, newdata = test_data)
lm_mse <- mean((lm_pred - y_test)^2)
This is a baseline model using simple least squares regression. It assumes a linear relationship between Apps and the other variables.
lambda_seq <- seq(0, 1000, length = 100)
ridge_mod <- lm.ridge(Apps ~ ., data = train_data, lambda = lambda_seq)
best_lambda <- lambda_seq[which.min(ridge_mod$GCV)]
ridge_coefs <- coef(ridge_mod)[which.min(ridge_mod$GCV), ]
ridge_pred <- cbind(1, x_test) %*% ridge_coefs
ridge_mse <- mean((ridge_pred - y_test)^2)
Ridge regression adds a penalty term to the loss function to prevent overfitting. I manually select the best penalty (lambda) using GCV.
I couldn’t do the Lasso because glmnet was not working. Lasso performs variable selection by shrinking some coefficients exactly to zero. However, without glmnet, we cannot perform this in base R.
pcr_fit <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Check validation plot
validationplot(pcr_fit, val.type = "MSEP")
# Choose best number of components (e.g., 10)
pcr_pred <- predict(pcr_fit, test_data, ncomp = 10)
pcr_mse <- mean((pcr_pred - y_test)^2)
PCR reduces dimensionality by transforming predictors into principal components. Only the top M components (chosen by cross-validation) are used.
pls_fit <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Check validation plot
validationplot(pls_fit, val.type = "MSEP")
# Choose best number of components (e.g., 10)
pls_pred <- predict(pls_fit, test_data, ncomp = 10)
pls_mse <- mean((pls_pred - y_test)^2)
PLS, like PCR, reduces dimensionality — but it considers the response variable (Apps) when forming components, making it often more efficient than PCR.
cat("Test MSEs:\n")
## Test MSEs:
cat("Linear Regression:", lm_mse, "\n")
## Linear Regression: 1135758
cat("Ridge Regression:", ridge_mse, "\n")
## Ridge Regression: 1135758
cat("PCR:", pcr_mse, "\n")
## PCR: 1723100
cat("PLS:", pls_mse, "\n")
## PLS: 1131661
cat("Lasso: skipped (glmnet not available)\n")
## Lasso: skipped (glmnet not available)
Compare all models based on test set error (MSE). Lower MSE means better prediction accuracy. This helps you evaluate which method worked best.
In finance, investors often want to predict whether the stock market will go Up or Down the next day. By analyzing past market returns and trading volume, I can try to estimate this direction using logistic regression, a model that calculates probabilities for binary outcomes.
Real-world use: This kind of model is used in algorithmic trading, market timing strategies, and quantitative investing, where decisions are made based on historical data patterns.
# Load library and data
library(ISLR)
data(Smarket)
# Explore the dataset
str(Smarket)
## 'data.frame': 1250 obs. of 9 variables:
## $ Year : num 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
I was using the Smarket dataset from ISLR. It contains daily percentage returns for the S&P 500 stock index over 1,250 days from 2001 to 2005.
Lag1 to Lag5: Return from 1 to 5 days before
Volume: Trading volume
Direction: Whether the market went Up or Down that day
# Training set: all years before 2005
train_data <- subset(Smarket, Year < 2005)
test_data <- subset(Smarket, Year == 2005)
# Check size
nrow(train_data); nrow(test_data)
## [1] 998
## [1] 252
To test the model’s predictive ability, I simulate a real-life forecasting task:
Training set: Data from 2001 to 2004 (I “learn” here)
Test set: Data from 2005 (I “predict” here)
This mimics using historical data to forecast future outcomes.
# Fit logistic regression using Lag1 and Lag2
log_model <- glm(Direction ~ Lag1 + Lag2, data = train_data, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03222 0.06338 0.508 0.611
## Lag1 -0.05562 0.05171 -1.076 0.282
## Lag2 -0.04449 0.05166 -0.861 0.389
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.4 on 995 degrees of freedom
## AIC: 1387.4
##
## Number of Fisher Scoring iterations: 3
I’m building a model to estimate the probability that the market goes Up, based on Lag1 and Lag2 returns (returns from the last 2 days). The logistic regression gives us coefficients that represent the effect of these predictors on the log-odds of the market going Up.
# Predict probabilities on test data
pred_prob <- predict(log_model, newdata = test_data, type = "response")
# Classify using 0.5 cutoff
pred_class <- ifelse(pred_prob > 0.5, "Up", "Down")
pred_class <- as.factor(pred_class)
I use the model to calculate the probability of the market going Up on each 2005 day. Then I convert this probability into a classification:
If P(Up) > 0.5, we predict Up
Otherwise, I predict Down
This decision rule is used in real investment strategies to decide whether to buy, sell, or hold.
# Confusion matrix
actual <- test_data$Direction
conf_matrix <- table(Predicted = pred_class, Actual = actual)
print(conf_matrix)
## Actual
## Predicted Down Up
## Down 35 35
## Up 76 106
# Accuracy
accuracy <- mean(pred_class == actual)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 55.95 %"
I check how well the model performed by comparing predictions to the actual 2005 outcomes.
Confusion Matrix shows how many times the model predicted correctly vs. incorrectly.
Accuracy tells what % of predictions were correct — a key metric in finance when using predictive models for decision-making.
If the model predicts “Up” correctly more often than “Down”, it could be useful for short-term investment strategies.
Though logistic regression is simple, it reveals patterns in market behavior and helps in algorithmic trading or market timing strategies.
You can enhance this with more features, non-linear models (e.g., SVMs, trees), or use lagged returns of individual stocks.
What I learn:
Logistic regression provides a simple way to model how recent returns affect future market movements.
While performance might be limited, this method shows the foundation of quantitative finance and trading signals.
How it’s used in real life:
Hedge funds and trading firms use similar models, though more advanced, to forecast market behavior, develop signals, and automate trades.
Risk managers might use it to estimate downside risk.