assignment 3

# Load the necessary library
library(readr)

# Set the file path relative to the R Markdown file
# Assuming "Weekly.csv" is in the same directory as your R Markdown file
file_path <- "Weekly.csv"

# Read the CSV file
# Load the CSV file into a data frame
weekly_data <- read.csv("C:/Users/ngaku/Downloads/Weekly.csv")

# View the first few rows of the data frame
head(weekly_data)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

# View the first few rows of the data
head(weekly_data)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

# Assuming your data is loaded into the variable weekly_data
summary(weekly_data)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##   Direction        
##  Length:1089       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

# Basic statistical summary of numerical variables
summary(weekly_data[, c("Year", "Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today")])

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260

# Frequency table for the categorical variable 'Direction'
table(weekly_data$Direction)

## 
## Down   Up 
##  484  605

par(mfrow=c(3,2)) # Setting up the plotting area to display multiple plots

hist(weekly_data$Lag1, main="Histogram of Lag1", xlab="Lag1")
hist(weekly_data$Lag2, main="Histogram of Lag2", xlab="Lag2")
hist(weekly_data$Lag3, main="Histogram of Lag3", xlab="Lag3")
hist(weekly_data$Lag4, main="Histogram of Lag4", xlab="Lag4")
hist(weekly_data$Lag5, main="Histogram of Lag5", xlab="Lag5")
hist(weekly_data$Volume, main="Histogram of Volume", xlab="Volume")

plot(weekly_data$Year, weekly_data$Volume, main="Volume Over Years", xlab="Year", ylab="Volume", pch=19)

#Based on the graphical summaries provided, we can observe a few patterns:

Volume Over Years:
    The scatter plot shows that the trading volume has a general upward trend over the years. It starts off relatively low in the early 1990s and increases significantly towards 2010. This could be due to a variety of factors such as the growth of the financial markets, increased trading activity, the advent of online trading platforms, or broader economic trends.

Histograms of Lag Variables and Volume:
    The histograms for the Lag variables (Lag1 to Lag5) appear to be fairly symmetric around zero, with tails extending towards both the positive and negative sides. This suggests that past weekly returns (as measured by the lag variables) fluctuate around a mean of zero with no strong bias towards positive or negative returns.
    The histogram for the Volume variable shows a right-skewed distribution, indicating that there are more instances of lower volume weeks, with fewer instances of very high volume weeks.

Summary Statistics:
    The numerical summary confirms the patterns observed in the histograms. The mean values for the Lag variables are close to zero, which is consistent with the symmetric nature of the histograms.
    The Volume has a mean much higher than the median, confirming the right-skewed distribution observed in the histogram. This skewness suggests that while most of the data points are clustered around a lower volume, there are enough high-volume weeks to pull the mean upwards.

Direction Frequency:
    There are more 'Up' weeks (605) than 'Down' weeks (484), indicating that over the time period of this dataset, there were more weeks where the market ended up higher than it started.

# Assuming your data is loaded into the variable weekly_data
# Convert Direction to a factor if it's not already
weekly_data$Direction <- factor(weekly_data$Direction, levels = c("Down", "Up"))

# Perform logistic regression
logistic_model <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                      data = weekly_data, family = binomial)

# Print the summary of the logistic regression model
summary(logistic_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = weekly_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

among the predictors in the logistic regression model, Lag2 appears to be statistically significant. Its p-value is 0.0296, which is below the common significance level of 0.05. This indicates that Lag2 has a statistically significant association with the direction of the market.

The Intercept is also statistically significant with a p-value of 0.0019.

The other predictors (Lag1, Lag3, Lag4, Lag5, and Volume) have p-values greater than 0.05, which suggests that they do not have a statistically significant association with the market direction, at least not at the 0.05 significance level.

# Assuming your logistic regression model is named logistic_model
# and your dataset is named weekly_data

# Get predicted probabilities
predicted_probs <- predict(logistic_model, type = "response")

# Convert probabilities to predicted classes
predicted_classes <- ifelse(predicted_probs > 0.5, "Up", "Down")

# Actual classes
actual_classes <- weekly_data$Direction

# Create the confusion matrix
confusion_matrix <- table(Predicted = predicted_classes, Actual = actual_classes)

# Print the confusion matrix
print(confusion_matrix)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

# Calculate the overall fraction of correct predictions
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

# Print the accuracy
print(accuracy)

## [1] 0.5610652

From the confusion matrix, the accuracy is approximately 56.1%. This means that the model correctly predicts the market direction a little more than half the time.

However, the confusion matrix also reveals that there is a significant imbalance in the types of mistakes the model is making:

The model has predicted "Up" much more frequently than "Down", which is evident from the high number of false positives (FP). In other words, the model predicted the market would go up 430 times when it actually went down.
The number of true negatives (TN) is very low compared to false positives (FP), indicating that when the market actually went down, the model often failed to predict this correctly.

This kind of mistake suggests that the model may have a bias towards predicting an “Up” market. It could be due to an imbalance in the training data where there are more ‘Up’ weeks than ‘Down’ weeks, or it could be that the model’s predictors have a stronger relationship with the ‘Up’ outcome.

# Subset the data into training and test sets
training_data <- subset(weekly_data, Year <= 2008)
test_data <- subset(weekly_data, Year > 2008)

# Convert Direction to a factor if it's not already
training_data$Direction <- factor(training_data$Direction, levels = c("Down", "Up"))

# Fit the logistic regression model on the training data
logistic_model_lag2 <- glm(Direction ~ Lag2, data = training_data, family = binomial)

# Predict on the test data
predicted_probs_test <- predict(logistic_model_lag2, newdata = test_data, type = "response")
predicted_classes_test <- ifelse(predicted_probs_test > 0.5, "Up", "Down")

# Actual classes in the test data
actual_classes_test <- test_data$Direction

# Create the confusion matrix for the test data
confusion_matrix_test <- table(Predicted = predicted_classes_test, Actual = actual_classes_test)

# Print the confusion matrix for the test data
print(confusion_matrix_test)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

# Calculate the overall fraction of correct predictions for the test data
accuracy_test <- sum(diag(confusion_matrix_test)) / sum(confusion_matrix_test)

# Print the accuracy for the test data
print(accuracy_test)

## [1] 0.625

# Load the MASS package for LDA
library(MASS)

# Subset the data into training and test sets
training_data <- subset(weekly_data, Year <= 2008)
test_data <- subset(weekly_data, Year > 2008)

# Convert Direction to a factor if it's not already
training_data$Direction <- factor(training_data$Direction, levels = c("Down", "Up"))

# Fit the LDA model on the training data using Lag2 as the predictor
lda_model_lag2 <- lda(Direction ~ Lag2, data = training_data)

# Predict on the test data
lda_predictions_test <- predict(lda_model_lag2, newdata = test_data)

# The class predictions are found in lda_predictions_test$class
predicted_classes_test <- lda_predictions_test$class

# Actual classes in the test data
actual_classes_test <- test_data$Direction

# Create the confusion matrix for the test data
confusion_matrix_test <- table(Predicted = predicted_classes_test, Actual = actual_classes_test)

# Print the confusion matrix for the test data
print(confusion_matrix_test)

##          Actual
## Predicted Down Up
##      Down    9  5
##      Up     34 56

# Calculate the overall fraction of correct predictions for the test data
accuracy_test <- sum(diag(confusion_matrix_test)) / sum(confusion_matrix_test)

# Print the accuracy for the test data
print(accuracy_test)

## [1] 0.625

# Load the MASS package for QDA
library(MASS)

# Subset the data into training and test sets
training_data <- subset(weekly_data, Year <= 2008)
test_data <- subset(weekly_data, Year > 2008)

# Convert Direction to a factor if it's not already
training_data$Direction <- factor(training_data$Direction, levels = c("Down", "Up"))

# Fit the QDA model on the training data using Lag2 as the predictor
qda_model_lag2 <- qda(Direction ~ Lag2, data = training_data)

# Predict on the test data
qda_predictions_test <- predict(qda_model_lag2, newdata = test_data)

# The class predictions are found in qda_predictions_test$class
predicted_classes_test <- qda_predictions_test$class

# Actual classes in the test data
actual_classes_test <- test_data$Direction

# Create the confusion matrix for the test data
confusion_matrix_test <- table(Predicted = predicted_classes_test, Actual = actual_classes_test)

# Print the confusion matrix for the test data
print(confusion_matrix_test)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

# Calculate the overall fraction of correct predictions for the test data
accuracy_test <- sum(diag(confusion_matrix_test)) / sum(confusion_matrix_test)

# Print the accuracy for the test data
print(accuracy_test)

## [1] 0.5865385

# Load the MASS package for QDA
library(MASS)

# Subset the data into training and test sets
training_data <- subset(weekly_data, Year <= 2008)
test_data <- subset(weekly_data, Year > 2008)

# Convert Direction to a factor if it's not already
training_data$Direction <- factor(training_data$Direction, levels = c("Down", "Up"))

# Fit the QDA model on the training data using Lag2 as the predictor
qda_model_lag2 <- qda(Direction ~ Lag2, data = training_data)

# Predict on the test data
qda_predictions_test <- predict(qda_model_lag2, newdata = test_data)

# The class predictions are found in qda_predictions_test$class
predicted_classes_test <- qda_predictions_test$class

# Actual classes in the test data
actual_classes_test <- test_data$Direction

# Create the confusion matrix for the test data
confusion_matrix_test <- table(Predicted = predicted_classes_test, Actual = actual_classes_test)

# Print the confusion matrix for the test data
print(confusion_matrix_test)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

# Calculate the overall fraction of correct predictions for the test data
accuracy_test <- sum(diag(confusion_matrix_test)) / sum(confusion_matrix_test)

# Print the accuracy for the test data
print(accuracy_test)

## [1] 0.5865385

# Load the e1071 package for Naive Bayes
library(e1071)

# Subset the data into training and test sets
training_data <- subset(weekly_data, Year <= 2008)
test_data <- subset(weekly_data, Year > 2008)

# Prepare the predictor and response for training and test sets
# Note that Naive Bayes in R via e1071 works with data frames for predictors
train_data <- data.frame(Lag2 = training_data$Lag2)
train_direction <- training_data$Direction
test_data_frame <- data.frame(Lag2 = test_data$Lag2)
test_direction <- test_data$Direction

# Fit the Naive Bayes model on the training data
naive_bayes_model <- naiveBayes(train_data, as.factor(train_direction))

# Predict on the test data
nb_predictions <- predict(naive_bayes_model, test_data_frame)

# Create the confusion matrix for the test data
confusion_matrix_test <- table(Predicted = nb_predictions, Actual = test_direction)

# Print the confusion matrix for the test data
print(confusion_matrix_test)

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

# Calculate the overall fraction of correct predictions for the test data
accuracy_test <- sum(diag(confusion_matrix_test)) / sum(confusion_matrix_test)

# Print the accuracy for the test data
print(accuracy_test)

## [1] 0.5865385

LDA and QDA: Both achieved an accuracy of 0.625.
KNN (with K=1K=1) and Naive Bayes: Both achieved an accuracy of approximately 0.5865385.

Based on these accuracies, LDA and QDA performed the best

# Assuming weekly_data is your dataset
log_model <- glm(Direction ~ Lag2 + Lag3 + I(Lag2*Lag3), data = weekly_data, family = binomial(link = "logit"))
summary(log_model)

## 
## Call:
## glm(formula = Direction ~ Lag2 + Lag3 + I(Lag2 * Lag3), family = binomial(link = "logit"), 
##     data = weekly_data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.217725   0.061407   3.546 0.000392 ***
## Lag2            0.064347   0.027907   2.306 0.021124 *  
## Lag3           -0.015640   0.026167  -0.598 0.550045    
## I(Lag2 * Lag3)  0.001897   0.006585   0.288 0.773289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1490.0  on 1085  degrees of freedom
## AIC: 1498
## 
## Number of Fisher Scoring iterations: 4

library(MASS)
lda_model <- lda(Direction ~ Lag2 + Lag4, data = weekly_data)
summary(lda_model)

##         Length Class  Mode     
## prior   2      -none- numeric  
## counts  2      -none- numeric  
## means   4      -none- numeric  
## scaling 2      -none- numeric  
## lev     2      -none- character
## svd     1      -none- numeric  
## N       1      -none- numeric  
## call    3      -none- call     
## terms   3      terms  call     
## xlevels 0      -none- list

qda_model <- qda(Direction ~ sqrt(Lag2), data = weekly_data)

## Warning in sqrt(Lag2): NaNs produced

summary(qda_model)

##           Length Class  Mode     
## prior       2    -none- numeric  
## counts      2    -none- numeric  
## means       2    -none- numeric  
## scaling     2    -none- numeric  
## ldet        2    -none- numeric  
## lev         2    -none- character
## N           1    -none- numeric  
## call        3    -none- call     
## terms       3    terms  call     
## xlevels     0    -none- list     
## na.action 484    omit   numeric

library(class)
library(caret) # For data standardization

## Loading required package: ggplot2

## Loading required package: lattice

# Standardize predictors
standardized_data <- scale(weekly_data[, c("Lag2", "Lag3")]) # example with Lag2 and Lag3
train_index <- which(weekly_data$Year <= 2008)
test_index <- which(weekly_data$Year > 2008)

# Subset the standardized data
train_data <- standardized_data[train_index, ]
test_data <- standardized_data[test_index, ]

# Train KNN with K=3 for example
set.seed(123) # For reproducibility
knn_pred <- knn(train = train_data, test = test_data, cl = weekly_data$Direction[train_index], k = 3)

# Confusion Matrix
confusionMatrix(factor(knn_pred), factor(weekly_data$Direction[test_index]))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   16 20
##       Up     27 41
##                                           
##                Accuracy : 0.5481          
##                  95% CI : (0.4474, 0.6459)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.8152          
##                                           
##                   Kappa : 0.0453          
##                                           
##  Mcnemar's Test P-Value : 0.3815          
##                                           
##             Sensitivity : 0.3721          
##             Specificity : 0.6721          
##          Pos Pred Value : 0.4444          
##          Neg Pred Value : 0.6029          
##              Prevalence : 0.4135          
##          Detection Rate : 0.1538          
##    Detection Prevalence : 0.3462          
##       Balanced Accuracy : 0.5221          
##                                           
##        'Positive' Class : Down            
##

library(e1071)
nb_model <- naiveBayes(Direction ~ Lag2 + I(Lag2^2), data = weekly_data)
nb_pred <- predict(nb_model, weekly_data[weekly_data$Year > 2008, ])

## Warning in predict.naiveBayes(nb_model, weekly_data[weekly_data$Year > 2008, :
## Type mismatch between training and new data for variable 'I(Lag2^2)'. Did you
## use factors with numeric labels for training, and numeric values for new data?

table(Predicted = nb_pred, Actual = weekly_data$Direction[weekly_data$Year > 2008])

##          Actual
## Predicted Down Up
##      Down    0  0
##      Up     43 61

# Assuming actual and predicted are your actual and predicted classifications
confusion_matrix <- table(Predicted = predicted_classes, Actual = actual_classes)
print(confusion_matrix)

##          Actual
## Predicted Down  Up
##      Down   54  48
##      Up    430 557

accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy:", accuracy))

## [1] "Accuracy: 0.561065197428834"

Based on the information provided, the original Logistic Regression model seems to offer the best results in terms of accuracy (0.561065197428834) among the models for which you’ve provided detailed outcomes. This model has a higher accuracy compared to the KNN model with adjusted KK (0.5481).

# Define the file path
file_path <- "C:/Users/ngaku/Downloads/Auto(2).csv"

# Use read.csv() to load the file into R
auto_data <- read.csv(file_path)

# Check the first few rows to confirm it's loaded correctly
head(auto_data)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

# Calculate the median of mpg
median_mpg <- median(auto_data$mpg, na.rm = TRUE)

# Create the binary variable mpg01
auto_data$mpg01 <- ifelse(auto_data$mpg > median_mpg, 1, 0)

# Check the first few rows to confirm the new variable is added correctly
head(auto_data)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

library(ggplot2)

# Ensure horsepower is numeric
auto_data$horsepower <- as.numeric(as.character(auto_data$horsepower))

## Warning: NAs introduced by coercion

# Plot Displacement vs mpg01
ggplot(auto_data, aes(x = displacement, y = mpg01, color = factor(mpg01))) +
  geom_jitter(alpha = 0.5) +
  labs(title = "Displacement vs. mpg01", x = "Displacement", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

# Plot Horsepower vs mpg01
ggplot(auto_data, aes(x = horsepower, y = mpg01, color = factor(mpg01))) +
  geom_jitter(alpha = 0.5) +
  labs(title = "Horsepower vs. mpg01", x = "Horsepower", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

## Warning: Removed 5 rows containing missing values (`geom_point()`).

# Plot Weight vs mpg01
ggplot(auto_data, aes(x = weight, y = mpg01, color = factor(mpg01))) +
  geom_jitter(alpha = 0.5) +
  labs(title = "Weight vs. mpg01", x = "Weight", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

# Plot Acceleration vs mpg01
ggplot(auto_data, aes(x = acceleration, y = mpg01, color = factor(mpg01))) +
  geom_jitter(alpha = 0.5) +
  labs(title = "Acceleration vs. mpg01", x = "Acceleration", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

# Plot Cylinders vs mpg01
ggplot(auto_data, aes(x = factor(cylinders), y = mpg01, fill = factor(mpg01))) +
  geom_boxplot() +
  labs(title = "Cylinders vs. mpg01", x = "Cylinders", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

# Plot Origin vs mpg01
ggplot(auto_data, aes(x = factor(origin), y = mpg01, fill = factor(mpg01))) +
  geom_boxplot() +
  labs(title = "Origin vs. mpg01", x = "Origin", y = "mpg01 (Above/Below Median)") +
  theme_minimal()

Overall, displacement, horsepower, and weight show a clear pattern of association with mpg01 and are likely to be strong predictors of fuel efficiency. Acceleration does not show as distinct of a separation between mpg01 values, suggesting it may not be as strong a predictor. These findings suggest that features associated with the size and power of the vehicle are inversely related to fuel efficiency. Vehicles that are larger, more powerful, and heavier tend to have lower miles per gallon, while the opposite is true for smaller, less powerful, and lighter vehicles.

# Set the seed for reproducibility
set.seed(123)

# Decide on a split percentage, e.g., 70% for training and 30% for testing
split_percentage <- 0.7

# Calculate the index that splits the data
split_index <- sample(seq_len(nrow(auto_data)), size = floor(split_percentage * nrow(auto_data)))

# Create the training set
training_set <- auto_data[split_index, ]

# Create the test set
test_set <- auto_data[-split_index, ]

# Impute missing data, for example, using the median for horsepower
median_horsepower <- median(training_set$horsepower, na.rm = TRUE)
training_set$horsepower[is.na(training_set$horsepower)] <- median_horsepower
test_set$horsepower[is.na(test_set$horsepower)] <- median_horsepower

# Now fit the LDA model and predict as before
# ...

# Assuming displacement, horsepower, and weight were the variables most associated with mpg01
# And assuming training_set and test_set are already created

# Load the MASS package for LDA
library(MASS)

# Fit the LDA model on the training data
lda_model <- lda(mpg01 ~ displacement + horsepower + weight, data = training_set)

# Predict on the test data
lda_predictions <- predict(lda_model, newdata = test_set)

# The class predictions are found in lda_predictions$class
predicted_classes <- lda_predictions$class

# Actual classes in the test data
actual_classes <- test_set$mpg01

# Create the confusion matrix for the test data
confusion_matrix <- table(Predicted = predicted_classes, Actual = actual_classes)

# Calculate the test error
test_error <- mean(predicted_classes != actual_classes)

# Print the test error
print(test_error)

## [1] 0.1416667

# Load the MASS package for QDA
library(MASS)

# Assuming displacement, horsepower, and weight are the variables most associated with mpg01
# Make sure to handle any NA values before this step, as shown in previous examples

# Fit the QDA model on the training data
qda_model <- qda(mpg01 ~ displacement + horsepower + weight, data = training_set)

# Predict on the test data
qda_predictions <- predict(qda_model, newdata = test_set)

# The class predictions are found in qda_predictions$class
predicted_classes <- qda_predictions$class

# Actual classes in the test data
actual_classes <- test_set$mpg01

# Create the confusion matrix for the test data
confusion_matrix <- table(Predicted = predicted_classes, Actual = actual_classes)

# Calculate the test error
test_error <- mean(predicted_classes != actual_classes)

# Print the test error
print(test_error)

## [1] 0.1083333

# Fit the logistic regression model on the training data
log_model <- glm(mpg01 ~ displacement + horsepower + weight, data = training_set, family = binomial)

# Predict on the test data
log_predictions <- predict(log_model, newdata = test_set, type = "response")

# Convert probabilities to binary class predictions using 0.5 as the threshold
predicted_classes <- ifelse(log_predictions > 0.5, 1, 0)

# Actual classes in the test data
actual_classes <- test_set$mpg01

# Compute the test error
test_error <- mean(predicted_classes != actual_classes)

# Print the test error
print(test_error)

## [1] 0.1083333

# Load the e1071 package for Naive Bayes
library(e1071)

# Assuming displacement, horsepower, and weight are the variables most associated with mpg01
# Ensure you have handled missing values before proceeding

# Fit the Naive Bayes model on the training data
nb_model <- naiveBayes(mpg01 ~ displacement + horsepower + weight, data = training_set)

# Predict on the test data
nb_predictions <- predict(nb_model, newdata = test_set)

# Actual classes in the test data
actual_classes <- test_set$mpg01

# Compute the test error
test_error <- mean(nb_predictions != actual_classes)

# Print the test error
print(test_error)

## [1] 0.1166667

# Load the class package for KNN
library(class)

# Standardize the predictors for training and test data
# Assuming displacement, horsepower, and weight are the predictors
predictors <- c("displacement", "horsepower", "weight")
training_set[predictors] <- scale(training_set[predictors])
test_set[predictors] <- scale(test_set[predictors])

# A vector to store errors for different values of K
k_values <- c(1, 3, 5, 7, 9, 11, 13, 15)
errors <- numeric(length(k_values))

# Loop over several values of K
for (i in seq_along(k_values)) {
  set.seed(123) # For reproducibility
  k <- k_values[i]

  # Perform KNN
  knn_pred <- knn(train = training_set[predictors], test = test_set[predictors],
                  cl = training_set$mpg01, k = k)

  # Compute the test error
  errors[i] <- mean(knn_pred != test_set$mpg01)
}

# Output the errors for each K
errors

## [1] 0.1333333 0.1083333 0.1000000 0.1000000 0.1083333 0.1083333 0.1083333
## [8] 0.1166667

# Determine the K with the lowest test error
best_k <- k_values[which.min(errors)]
best_error <- min(errors)

# Print the best K and its error
print(paste("Best K:", best_k, "with test error:", best_error))

## [1] "Best K: 5 with test error: 0.1"

# Define the file path
file_path <- "C:/Users/ngaku/Downloads/Boston(1).csv"

# Read the CSV file into an R data frame
boston_data <- read.csv(file_path)

# Check the first few rows to confirm it's loaded correctly
head(boston_data)

##   X    crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# Calculate the median crime rate
median_crim <- median(boston_data$crim)

# Create a binary response variable: 1 if 'crim' is above the median, 0 otherwise
boston_data$crim_above_median <- ifelse(boston_data$crim > median_crim, 1, 0)

set.seed(123) # For reproducibility
train_indices <- sample(1:nrow(boston_data), 0.7 * nrow(boston_data))
train_data <- boston_data[train_indices, ]
test_data <- boston_data[-train_indices, ]

# Example using 'rm' and 'lstat' as predictors
log_model <- glm(crim_above_median ~ rm + lstat, data = train_data, family = "binomial")
summary(log_model)

## 
## Call:
## glm(formula = crim_above_median ~ rm + lstat, family = "binomial", 
##     data = train_data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.89017    1.77687  -4.440 8.98e-06 ***
## rm           0.81049    0.24359   3.327 0.000877 ***
## lstat        0.23166    0.02978   7.779 7.33e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.74  on 353  degrees of freedom
## Residual deviance: 387.89  on 351  degrees of freedom
## AIC: 393.89
## 
## Number of Fisher Scoring iterations: 4

library(MASS)
lda_model <- lda(crim_above_median ~ rm + lstat, data = train_data)
summary(lda_model)

##         Length Class  Mode     
## prior   2      -none- numeric  
## counts  2      -none- numeric  
## means   4      -none- numeric  
## scaling 2      -none- numeric  
## lev     2      -none- character
## svd     1      -none- numeric  
## N       1      -none- numeric  
## call    3      -none- call     
## terms   3      terms  call     
## xlevels 0      -none- list

library(e1071)
nb_model <- naiveBayes(crim_above_median ~ rm + lstat, data = train_data)
summary(nb_model)

##           Length Class  Mode     
## apriori   2      table  numeric  
## tables    2      -none- list     
## levels    2      -none- character
## isnumeric 2      -none- logical  
## call      4      -none- call

library(class)
# Ensure predictors and response are scaled or normalized if needed
knn_pred <- knn(train = train_data[, c("rm", "lstat")], test = test_data[, c("rm", "lstat")], 
                cl = train_data$crim_above_median, k = 5)

# Predict on test data
log_pred_prob <- predict(log_model, newdata = test_data, type = "response")
log_pred_class <- ifelse(log_pred_prob > 0.5, 1, 0)

# Calculate accuracy
mean(log_pred_class == test_data$crim_above_median)

## [1] 0.7302632

# Predict on test data
lda_pred <- predict(lda_model, newdata = test_data)
lda_pred_class <- lda_pred$class

# Calculate accuracy
lda_accuracy <- mean(lda_pred_class == test_data$crim_above_median)
print(paste("LDA Accuracy:", lda_accuracy))

## [1] "LDA Accuracy: 0.717105263157895"

# Predict on test data (assuming you want the classes, not probabilities)
nb_pred <- predict(nb_model, newdata = test_data)

# nb_pred is directly the predicted classes, no need to use $class
nb_pred_class <- nb_pred

# Calculate accuracy
nb_accuracy <- mean(nb_pred_class == test_data$crim_above_median)
print(paste("Naive Bayes Accuracy:", nb_accuracy))

## [1] "Naive Bayes Accuracy: 0.677631578947368"

# Convert factors to numeric if necessary
knn_pred_class <- as.numeric(as.character(knn_pred))

# Calculate accuracy
knn_accuracy <- mean(knn_pred_class == test_data$crim_above_median)
print(paste("KNN Accuracy:", knn_accuracy))

## [1] "KNN Accuracy: 0.690789473684211"

The results suggest that for predicting crime rates as above or below the median in the Boston dataset, logistic regression and LDA are more effective models compared to KNN and Naive Bayes. The effectiveness of logistic regression and LDA could be attributed to their ability to model linear relationships between the predictors and the log-odds of the outcome effectively.

assignment 3

Ndungu Gakunga

2024-02-25