getwd()
[1] "/Users/joshuachang/Desktop/Stats 432 R_project/automobile"
setwd("/Users/joshuachang/Desktop/Stats 432 R_project/automobile")
column_names <- c("symboling", "normalized_losses", "make", "fuel_type", "aspiration",
"num_doors", "body_style", "drive_wheels", "engine_location",
"wheel_base", "length", "width", "height", "curb_weight", "engine_type",
"num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg",
"highway_mpg", "price")
import_85 = read.csv("imports-85.data", header = FALSE, col.names = column_names, na.strings = "?")
import_85 = na.omit(import_85)
# # EPA weighted average: 55% city, 45% highway
import_85$combined_mpg <- 0.55 * import_85$city_mpg + 0.45 * import_85$highway_mpg
library(leaps)
library(ISLR2)
library(glmnet)
library(corrplot)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(ggplot2)
library(gridExtra)
library(class)
library(rmarkdown)
# Data Dimensional Analysis
head(import_85)
dim(import_85)
[1] 159 27
names(import_85)
[1] "symboling" "normalized_losses" "make" "fuel_type" "aspiration"
[6] "num_doors" "body_style" "drive_wheels" "engine_location" "wheel_base"
[11] "length" "width" "height" "curb_weight" "engine_type"
[16] "num_cylinders" "engine_size" "fuel_system" "bore" "stroke"
[21] "compression_ratio" "horsepower" "peak_rpm" "city_mpg" "highway_mpg"
[26] "price" "combined_mpg"
#This objective involves identifying hidden patterns and non-obvious relationships within the dataset. While some correlations (e.g., higher horsepower leading to higher price) are expected, we will use correlation matrices and visualization tools to discover less intuitive associations. All variables—numerical and categorical—may serve as features or responses depending on the analysis.
# Select only numeric variables
numeric_vars = import_85[, sapply(import_85, is.numeric)]
# Correlation matrix
cor_matrix <- cor(numeric_vars)
# Heat-map (red)
corrplot::corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.7)
par(mfrow = c(3, 1))
hist(import_85$price, main = "Price Distribution", col = "skyblue", xlab = "Price", breaks = 20)
hist(import_85$combined_mpg, main = "Combined MPG Distribution", col = "orange", xlab = "Combined MPG", breaks = 15)
hist(import_85$horsepower, main = "Horsepower Distribution", col = "lightgreen", xlab = "Horsepower")
par(mfrow = c(3, 1))
barplot(table(import_85$make), las = 2, cex.names = 0.7, main = "Car Make Frequency", col = "gray")
barplot(table(import_85$fuel_type), main = "Fuel Type Frequency", col = "lightblue")
barplot(table(import_85$drive_wheels), main = "Drive Wheel Configuration", col = "lightgreen")
library(ggplot2)
ggplot(import_85, aes(x = combined_mpg, y = price, color = fuel_type)) +
geom_point() +
labs(title = "Combined MPG vs. Price by Fuel Type", x = "Combined MPG", y = "Price")
# The goal is to predict vehicle price using variable selection methods such as Ridge Regression, Lasso,
# and Best Subset Selection. These techniques will help identify the most informative predictors with minimal redundancy.
# By splitting the dataset into training and test sets, we can evaluate the model's predictive performance.
# The response variable will be price, and all other variables—both numerical and categorical (encoded as dummy variables)—will serve as features.
# Create numeric_data by selecting numeric columns from import_85
numeric_data <- import_85[, sapply(import_85, is.numeric)]
numeric_data <- numeric_data[, c("price", setdiff(names(numeric_data), "price"))]
head(numeric_data)
set.seed(123) # For reproducibility
train_index <- sample(1:nrow(numeric_data), 0.7 * nrow(numeric_data))
train_data <- numeric_data[train_index, ]
test_data <- numeric_data[-train_index, ]
x_train <- as.matrix(train_data[, -1]) # predictors
y_train <- train_data[, 1] # price
x_test <- as.matrix(test_data[, -1])
y_test <- test_data[, 1]
# Ridge regression (alpha = 0)
ridge_mod <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_best_lambda <- ridge_mod$lambda.min
ridge_pred <- predict(ridge_mod, s = ridge_best_lambda, newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
# Lasso regression (alpha = 1)
lasso_mod <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_best_lambda <- lasso_mod$lambda.min
lasso_pred <- predict(lasso_mod, s = lasso_best_lambda, newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
library(leaps)
regfit.full <- regsubsets(price ~ ., data = train_data, nvmax = 10, really.big = TRUE)
Warning: 1 linear dependencies found
predict_regsubsets <- function(object, newdata, id) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
# Get the actual number of models stored
num_models <- nrow(summary(regfit.full)$which)
# Re-run validation loop with correct limit
val_errors <- sapply(1:num_models, function(i) {
pred <- predict_regsubsets(regfit.full, test_data, i)
mean((test_data$price - pred)^2)
})
# Select best
best_subset_mse <- min(val_errors)
best_subset_size <- which.min(val_errors)
best_subset_mse = min(val_errors)
best_subset_size = which.min(val_errors)
cat("Best Subset MSE (size =", best_subset_size, "):", round(best_subset_mse, 2), "\n")
Best Subset MSE (size = 4 ): 8823901
cat("Ridge Regression Test MSE:", round(ridge_mse, 2), "\n")
Ridge Regression Test MSE: 9984899
cat("Lasso Regression Test MSE:", round(lasso_mse, 2), "\n")
Lasso Regression Test MSE: 9001026
We applied Ridge and Lasso regression methods to predict vehicle price using a dataset with numeric and dummy variables. Ridge regression shrinks all coefficients, which helps when predictors are highly correlated. Lasso regression also performs shrinkage but can set some coefficients to zero, thus doing variable selection. By comparing the test Mean Squared Errors (MSE), we selected the model with the lowest MSE. We therefore select Best Subset Selection as the optimal regression method due to its superior predictive accuracy in this case.
coef(regfit.full, id = 4)
(Intercept) normalized_losses width curb_weight horsepower
-47278.22053 15.64827 547.83856 7.39948 28.54066
The best subset regression model selected 4 predictors: normalized losses, width, curb weight, and horsepower. These variables collectively yielded the lowest test Mean Squared Error (MSE), indicating strong predictive power for vehicle price. The inclusion of normalized losses likely reflects risk-based pricing adjustments, while width and curb weight capture vehicle size and mass — features typically associated with luxury or performance. Horsepower, a direct indicator of engine strength, is also positively associated with price. Together, these predictors form a parsimonious yet effective model for estimating vehicle prices using numeric data only.
# Subset the numeric data to relevant variables
vars <- c("price", "normalized_losses", "width", "curb_weight", "horsepower")
subset_data <- import_85[, vars]
# Remove rows with missing values
subset_data <- na.omit(subset_data)
# Correlation matrix
cor_matrix <- cor(subset_data)
print(round(cor_matrix, 2))
price normalized_losses width curb_weight horsepower
price 1.00 0.20 0.84 0.89 0.76
normalized_losses 0.20 1.00 0.11 0.13 0.29
width 0.84 0.11 1.00 0.87 0.68
curb_weight 0.89 0.13 0.87 1.00 0.79
horsepower 0.76 0.29 0.68 0.79 1.00
#This task aims to explore which vehicle specifications contribute most to fuel efficiency (measured in miles per gallon). We will treat city_mpg and highway_mpg as separate response variables and also consider a combined efficiency metric. The focus will be on engineering-related features such as compression_ratio, num_cylinders, and engine_size. The outcome of this analysis can be applied to real-world vehicle purchasing decisions, especially for those seeking fuel-efficient cars.
# We only focus on engineering-related variables
engineering_vars = c("compression_ratio", "engine_size", "num_cylinders","horsepower", "curb_weight", "bore", "stroke")
# Convert num_cylinders from factor (e.g., "four", "six") to numeric (e.g., 4, 6)
cyl_map <- c("two"=2, "three"=3, "four"=4, "five"=5, "six"=6, "eight"=8, "twelve"=12)
import_85$num_cylinders <- cyl_map[as.character(import_85$num_cylinders)]
engineering_data = import_85[, c("combined_mpg", engineering_vars)]
engineering_data = na.omit(engineering_data)
set.seed(432)
# K-fold cross-validation
k = 10
folds = sample(1:k, nrow(engineering_data), replace = TRUE)
cv_errors = matrix(NA, k, 7)
for (j in 1:k) {
regfit_cv = regsubsets(combined_mpg ~ ., data = engineering_data[folds != j, ], nvmax = 7)
for (i in 1:7) {
pred = predict_regsubsets(regfit_cv, engineering_data[folds == j, ], id = i)
y_true = engineering_data$combined_mpg[folds == j]
cv_errors[j, i] = mean((y_true - pred)^2)
}
}
mean_cv_errors = colMeans(cv_errors)
plot(1:7, mean_cv_errors, type = "b", xlab = "Number of Variables", ylab = "CV MSE")
points(which.min(mean_cv_errors), mean_cv_errors[which.min(mean_cv_errors)], col = "blue", pch = 19)
best_cv_size = which.min(mean_cv_errors)
set.seed(432)
# Fit model for combined MPG
regfit_fwd = regsubsets(combined_mpg ~ ., data = engineering_data,
nvmax = 7, method = "forward")
regfit_bwd = regsubsets(combined_mpg ~ ., data = engineering_data,
nvmax =7, method = "backward")
coef(regfit_fwd, best_cv_size) # Forward stepwise
(Intercept) compression_ratio engine_size num_cylinders horsepower curb_weight
59.001893938 0.527786685 0.065690062 -1.061641644 -0.066140113 -0.009538737
bore
-2.684084489
coef(regfit_bwd, best_cv_size) # Backward stepwise
(Intercept) compression_ratio engine_size num_cylinders horsepower curb_weight
59.001893938 0.527786685 0.065690062 -1.061641644 -0.066140113 -0.009538737
bore
-2.684084489
# Set seed and split data
set.seed(432)
sample_size = floor(0.8 * nrow(engineering_data)) # 80% training
train_index = sample(seq_len(nrow(engineering_data)), size = sample_size)
train_data = engineering_data[train_index, ]
test_data =engineering_data[-train_index, ]
# Fit a linear model using the top 4 variables
top_4_model = lm(combined_mpg ~ horsepower + engine_size + curb_weight + compression_ratio,
data = train_data)
# Predict on the test set
test_predictions = predict(top_4_model, newdata = test_data)
test_mse <- mean((test_data$combined_mpg - test_predictions)^2)
test_mse
[1] 10.84735
main_effects = "horsepower + engine_size + curb_weight + compression_ratio"
# All possible 2-way interactions
interaction_terms = c("horsepower:engine_size", "horsepower:curb_weight",
"horsepower:compression_ratio", "engine_size:curb_weight",
"engine_size:compression_ratio", "curb_weight:compression_ratio")
results = data.frame(Interaction = character(), Test_MSE = numeric(), stringsAsFactors = FALSE)
# Loop over each interaction term
for (term in interaction_terms) {
full_formula = as.formula(paste("combined_mpg ~", paste(c(main_effects, term), collapse = " + ")))
# Fit model with only one interaction + 4 top variables
model = lm(full_formula, data = train_data)
preds = predict(model, newdata = test_data)
mse = mean((test_data$combined_mpg - preds)^2)
results = rbind(results, data.frame(Interaction = term, Test_MSE = mse))
}
# Sort and print
results_sorted <- results[order(results$Test_MSE), ]
print(results_sorted)
results_sorted[1, ]
top_4_interaction_1 = lm(combined_mpg ~ horsepower + engine_size + curb_weight +compression_ratio + horsepower:curb_weight, data = train_data)
summary(top_4_interaction_1)
Call:
lm(formula = combined_mpg ~ horsepower + engine_size + curb_weight +
compression_ratio + horsepower:curb_weight, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-5.3827 -1.2391 -0.4331 1.2107 8.8016
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.939e+01 4.021e+00 17.258 < 2e-16 ***
horsepower -2.911e-01 4.266e-02 -6.825 3.72e-10 ***
engine_size -1.037e-02 1.797e-02 -0.577 0.565
curb_weight -1.531e-02 1.520e-03 -10.075 < 2e-16 ***
compression_ratio 5.553e-01 6.192e-02 8.968 4.60e-15 ***
horsepower:curb_weight 8.404e-05 1.559e-05 5.390 3.54e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.185 on 121 degrees of freedom
Multiple R-squared: 0.8704, Adjusted R-squared: 0.8651
F-statistic: 162.6 on 5 and 121 DF, p-value: < 2.2e-16
top_4_interaction_2 = lm(combined_mpg ~ horsepower + engine_size + curb_weight +compression_ratio + horsepower:engine_size, data = train_data)
summary(top_4_interaction_2)
Call:
lm(formula = combined_mpg ~ horsepower + engine_size + curb_weight +
compression_ratio + horsepower:engine_size, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-6.1769 -1.1204 -0.3605 1.0326 8.9674
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.0888035 2.8473454 21.806 < 2e-16 ***
horsepower -0.1937826 0.0271814 -7.129 7.98e-11 ***
engine_size -0.1287899 0.0358810 -3.589 0.00048 ***
curb_weight -0.0074697 0.0010928 -6.835 3.55e-10 ***
compression_ratio 0.6074130 0.0624226 9.731 < 2e-16 ***
horsepower:engine_size 0.0010721 0.0002079 5.157 9.93e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.203 on 121 degrees of freedom
Multiple R-squared: 0.8683, Adjusted R-squared: 0.8628
F-statistic: 159.5 on 5 and 121 DF, p-value: < 2.2e-16
top_4_interaction_3 = lm(combined_mpg ~ horsepower + engine_size + curb_weight +compression_ratio + engine_size:curb_weight, data = train_data)
summary(top_4_interaction_3)
Call:
lm(formula = combined_mpg ~ horsepower + engine_size + curb_weight +
compression_ratio + engine_size:curb_weight, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-5.4869 -1.2695 -0.3408 1.3016 9.0940
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.264e+01 3.995e+00 15.682 < 2e-16 ***
horsepower -6.203e-02 1.512e-02 -4.103 7.44e-05 ***
engine_size -1.139e-01 4.421e-02 -2.576 0.011185 *
curb_weight -1.337e-02 1.551e-03 -8.618 3.08e-14 ***
compression_ratio 5.739e-01 6.520e-02 8.801 1.14e-14 ***
engine_size:curb_weight 4.210e-05 1.141e-05 3.690 0.000338 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.307 on 121 degrees of freedom
Multiple R-squared: 0.8556, Adjusted R-squared: 0.8496
F-statistic: 143.4 on 5 and 121 DF, p-value: < 2.2e-16
# Initialize storage
test_mse_poly = numeric(9)
# Loop through degrees 1 to 9
for (d in 1:9) {
formula_poly = as.formula(paste(
"combined_mpg ~",
paste0("I(horsepower^", d, ") + ",
"I(engine_size^", d, ") + ",
"I(curb_weight^", d, ") + ",
"I(compression_ratio^", d, ")")
))
# Fit model
poly_model = lm(formula_poly, data = train_data)
# Predict
pred_poly = predict(poly_model, newdata = test_data)
# Compute test MSE
test_mse_poly[d] = mean((test_data$combined_mpg - pred_poly)^2)
}
# Find the degree with lowest test MSE
best_degree = which.min(test_mse_poly)
best_mse = min(test_mse_poly)
# Plot MSE vs polynomial degree
plot(1:9, test_mse_poly, type = "b", pch = 19,
xlab = "Polynomial Degree", ylab = "Test MSE",
main = "Test MSE vs. Polynomial Degree")
points(which.min(test_mse_poly), min(test_mse_poly), col = "red", pch = 19)
cat("Lowest MSE is", round(best_mse, 4), "at polynomial degree", best_degree, "\n")
Lowest MSE is 10.8473 at polynomial degree 1
# Here, we aim to classify a vehicle’s insurance risk level—represented by the symbol score (ranging from -3 to +3)—based on its specifications. Using classification models, we’ll determine how features like car specs and pricing relate to insurance risk. This could provide the basis for a decision system in which cars above a certain risk threshold may be flagged for higher premiums or insurance denial. In this task, symbol will be the response variable.
categorical_vars = import_85[, sapply(import_85, function(x) is.factor(x) || is.character(x))]
names(categorical_vars)
[1] "make" "fuel_type" "aspiration" "num_doors" "body_style" "drive_wheels"
[7] "engine_location" "engine_type" "fuel_system"
library(ggplot2)
library(dplyr)
ggplot(import_85, aes(x = make, fill = as.factor(symboling))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
scale_fill_manual(values = c("-2"="#4575b4", "-1"="#91bfdb", "0"="#e0f3f8",
"1"="#fee090", "2"="#fc8d59", "3"="#d73027")) +
labs(title = "Proportion of Insurance Risk Ratings by Car Make",
x = "Car Make", y = "Proportion", fill = "Risk Rating") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Insurance Risk by Fuel Type
ggplot(import_85, aes(x = fuel_type, fill = as.factor(symboling))) +
geom_bar(position = "dodge") +
labs(title = "Insurance Risk by Fuel Type", x = "Fuel Type", fill = "Symboling")
# Insurance Risk by Body Style
ggplot(import_85, aes(x = body_style, fill = as.factor(symboling))) +
geom_bar(position = "fill") +
labs(title = "Proportion of Insurance Risk by Body Style", y = "Proportion", fill = "Symboling")
# Risk Rating Distribution by Drive Type
ggplot(import_85, aes(x = drive_wheels, fill = as.factor(symboling))) +
geom_bar(position = "fill") +
labs(title = "Risk Rating Distribution by Drive Type", y = "Proportion", fill = "Symboling")
# Ensure symboling is a factor (for categorical coloring)
import_85$symboling <- as.factor(import_85$symboling)
# Boxplots for key numeric predictors
bp1 <- ggplot(import_85, aes(x = symboling, y = price)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Price by Insurance Risk Rating", x = "Symboling", y = "Price") +
theme_minimal()
bp2 <- ggplot(import_85, aes(x = symboling, y = combined_mpg)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Combined MPG by Insurance Risk Rating", x = "Symboling", y = "Combined MPG") +
theme_minimal()
bp3 <- ggplot(import_85, aes(x = symboling, y = horsepower)) +
geom_boxplot(fill = "lightcoral") +
labs(title = "Horsepower by Insurance Risk Rating", x = "Symboling", y = "Horsepower") +
theme_minimal()
bp4 <- ggplot(import_85, aes(x = symboling, y = engine_size)) +
geom_boxplot(fill = "khaki") +
labs(title = "Engine Size by Insurance Risk Rating", x = "Symboling", y = "Engine Size") +
theme_minimal()
bp5 <- ggplot(import_85, aes(x = symboling, y = curb_weight)) +
geom_boxplot(fill = "plum") +
labs(title = "Curb Weight by Insurance Risk Rating", x = "Symboling", y = "Curb Weight") +
theme_minimal()
# Arrange plots in a grid
grid.arrange(bp1, bp2, bp3, bp4, bp5, ncol = 2)
set.seed(432)
# Split into 80% training, 20% testing
train_idx = sample(1:nrow(model_data), 0.8 * nrow(model_data))
train_data = model_data[train_idx, ]
test_data = model_data[-train_idx, ]
import_85$symboling = as.numeric(as.character(import_85$symboling)) # if it's a factor
import_85$risk = ifelse(import_85$symboling > 0, "High", "Low")
import_85$risk = factor(import_85$risk)
predictors = c("price", "horsepower", "curb_weight", "engine_size", "compression_ratio", "num_cylinders")
model_data = import_85[, c("risk", predictors)]
model_data = na.omit(model_data)
set.seed(432)
# Fit logistic regression model
logit_mod = glm(risk ~ ., data = model_data, family = "binomial")
# Predict probabilities on test set
prob_pred = predict(logit_mod, newdata = test_data, type = "response")
# Convert probabilities to class predictions
class_pred = ifelse(prob_pred > 0.5, "High", "Low")
class_pred = factor(class_pred, levels = levels(test_data$risk))
# Confusion matrix
conf_matrix = table(Predicted = class_pred, Actual = test_data$risk)
print(conf_matrix)
Actual
Predicted High Low
High 3 6
Low 17 6
# Accuracy
accuracy = mean(class_pred == test_data$risk)
cat("Test Accuracy:", accuracy, "\n")
Test Accuracy: 0.28125
set.seed(432)
tree_model = rpart(risk ~ ., data = train_data, method = "class")
rpart.plot(tree_model)
set.seed(432)
# Ensure only numeric predictors are used
numeric_predictors = predictors[sapply(train_data[, predictors], is.numeric)]
train_x = scale(train_data[, numeric_predictors])
test_x = scale(test_data[, numeric_predictors])
train_y = train_data$risk
test_y = test_data$risk
# Try Different k Values and Plot Accuracy
k_values = 1:20
accuracies = numeric(length(k_values))
for (i in k_values) {
pred_k = knn(train_x, test_x, train_y, k = i)
accuracies[i] = mean(pred_k == test_y)
}
plot(k_values, accuracies, type = "b", pch = 19, col = "blue",
xlab = "k (Number of Neighbors)", ylab = "Accuracy", main = "KNN Accuracy vs k")
best_k = k_values[which.max(accuracies)]
pred_knn = knn(train_x, test_x, train_y, k = best_k)
mean(pred_knn == test_y)
[1] 0.875
confusionMatrix(pred_knn, test_y)
Confusion Matrix and Statistics
Reference
Prediction High Low
High 17 1
Low 3 11
Accuracy : 0.875
95% CI : (0.7101, 0.9649)
No Information Rate : 0.625
P-Value [Acc > NIR] : 0.001743
Kappa : 0.7419
Mcnemar's Test P-Value : 0.617075
Sensitivity : 0.8500
Specificity : 0.9167
Pos Pred Value : 0.9444
Neg Pred Value : 0.7857
Prevalence : 0.6250
Detection Rate : 0.5312
Detection Prevalence : 0.5625
Balanced Accuracy : 0.8833
'Positive' Class : High
set.seed(432)
accuracies = numeric(200)
for (ntree in 1:200) {
rf_mod = randomForest(risk ~ ., data = train_data, ntree = ntree)
rf_pred = predict(rf_mod, newdata = test_data)
accuracies[ntree] = mean(rf_pred == test_data$risk)
}
# Plot
plot(1:200, accuracies, type = "l", col = "blue",
xlab = "Number of Trees", ylab = "Accuracy",
main = "Random Forest Accuracy vs Number of Trees")
abline(h = max(accuracies), col = "green", lty = 2)
abline(v = which.max(accuracies), col = "red", lty = 2)
# Find the best number of trees and its accuracy
best_ntree <- which.max(accuracies)
best_accuracy <- max(accuracies)
cat("Best number of trees:", best_ntree, "\n")
Best number of trees: 67
cat("Highest accuracy:", round(best_accuracy, 4), "\n")
Highest accuracy: 0.9688