STA 631 PROJECT
Seattle Weather Prediction Using Regression Analysis
Project Objective:
The objective of this project, titled “Seattle Weather Prediction
Using Regression Analysis,” is to develop a predictive model to forecast
the maximum daily temperature (temp_max) in Seattle using
historical weather data. By analyzing features such as minimum
temperature (temp_min), precipitation, wind speed
(wind), and weather type (weather), my aim to
create an accurate and reliable model. This involves data preparation,
exploratory data analysis, and model evaluation using metrics like RMSE
and MAPE. The goal is to use the model to predict future temperatures
and support weather-related decision-making in Seattle.
Importing the libraries
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## corrplot 0.92 loaded
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Warning: package 'caTools' was built under R version 4.3.3
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
Loading the Data
# Loading the data set
weather_data <- read.csv('D:/STA 631/seattle-weather.csv')
# Printing the first few rows of the dataset
head(weather_data)## date precipitation temp_max temp_min wind weather
## 1 2012-01-01 0.0 12.8 5.0 4.7 drizzle
## 2 2012-01-02 10.9 10.6 2.8 4.5 rain
## 3 2012-01-03 0.8 11.7 7.2 2.3 rain
## 4 2012-01-04 20.3 12.2 5.6 4.7 rain
## 5 2012-01-05 1.3 8.9 2.8 6.1 rain
## 6 2012-01-06 2.5 4.4 2.2 2.2 rain
Data Dimensions and Summary Statistics
## [1] 1461 6
## date precipitation temp_max temp_min
## Length:1461 Min. : 0.000 Min. :-1.60 Min. :-7.100
## Class :character 1st Qu.: 0.000 1st Qu.:10.60 1st Qu.: 4.400
## Mode :character Median : 0.000 Median :15.60 Median : 8.300
## Mean : 3.029 Mean :16.44 Mean : 8.235
## 3rd Qu.: 2.800 3rd Qu.:22.20 3rd Qu.:12.200
## Max. :55.900 Max. :35.60 Max. :18.300
## wind weather
## Min. :0.400 Length:1461
## 1st Qu.:2.200 Class :character
## Median :3.000 Mode :character
## Mean :3.241
## 3rd Qu.:4.000
## Max. :9.500
Data Types
## 'data.frame': 1461 obs. of 6 variables:
## $ date : chr "2012-01-01" "2012-01-02" "2012-01-03" "2012-01-04" ...
## $ precipitation: num 0 10.9 0.8 20.3 1.3 2.5 0 0 4.3 1 ...
## $ temp_max : num 12.8 10.6 11.7 12.2 8.9 4.4 7.2 10 9.4 6.1 ...
## $ temp_min : num 5 2.8 7.2 5.6 2.8 2.2 2.8 2.8 5 0.6 ...
## $ wind : num 4.7 4.5 2.3 4.7 6.1 2.2 2.3 2 3.4 3.4 ...
## $ weather : chr "drizzle" "rain" "rain" "rain" ...
Distribution of Numerical Columns
precipitation_hist <- ggplot(weather_data, aes(x = precipitation)) +
geom_histogram(binwidth = 1, fill = "#AFEEEE", color = "black") +
ggtitle("Distribution of Precipitation")
temp_max_hist <- ggplot(weather_data, aes(x = temp_max)) +
geom_histogram(binwidth = 1, fill = "#FFFFCC", color = "black") +
ggtitle("Distribution of Maximum Temp")
temp_min_hist <- ggplot(weather_data, aes(x = temp_min)) +
geom_histogram(binwidth = 1, fill = "#99FF99", color = "black") +
ggtitle("Distribution of Minimum Temp")
wind_hist <- ggplot(weather_data, aes(x = wind)) +
geom_histogram(binwidth = 0.5, fill = "#99CCFF", color = "black") +
ggtitle("Distribution of Wind Speed")
grid.arrange(precipitation_hist, temp_max_hist, temp_min_hist, wind_hist, nrow = 2, ncol = 2)
## Categorical Variable Frequency Analysis
##
## drizzle fog rain snow sun
## 53 101 641 26 640
Distribution of Categorical Columns
weather_data$weather <- as.factor(weather_data$weather)
weather_plot <- ggplot(weather_data, aes(x = weather)) +
geom_bar(fill = "gray", color = "black", alpha = 0.7) +
labs(title = "Weather Distribution", x = "Weather Type", y = "Frequency")
temp_max_plot <- ggplot(weather_data, aes(x = weather, y = temp_max, fill = weather)) +
geom_boxplot() +
labs(title = "Maximum Temperature by Weather Type", x = "Weather Type", y = "Maximum Temperature") +
theme(legend.position = "none")
temp_min_plot <- ggplot(weather_data, aes(x = weather, y = temp_min, fill = weather)) +
geom_boxplot() +
labs(title = "Minimum Temperature by Weather Type", x = "Weather Type", y = "Minimum Temperature") +
theme(legend.position = "none")
precipitation_plot <- ggplot(weather_data, aes(x = weather, y = precipitation, fill = weather)) +
geom_boxplot() +
labs(title = "Precipitation by Weather Type", x = "Weather Type", y = "Precipitation") +
theme(legend.position = "none")
grid.arrange(weather_plot, temp_max_plot, temp_min_plot, precipitation_plot, ncol = 2)Bivariate analysis
temp_precip_scatt <- ggplot(weather_data, aes(x = precipitation, y = temp_max)) +
geom_point(color = 'orange', alpha = 0.5) +
labs(title = "Maximum Temp vs Precipitation", x = "Precipitation", y = "Maximum Temp")
temp_min_scatt <- ggplot(weather_data, aes(x = temp_max, y = temp_min)) +
geom_point(color = 'blue', alpha = 0.5) +
ggtitle("Max Temp vs Min Temp") +
labs(x = "Max Temp", y = "Min Temp")
precipitation_scatt <- ggplot(weather_data, aes(x = temp_max, y = precipitation)) +
geom_point(color = 'green', alpha = 0.5) +
ggtitle("Max Temp vs Precipitation") +
labs(x = "Max Temp", y = "Precipitation")
wind_scatt <- ggplot(weather_data, aes(x = temp_max, y = wind)) +
geom_point(color = 'red', alpha = 0.5) +
ggtitle("Max Temp vs Wind Speed") +
labs(x = "Max Temp", y = "Wind Speed")
grid.arrange(temp_precip_scatt, temp_min_scatt, precipitation_scatt, wind_scatt, nrow =2, ncol = 2)Multi-Variate analysis
# Convert necessary columns to appropriate types
weather_data$weather <- as.factor(weather_data$weather)
# Multivariate Scatter Plot: Temp Max vs. Temp Min colored by Weather Type
multivariate_weather_plot <- ggplot(weather_data, aes(x = temp_min, y = temp_max, color = weather)) +
geom_point(alpha = 0.7) +
labs(title = "Temp Max vs Temp Min by Weather Type",
x = "Min Temperature",
y = "Max Temperature") +
theme_minimal()
# Display the plot
print(multivariate_weather_plot)Correlation matrix
# correlation matrix
cor_matrix <- cor(weather_data %>% select_if(is.numeric))
# Print the correlation matrix
print(cor_matrix)## precipitation temp_max temp_min wind
## precipitation 1.00000000 -0.2285548 -0.07268404 0.32804509
## temp_max -0.22855482 1.0000000 0.87568666 -0.16485663
## temp_min -0.07268404 0.8756867 1.00000000 -0.07418523
## wind 0.32804509 -0.1648566 -0.07418523 1.00000000
# Visualize the correlation matrix
library(corrplot)
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
diag = FALSE, order = "hclust")Data Cleaning
## date precipitation temp_max temp_min wind
## 0 0 0 0 0
## weather
## 0
Converting categorical columns to numerical columns
# Label encoding for 'weather' column
weather_data$weather <- as.numeric(factor(weather_data$weather, levels = unique(weather_data$weather)))
# Scaling numerical columns
numerical_columns <- c("temp_max", "temp_min", "precipitation", "wind")
weather_data[numerical_columns] <- scale(weather_data[numerical_columns])Data standardization
Data splitting
Model Selection
# Load necessary library
library(caret)
# Define training control for cross-validation
cv_control <- trainControl(method = "cv", number = 5) # 5-fold cross-validation
# Train the base model using all predictors
base_model <- train(temp_max ~ temp_min + precipitation + wind + weather,
data = train_data, method = "lm", trControl = cv_control)
# Train a reduced model using selected predictors based on domain knowledge
reduced_model <- train(temp_max ~ temp_min + precipitation,
data = train_data, method = "lm", trControl = cv_control)
# Compare models using resampling
model_results <- resamples(list(base = base_model, reduced = reduced_model))
# Summary of the results
print(summary(model_results))##
## Call:
## summary.resamples(object = model_results)
##
## Models: base, reduced
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base 0.3401595 0.3535426 0.3541065 0.3549866 0.3615192 0.3656052 0
## reduced 0.3541595 0.3687699 0.3739531 0.3727353 0.3809200 0.3858739 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base 0.4228416 0.4355577 0.4378987 0.4389057 0.4417565 0.4564738 0
## reduced 0.4362602 0.4449062 0.4572849 0.4549848 0.4623962 0.4740765 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## base 0.7896704 0.7994980 0.8203990 0.8103916 0.8207922 0.8215986 0
## reduced 0.7712550 0.7738643 0.7989508 0.7968314 0.8079474 0.8321394 0
Fitting Linear regression model
# Fitting a linear regression model to predict temp_max using all predictors
lm_model_all <- lm(temp_max ~ temp_min + precipitation + wind + weather, data = train_data)
# Printing summary of the linear regression model
summary(lm_model_all)##
## Call:
## lm(formula = temp_max ~ temp_min + precipitation + wind + weather,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1056 -0.3033 -0.0427 0.2924 1.2622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.33058 0.04436 -7.453 1.94e-13 ***
## temp_min 0.87245 0.01390 62.771 < 2e-16 ***
## precipitation -0.11796 0.01461 -8.074 1.89e-15 ***
## wind -0.04759 0.01470 -3.237 0.00125 **
## weather 0.12212 0.01604 7.612 6.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4385 on 1025 degrees of freedom
## Multiple R-squared: 0.8109, Adjusted R-squared: 0.8101
## F-statistic: 1099 on 4 and 1025 DF, p-value: < 2.2e-16
Distribution of Residual analysis
# Distribution of residuals
resid_hist <- ggplot(lm_model_all, aes(x = .resid)) +
geom_histogram(binwidth = 0.1, fill = "gray") +
labs(title = "Histogram of Residuals", x = "Residuals", y = "Count")
# Variance of residuals
resid_fitted <- ggplot(lm_model_all, aes(x = .fitted, y = .resid)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals")
# For 2 plots
grid.arrange(resid_hist, resid_fitted, nrow = 2, ncol = 1, top = "Regression Assumptions")## `geom_smooth()` using formula = 'y ~ x'