STA 631 PROJECT

Seattle Weather Prediction Using Regression Analysis

Project Objective:

The objective of this project, titled “Seattle Weather Prediction Using Regression Analysis,” is to develop a predictive model to forecast the maximum daily temperature (temp_max) in Seattle using historical weather data. By analyzing features such as minimum temperature (temp_min), precipitation, wind speed (wind), and weather type (weather), my aim to create an accurate and reliable model. This involves data preparation, exploratory data analysis, and model evaluation using metrics like RMSE and MAPE. The goal is to use the model to predict future temperatures and support weather-related decision-making in Seattle.

Importing the libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot)
## corrplot 0.92 loaded
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice

Loading the Data

# Loading the data set
weather_data <- read.csv('D:/STA 631/seattle-weather.csv')

# Printing the first few rows of the dataset
head(weather_data)
##         date precipitation temp_max temp_min wind weather
## 1 2012-01-01           0.0     12.8      5.0  4.7 drizzle
## 2 2012-01-02          10.9     10.6      2.8  4.5    rain
## 3 2012-01-03           0.8     11.7      7.2  2.3    rain
## 4 2012-01-04          20.3     12.2      5.6  4.7    rain
## 5 2012-01-05           1.3      8.9      2.8  6.1    rain
## 6 2012-01-06           2.5      4.4      2.2  2.2    rain

Data Dimensions and Summary Statistics

# Data dimensions
dim(weather_data)
## [1] 1461    6
# Summary statistics
summary_stat <- summary(weather_data)

summary_stat
##      date           precipitation       temp_max        temp_min     
##  Length:1461        Min.   : 0.000   Min.   :-1.60   Min.   :-7.100  
##  Class :character   1st Qu.: 0.000   1st Qu.:10.60   1st Qu.: 4.400  
##  Mode  :character   Median : 0.000   Median :15.60   Median : 8.300  
##                     Mean   : 3.029   Mean   :16.44   Mean   : 8.235  
##                     3rd Qu.: 2.800   3rd Qu.:22.20   3rd Qu.:12.200  
##                     Max.   :55.900   Max.   :35.60   Max.   :18.300  
##       wind         weather         
##  Min.   :0.400   Length:1461       
##  1st Qu.:2.200   Class :character  
##  Median :3.000   Mode  :character  
##  Mean   :3.241                     
##  3rd Qu.:4.000                     
##  Max.   :9.500

Data Types

# Check data types
str(weather_data)
## 'data.frame':    1461 obs. of  6 variables:
##  $ date         : chr  "2012-01-01" "2012-01-02" "2012-01-03" "2012-01-04" ...
##  $ precipitation: num  0 10.9 0.8 20.3 1.3 2.5 0 0 4.3 1 ...
##  $ temp_max     : num  12.8 10.6 11.7 12.2 8.9 4.4 7.2 10 9.4 6.1 ...
##  $ temp_min     : num  5 2.8 7.2 5.6 2.8 2.2 2.8 2.8 5 0.6 ...
##  $ wind         : num  4.7 4.5 2.3 4.7 6.1 2.2 2.3 2 3.4 3.4 ...
##  $ weather      : chr  "drizzle" "rain" "rain" "rain" ...

Distribution of Numerical Columns

precipitation_hist <- ggplot(weather_data, aes(x = precipitation)) + 
  geom_histogram(binwidth = 1, fill = "#AFEEEE", color = "black") +
  ggtitle("Distribution of Precipitation")

temp_max_hist <- ggplot(weather_data, aes(x = temp_max)) + 
  geom_histogram(binwidth = 1, fill = "#FFFFCC", color = "black") +
  ggtitle("Distribution of Maximum Temp")

temp_min_hist <- ggplot(weather_data, aes(x = temp_min)) + 
  geom_histogram(binwidth = 1, fill = "#99FF99", color = "black") +
  ggtitle("Distribution of Minimum Temp")

wind_hist <- ggplot(weather_data, aes(x = wind)) + 
  geom_histogram(binwidth = 0.5, fill = "#99CCFF", color = "black") +
  ggtitle("Distribution of Wind Speed")

grid.arrange(precipitation_hist, temp_max_hist, temp_min_hist, wind_hist, nrow = 2, ncol = 2)

## Categorical Variable Frequency Analysis

table(weather_data$weather)
## 
## drizzle     fog    rain    snow     sun 
##      53     101     641      26     640

Distribution of Categorical Columns

weather_data$weather <- as.factor(weather_data$weather)

weather_plot <- ggplot(weather_data, aes(x = weather)) + 
  geom_bar(fill = "gray", color = "black", alpha = 0.7) +
  labs(title = "Weather Distribution", x = "Weather Type", y = "Frequency")

temp_max_plot <- ggplot(weather_data, aes(x = weather, y = temp_max, fill = weather)) + 
  geom_boxplot() +
  labs(title = "Maximum Temperature by Weather Type", x = "Weather Type", y = "Maximum Temperature") +
  theme(legend.position = "none")

temp_min_plot <- ggplot(weather_data, aes(x = weather, y = temp_min, fill = weather)) + 
  geom_boxplot() +
  labs(title = "Minimum Temperature by Weather Type", x = "Weather Type", y = "Minimum Temperature") +
  theme(legend.position = "none")

precipitation_plot <- ggplot(weather_data, aes(x = weather, y = precipitation, fill = weather)) + 
  geom_boxplot() +
  labs(title = "Precipitation by Weather Type", x = "Weather Type", y = "Precipitation") +
  theme(legend.position = "none")

grid.arrange(weather_plot, temp_max_plot, temp_min_plot, precipitation_plot, ncol = 2)

Bivariate analysis

temp_precip_scatt <- ggplot(weather_data, aes(x = precipitation, y = temp_max)) +
  geom_point(color = 'orange', alpha = 0.5) +
  labs(title = "Maximum Temp vs Precipitation", x = "Precipitation", y = "Maximum Temp")

temp_min_scatt <- ggplot(weather_data, aes(x = temp_max, y = temp_min)) +
  geom_point(color = 'blue', alpha = 0.5) +
  ggtitle("Max Temp vs Min Temp") +
  labs(x = "Max Temp", y = "Min Temp")

precipitation_scatt <- ggplot(weather_data, aes(x = temp_max, y = precipitation)) +
  geom_point(color = 'green', alpha = 0.5) +
  ggtitle("Max Temp vs Precipitation") +
  labs(x = "Max Temp", y = "Precipitation")

wind_scatt <- ggplot(weather_data, aes(x = temp_max, y = wind)) +
  geom_point(color = 'red', alpha = 0.5) +
  ggtitle("Max Temp vs Wind Speed") +
  labs(x = "Max Temp", y = "Wind Speed")

grid.arrange(temp_precip_scatt, temp_min_scatt, precipitation_scatt, wind_scatt, nrow =2, ncol = 2)

Multi-Variate analysis

# Convert necessary columns to appropriate types
weather_data$weather <- as.factor(weather_data$weather)

# Multivariate Scatter Plot: Temp Max vs. Temp Min colored by Weather Type
multivariate_weather_plot <- ggplot(weather_data, aes(x = temp_min, y = temp_max, color = weather)) +
  geom_point(alpha = 0.7) +
  labs(title = "Temp Max vs Temp Min by Weather Type",
       x = "Min Temperature",
       y = "Max Temperature") +
  theme_minimal()

# Display the plot
print(multivariate_weather_plot)

Correlation matrix

# correlation matrix
cor_matrix <- cor(weather_data %>% select_if(is.numeric))

# Print the correlation matrix
print(cor_matrix)
##               precipitation   temp_max    temp_min        wind
## precipitation    1.00000000 -0.2285548 -0.07268404  0.32804509
## temp_max        -0.22855482  1.0000000  0.87568666 -0.16485663
## temp_min        -0.07268404  0.8756867  1.00000000 -0.07418523
## wind             0.32804509 -0.1648566 -0.07418523  1.00000000
# Visualize the correlation matrix
library(corrplot)
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, addCoef.col = "black", 
         diag = FALSE, order = "hclust")

Data Cleaning

# Checking missing values

colSums(is.na(weather_data))
##          date precipitation      temp_max      temp_min          wind 
##             0             0             0             0             0 
##       weather 
##             0

Converting categorical columns to numerical columns

# Label encoding for 'weather' column
weather_data$weather <- as.numeric(factor(weather_data$weather, levels = unique(weather_data$weather)))

# Scaling numerical columns
numerical_columns <- c("temp_max", "temp_min", "precipitation", "wind")
weather_data[numerical_columns] <- scale(weather_data[numerical_columns])

Data standardization

# Standardization for numerical columns
weather_data$temp_max <- scale(weather_data$temp_max)
weather_data$temp_min <- scale(weather_data$temp_min)
weather_data$precipitation <- scale(weather_data$precipitation)
weather_data$wind <- scale(weather_data$wind)

Data splitting

# Splitting the data into training and testing sets (70% train, 30% test)
set.seed(123) 
split <- sample.split(weather_data$temp_max, SplitRatio = 0.7)
train_data <- subset(weather_data, split == TRUE)
test_data <- subset(weather_data, split == FALSE)

Model Selection

# Load necessary library
library(caret)

# Define training control for cross-validation
cv_control <- trainControl(method = "cv", number = 5)  # 5-fold cross-validation

# Train the base model using all predictors
base_model <- train(temp_max ~ temp_min + precipitation + wind + weather, 
                    data = train_data, method = "lm", trControl = cv_control)

# Train a reduced model using selected predictors based on domain knowledge
reduced_model <- train(temp_max ~ temp_min + precipitation, 
                       data = train_data, method = "lm", trControl = cv_control)

# Compare models using resampling
model_results <- resamples(list(base = base_model, reduced = reduced_model))

# Summary of the results
print(summary(model_results))
## 
## Call:
## summary.resamples(object = model_results)
## 
## Models: base, reduced 
## Number of resamples: 5 
## 
## MAE 
##              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## base    0.3401595 0.3535426 0.3541065 0.3549866 0.3615192 0.3656052    0
## reduced 0.3541595 0.3687699 0.3739531 0.3727353 0.3809200 0.3858739    0
## 
## RMSE 
##              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## base    0.4228416 0.4355577 0.4378987 0.4389057 0.4417565 0.4564738    0
## reduced 0.4362602 0.4449062 0.4572849 0.4549848 0.4623962 0.4740765    0
## 
## Rsquared 
##              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## base    0.7896704 0.7994980 0.8203990 0.8103916 0.8207922 0.8215986    0
## reduced 0.7712550 0.7738643 0.7989508 0.7968314 0.8079474 0.8321394    0

Fitting Linear regression model

# Fitting a linear regression model to predict temp_max using all predictors
lm_model_all <- lm(temp_max ~ temp_min + precipitation + wind + weather, data = train_data)

# Printing summary of the linear regression model
summary(lm_model_all)
## 
## Call:
## lm(formula = temp_max ~ temp_min + precipitation + wind + weather, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1056 -0.3033 -0.0427  0.2924  1.2622 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.33058    0.04436  -7.453 1.94e-13 ***
## temp_min       0.87245    0.01390  62.771  < 2e-16 ***
## precipitation -0.11796    0.01461  -8.074 1.89e-15 ***
## wind          -0.04759    0.01470  -3.237  0.00125 ** 
## weather        0.12212    0.01604   7.612 6.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4385 on 1025 degrees of freedom
## Multiple R-squared:  0.8109, Adjusted R-squared:  0.8101 
## F-statistic:  1099 on 4 and 1025 DF,  p-value: < 2.2e-16

Distribution of Residual analysis

# Distribution of residuals
resid_hist <- ggplot(lm_model_all, aes(x = .resid)) + 
  geom_histogram(binwidth = 0.1, fill = "gray") +
  labs(title = "Histogram of Residuals", x = "Residuals", y = "Count")

# Variance of residuals
resid_fitted <- ggplot(lm_model_all, aes(x = .fitted, y = .resid)) + 
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals")

# For 2 plots
grid.arrange(resid_hist, resid_fitted, nrow = 2, ncol = 1, top = "Regression Assumptions")
## `geom_smooth()` using formula = 'y ~ x'

Predictions on test data

# Predicting temp_max on the test set
predictions <- predict(lm_model_all, newdata = test_data)

RMSE Calculation

# Calculating RMSE
rmse <- sqrt(mean((test_data$temp_max - predictions)^2))
print(paste("Root Mean Squared Error (RMSE):", round(rmse, 2)))
## [1] "Root Mean Squared Error (RMSE): 0.44"

MAPE Calculation

# Calculating MAPE
mape <- mean(abs((test_data$temp_max - predictions) / test_data$temp_max)) * 100
print(paste("Mean Absolute Percentage Error (MAPE):", round(mape, 2)))
## [1] "Mean Absolute Percentage Error (MAPE): 98.61"