library(caret)
library(tidymodels)
library(summarytools)
library(janitor)
library(lubridate)Weather Prediction in Nairobi Kenya
Load Libraries
Load the dataset
weather_pred<-read.csv("D:/weather/weatherke.csv") %>%
clean_names()#lets check the data types of different columns and change them to the right type if need there be
sapply(weather_pred,class) name datetime tempmax tempmin
"character" "character" "numeric" "numeric"
temp feelslikemax feelslikemin feelslike
"numeric" "numeric" "numeric" "numeric"
dew humidity precip precipprob
"numeric" "numeric" "numeric" "integer"
precipcover preciptype snow snowdepth
"numeric" "character" "integer" "integer"
windgust windspeed winddir sealevelpressure
"numeric" "numeric" "numeric" "numeric"
cloudcover visibility solarradiation solarenergy
"numeric" "numeric" "numeric" "numeric"
uvindex severerisk sunrise sunset
"integer" "integer" "character" "character"
moonphase conditions description icon
"numeric" "character" "character" "character"
stations
"character"
#select necessary columns
weather_pred1 <- weather_pred %>%
select(datetime, temp,dew, humidity,windgust, windspeed, winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,stations)#lets see the datatypes of different columns
weather_pred1$datetime <- as.Date(weather_pred1$datetime, format="%d/%m/%Y")
weather_pred1$temp <- as.numeric(weather_pred1$temp)
weather_pred1$dew <- as.numeric(weather_pred1$dew)
weather_pred1$humidity <- as.numeric(weather_pred1$humidity)
weather_pred1$windgust <- as.numeric(weather_pred1$windgust)
weather_pred1$windspeed <- as.numeric(weather_pred1$windspeed)
weather_pred1$winddir <- as.numeric(weather_pred1$winddir)
weather_pred1$sealevelpressure <- as.numeric(weather_pred1$sealevelpressure)
weather_pred1$cloudcover <- as.numeric(weather_pred1$cloudcover)
weather_pred1$visibility <- as.numeric(weather_pred1$visibility)
weather_pred1$solarradiation <- as.numeric(weather_pred1$solarradiation)
weather_pred1$solarenergy <- as.numeric(weather_pred1$solarenergy)
weather_pred1$uvindex <- as.numeric(weather_pred1$uvindex)
weather_pred1$conditions <- as.character(weather_pred1$conditions)
weather_pred1$stations <- as.character(weather_pred1$stations)
#confirm data types
sapply(weather_pred1,class) #all good now! datetime temp dew humidity
"Date" "numeric" "numeric" "numeric"
windgust windspeed winddir sealevelpressure
"numeric" "numeric" "numeric" "numeric"
cloudcover visibility solarradiation solarenergy
"numeric" "numeric" "numeric" "numeric"
uvindex conditions stations
"numeric" "character" "character"
#check whether there are any missing values
colSums(is.na(weather_pred1)) #no missing values recorded in the dataset datetime temp dew humidity
0 0 0 0
windgust windspeed winddir sealevelpressure
0 0 0 0
cloudcover visibility solarradiation solarenergy
0 0 0 0
uvindex conditions stations
0 0 0
#lets add a new column for months of the year extracted from the 'datetime' column
weather_pred1$month <- month(weather_pred1$datetime, label = TRUE)
weather_pred1$year <- year(weather_pred1$datetime)#lets make the month & year column into binary form for easy prediction
# Lets use the mutate() function from dplyr to create a new column with numeric values
weather_pred1 <- weather_pred1 %>%
mutate(month_numeric = match(month, unique(weather_pred1$month)))
weather_pred1 <- weather_pred1 %>%
mutate(year_numeric = match(year, unique(weather_pred1$year)))#select the only columns that will be needed
weather_pred1 <- weather_pred1 %>%
select(temp,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex, month_numeric, year_numeric)#lets make our data normalized
#extract the temp column since it will by our y value
temp <- weather_pred1$temp
weather_pred1_no_temp <- weather_pred1[, !(names(weather_pred1) == "temp")]
# Min-Max scaling (Normalization)
process <- preProcess(as.data.frame(weather_pred1_no_temp), method=c("range"))
weather_scale <- predict(process, as.data.frame(weather_pred1_no_temp))
#reattach temp column
weather_normalized <-cbind(temp, weather_scale)The data consists of 487 rows and the following 16 columns:
datetime: The date on which the data was observed. In this case, the data was collected daily, so there’s one row per date.
Year: The year of the study in which the observation was made. The study took place over two years. Year 0 represents 2023, and year 1 represents 2024.
month: The calendar month in which the observation was made. The value 1 represents January and continues through 12 for December.
temp: The temperature in Celsius .
dew: The dew levels (normalized)
humidity: The humidity level (normalized).
windgust: The windgust (normalized)
winddir: The wind directon in degrees (normalized).
windspeed: The windspeed (normalized).
sealevelpressure: The sea level (normalized)
cloudcover: The cloudcover (normalized)
visibility: The visibility (normalized)
solarradiation: The solar radiation (normalized)
solarenergy: The solar energy (normalized)
uvindex: The uv index(normalized)
In this dataset, temperature (temp) represents the label, which is the y value, your model must be trained to predict. The other columns are potential features. They’re x values.
Exploratory Data Analysis
#lets perform summary stats on the columns
# load package into the R session
library(summarytools)
# Obtain summary stats for feature and label columns
weather_normalized %>%
# Select features and label
select(c(
temp,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex, month_numeric, year_numeric
)) %>%
# Summary stats
descr(order = "preserve",
stats = c('mean', 'sd', 'min', 'q1', 'med', 'q3', 'max'),
round.digits = 6)Descriptive Statistics
weather_normalized
N: 487
temp dew humidity windgust windspeed winddir sealevelpressure
------------- ----------- ---------- ---------- ---------- ----------- ---------- ------------------
Mean 20.441068 0.669808 0.546014 0.495410 0.094411 0.276275 0.453108
Std.Dev 1.522586 0.186147 0.214090 0.215783 0.059977 0.144889 0.164163
Min 16.200000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Q1 19.500000 0.549180 0.392713 0.333333 0.057045 0.185134 0.336449
Median 20.500000 0.688525 0.558704 0.505291 0.091409 0.215700 0.448598
Q3 21.600000 0.819672 0.698381 0.677249 0.131959 0.331365 0.560748
Max 24.200000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
Table: Table continues below
cloudcover visibility solarradiation solarenergy uvindex month_numeric
------------- ------------ ------------ ---------------- ------------- ---------- ---------------
Mean 0.757317 0.234575 0.703750 0.702336 0.906314 0.411051
Std.Dev 0.212481 0.117293 0.208255 0.208831 0.175631 0.318556
Min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
Q1 0.676884 0.104762 0.563307 0.558052 0.875000 0.181818
Median 0.823755 0.257143 0.757429 0.752809 1.000000 0.363636
Q3 0.922095 0.304762 0.865310 0.865169 1.000000 0.727273
Max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
Table: Table continues below
year_numeric
------------- --------------
Mean 0.250513
Std.Dev 0.433754
Min 0.000000
Q1 0.000000
Median 0.000000
Q3 1.000000
Max 1.000000
library(patchwork)
library(paletteer) # Collection of color palettes
theme_set(theme_light())
# Plot a histogram
hist_plt <- weather_normalized %>%
ggplot(mapping = aes(x = temp)) +
geom_histogram(bins = 100, fill = "midnightblue", alpha = 0.7) +
# Add lines for mean and median
geom_vline(aes(xintercept = mean(temp), color = 'Mean'), linetype = "dashed", size = 1.3) +
geom_vline(aes(xintercept = median(temp), color = 'Median'), linetype = "dashed", size = 1.3 ) +
xlab("") +
ylab("Frequency") +
scale_color_manual(name = "", values = c(Mean = "red", Median = "yellow")) +
theme(legend.position = c(0.9, 0.9), legend.background = element_blank())Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# Plot a box plot
box_plt <- weather_normalized %>%
ggplot(aes(x = temp, y = 1)) +
geom_boxplot(fill = "#E69F00", color = "gray23", alpha = 0.7) +
# Add titles and labels
xlab("Temperature")+
ylab("")
# Combine plots
(hist_plt / box_plt) +
plot_annotation(title = 'Temperature Distribution',
theme = theme(plot.title = element_text(hjust = 0.5)))Lets see the distribution of numeric features
# Create a data frame of numeric features & label
weather_numeric_features <- weather_normalized %>%
select(c(dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,temp))
# Pivot data to a long format
weather_numeric_features <-weather_numeric_features %>%
pivot_longer(!temp, names_to = "features", values_to = "values") %>%
group_by(features) %>%
mutate(Mean = mean(values),
Median = median(values))
custom_palette <- c("#FFB6C1", "#DC143C", "#8B0000", "#FF4500", "#FFA500",
"#FFD700", "#4B0082", "#32CD32", "#008000", "#00CED1",
"#4682B4", "#000080")
# Plot a histogram for each feature
weather_numeric_features %>%
ggplot() +
geom_histogram(aes(x = values, fill = features), bins = 100, alpha = 0.7, show.legend = F) +
facet_wrap(~ features, scales = 'free')+
scale_fill_manual(values = custom_palette) +
# Add lines for mean and median
geom_vline(aes(xintercept = Mean, color = "Mean"), linetype = "dashed", size = 1.3 ) +
geom_vline(aes(xintercept = Median, color = "Median"), linetype = "dashed", size = 1.3 ) +
scale_color_manual(name = "", values = c(Mean = "red", Median = "yellow")) lets look at the distribution of categorical features
# Create a data frame of categorical features & label
weather_categorical_features <- weather_normalized %>%
select(c(month_numeric,year_numeric, temp))
# Pivot data to a long format
weather_categorical_features <- weather_categorical_features %>%
pivot_longer(!temp, names_to = "features", values_to = "values") %>%
group_by(features) %>%
mutate(values = factor(values))
# Plot a bar plot for each feature
weather_categorical_features %>%
ggplot() +
geom_bar(aes(x = values, fill = features), alpha = 0.7, show.legend = F) +
facet_wrap(~ features, scales = 'free') +
scale_fill_manual(values = custom_palette) +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(angle = 90))Now, lets look into the relationship between features and the temp column label that we want to predict.
For the numeric features, lets create scatter plots that show the intersection of feature and label values.
# Plot a scatter plot for each feature
weather_numeric_features %>%
mutate(corr_coef = cor(values, temp)) %>%
mutate(features = paste(features, ' vs temperature, r = ', corr_coef, sep = '')) %>%
ggplot(aes(x = values, y = temp, color = features)) +
geom_point(alpha = 0.7, show.legend = F) +
facet_wrap(~ features, scales = 'free')+
scale_fill_manual(values = custom_palette)# Calculate correlation coefficient
weather_numeric_features %>%
summarise(corr_coef = cor(values, temp))# A tibble: 11 × 2
features corr_coef
<chr> <dbl>
1 cloudcover -0.580
2 dew -0.183
3 humidity -0.623
4 sealevelpressure -0.524
5 solarenergy 0.640
6 solarradiation 0.639
7 uvindex 0.546
8 visibility 0.0175
9 winddir -0.394
10 windgust 0.604
11 windspeed 0.367
The results aren’t conclusive. But if you look closely at the scatter plots for temp and solar radiation or solar energy, you can see a vague diagonal trend. This trend shows that higher temperature trend tend to coincide with higher solar radiation. A correlation value of just over 0.5 for both features supports this observation. Conversely, the plots for humidityand cloudcover show a slightly negative correlation. This trend indicates there is lower temperature on days with high humidity or cloudcover.
Now let’s compare the categorical features to the label. Lets create box plots that show the distribution of temperature for each category.
# Plot a box plot for each feature
weather_categorical_features %>%
ggplot() +
geom_boxplot(aes(x = values, y = temp, fill = features), alpha = 0.9, show.legend = F) +
facet_wrap(~ features, scales = 'free') +
paletteer::scale_fill_paletteer_d("tvthemes::simpsons")+
theme(
panel.grid = element_blank(),
axis.text.x = element_text(angle = 90))The plots show some variance in the relationship between category values and temperature. For example, there’s a clear difference in the distribution of temperature across various months and years.
Regression Models using Tidy Models
#lets pick our desired features and labels
weather_train <- weather_normalized %>%
select(c(month_numeric, year_numeric,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,temp)) %>%
# Encode certain features as categorical
mutate(across(1:2, factor))
# Get a glimpse of your data
glimpse(weather_train)Rows: 487
Columns: 14
$ month_numeric <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ year_numeric <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ dew <dbl> 0.4754098, 0.5737705, 0.7540984, 0.7704918, 0.5327869…
$ humidity <dbl> 0.36842105, 0.56072874, 0.76113360, 0.84210526, 0.546…
$ windgust <dbl> 0.8201058, 0.9153439, 0.8492063, 0.9153439, 0.9523810…
$ windspeed <dbl> 0.16082474, 0.16288660, 0.17525773, 0.10240550, 0.129…
$ winddir <dbl> 0.2136158, 0.2136158, 0.2049323, 0.1886072, 0.1392845…
$ sealevelpressure <dbl> 0.45794393, 0.42056075, 0.44859813, 0.43925234, 0.401…
$ cloudcover <dbl> 0.47765006, 0.66666667, 0.77777778, 0.88633461, 0.719…
$ visibility <dbl> 0.08571429, 0.08571429, 0.05714286, 0.06666667, 0.066…
$ solarradiation <dbl> 0.8898579, 0.8572351, 0.3659561, 0.7580749, 0.8969638…
$ solarenergy <dbl> 0.8951311, 0.8501873, 0.3595506, 0.7528090, 0.8913858…
$ uvindex <dbl> 1.000, 1.000, 0.625, 1.000, 1.000, 1.000, 1.000, 1.00…
$ temp <dbl> 20.1, 18.5, 18.5, 17.9, 18.6, 20.2, 20.4, 20.9, 20.5,…
#lets split 70% of data fir training and the remaining for testing
# Split 70% of the data for training and the rest for testing
set.seed(2056)
weather_train_split<- weather_train %>%
initial_split(prop = 0.7,
# splitting data evenly on the holiday variable
strata = month_numeric)
# Extract the data in each split
weather_train1 <- training(weather_train_split)
weather_test1 <- testing(weather_train_split)
cat("Training Set", nrow(weather_train), "rows",
"\nTest Set", nrow(weather_test1), "rows")Training Set 487 rows
Test Set 148 rows
Linear Regression Model Train
# Build a linear model specification
lm_spec <-
# Type
linear_reg() %>%
# Engine
set_engine("lm") %>%
# Mode
set_mode("regression")# Train a linear regression model
lm_mod <- lm_spec %>%
fit(temp ~ ., data = weather_train1)
# Print the model object
lm_modparsnip model object
Call:
stats::lm(formula = temp ~ ., data = data)
Coefficients:
(Intercept) month_numeric0.0909090909090909
21.29235 0.37633
month_numeric0.181818181818182 month_numeric0.272727272727273
0.33869 0.34746
month_numeric0.363636363636364 month_numeric0.454545454545455
0.36478 -0.03494
month_numeric0.545454545454545 month_numeric0.636363636363636
-0.32801 -0.17089
month_numeric0.727272727272727 month_numeric0.818181818181818
0.12675 0.47228
month_numeric0.909090909090909 month_numeric1
0.57117 0.39271
year_numeric1 dew
0.43302 8.74022
humidity windgust
-10.83824 -0.38383
windspeed winddir
0.48811 -0.03265
sealevelpressure cloudcover
-1.09824 -0.65351
visibility solarradiation
-0.27252 3.51627
solarenergy uvindex
-3.39339 0.01174
# Make predictions on test set
pred <- lm_mod %>%
predict(new_data = weather_test1)
# View predictions
pred %>%
slice_head(n = 5)# A tibble: 5 × 1
.pred
<dbl>
1 20.5
2 18.4
3 21.4
4 20.7
5 21.4
# Predict temperature for the test set and bind it to the test_set
results <- weather_test1 %>%
bind_cols(lm_mod %>%
# Predict temperature
predict(new_data = weather_test1) %>%
rename(predictions = .pred))
# Compare predictions
results %>%
select(c(temp, predictions)) %>%
slice_head(n = 10) temp predictions
1 20.1 20.47599
2 18.5 18.44440
3 20.9 21.35189
4 20.8 20.68974
5 20.4 21.38287
6 21.5 21.77932
7 20.0 20.82632
8 20.3 20.37185
9 21.7 21.77856
10 21.2 21.62710
# Visualise the results
results %>%
ggplot(mapping = aes(x = temp, y = predictions)) +
geom_point(size = 1.6, color = "steelblue") +
# Overlay a regression line
geom_smooth(method = "lm", se = F, color = 'magenta') +
ggtitle("Daily Temperature Predictions") +
xlab("Actual Labels") +
ylab("Predicted Labels") +
theme(plot.title = element_text(hjust = 0.5))`geom_smooth()` using formula = 'y ~ x'
There’s a good definite diagonal trend. The intersections of the predicted and actual values are generally following the path of the trend line.
Lets see the residuals (level of error) in our prediction
# Multiple regression metrics
eval_metrics <- metric_set(rmse, rsq)
# Evaluate RMSE, R2 based on the results
eval_metrics(data = results,
truth = temp,
estimate = predictions)# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.275
2 rsq standard 0.964
Lets try out another model for comparison
Xgboost Model
# Split 70% of the data for training and the rest for testing
set.seed(2056)
w_split<- weather_train %>%
initial_split(prop = 0.7,
# splitting data evenly on the holiday variable
strata = month_numeric)
# Extract the data in each split
wtrain1 <- training(w_split)
wtest1<- testing(w_split)
# Build an xgboost model specification
boost_spec <- boost_tree() %>%
set_engine('xgboost') %>%
set_mode('regression')
# Train an xgboost model
boost_mod <- boost_spec %>%
fit(temp ~ ., data = wtrain1)
# Print model
boost_modparsnip model object
##### xgb.Booster
raw: 36.3 Kb
call:
xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0,
colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist,
verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
niter
callbacks:
cb.evaluation.log()
# of features: 25
niter: 15
nfeatures : 25
evaluation_log:
iter training_rmse
1 14.0644650
2 9.9084496
---
14 0.2738099
15 0.2249564
# Make and bind predictions to test data a
results <- boost_mod %>%
augment(new_data = wtest1) %>%
rename(predictions = .pred)
# Evaluate the model
boost_metrics <- eval_metrics(data = results,
truth = temp,
estimate = predictions)
# Plot predicted vs actual
boost_plt <- results %>%
ggplot(mapping = aes(x = temp, y = predictions)) +
geom_point(color = '#4D3161FF') +
# overlay regression line
geom_smooth(method = 'lm', color = 'black', se = F) +
ggtitle("Daily Temperature Predictions") +
xlab("Actual Labels") +
ylab("Predicted Labels") +
theme(plot.title = element_text(hjust = 0.5))
# Return evaluations
list(boost_metrics, boost_plt)[[1]]
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.545
2 rsq standard 0.861
[[2]]
`geom_smooth()` using formula = 'y ~ x'
In this case, we will go with the simple linear regression model since it performed better the Xgboost model.