Weather Prediction in Nairobi Kenya

Load Libraries

library(caret)
library(tidymodels)
library(summarytools)
library(janitor)
library(lubridate)

Load the dataset

weather_pred<-read.csv("D:/weather/weatherke.csv") %>%  
  clean_names()

#lets check the data types of different columns and change them to the right type if need there be
sapply(weather_pred,class)

            name         datetime          tempmax          tempmin 
     "character"      "character"        "numeric"        "numeric" 
            temp     feelslikemax     feelslikemin        feelslike 
       "numeric"        "numeric"        "numeric"        "numeric" 
             dew         humidity           precip       precipprob 
       "numeric"        "numeric"        "numeric"        "integer" 
     precipcover       preciptype             snow        snowdepth 
       "numeric"      "character"        "integer"        "integer" 
        windgust        windspeed          winddir sealevelpressure 
       "numeric"        "numeric"        "numeric"        "numeric" 
      cloudcover       visibility   solarradiation      solarenergy 
       "numeric"        "numeric"        "numeric"        "numeric" 
         uvindex       severerisk          sunrise           sunset 
       "integer"        "integer"      "character"      "character" 
       moonphase       conditions      description             icon 
       "numeric"      "character"      "character"      "character" 
        stations 
     "character"

#select necessary columns
weather_pred1 <- weather_pred %>% 
  select(datetime, temp,dew, humidity,windgust, windspeed, winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,conditions,stations)

#lets see the datatypes of different columns
weather_pred1$datetime <- as.Date(weather_pred1$datetime, format="%d/%m/%Y")
weather_pred1$temp <- as.numeric(weather_pred1$temp)
weather_pred1$dew <- as.numeric(weather_pred1$dew)
weather_pred1$humidity <- as.numeric(weather_pred1$humidity)
weather_pred1$windgust <- as.numeric(weather_pred1$windgust)
weather_pred1$windspeed <- as.numeric(weather_pred1$windspeed)
weather_pred1$winddir <- as.numeric(weather_pred1$winddir)
weather_pred1$sealevelpressure <- as.numeric(weather_pred1$sealevelpressure)
weather_pred1$cloudcover <- as.numeric(weather_pred1$cloudcover)
weather_pred1$visibility <- as.numeric(weather_pred1$visibility)
weather_pred1$solarradiation <- as.numeric(weather_pred1$solarradiation)
weather_pred1$solarenergy <- as.numeric(weather_pred1$solarenergy)
weather_pred1$uvindex <- as.numeric(weather_pred1$uvindex)
weather_pred1$conditions <- as.character(weather_pred1$conditions)
weather_pred1$stations <- as.character(weather_pred1$stations)

#confirm data types
sapply(weather_pred1,class) #all good now!

        datetime             temp              dew         humidity 
          "Date"        "numeric"        "numeric"        "numeric" 
        windgust        windspeed          winddir sealevelpressure 
       "numeric"        "numeric"        "numeric"        "numeric" 
      cloudcover       visibility   solarradiation      solarenergy 
       "numeric"        "numeric"        "numeric"        "numeric" 
         uvindex       conditions         stations 
       "numeric"      "character"      "character"

#check whether there are any missing values
colSums(is.na(weather_pred1)) #no missing values recorded in the dataset

        datetime             temp              dew         humidity 
               0                0                0                0 
        windgust        windspeed          winddir sealevelpressure 
               0                0                0                0 
      cloudcover       visibility   solarradiation      solarenergy 
               0                0                0                0 
         uvindex       conditions         stations 
               0                0                0

#lets add a new column for months of the year extracted from the 'datetime' column
weather_pred1$month <- month(weather_pred1$datetime, label = TRUE)
weather_pred1$year <- year(weather_pred1$datetime)

#lets make the month & year column into binary form for easy prediction
# Lets use the mutate() function from dplyr to create a new column with numeric values
weather_pred1 <- weather_pred1 %>%
  mutate(month_numeric = match(month, unique(weather_pred1$month)))

weather_pred1 <- weather_pred1 %>%
  mutate(year_numeric = match(year, unique(weather_pred1$year)))

#select the only columns that will be needed
weather_pred1 <- weather_pred1 %>% 
  select(temp,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex, month_numeric, year_numeric)

#lets make our data normalized
#extract the temp column since it will by our y value
temp <- weather_pred1$temp
weather_pred1_no_temp <- weather_pred1[, !(names(weather_pred1) == "temp")]
# Min-Max scaling (Normalization)
process <- preProcess(as.data.frame(weather_pred1_no_temp), method=c("range"))
weather_scale <- predict(process, as.data.frame(weather_pred1_no_temp))
#reattach temp column
weather_normalized <-cbind(temp, weather_scale)

The data consists of 487 rows and the following 16 columns:

datetime: The date on which the data was observed. In this case, the data was collected daily, so there’s one row per date.
Year: The year of the study in which the observation was made. The study took place over two years. Year 0 represents 2023, and year 1 represents 2024.
month: The calendar month in which the observation was made. The value 1 represents January and continues through 12 for December.
temp: The temperature in Celsius .
dew: The dew levels (normalized)
humidity: The humidity level (normalized).
windgust: The windgust (normalized)
winddir: The wind directon in degrees (normalized).
windspeed: The windspeed (normalized).
sealevelpressure: The sea level (normalized)
cloudcover: The cloudcover (normalized)
visibility: The visibility (normalized)
solarradiation: The solar radiation (normalized)
solarenergy: The solar energy (normalized)
uvindex: The uv index(normalized)

In this dataset, temperature (temp) represents the label, which is the y value, your model must be trained to predict. The other columns are potential features. They’re x values.

Exploratory Data Analysis

#lets perform summary stats on the columns
# load package into the R session
library(summarytools)

# Obtain summary stats for feature and label columns
weather_normalized %>% 
  # Select features and label
  select(c(
    temp,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex, month_numeric, year_numeric
  )) %>% 
  # Summary stats
  descr(order = "preserve",
        stats = c('mean', 'sd', 'min', 'q1', 'med', 'q3', 'max'),
        round.digits = 6)

Descriptive Statistics  
weather_normalized  
N: 487  

                     temp        dew   humidity   windgust   windspeed    winddir   sealevelpressure
------------- ----------- ---------- ---------- ---------- ----------- ---------- ------------------
         Mean   20.441068   0.669808   0.546014   0.495410    0.094411   0.276275           0.453108
      Std.Dev    1.522586   0.186147   0.214090   0.215783    0.059977   0.144889           0.164163
          Min   16.200000   0.000000   0.000000   0.000000    0.000000   0.000000           0.000000
           Q1   19.500000   0.549180   0.392713   0.333333    0.057045   0.185134           0.336449
       Median   20.500000   0.688525   0.558704   0.505291    0.091409   0.215700           0.448598
           Q3   21.600000   0.819672   0.698381   0.677249    0.131959   0.331365           0.560748
          Max   24.200000   1.000000   1.000000   1.000000    1.000000   1.000000           1.000000

Table: Table continues below

 

                cloudcover   visibility   solarradiation   solarenergy    uvindex   month_numeric
------------- ------------ ------------ ---------------- ------------- ---------- ---------------
         Mean     0.757317     0.234575         0.703750      0.702336   0.906314        0.411051
      Std.Dev     0.212481     0.117293         0.208255      0.208831   0.175631        0.318556
          Min     0.000000     0.000000         0.000000      0.000000   0.000000        0.000000
           Q1     0.676884     0.104762         0.563307      0.558052   0.875000        0.181818
       Median     0.823755     0.257143         0.757429      0.752809   1.000000        0.363636
           Q3     0.922095     0.304762         0.865310      0.865169   1.000000        0.727273
          Max     1.000000     1.000000         1.000000      1.000000   1.000000        1.000000

Table: Table continues below

 

                year_numeric
------------- --------------
         Mean       0.250513
      Std.Dev       0.433754
          Min       0.000000
           Q1       0.000000
       Median       0.000000
           Q3       1.000000
          Max       1.000000

library(patchwork)
library(paletteer) # Collection of color palettes
theme_set(theme_light())

# Plot a histogram
hist_plt <- weather_normalized %>% 
  ggplot(mapping = aes(x = temp)) + 
  geom_histogram(bins = 100, fill = "midnightblue", alpha = 0.7) +
  
  # Add lines for mean and median
  geom_vline(aes(xintercept = mean(temp), color = 'Mean'), linetype = "dashed", size = 1.3) +
  geom_vline(aes(xintercept = median(temp), color = 'Median'), linetype = "dashed", size = 1.3 ) +
  xlab("") +
  ylab("Frequency") +
  scale_color_manual(name = "", values = c(Mean = "red", Median = "yellow")) +
  theme(legend.position = c(0.9, 0.9), legend.background = element_blank())

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# Plot a box plot
box_plt <- weather_normalized %>% 
  ggplot(aes(x = temp, y = 1)) +
  geom_boxplot(fill = "#E69F00", color = "gray23", alpha = 0.7) +
    # Add titles and labels
  xlab("Temperature")+
  ylab("")


# Combine plots
(hist_plt / box_plt) +
  plot_annotation(title = 'Temperature Distribution',
                  theme = theme(plot.title = element_text(hjust = 0.5)))

Lets see the distribution of numeric features

# Create a data frame of numeric features & label
weather_numeric_features <- weather_normalized %>% 
  select(c(dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,temp))
# Pivot data to a long format
weather_numeric_features <-weather_numeric_features %>% 
  pivot_longer(!temp, names_to = "features", values_to = "values") %>%
  group_by(features) %>% 
  mutate(Mean = mean(values),
         Median = median(values))
custom_palette <- c("#FFB6C1", "#DC143C", "#8B0000", "#FF4500", "#FFA500", 
                    "#FFD700", "#4B0082", "#32CD32", "#008000", "#00CED1", 
                    "#4682B4", "#000080")
 # Plot a histogram for each feature
weather_numeric_features %>%
  ggplot() +
  geom_histogram(aes(x = values, fill = features), bins = 100, alpha = 0.7, show.legend = F) +
  facet_wrap(~ features, scales = 'free')+
  scale_fill_manual(values = custom_palette) +
  
  # Add lines for mean and median
  geom_vline(aes(xintercept = Mean, color = "Mean"), linetype = "dashed", size = 1.3 ) +
  geom_vline(aes(xintercept = Median, color = "Median"), linetype = "dashed", size = 1.3 ) +
  scale_color_manual(name = "", values = c(Mean = "red", Median = "yellow"))

lets look at the distribution of categorical features

# Create a data frame of categorical features & label
weather_categorical_features <- weather_normalized %>% 
  select(c(month_numeric,year_numeric, temp))

# Pivot data to a long format
weather_categorical_features <- weather_categorical_features %>% 
  pivot_longer(!temp, names_to = "features", values_to = "values") %>%
  group_by(features) %>% 
  mutate(values = factor(values))


# Plot a bar plot for each feature
weather_categorical_features %>%
  ggplot() +
  geom_bar(aes(x = values, fill = features), alpha = 0.7, show.legend = F) +
  facet_wrap(~ features, scales = 'free') +
  scale_fill_manual(values = custom_palette) +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90))

Now, lets look into the relationship between features and the temp column label that we want to predict.

For the numeric features, lets create scatter plots that show the intersection of feature and label values.

# Plot a scatter plot for each feature
weather_numeric_features %>% 
  mutate(corr_coef = cor(values, temp)) %>%
  mutate(features = paste(features, ' vs temperature, r = ', corr_coef, sep = '')) %>% 
  ggplot(aes(x = values, y = temp, color = features)) +
  geom_point(alpha = 0.7, show.legend = F) +
  facet_wrap(~ features, scales = 'free')+
 scale_fill_manual(values = custom_palette)

# Calculate correlation coefficient
weather_numeric_features %>% 
  summarise(corr_coef = cor(values, temp))

# A tibble: 11 × 2
   features         corr_coef
   <chr>                <dbl>
 1 cloudcover         -0.580 
 2 dew                -0.183 
 3 humidity           -0.623 
 4 sealevelpressure   -0.524 
 5 solarenergy         0.640 
 6 solarradiation      0.639 
 7 uvindex             0.546 
 8 visibility          0.0175
 9 winddir            -0.394 
10 windgust            0.604 
11 windspeed           0.367

The results aren’t conclusive. But if you look closely at the scatter plots for temp and solar radiation or solar energy, you can see a vague diagonal trend. This trend shows that higher temperature trend tend to coincide with higher solar radiation. A correlation value of just over 0.5 for both features supports this observation. Conversely, the plots for humidityand cloudcover show a slightly negative correlation. This trend indicates there is lower temperature on days with high humidity or cloudcover.

Now let’s compare the categorical features to the label. Lets create box plots that show the distribution of temperature for each category.

# Plot a box plot for each feature
weather_categorical_features %>%
  ggplot() +
  geom_boxplot(aes(x = values, y = temp, fill = features), alpha = 0.9, show.legend = F) +
  facet_wrap(~ features, scales = 'free') +
  paletteer::scale_fill_paletteer_d("tvthemes::simpsons")+
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90))

The plots show some variance in the relationship between category values and temperature. For example, there’s a clear difference in the distribution of temperature across various months and years.

Regression Models using Tidy Models

#lets pick our desired features and labels
weather_train <- weather_normalized %>% 
  select(c(month_numeric, year_numeric,dew,humidity,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,temp)) %>% 
  # Encode certain features as categorical
  mutate(across(1:2, factor))

# Get a glimpse of your data
glimpse(weather_train)

Rows: 487
Columns: 14
$ month_numeric    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ year_numeric     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ dew              <dbl> 0.4754098, 0.5737705, 0.7540984, 0.7704918, 0.5327869…
$ humidity         <dbl> 0.36842105, 0.56072874, 0.76113360, 0.84210526, 0.546…
$ windgust         <dbl> 0.8201058, 0.9153439, 0.8492063, 0.9153439, 0.9523810…
$ windspeed        <dbl> 0.16082474, 0.16288660, 0.17525773, 0.10240550, 0.129…
$ winddir          <dbl> 0.2136158, 0.2136158, 0.2049323, 0.1886072, 0.1392845…
$ sealevelpressure <dbl> 0.45794393, 0.42056075, 0.44859813, 0.43925234, 0.401…
$ cloudcover       <dbl> 0.47765006, 0.66666667, 0.77777778, 0.88633461, 0.719…
$ visibility       <dbl> 0.08571429, 0.08571429, 0.05714286, 0.06666667, 0.066…
$ solarradiation   <dbl> 0.8898579, 0.8572351, 0.3659561, 0.7580749, 0.8969638…
$ solarenergy      <dbl> 0.8951311, 0.8501873, 0.3595506, 0.7528090, 0.8913858…
$ uvindex          <dbl> 1.000, 1.000, 0.625, 1.000, 1.000, 1.000, 1.000, 1.00…
$ temp             <dbl> 20.1, 18.5, 18.5, 17.9, 18.6, 20.2, 20.4, 20.9, 20.5,…

#lets split 70% of data fir training and the remaining for testing
# Split 70% of the data for training and the rest for testing
set.seed(2056)
weather_train_split<- weather_train %>% 
  initial_split(prop = 0.7,
  # splitting data evenly on the holiday variable
                strata = month_numeric)

# Extract the data in each split
weather_train1 <- training(weather_train_split)
weather_test1 <- testing(weather_train_split)


cat("Training Set", nrow(weather_train), "rows",
    "\nTest Set", nrow(weather_test1), "rows")

Training Set 487 rows 
Test Set 148 rows

Linear Regression Model Train

# Build a linear model specification
lm_spec <- 
  # Type
  linear_reg() %>% 
  # Engine
  set_engine("lm") %>% 
  # Mode
  set_mode("regression")

# Train a linear regression model
lm_mod <- lm_spec %>% 
  fit(temp ~ ., data = weather_train1)

# Print the model object
lm_mod

parsnip model object


Call:
stats::lm(formula = temp ~ ., data = data)

Coefficients:
                    (Intercept)  month_numeric0.0909090909090909  
                       21.29235                          0.37633  
 month_numeric0.181818181818182   month_numeric0.272727272727273  
                        0.33869                          0.34746  
 month_numeric0.363636363636364   month_numeric0.454545454545455  
                        0.36478                         -0.03494  
 month_numeric0.545454545454545   month_numeric0.636363636363636  
                       -0.32801                         -0.17089  
 month_numeric0.727272727272727   month_numeric0.818181818181818  
                        0.12675                          0.47228  
 month_numeric0.909090909090909                   month_numeric1  
                        0.57117                          0.39271  
                  year_numeric1                              dew  
                        0.43302                          8.74022  
                       humidity                         windgust  
                      -10.83824                         -0.38383  
                      windspeed                          winddir  
                        0.48811                         -0.03265  
               sealevelpressure                       cloudcover  
                       -1.09824                         -0.65351  
                     visibility                   solarradiation  
                       -0.27252                          3.51627  
                    solarenergy                          uvindex  
                       -3.39339                          0.01174

# Make predictions on test set
pred <- lm_mod %>% 
  predict(new_data = weather_test1)

# View predictions
pred %>% 
  slice_head(n = 5)

# A tibble: 5 × 1
  .pred
  <dbl>
1  20.5
2  18.4
3  21.4
4  20.7
5  21.4

# Predict temperature for the test set and bind it to the test_set
results <- weather_test1 %>% 
  bind_cols(lm_mod %>% 
    # Predict temperature
    predict(new_data = weather_test1) %>% 
      rename(predictions = .pred))

# Compare predictions
results %>% 
  select(c(temp, predictions)) %>% 
  slice_head(n = 10)

   temp predictions
1  20.1    20.47599
2  18.5    18.44440
3  20.9    21.35189
4  20.8    20.68974
5  20.4    21.38287
6  21.5    21.77932
7  20.0    20.82632
8  20.3    20.37185
9  21.7    21.77856
10 21.2    21.62710

# Visualise the results
results %>% 
  ggplot(mapping = aes(x = temp, y = predictions)) +
  geom_point(size = 1.6, color = "steelblue") +
  # Overlay a regression line
  geom_smooth(method = "lm", se = F, color = 'magenta') +
  ggtitle("Daily Temperature Predictions") +
  xlab("Actual Labels") +
  ylab("Predicted Labels") +
  theme(plot.title = element_text(hjust = 0.5))

`geom_smooth()` using formula = 'y ~ x'

There’s a good definite diagonal trend. The intersections of the predicted and actual values are generally following the path of the trend line.

Lets see the residuals (level of error) in our prediction

# Multiple regression metrics
eval_metrics <- metric_set(rmse, rsq)

# Evaluate RMSE, R2 based on the results
eval_metrics(data = results,
             truth = temp,
             estimate = predictions)

# A tibble: 2 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.275
2 rsq     standard       0.964

Lets try out another model for comparison

Xgboost Model

# Split 70% of the data for training and the rest for testing
set.seed(2056)
w_split<- weather_train %>% 
  initial_split(prop = 0.7,
  # splitting data evenly on the holiday variable
                strata = month_numeric)

# Extract the data in each split
wtrain1 <- training(w_split)
wtest1<- testing(w_split)


# Build an xgboost model specification
boost_spec <- boost_tree() %>% 
  set_engine('xgboost') %>% 
  set_mode('regression')

# Train an xgboost model 
boost_mod <- boost_spec %>% 
  fit(temp ~ ., data = wtrain1)


# Print model
boost_mod

parsnip model object

##### xgb.Booster
raw: 36.3 Kb 
call:
  xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
    colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
    subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, 
    verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
callbacks:
  cb.evaluation.log()
# of features: 25 
niter: 15
nfeatures : 25 
evaluation_log:
    iter training_rmse
       1    14.0644650
       2     9.9084496
---                   
      14     0.2738099
      15     0.2249564

# Make and bind predictions to test data a
results <- boost_mod %>% 
  augment(new_data = wtest1) %>% 
  rename(predictions = .pred)

# Evaluate the model
boost_metrics <- eval_metrics(data = results,
                                  truth = temp,
                                  estimate = predictions) 

# Plot predicted vs actual
boost_plt <- results %>% 
  ggplot(mapping = aes(x = temp, y = predictions)) +
  geom_point(color = '#4D3161FF') +
  # overlay regression line
  geom_smooth(method = 'lm', color = 'black', se = F) +
  ggtitle("Daily Temperature Predictions") +
  xlab("Actual Labels") +
  ylab("Predicted Labels") +
  theme(plot.title = element_text(hjust = 0.5))

# Return evaluations
list(boost_metrics, boost_plt)

[[1]]
# A tibble: 2 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.545
2 rsq     standard       0.861

[[2]]

`geom_smooth()` using formula = 'y ~ x'

In this case, we will go with the simple linear regression model since it performed better the Xgboost model.

Load Libraries

Load the dataset

Exploratory Data Analysis

Regression Models using Tidy Models

Linear Regression Model Train

Xgboost Model