WRF R_Markdown

Loading the Libraries

Loading of data

Data-set loaded into the project folder and read inside R using read.csv

data <- read.csv(here("data/WRFdata_May2018.csv"))

# Extract rows 4287 XLAT 51.375 XLONG 0.456 needed for analysis

new_data <- data[c(1, 4285), ]

# Replacing a wrong column name if available

if ("X.2225" %in% colnames(new_data)) {
  names(new_data)[names(new_data) == 'X.2225'] <- 'X31.05.2018.21.00'
} else {
  cat("Column 'X.2225' does not exist in the data frame.\n")
}

## Column 'X.2225' does not exist in the data frame.

# Remove the Xlat and Xlong Columns
new_data <- new_data[-c(1,2)]

Data Reshaping

# Extrating datatime
extract_datetime <- function(header){
  match <-str_extract(header, "\\d{2}\\.\\d{2}\\.\\d{4}\\.\\d{2}\\.\\d{2}")
  return(ifelse(is.na(match), "", match))
}

Creating New datatime column

#Implementaing Sapply

extracted_datetime <- sapply(colnames(new_data), extract_datetime)

# Create a separate dataframe for the extracted datetime

datetime_df <- data.frame(datatime = extracted_datetime)

datetime_df1 <- data.frame(data = datetime_df)

#Handling the blank rows
datetime_df <- datetime_df[!apply(is.na(datetime_df) | datetime_df == "", 1, all),]

datetime_df1 <- data.frame(data = datetime_df)

# create new copy of dataframe
newdata_copy <- new_data

# Replace the Column headers with the header row
header_row_index <- 1
colnames(newdata_copy) <- unlist(newdata_copy[header_row_index, ])
# Delete the header row
newdata_copy <- newdata_copy[-header_row_index, ]

# Replacing the NA Values on the header column

sequence <- paste0(rep(c("TSK", "PSFC", "U10", "V10", "Q2", "RAINC", "RAINNC", "SNOW", "TSLB", "SMOIS"),times = 248))
colnames(newdata_copy) <- sequence

# Splitting the Dataframe into separate dataframes
split_dfs <- split.default(newdata_copy, rep(1:(ncol(newdata_copy)%/% 10), each = 10))

# Convert the column value to numerica format
for (i in seq_along(split_dfs)) {split_dfs[[i]] <- lapply(split_dfs[[i]], as.numeric) 
}

# merge the split dataframes

merged_df <- bind_rows(split_dfs)

View(merged_df)

Handling the missing in columns

# Function to replace NA with mean of two previous values
fill_na_with_mean <- function(x) {
  na_index <- which(is.na(x))
  for (i in na_index) {
    if (i > 2) {
      x[i] <- mean(c(x[i-1], x[i-2]), na.rm = TRUE)
    }
  }
  return(x)
}

# Apply the function to all columns except 'DATETIME'
merged_df[-ncol(merged_df)] <- lapply(merged_df[-ncol(merged_df)], fill_na_with_mean)

# Rename 'data' column to 'DATETIME'
colnames(merged_df)[colnames(merged_df) == "data"] <- "DATETIME"

View(merged_df)

# Joining the datetime dataframe with the merged dataframe
data1 <- cbind(merged_df, datetime_df1)

# convert datetime column to POSIXct format
format_string <- as.POSIXct(data1$DATETIME, format = format_string)

# Move 'DATETIME' column to first position

data1 <- data1[c("data", setdiff(colnames(data1), "DATETIME"))]

# Remove the last Columns
data1 <- data1[-c(12)]

names(data1)[names(data1) == 'data'] <- "DATETIME"

# Replacing Missing Values in RAINNC
for (i in which(is.na(data1$RAINNC))) {
  if (i < nrow(data1) - 2) {
    data1$RAINNC[i] <- mean(data1$RAINNC[(i+1):(i+3)], na.rm = TRUE)
  }
}

# Replacing Missing Values in SMOIS
for (i in which(is.na(data1$SMOIS))) {
  if (i < nrow(data1) - 2) {
    data1$SMOIS[i] <- mean(data1$SMOIS[(i+1):(i+3)], na.rm = TRUE)
  }
}

Convert DATETIME column to POSIXct format

# Define the format string for parsing the datetime column
format_string <- "%d.%m.%Y.%H.%M"

# Convert DATETIME column to POSIXct format
data1$DATETIME <- as.POSIXct(data1$DATETIME, format = format_string)

View(data1)

Calculate Wind Speed

data1$WIND_SPEED <- round(sqrt(data1$U10^2 + data1$V10^2), 2)

Exploratory Data Analysis

str(data1)

## 'data.frame':    248 obs. of  12 variables:
##  $ DATETIME  : POSIXct, format: "2018-05-01 00:00:00" "2018-05-01 03:00:00" ...
##  $ TSK       : num  276 276 277 289 293 ...
##  $ PSFC      : num  1e+05 1e+05 1e+05 1e+05 1e+05 ...
##  $ U10       : num  3.5 3.8 4.4 4.2 3.6 5.2 3.2 0 -0.4 0.4 ...
##  $ V10       : num  -0.1 1.1 0.4 0.8 3.3 6.4 5.9 5.3 5.5 6.9 ...
##  $ Q2        : num  0.00437 0.00433 0.00398 0.00401 0.0047 ...
##  $ RAINC     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RAINNC    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TSLB      : num  279 278 278 280 283 ...
##  $ SMOIS     : num  0.359 0.352 0.355 0.352 0.348 ...
##  $ WIND_SPEED: num  3.5 3.96 4.42 4.28 4.88 8.25 6.71 5.3 5.51 6.91 ...

# Check the header summary

head(data1)

##              DATETIME   TSK   PSFC U10  V10       Q2 RAINC RAINNC SNOW  TSLB
## 1 2018-05-01 00:00:00 276.5 100218 3.5 -0.1 0.004370     0      0    0 279.1
## 2 2018-05-01 03:00:00 276.1 100272 3.8  1.1 0.004330     0      0    0 278.5
## 3 2018-05-01 06:00:00 277.1 100378 4.4  0.4 0.003980     0      0    0 278.0
## 4 2018-05-01 09:00:00 288.6 100436 4.2  0.8 0.004010     0      0    0 279.8
## 5 2018-05-01 12:00:00 292.8 100428 3.6  3.3 0.004700     0      0    0 283.0
## 6 2018-05-01 15:00:00 289.3 100357 5.2  6.4 0.004355     0      0    0 285.1
##       SMOIS WIND_SPEED
## 1 0.3591000       3.50
## 2 0.3517333       3.96
## 3 0.3551000       4.42
## 4 0.3522000       4.28
## 5 0.3479000       4.88
## 6 0.3441000       8.25

Perform Exploratory Data Analysis (EDA) and identify potential outliers in the data frame

# Summary statistics
summary(data1)

##     DATETIME                        TSK             PSFC       
##  Min.   :2018-05-01 00:00:00   Min.   :272.1   Min.   : 99872  
##  1st Qu.:2018-05-08 17:15:00   1st Qu.:281.6   1st Qu.:100761  
##  Median :2018-05-16 10:30:00   Median :287.1   Median :101145  
##  Mean   :2018-05-16 10:30:00   Mean   :288.0   Mean   :101123  
##  3rd Qu.:2018-05-24 03:45:00   3rd Qu.:294.9   3rd Qu.:101474  
##  Max.   :2018-05-31 21:00:00   Max.   :305.8   Max.   :102065  
##       U10               V10               Q2               RAINC        
##  Min.   :-7.1000   Min.   :-8.200   Min.   :0.003870   Min.   : 0.0000  
##  1st Qu.:-2.3000   1st Qu.:-2.900   1st Qu.:0.005680   1st Qu.: 0.0000  
##  Median :-0.5750   Median :-1.550   Median :0.006860   Median : 0.0000  
##  Mean   :-0.4601   Mean   :-1.055   Mean   :0.007379   Mean   : 0.5552  
##  3rd Qu.: 1.5000   3rd Qu.: 0.700   3rd Qu.:0.008985   3rd Qu.: 0.0000  
##  Max.   : 5.2000   Max.   : 9.400   Max.   :0.013200   Max.   :20.3000  
##      RAINNC            SNOW        TSLB           SMOIS          WIND_SPEED   
##  Min.   :0.0000   Min.   :0   Min.   :277.8   Min.   :0.2750   Min.   :0.140  
##  1st Qu.:0.0000   1st Qu.:0   1st Qu.:283.5   1st Qu.:0.2883   1st Qu.:2.540  
##  Median :0.0000   Median :0   Median :286.3   Median :0.3000   Median :3.450  
##  Mean   :0.2117   Mean   :0   Mean   :286.4   Mean   :0.3037   Mean   :3.632  
##  3rd Qu.:0.0000   3rd Qu.:0   3rd Qu.:289.1   3rd Qu.:0.3159   3rd Qu.:4.562  
##  Max.   :7.9000   Max.   :0   Max.   :295.8   Max.   :0.3676   Max.   :9.430

# Descriptive statistics for numerical variables
describe(data1[, sapply(data1, is.numeric)])  # Filter numeric columns

## data1[, sapply(data1, is.numeric)] 
## 
##  11  Variables      248  Observations
## --------------------------------------------------------------------------------
## TSK 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      175        1      288    9.254    276.1    277.5 
##      .25      .50      .75      .90      .95 
##    281.6    287.1    294.9    299.1    300.7 
## 
## lowest : 272.1 273.5 273.9 274.3 274.6, highest: 303.1 303.8 304   305.7 305.8
## --------------------------------------------------------------------------------
## PSFC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      231        1   101123    588.3   100230   100398 
##      .25      .50      .75      .90      .95 
##   100761   101145   101474   101811   101952 
## 
## lowest :  99872  99873  99901  99960 100082, highest: 102011 102034 102047 102050 102065
## --------------------------------------------------------------------------------
## U10 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0       98        1  -0.4601    2.877   -4.265   -3.700 
##      .25      .50      .75      .90      .95 
##   -2.300   -0.575    1.500    2.900    3.500 
## 
## lowest : -7.1 -6.5 -6.1 -6   -5.8, highest: 4.4  4.6  4.7  5.1  5.2 
## --------------------------------------------------------------------------------
## V10 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0       98        1   -1.055    3.185   -5.200   -4.330 
##      .25      .50      .75      .90      .95 
##   -2.900   -1.550    0.700    2.800    3.765 
## 
## lowest : -8.2 -7.3 -6.7 -6.6 -6.1, highest: 5.9  6.4  6.9  8    9.4 
## --------------------------------------------------------------------------------
## Q2 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      212        1 0.007379 0.002386 0.004720 0.004997 
##      .25      .50      .75      .90      .95 
## 0.005680 0.006860 0.008985 0.010727 0.011270 
## 
## lowest : 0.00387 0.00398 0.00401 0.00407 0.00422
## highest: 0.01238 0.01245 0.01257 0.01271 0.0132 
## --------------------------------------------------------------------------------
## RAINC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0       20    0.321   0.5552    1.059     0.00     0.00 
##      .25      .50      .75      .90      .95 
##     0.00     0.00     0.00     0.33     2.60 
##                                                                             
## Value        0.0   0.2   0.3   0.4   0.7   1.1   1.8   2.1   2.5   2.6   2.8
## Frequency    218     1     4     1     4     2     1     1     1     3     1
## Proportion 0.879 0.004 0.016 0.004 0.016 0.008 0.004 0.004 0.004 0.012 0.004
##                                                                 
## Value        3.6   5.0   6.0   7.2  10.6  11.4  12.7  18.5  20.3
## Frequency      1     1     1     2     1     2     1     1     1
## Proportion 0.004 0.004 0.004 0.008 0.004 0.008 0.004 0.004 0.004
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## RAINNC 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0       14    0.273   0.2117   0.3992    0.000    0.000 
##      .25      .50      .75      .90      .95 
##    0.000    0.000    0.000    0.030    2.145 
##                                                                             
## Value        0.0   0.1   0.2   0.3   0.5   0.8   0.9   1.0   1.2   1.3   2.6
## Frequency    223     1     1     1     1     1     1     4     1     1     3
## Proportion 0.899 0.004 0.004 0.004 0.004 0.004 0.004 0.016 0.004 0.004 0.012
##                             
## Value        2.8   5.1   7.9
## Frequency      8     1     1
## Proportion 0.032 0.004 0.004
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## SNOW 
##        n  missing distinct     Info     Mean      Gmd 
##      248        0        1        0        0        0 
##               
## Value        0
## Frequency  248
## Proportion   1
## --------------------------------------------------------------------------------
## TSLB 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      128        1    286.4     4.43    280.3    281.3 
##      .25      .50      .75      .90      .95 
##    283.5    286.3    289.1    291.4    293.0 
## 
## lowest : 277.8 278   278.5 279.1 279.7, highest: 293.9 295.2 295.5 295.6 295.8
## --------------------------------------------------------------------------------
## SMOIS 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      205        1   0.3037  0.02222   0.2787   0.2818 
##      .25      .50      .75      .90      .95 
##   0.2883   0.3000   0.3159   0.3336   0.3423 
## 
## lowest : 0.275  0.2752 0.2756 0.2759 0.276 , highest: 0.3551 0.3556 0.3591 0.36   0.3676
## --------------------------------------------------------------------------------
## WIND_SPEED 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      248        0      196        1    3.632    1.839    1.207    1.700 
##      .25      .50      .75      .90      .95 
##    2.540    3.450    4.562    5.793    6.763 
## 
## lowest : 0.14 0.5  0.58 0.6  0.72, highest: 8.01 8.2  8.25 8.43 9.43
## --------------------------------------------------------------------------------

# Check for missing values
colMeans(is.na(data1))  # Proportion of missing values per column

##   DATETIME        TSK       PSFC        U10        V10         Q2      RAINC 
##          0          0          0          0          0          0          0 
##     RAINNC       SNOW       TSLB      SMOIS WIND_SPEED 
##          0          0          0          0          0

# Explore data distribution with histograms
for (var in names(data1)[sapply(data1, is.numeric)]) {
print(
ggplot(data1, aes_string(x = var)) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
labs(title = paste("Distribution of", var), x = var, y = "Frequency") +
theme_bw()
)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Explore data distribution with boxplots and identify outliers
for (var in names(data1)[sapply(data1, is.numeric)]) {
print(
ggplot(data1, aes_string(x = var)) +
geom_boxplot() +
labs(title = paste("Boxplot of", var), x = var, y = "") +
theme_bw()
)
}

# Identify outliers with z-scores (more than 3 standard deviations from the mean)
outliers <- lapply(data1[, sapply(data1, is.numeric)], function(x) {
abs_z_scores <- abs((x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE))
return(which(abs_z_scores > 3))
})

# Report columns containing outliers based on z-scores
outlier_columns <- names(data1)[sapply(data1, is.numeric)][sapply(outliers, length) > 0]
if (length(unlist(outliers)) > 0) {
cat("Potential outliers identified in columns:", outlier_columns, "\n")
} else {
cat("No potential outliers identified based on z-scores.\n")
}

## Potential outliers identified in columns: V10 RAINC RAINNC SMOIS WIND_SPEED

Handling and resolving outliers

# Display values for all outliers except for RAINC and RAINNC
outlier_values <- list()
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
# Calculate the bounds for outliers
lower_bound <- quantile(data1[[var]], probs = 0.25, na.rm = TRUE) - 1.5 * IQR(data1[[var]], na.rm = TRUE)
upper_bound <- quantile(data1[[var]], probs = 0.75, na.rm = TRUE) + 1.5 * IQR(data1[[var]], na.rm = TRUE)
# Identify and store the outlier values
outlier_values[[var]] <- data1[[var]][data1[[var]] < lower_bound | data1[[var]] > upper_bound]
}
}

Applying mutation (Cap) to handle outliers

# Cap the outliers for the same columns
data1_clean <- data1
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
lower_bound <- quantile(data1_clean[[var]], probs = 0.25, na.rm = TRUE) - 1.5 * IQR(data1_clean[[var]], na.rm = TRUE)
upper_bound <- quantile(data1_clean[[var]], probs = 0.75, na.rm = TRUE) + 1.5 * IQR(data1_clean[[var]], na.rm = TRUE)
# Cap the values
data1_clean[[var]] <- pmin(pmax(data1_clean[[var]], lower_bound), upper_bound)
}
}

# Verify the results after handling outliers
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
print(
ggplot(data1_clean, aes(x = .data[[var]])) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
labs(title = paste("Cleaned Distribution of", var), x = var, y = "Frequency") +
theme_bw()
)
}
}

Research Questions

Statistical Questions 1:

How does the mean Pressure (PSFC) differ between daytime (6:00 AM to 6:00 PM) and nighttime (6:00 PM to 6:00 AM)?

What is the correlation between Soil Moisture (SMOIS) and Air Temperature (TSK)?
How does the mean Wind Speed vary across the recorded period?
What is the correlation between Air Pressure (PSFC) and Temperature (TSK)?

Time-Series Questions:

Can historical data be used to forecast Air Temperature (TSK) for the next 24 hours?

Do Precipitation (RAINC and RAINNC) exhibit a seasonal pattern over time?
Can historical data be used to forecast Precipitation (RAINC and RAINNC) for the next 24 hours?
What is the accuracy of time-series forecasting methods for predicting Temperature (TSK) for the next week?

Machine Learning Modeling:

Can a machine learning model be built to predict Snowfall (SNOW) based on other weather variables?
Can machine learning models accurately predict the presence of Snow (SNOW) based on other weather variables (Temperature, Humidity, Wind Speed)?

# Que: How does the mean Pressure (PSFC) differ between daytime

data_df <- data1_clean

# Check if 'DATETIME' column is already POSIXct
if (!inherits(data_df$DATETIME, "POSIXct")) {
# Convert 'DATETIME' column to POSIXct datetime format if it is not already
data_df$DATETIME <- as.POSIXct(data_df$DATETIME)
} else {
cat("'DATETIME' column is already of type POSIXct.\n")
}

## 'DATETIME' column is already of type POSIXct.

# Extract hour from DATETIME
data_df$Hour <- as.numeric(format(data_df$DATETIME, "%H"))

# Define a function to categorize daytime and nighttime
day_night_category <- function(hour) {
if (is.na(hour)) {
return(NA)  # Return NA if the hour is missing
} else if (hour >= 6 & hour < 18) {
return("Daytime")
} else {
return("Nighttime")
}
}

# Apply the function to create a new column 'Day_Night'
data_df$Day_Night <- sapply(data_df$Hour, day_night_category)

# Subset data for daytime and nighttime
daytime_data <- filter(data_df, Day_Night == "Daytime")
nighttime_data <- filter(data_df, Day_Night == "Nighttime")

# Perform t-test to compare mean pressure (PSFC) between daytime and nighttime
t_test_result <- t.test(daytime_data$PSFC, nighttime_data$PSFC)

# Print the result
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  daytime_data$PSFC and nighttime_data$PSFC
## t = -0.014232, df = 245.99, p-value = 0.9887
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -129.8403  127.9774
## sample estimates:
## mean of x mean of y 
##  101122.4  101123.3

Question 1 ( Result and analysis)

Welch Two Sample t-test used to test if there is a statistically significant difference between the means of two independent samples.

Output breakdown:

Data: The test was conducted on two sets of data, daytime_data\(PSFC and nighttime_data\)PSFC. These represent the daytime and nighttime values of the variable PSFC.
t-value: The calculated t-statistic is -0.014232.

The t-value measures the size of the difference relative to the variation in the sample data.

A higher t-value indicates a greater difference between groups.
Degrees of Freedom (df): The degrees of freedom for this test are approximately 245.99.

This value is used in determining the critical value of t from the t-distribution.
p-value: The p-value is 0.9887.

This is the probability of observing a test statistic as extreme as, or more extreme than, the observed value under the null hypothesis.

A high p-value suggests that the observed data is consistent with the null hypothesis.
Alternative Hypothesis: The alternative hypothesis states that the true difference in means is not equal to 0.
95 Percent Confidence Interval: The confidence interval ranges from -129.8403 to 127.9774.

This interval estimates where the true difference in means lies with 95% confidence.

Since the interval includes 0, it suggests that there is no significant difference between the means.

Sample Estimates: The mean of the first sample (daytime) is 101122.4, and the mean of the second sample (nighttime) is 101123.3.

In summary, the t-test results suggest that there is no statistically significant difference between the daytime and nighttime PSFC means, as indicated by the high p-value and the confidence interval that includes zero. The means of the two samples are very close to each other

#Statistical Question Two

Statistical Questions 2: Correlation between Soil Moisture (SMOIS) and Air Temperature (TSK)

# Calculate the correlation coefficient between SMOIS and TSK
correlation <- cor(data_df$SMOIS, data_df$TSK)

# Print the correlation coefficient
print(correlation)

## [1] -0.2025451

# Visualize the correlation using a correlation plot
corrplot(cor(data_df[, c("SMOIS", "TSK")]), method="circle")

Question 2 ( Result and analysis)

Output breakdown:

Correlation Calculation: The cor() function is used to calculate the correlation coefficient between data_df\(SMOIS and data_df\)TSK.

These represent two columns in the data_df data frame.
Correlation Coefficient: The calculated correlation coefficient is -0.2025451. This value indicates the strength and direction of the linear relationship between the two variables.

A coefficient close to 1 implies a strong positive correlation, close to -1 implies a strong negative correlation, and close to 0 implies no linear correlation.
Interpretation: Since the correlation coefficient is -0.2025451, it suggests a weak negative linear relationship between SMOIS and TSK. As the values of SMOIS increase, the values of TSK tend to decrease slightly, and vice versa.
Visualization: The corrplot() function is used to create a visual representation of the correlation matrix, which includes the correlation between SMOIS and TSK. The method=“circle” argument specifies that the correlation coefficients should be represented by circles whose size and color intensity vary with the correlation value.

In summary, the analysis indicates a weak negative correlation between soil moisture (SMOIS) and skin temperature (TSK). The visualization provided by corrplot() shows relationship in a graphical format, making it easier to interpret the correlation visually.

Statistical Questions 3: How does the mean Wind Speed vary across the recorded period?

# Extract date and hour from DATETIME
data_df$Date <- as.Date(data_df$DATETIME)
data_df$Hour <- format(data_df$DATETIME, "%H")

# Check if 'wind_speed' column exists
if ("WIND_SPEED" %in% names(data_df)) {  

  # Calculate mean wind speed for each date (assuming 'wind_speed' is the column)
  mean_wind_speed <- aggregate(WIND_SPEED ~ Date, data = data_df, FUN = mean)
  print(mean_wind_speed)
  
} else {
  print("Error: 'WIND_SPEED' column not found in the data_df")
}

##          Date WIND_SPEED
## 1  2018-04-30   3.500000
## 2  2018-05-01   5.332031
## 3  2018-05-02   5.575312
## 4  2018-05-03   2.528750
## 5  2018-05-04   1.875000
## 6  2018-05-05   3.252500
## 7  2018-05-06   3.003750
## 8  2018-05-07   2.681250
## 9  2018-05-08   2.687500
## 10 2018-05-09   3.763750
## 11 2018-05-10   4.167500
## 12 2018-05-11   4.543750
## 13 2018-05-12   1.921250
## 14 2018-05-13   3.063750
## 15 2018-05-14   5.795312
## 16 2018-05-15   4.466250
## 17 2018-05-16   5.512500
## 18 2018-05-17   4.203750
## 19 2018-05-18   2.547500
## 20 2018-05-19   2.323750
## 21 2018-05-20   2.787500
## 22 2018-05-21   3.402500
## 23 2018-05-22   4.070000
## 24 2018-05-23   4.966250
## 25 2018-05-24   4.053750
## 26 2018-05-25   2.018750
## 27 2018-05-26   5.338281
## 28 2018-05-27   2.876250
## 29 2018-05-28   2.818750
## 30 2018-05-29   4.946250
## 31 2018-05-30   2.352500
## 32 2018-05-31   3.094286

# Plot the distribution of wind speed
ggplot(data_df, aes(x = WIND_SPEED)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Wind Speed", x = "WIND_SPEED (km/h)", y = "Frequency") +
theme_minimal()

Question 3 ( Result and analysis)

Output breakdown:

Range: The wind speeds range from a low of 1.875 to a high of 5.795312.

Central Tendency: There seems to be a central clustering around the 3 to 4 wind speed mark, as many values fall within this range.

Variability: There are some variability in the data, with wind speeds occasionally reaching above 5 and dropping below 2.

Distribution: A unimodal distribution with a single peak around the 3 to 4 wind speed mark.

The distribution is not perfectly symmetrical, suggesting a slight skewness to the left with the frequency of higher and lower wind speeds.
The presence of occasional higher values (like 5.795312) might create a longer tail on one end of the distribution.