WRF R_Markdown
Loading the Libraries
Loading of data
Data-set loaded into the project folder and read inside R using read.csv
# Replacing a wrong column name if available
if ("X.2225" %in% colnames(new_data)) {
names(new_data)[names(new_data) == 'X.2225'] <- 'X31.05.2018.21.00'
} else {
cat("Column 'X.2225' does not exist in the data frame.\n")
}## Column 'X.2225' does not exist in the data frame.
Data Reshaping
Creating New datatime column
# Create a separate dataframe for the extracted datetime
datetime_df <- data.frame(datatime = extracted_datetime)
datetime_df1 <- data.frame(data = datetime_df)#Handling the blank rows
datetime_df <- datetime_df[!apply(is.na(datetime_df) | datetime_df == "", 1, all),]# Replace the Column headers with the header row
header_row_index <- 1
colnames(newdata_copy) <- unlist(newdata_copy[header_row_index, ])
# Delete the header row
newdata_copy <- newdata_copy[-header_row_index, ]# Replacing the NA Values on the header column
sequence <- paste0(rep(c("TSK", "PSFC", "U10", "V10", "Q2", "RAINC", "RAINNC", "SNOW", "TSLB", "SMOIS"),times = 248))
colnames(newdata_copy) <- sequence# Splitting the Dataframe into separate dataframes
split_dfs <- split.default(newdata_copy, rep(1:(ncol(newdata_copy)%/% 10), each = 10))
# Convert the column value to numerica format
for (i in seq_along(split_dfs)) {split_dfs[[i]] <- lapply(split_dfs[[i]], as.numeric)
}
# merge the split dataframes
merged_df <- bind_rows(split_dfs)Handling the missing in columns
# Function to replace NA with mean of two previous values
fill_na_with_mean <- function(x) {
na_index <- which(is.na(x))
for (i in na_index) {
if (i > 2) {
x[i] <- mean(c(x[i-1], x[i-2]), na.rm = TRUE)
}
}
return(x)
}
# Apply the function to all columns except 'DATETIME'
merged_df[-ncol(merged_df)] <- lapply(merged_df[-ncol(merged_df)], fill_na_with_mean)
# Rename 'data' column to 'DATETIME'
colnames(merged_df)[colnames(merged_df) == "data"] <- "DATETIME"# Joining the datetime dataframe with the merged dataframe
data1 <- cbind(merged_df, datetime_df1)
# convert datetime column to POSIXct format
format_string <- as.POSIXct(data1$DATETIME, format = format_string)Convert DATETIME column to POSIXct format
Exploratory Data Analysis
## 'data.frame': 248 obs. of 12 variables:
## $ DATETIME : POSIXct, format: "2018-05-01 00:00:00" "2018-05-01 03:00:00" ...
## $ TSK : num 276 276 277 289 293 ...
## $ PSFC : num 1e+05 1e+05 1e+05 1e+05 1e+05 ...
## $ U10 : num 3.5 3.8 4.4 4.2 3.6 5.2 3.2 0 -0.4 0.4 ...
## $ V10 : num -0.1 1.1 0.4 0.8 3.3 6.4 5.9 5.3 5.5 6.9 ...
## $ Q2 : num 0.00437 0.00433 0.00398 0.00401 0.0047 ...
## $ RAINC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ RAINNC : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TSLB : num 279 278 278 280 283 ...
## $ SMOIS : num 0.359 0.352 0.355 0.352 0.348 ...
## $ WIND_SPEED: num 3.5 3.96 4.42 4.28 4.88 8.25 6.71 5.3 5.51 6.91 ...
## DATETIME TSK PSFC U10 V10 Q2 RAINC RAINNC SNOW TSLB
## 1 2018-05-01 00:00:00 276.5 100218 3.5 -0.1 0.004370 0 0 0 279.1
## 2 2018-05-01 03:00:00 276.1 100272 3.8 1.1 0.004330 0 0 0 278.5
## 3 2018-05-01 06:00:00 277.1 100378 4.4 0.4 0.003980 0 0 0 278.0
## 4 2018-05-01 09:00:00 288.6 100436 4.2 0.8 0.004010 0 0 0 279.8
## 5 2018-05-01 12:00:00 292.8 100428 3.6 3.3 0.004700 0 0 0 283.0
## 6 2018-05-01 15:00:00 289.3 100357 5.2 6.4 0.004355 0 0 0 285.1
## SMOIS WIND_SPEED
## 1 0.3591000 3.50
## 2 0.3517333 3.96
## 3 0.3551000 4.42
## 4 0.3522000 4.28
## 5 0.3479000 4.88
## 6 0.3441000 8.25
Perform Exploratory Data Analysis (EDA) and identify potential outliers in the data frame
## DATETIME TSK PSFC
## Min. :2018-05-01 00:00:00 Min. :272.1 Min. : 99872
## 1st Qu.:2018-05-08 17:15:00 1st Qu.:281.6 1st Qu.:100761
## Median :2018-05-16 10:30:00 Median :287.1 Median :101145
## Mean :2018-05-16 10:30:00 Mean :288.0 Mean :101123
## 3rd Qu.:2018-05-24 03:45:00 3rd Qu.:294.9 3rd Qu.:101474
## Max. :2018-05-31 21:00:00 Max. :305.8 Max. :102065
## U10 V10 Q2 RAINC
## Min. :-7.1000 Min. :-8.200 Min. :0.003870 Min. : 0.0000
## 1st Qu.:-2.3000 1st Qu.:-2.900 1st Qu.:0.005680 1st Qu.: 0.0000
## Median :-0.5750 Median :-1.550 Median :0.006860 Median : 0.0000
## Mean :-0.4601 Mean :-1.055 Mean :0.007379 Mean : 0.5552
## 3rd Qu.: 1.5000 3rd Qu.: 0.700 3rd Qu.:0.008985 3rd Qu.: 0.0000
## Max. : 5.2000 Max. : 9.400 Max. :0.013200 Max. :20.3000
## RAINNC SNOW TSLB SMOIS WIND_SPEED
## Min. :0.0000 Min. :0 Min. :277.8 Min. :0.2750 Min. :0.140
## 1st Qu.:0.0000 1st Qu.:0 1st Qu.:283.5 1st Qu.:0.2883 1st Qu.:2.540
## Median :0.0000 Median :0 Median :286.3 Median :0.3000 Median :3.450
## Mean :0.2117 Mean :0 Mean :286.4 Mean :0.3037 Mean :3.632
## 3rd Qu.:0.0000 3rd Qu.:0 3rd Qu.:289.1 3rd Qu.:0.3159 3rd Qu.:4.562
## Max. :7.9000 Max. :0 Max. :295.8 Max. :0.3676 Max. :9.430
# Descriptive statistics for numerical variables
describe(data1[, sapply(data1, is.numeric)]) # Filter numeric columns## data1[, sapply(data1, is.numeric)]
##
## 11 Variables 248 Observations
## --------------------------------------------------------------------------------
## TSK
## n missing distinct Info Mean Gmd .05 .10
## 248 0 175 1 288 9.254 276.1 277.5
## .25 .50 .75 .90 .95
## 281.6 287.1 294.9 299.1 300.7
##
## lowest : 272.1 273.5 273.9 274.3 274.6, highest: 303.1 303.8 304 305.7 305.8
## --------------------------------------------------------------------------------
## PSFC
## n missing distinct Info Mean Gmd .05 .10
## 248 0 231 1 101123 588.3 100230 100398
## .25 .50 .75 .90 .95
## 100761 101145 101474 101811 101952
##
## lowest : 99872 99873 99901 99960 100082, highest: 102011 102034 102047 102050 102065
## --------------------------------------------------------------------------------
## U10
## n missing distinct Info Mean Gmd .05 .10
## 248 0 98 1 -0.4601 2.877 -4.265 -3.700
## .25 .50 .75 .90 .95
## -2.300 -0.575 1.500 2.900 3.500
##
## lowest : -7.1 -6.5 -6.1 -6 -5.8, highest: 4.4 4.6 4.7 5.1 5.2
## --------------------------------------------------------------------------------
## V10
## n missing distinct Info Mean Gmd .05 .10
## 248 0 98 1 -1.055 3.185 -5.200 -4.330
## .25 .50 .75 .90 .95
## -2.900 -1.550 0.700 2.800 3.765
##
## lowest : -8.2 -7.3 -6.7 -6.6 -6.1, highest: 5.9 6.4 6.9 8 9.4
## --------------------------------------------------------------------------------
## Q2
## n missing distinct Info Mean Gmd .05 .10
## 248 0 212 1 0.007379 0.002386 0.004720 0.004997
## .25 .50 .75 .90 .95
## 0.005680 0.006860 0.008985 0.010727 0.011270
##
## lowest : 0.00387 0.00398 0.00401 0.00407 0.00422
## highest: 0.01238 0.01245 0.01257 0.01271 0.0132
## --------------------------------------------------------------------------------
## RAINC
## n missing distinct Info Mean Gmd .05 .10
## 248 0 20 0.321 0.5552 1.059 0.00 0.00
## .25 .50 .75 .90 .95
## 0.00 0.00 0.00 0.33 2.60
##
## Value 0.0 0.2 0.3 0.4 0.7 1.1 1.8 2.1 2.5 2.6 2.8
## Frequency 218 1 4 1 4 2 1 1 1 3 1
## Proportion 0.879 0.004 0.016 0.004 0.016 0.008 0.004 0.004 0.004 0.012 0.004
##
## Value 3.6 5.0 6.0 7.2 10.6 11.4 12.7 18.5 20.3
## Frequency 1 1 1 2 1 2 1 1 1
## Proportion 0.004 0.004 0.004 0.008 0.004 0.008 0.004 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## RAINNC
## n missing distinct Info Mean Gmd .05 .10
## 248 0 14 0.273 0.2117 0.3992 0.000 0.000
## .25 .50 .75 .90 .95
## 0.000 0.000 0.000 0.030 2.145
##
## Value 0.0 0.1 0.2 0.3 0.5 0.8 0.9 1.0 1.2 1.3 2.6
## Frequency 223 1 1 1 1 1 1 4 1 1 3
## Proportion 0.899 0.004 0.004 0.004 0.004 0.004 0.004 0.016 0.004 0.004 0.012
##
## Value 2.8 5.1 7.9
## Frequency 8 1 1
## Proportion 0.032 0.004 0.004
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## SNOW
## n missing distinct Info Mean Gmd
## 248 0 1 0 0 0
##
## Value 0
## Frequency 248
## Proportion 1
## --------------------------------------------------------------------------------
## TSLB
## n missing distinct Info Mean Gmd .05 .10
## 248 0 128 1 286.4 4.43 280.3 281.3
## .25 .50 .75 .90 .95
## 283.5 286.3 289.1 291.4 293.0
##
## lowest : 277.8 278 278.5 279.1 279.7, highest: 293.9 295.2 295.5 295.6 295.8
## --------------------------------------------------------------------------------
## SMOIS
## n missing distinct Info Mean Gmd .05 .10
## 248 0 205 1 0.3037 0.02222 0.2787 0.2818
## .25 .50 .75 .90 .95
## 0.2883 0.3000 0.3159 0.3336 0.3423
##
## lowest : 0.275 0.2752 0.2756 0.2759 0.276 , highest: 0.3551 0.3556 0.3591 0.36 0.3676
## --------------------------------------------------------------------------------
## WIND_SPEED
## n missing distinct Info Mean Gmd .05 .10
## 248 0 196 1 3.632 1.839 1.207 1.700
## .25 .50 .75 .90 .95
## 2.540 3.450 4.562 5.793 6.763
##
## lowest : 0.14 0.5 0.58 0.6 0.72, highest: 8.01 8.2 8.25 8.43 9.43
## --------------------------------------------------------------------------------
## DATETIME TSK PSFC U10 V10 Q2 RAINC
## 0 0 0 0 0 0 0
## RAINNC SNOW TSLB SMOIS WIND_SPEED
## 0 0 0 0 0
# Explore data distribution with histograms
for (var in names(data1)[sapply(data1, is.numeric)]) {
print(
ggplot(data1, aes_string(x = var)) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
labs(title = paste("Distribution of", var), x = var, y = "Frequency") +
theme_bw()
)
}## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Explore data distribution with boxplots and identify outliers
for (var in names(data1)[sapply(data1, is.numeric)]) {
print(
ggplot(data1, aes_string(x = var)) +
geom_boxplot() +
labs(title = paste("Boxplot of", var), x = var, y = "") +
theme_bw()
)
}# Identify outliers with z-scores (more than 3 standard deviations from the mean)
outliers <- lapply(data1[, sapply(data1, is.numeric)], function(x) {
abs_z_scores <- abs((x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE))
return(which(abs_z_scores > 3))
})# Report columns containing outliers based on z-scores
outlier_columns <- names(data1)[sapply(data1, is.numeric)][sapply(outliers, length) > 0]
if (length(unlist(outliers)) > 0) {
cat("Potential outliers identified in columns:", outlier_columns, "\n")
} else {
cat("No potential outliers identified based on z-scores.\n")
}## Potential outliers identified in columns: V10 RAINC RAINNC SMOIS WIND_SPEED
Handling and resolving outliers
# Display values for all outliers except for RAINC and RAINNC
outlier_values <- list()
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
# Calculate the bounds for outliers
lower_bound <- quantile(data1[[var]], probs = 0.25, na.rm = TRUE) - 1.5 * IQR(data1[[var]], na.rm = TRUE)
upper_bound <- quantile(data1[[var]], probs = 0.75, na.rm = TRUE) + 1.5 * IQR(data1[[var]], na.rm = TRUE)
# Identify and store the outlier values
outlier_values[[var]] <- data1[[var]][data1[[var]] < lower_bound | data1[[var]] > upper_bound]
}
}Applying mutation (Cap) to handle outliers
# Cap the outliers for the same columns
data1_clean <- data1
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
lower_bound <- quantile(data1_clean[[var]], probs = 0.25, na.rm = TRUE) - 1.5 * IQR(data1_clean[[var]], na.rm = TRUE)
upper_bound <- quantile(data1_clean[[var]], probs = 0.75, na.rm = TRUE) + 1.5 * IQR(data1_clean[[var]], na.rm = TRUE)
# Cap the values
data1_clean[[var]] <- pmin(pmax(data1_clean[[var]], lower_bound), upper_bound)
}
}# Verify the results after handling outliers
for (var in outlier_columns) {
if (!(var %in% c("RAINC", "RAINNC"))) {
print(
ggplot(data1_clean, aes(x = .data[[var]])) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
labs(title = paste("Cleaned Distribution of", var), x = var, y = "Frequency") +
theme_bw()
)
}
}Research Questions
Statistical Questions 1:
- How does the mean Pressure (PSFC) differ between daytime (6:00 AM to 6:00 PM) and nighttime (6:00 PM to 6:00 AM)?
- What is the correlation between Soil Moisture (SMOIS) and Air Temperature (TSK)?
- How does the mean Wind Speed vary across the recorded period?
- What is the correlation between Air Pressure (PSFC) and Temperature (TSK)?
Time-Series Questions:
- Can historical data be used to forecast Air Temperature (TSK) for the next 24 hours?
- Do Precipitation (RAINC and RAINNC) exhibit a seasonal pattern over time?
- Can historical data be used to forecast Precipitation (RAINC and RAINNC) for the next 24 hours?
- What is the accuracy of time-series forecasting methods for predicting Temperature (TSK) for the next week?
Machine Learning Modeling:
- Can a machine learning model be built to predict Snowfall (SNOW) based on other weather variables?
- Can machine learning models accurately predict the presence of Snow (SNOW) based on other weather variables (Temperature, Humidity, Wind Speed)?
# Que: How does the mean Pressure (PSFC) differ between daytime
data_df <- data1_clean
# Check if 'DATETIME' column is already POSIXct
if (!inherits(data_df$DATETIME, "POSIXct")) {
# Convert 'DATETIME' column to POSIXct datetime format if it is not already
data_df$DATETIME <- as.POSIXct(data_df$DATETIME)
} else {
cat("'DATETIME' column is already of type POSIXct.\n")
}## 'DATETIME' column is already of type POSIXct.
# Extract hour from DATETIME
data_df$Hour <- as.numeric(format(data_df$DATETIME, "%H"))
# Define a function to categorize daytime and nighttime
day_night_category <- function(hour) {
if (is.na(hour)) {
return(NA) # Return NA if the hour is missing
} else if (hour >= 6 & hour < 18) {
return("Daytime")
} else {
return("Nighttime")
}
}
# Apply the function to create a new column 'Day_Night'
data_df$Day_Night <- sapply(data_df$Hour, day_night_category)
# Subset data for daytime and nighttime
daytime_data <- filter(data_df, Day_Night == "Daytime")
nighttime_data <- filter(data_df, Day_Night == "Nighttime")
# Perform t-test to compare mean pressure (PSFC) between daytime and nighttime
t_test_result <- t.test(daytime_data$PSFC, nighttime_data$PSFC)
# Print the result
print(t_test_result)##
## Welch Two Sample t-test
##
## data: daytime_data$PSFC and nighttime_data$PSFC
## t = -0.014232, df = 245.99, p-value = 0.9887
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -129.8403 127.9774
## sample estimates:
## mean of x mean of y
## 101122.4 101123.3
Question 1 ( Result and analysis)
Welch Two Sample t-test used to test if there is a statistically significant difference between the means of two independent samples.
Output breakdown:
Data: The test was conducted on two sets of data, daytime_data\(PSFC and nighttime_data\)PSFC. These represent the daytime and nighttime values of the variable PSFC.
t-value: The calculated t-statistic is -0.014232.
The t-value measures the size of the difference relative to the variation in the sample data.
A higher t-value indicates a greater difference between groups.
Degrees of Freedom (df): The degrees of freedom for this test are approximately 245.99.
This value is used in determining the critical value of t from the t-distribution.
p-value: The p-value is 0.9887.
This is the probability of observing a test statistic as extreme as, or more extreme than, the observed value under the null hypothesis.
A high p-value suggests that the observed data is consistent with the null hypothesis.
Alternative Hypothesis: The alternative hypothesis states that the true difference in means is not equal to 0.
95 Percent Confidence Interval: The confidence interval ranges from -129.8403 to 127.9774.
This interval estimates where the true difference in means lies with 95% confidence.
Since the interval includes 0, it suggests that there is no significant difference between the means.
- Sample Estimates: The mean of the first sample (daytime) is 101122.4, and the mean of the second sample (nighttime) is 101123.3.
In summary, the t-test results suggest that there is no statistically significant difference between the daytime and nighttime PSFC means, as indicated by the high p-value and the confidence interval that includes zero. The means of the two samples are very close to each other
Statistical Questions 2: Correlation between Soil Moisture (SMOIS) and Air Temperature (TSK)
# Calculate the correlation coefficient between SMOIS and TSK
correlation <- cor(data_df$SMOIS, data_df$TSK)
# Print the correlation coefficient
print(correlation)## [1] -0.2025451
# Visualize the correlation using a correlation plot
corrplot(cor(data_df[, c("SMOIS", "TSK")]), method="circle")Question 2 ( Result and analysis)
Output breakdown:
Correlation Calculation: The cor() function is used to calculate the correlation coefficient between data_df\(SMOIS and data_df\)TSK.
These represent two columns in the data_df data frame.
Correlation Coefficient: The calculated correlation coefficient is -0.2025451. This value indicates the strength and direction of the linear relationship between the two variables.
A coefficient close to 1 implies a strong positive correlation, close to -1 implies a strong negative correlation, and close to 0 implies no linear correlation.
Interpretation: Since the correlation coefficient is -0.2025451, it suggests a weak negative linear relationship between SMOIS and TSK. As the values of SMOIS increase, the values of TSK tend to decrease slightly, and vice versa.
Visualization: The corrplot() function is used to create a visual representation of the correlation matrix, which includes the correlation between SMOIS and TSK. The method=“circle” argument specifies that the correlation coefficients should be represented by circles whose size and color intensity vary with the correlation value.
In summary, the analysis indicates a weak negative correlation between soil moisture (SMOIS) and skin temperature (TSK). The visualization provided by corrplot() shows relationship in a graphical format, making it easier to interpret the correlation visually.
Statistical Questions 3: How does the mean Wind Speed vary across the recorded period?
# Extract date and hour from DATETIME
data_df$Date <- as.Date(data_df$DATETIME)
data_df$Hour <- format(data_df$DATETIME, "%H")
# Check if 'wind_speed' column exists
if ("WIND_SPEED" %in% names(data_df)) {
# Calculate mean wind speed for each date (assuming 'wind_speed' is the column)
mean_wind_speed <- aggregate(WIND_SPEED ~ Date, data = data_df, FUN = mean)
print(mean_wind_speed)
} else {
print("Error: 'WIND_SPEED' column not found in the data_df")
}## Date WIND_SPEED
## 1 2018-04-30 3.500000
## 2 2018-05-01 5.332031
## 3 2018-05-02 5.575312
## 4 2018-05-03 2.528750
## 5 2018-05-04 1.875000
## 6 2018-05-05 3.252500
## 7 2018-05-06 3.003750
## 8 2018-05-07 2.681250
## 9 2018-05-08 2.687500
## 10 2018-05-09 3.763750
## 11 2018-05-10 4.167500
## 12 2018-05-11 4.543750
## 13 2018-05-12 1.921250
## 14 2018-05-13 3.063750
## 15 2018-05-14 5.795312
## 16 2018-05-15 4.466250
## 17 2018-05-16 5.512500
## 18 2018-05-17 4.203750
## 19 2018-05-18 2.547500
## 20 2018-05-19 2.323750
## 21 2018-05-20 2.787500
## 22 2018-05-21 3.402500
## 23 2018-05-22 4.070000
## 24 2018-05-23 4.966250
## 25 2018-05-24 4.053750
## 26 2018-05-25 2.018750
## 27 2018-05-26 5.338281
## 28 2018-05-27 2.876250
## 29 2018-05-28 2.818750
## 30 2018-05-29 4.946250
## 31 2018-05-30 2.352500
## 32 2018-05-31 3.094286
# Plot the distribution of wind speed
ggplot(data_df, aes(x = WIND_SPEED)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of Wind Speed", x = "WIND_SPEED (km/h)", y = "Frequency") +
theme_minimal()Question 3 ( Result and analysis)
Output breakdown:
Range: The wind speeds range from a low of 1.875 to a high of 5.795312.
Central Tendency: There seems to be a central clustering around the 3 to 4 wind speed mark, as many values fall within this range.
Variability: There are some variability in the data, with wind speeds occasionally reaching above 5 and dropping below 2.
Distribution: A unimodal distribution with a single peak around the 3 to 4 wind speed mark.
The distribution is not perfectly symmetrical, suggesting a slight skewness to the left with the frequency of higher and lower wind speeds.
The presence of occasional higher values (like 5.795312) might create a longer tail on one end of the distribution.