setwd("D:\\Data Science\\Hackathons\\Zindi\\Flood Prediction in Malawi")
getwd()
## [1] "D:/Data Science/Hackathons/Zindi/Flood Prediction in Malawi"
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(jmv)
library(tseries)
library(forecast)
library(caret)
library(e1071)
library(lubridate)
library(knitr)
library(kableExtra)
library(BBmisc)
rm(list = ls(all.names = TRUE))
Data <- read.csv("Train.csv")
We transform the dataset from the panel (wide) fomart to the long format and clean the date variable as follows:
Cols <- Data %>% select(contains("precip")) %>% names()
Data_Clean <- Data %>% pivot_longer(Cols, names_to = "Date", values_to = "Precip")
str_length(Data_Clean$Date)[1]
## [1] 30
str_sub(Data_Clean$Date, start = 1, end = 20) <- ""
class(Data_Clean$Date)
## [1] "character"
Data_Clean$Date <- as_date(Data_Clean$Date)
Data_Clean$LC_Type1_mode <- as.factor(Data_Clean$LC_Type1_mode)
class(Data_Clean$Date)
## [1] "Date"
Data_Clean %>%
select(-c(Square_ID, Date)) %>%
descriptives(mode = TRUE,
sd = TRUE,
range = TRUE,
kurt = TRUE,
skew = TRUE)
##
## DESCRIPTIVES
##
## Descriptives
## -----------------------------------------------------------------------------------------------------
## X Y target_2015 elevation LC_Type1_mode Precip
## -----------------------------------------------------------------------------------------------------
## N 559844 559844 559844 559844 559844 559844
## Missing 0 0 0 0 0 0
## Mean 35.1 -15.8 0.0766 593 13.6
## Median 35.0 -15.8 0.00 623 7.12
## Mode 34.9 -15.9 0.00 623 0.00
## Standard deviation 0.392 0.360 0.229 355 17.1
## Range 1.60 1.43 1.00 2758 105
## Minimum 34.3 -16.6 0.00 45.5 0.00
## Maximum 35.9 -15.2 1.00 2803 105
## Skewness 0.141 -0.250 3.17 1.08 1.71
## Std. error skewness 0.00327 0.00327 0.00327 0.00327 0.00327
## Kurtosis -0.954 -0.817 8.89 3.35 2.96
## Std. error kurtosis 0.00655 0.00655 0.00655 0.00655 0.00655
## -----------------------------------------------------------------------------------------------------
The above descriptive analyses indicate that the dependent varible is eavily skewed. We further inspect this finding below:
ggplot(Data_Clean) +
geom_histogram(aes(x = target_2015,
y = ..density..),
bins = 30L,
fill = "gold") +
labs(title = "Histogram of Target 2015",
x = "Target",
y = "Density") +
theme_bw()
Log-transforming the target_2015 produces the following distribution, with most disaapearing to -Inf:
Data_Clean$Logged <- log(Data_Clean$target_2015)
ggplot(Data_Clean) +
geom_histogram(aes(x = Logged,
y = ..density..),
bins = 30L,
fill = "gold") +
labs(title = "Histogram of Logged Target 2015",
x = "Target",
y = "Density") +
theme_bw()
ggplot(Data_Clean) +
geom_histogram(aes(y = ..density..,
x = Precip),
bins = 30L,
fill = "gold") +
labs(title = "Histogram of Precipitaion",
x = "Precipitaion",
y = "Density") +
theme_bw()
ggplot(Data_Clean) +
aes(x = LC_Type1_mode) +
geom_bar(fill = "gold") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Vegetaion Type",
y = "Count",
title = "Bar Graph of Vegetaion Type") +
coord_flip() +
theme_bw()
We can visulaise the relationship between target_2015 and Precip across LC_Type1_mode using a scatter plot as shown below.
Data_Clean$LC_Type1_mode <- as.numeric(Data_Clean$LC_Type1_mode)
ggplot(Data_Clean, aes(x = Precip,
y = target_2015,
color = LC_Type1_mode)) +
geom_point(size = 1L,
position = "jitter") +
scale_color_viridis_c(option = "viridis") +
theme_bw()
Given that target and Precip are continuous variables:
OLS <- lm(target_2015 ~ Precip + factor(LC_Type1_mode),
data = Data_Clean)
summary(OLS)
##
## Call:
## lm(formula = target_2015 ~ Precip + factor(LC_Type1_mode), data = Data_Clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41014 -0.10920 -0.07268 -0.02724 0.98216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.231e-02 3.482e-03 9.281 < 2e-16 ***
## Precip 1.245e-04 1.756e-05 7.088 1.36e-12 ***
## factor(LC_Type1_mode)2 -2.705e-02 4.827e-03 -5.604 2.10e-08 ***
## factor(LC_Type1_mode)3 -1.694e-02 6.354e-03 -2.666 0.00767 **
## factor(LC_Type1_mode)4 -3.412e-02 3.852e-02 -0.886 0.37575
## factor(LC_Type1_mode)5 4.411e-02 1.491e-02 2.958 0.00309 **
## factor(LC_Type1_mode)6 -5.129e-03 3.526e-03 -1.455 0.14573
## factor(LC_Type1_mode)7 4.037e-02 3.517e-03 11.478 < 2e-16 ***
## factor(LC_Type1_mode)8 3.647e-01 4.576e-03 79.709 < 2e-16 ***
## factor(LC_Type1_mode)9 7.689e-02 3.508e-03 21.921 < 2e-16 ***
## factor(LC_Type1_mode)10 4.097e-02 6.863e-03 5.969 2.39e-09 ***
## factor(LC_Type1_mode)11 2.628e-02 3.938e-03 6.673 2.50e-11 ***
## factor(LC_Type1_mode)12 -2.710e-02 3.846e-03 -7.047 1.83e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2237 on 559831 degrees of freedom
## Multiple R-squared: 0.04352, Adjusted R-squared: 0.0435
## F-statistic: 2123 on 12 and 559831 DF, p-value: < 2.2e-16
anova(OLS)
## Analysis of Variance Table
##
## Response: target_2015
## Df Sum Sq Mean Sq F value Pr(>F)
## Precip 1 2.9 2.878 57.506 3.375e-14 ***
## factor(LC_Type1_mode) 11 1271.8 115.615 2310.433 < 2.2e-16 ***
## Residuals 559831 28014.1 0.050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(1111)
Index <- sample(x = 2, size = nrow(Data), replace = TRUE, prob = c(0.7, 0.3))
Train <- Data_Clean[Index == 1, ]
Test <- Data_Clean[Index == 2, ]