Setting Directory

setwd("D:\\Data Science\\Hackathons\\Zindi\\Flood Prediction in Malawi")

getwd()
## [1] "D:/Data Science/Hackathons/Zindi/Flood Prediction in Malawi"

Loading Packages

library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(jmv)
library(tseries)
library(forecast)
library(caret)
library(e1071)
library(lubridate)
library(knitr)
library(kableExtra)
library(BBmisc)

Loading Dataset

rm(list = ls(all.names = TRUE))

Data <- read.csv("Train.csv")

Wrangling Dataset

We transform the dataset from the panel (wide) fomart to the long format and clean the date variable as follows:

Cols <- Data %>% select(contains("precip")) %>% names()

Data_Clean <- Data %>% pivot_longer(Cols, names_to = "Date", values_to = "Precip")

str_length(Data_Clean$Date)[1] 
## [1] 30
str_sub(Data_Clean$Date, start = 1, end = 20) <- ""

class(Data_Clean$Date)
## [1] "character"
Data_Clean$Date <- as_date(Data_Clean$Date)

Data_Clean$LC_Type1_mode <- as.factor(Data_Clean$LC_Type1_mode)

class(Data_Clean$Date)
## [1] "Date"

Exploratory Data Analysis

Data_Clean %>% 
  select(-c(Square_ID, Date)) %>% 
  descriptives(mode = TRUE,
               sd = TRUE,
               range = TRUE,
               kurt = TRUE,
               skew = TRUE) 
## 
##  DESCRIPTIVES
## 
##  Descriptives                                                                                          
##  ----------------------------------------------------------------------------------------------------- 
##                           X          Y          target_2015    elevation    LC_Type1_mode    Precip    
##  ----------------------------------------------------------------------------------------------------- 
##    N                       559844     559844         559844       559844           559844     559844   
##    Missing                      0          0              0            0                0          0   
##    Mean                      35.1      -15.8         0.0766          593                        13.6   
##    Median                    35.0      -15.8           0.00          623                        7.12   
##    Mode                      34.9      -15.9           0.00          623                        0.00   
##    Standard deviation       0.392      0.360          0.229          355                        17.1   
##    Range                     1.60       1.43           1.00         2758                         105   
##    Minimum                   34.3      -16.6           0.00         45.5                        0.00   
##    Maximum                   35.9      -15.2           1.00         2803                         105   
##    Skewness                 0.141     -0.250           3.17         1.08                        1.71   
##    Std. error skewness    0.00327    0.00327        0.00327      0.00327                     0.00327   
##    Kurtosis                -0.954     -0.817           8.89         3.35                        2.96   
##    Std. error kurtosis    0.00655    0.00655        0.00655      0.00655                     0.00655   
##  -----------------------------------------------------------------------------------------------------

The above descriptive analyses indicate that the dependent varible is eavily skewed. We further inspect this finding below:

Target Variable

ggplot(Data_Clean) +
  geom_histogram(aes(x = target_2015,
                     y = ..density..),
                 bins = 30L,
                 fill = "gold") +
  labs(title = "Histogram of Target 2015",
       x = "Target",
       y = "Density") +
  theme_bw()

Log-transforming the target_2015 produces the following distribution, with most disaapearing to -Inf:

Data_Clean$Logged <- log(Data_Clean$target_2015)

ggplot(Data_Clean) +
  geom_histogram(aes(x = Logged,
                     y = ..density..),
                 bins = 30L,
                 fill = "gold") +
  labs(title = "Histogram of Logged Target 2015",
       x = "Target",
       y = "Density") +
  theme_bw()

Precipitaion

ggplot(Data_Clean) +
  geom_histogram(aes(y = ..density..,
                     x = Precip),
                 bins = 30L, 
                 fill = "gold") +
  labs(title = "Histogram of Precipitaion",
       x = "Precipitaion",
       y = "Density") +
  theme_bw()

Vegetation Type

ggplot(Data_Clean) +
 aes(x = LC_Type1_mode) +
 geom_bar(fill = "gold") +
 scale_y_continuous(expand = c(0, 0)) +
 labs(x = "Vegetaion Type", 
      y = "Count", 
      title = "Bar Graph of Vegetaion Type") +
 coord_flip() +
 theme_bw()

Linear Regression

We can visulaise the relationship between target_2015 and Precip across LC_Type1_mode using a scatter plot as shown below.

Data_Clean$LC_Type1_mode <- as.numeric(Data_Clean$LC_Type1_mode)

ggplot(Data_Clean, aes(x = Precip, 
                       y = target_2015, 
                       color = LC_Type1_mode)) +
 geom_point(size = 1L, 
            position = "jitter") +
 scale_color_viridis_c(option = "viridis") +
 theme_bw()

Given that target and Precip are continuous variables:

OLS <- lm(target_2015 ~ Precip + factor(LC_Type1_mode),
          data = Data_Clean)

summary(OLS)
## 
## Call:
## lm(formula = target_2015 ~ Precip + factor(LC_Type1_mode), data = Data_Clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41014 -0.10920 -0.07268 -0.02724  0.98216 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.231e-02  3.482e-03   9.281  < 2e-16 ***
## Precip                   1.245e-04  1.756e-05   7.088 1.36e-12 ***
## factor(LC_Type1_mode)2  -2.705e-02  4.827e-03  -5.604 2.10e-08 ***
## factor(LC_Type1_mode)3  -1.694e-02  6.354e-03  -2.666  0.00767 ** 
## factor(LC_Type1_mode)4  -3.412e-02  3.852e-02  -0.886  0.37575    
## factor(LC_Type1_mode)5   4.411e-02  1.491e-02   2.958  0.00309 ** 
## factor(LC_Type1_mode)6  -5.129e-03  3.526e-03  -1.455  0.14573    
## factor(LC_Type1_mode)7   4.037e-02  3.517e-03  11.478  < 2e-16 ***
## factor(LC_Type1_mode)8   3.647e-01  4.576e-03  79.709  < 2e-16 ***
## factor(LC_Type1_mode)9   7.689e-02  3.508e-03  21.921  < 2e-16 ***
## factor(LC_Type1_mode)10  4.097e-02  6.863e-03   5.969 2.39e-09 ***
## factor(LC_Type1_mode)11  2.628e-02  3.938e-03   6.673 2.50e-11 ***
## factor(LC_Type1_mode)12 -2.710e-02  3.846e-03  -7.047 1.83e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2237 on 559831 degrees of freedom
## Multiple R-squared:  0.04352,    Adjusted R-squared:  0.0435 
## F-statistic:  2123 on 12 and 559831 DF,  p-value: < 2.2e-16
anova(OLS)
## Analysis of Variance Table
## 
## Response: target_2015
##                           Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Precip                     1     2.9   2.878   57.506 3.375e-14 ***
## factor(LC_Type1_mode)     11  1271.8 115.615 2310.433 < 2.2e-16 ***
## Residuals             559831 28014.1   0.050                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Split the Data into Training and Testing sets

set.seed(1111)
Index <- sample(x = 2, size = nrow(Data), replace = TRUE, prob = c(0.7, 0.3))
Train <- Data_Clean[Index == 1, ]
Test <- Data_Clean[Index == 2, ]

Building Prediction Model

Testing Model

Model Diagnostics

Saving for Submission