Binary Prediction with a Rainfall

Understanding the data

str(train)
## 'data.frame':    2190 obs. of  13 variables:
##  $ id           : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ day          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pressure     : num  1017 1020 1024 1013 1022 ...
##  $ maxtemp      : num  21.2 16.2 19.4 18.1 21.3 20.6 19.5 15.8 17.6 16.5 ...
##  $ temparature  : num  20.6 16.9 16.1 17.8 18.4 18.6 18.4 13.6 16.5 14.4 ...
##  $ mintemp      : num  19.9 15.8 14.6 16.9 15.2 16.5 15.3 12.7 15.6 12 ...
##  $ dewpoint     : num  19.4 15.4 9.3 16.8 9.6 12.5 11.3 11.8 12.5 8.6 ...
##  $ humidity     : num  87 95 75 95 52 79 56 96 86 77 ...
##  $ cloud        : num  88 91 47 95 45 81 46 100 100 84 ...
##  $ sunshine     : num  1.1 0 8.3 0 3.6 0 7.6 0 0 1 ...
##  $ winddirection: num  60 50 70 60 40 20 20 50 50 50 ...
##  $ windspeed    : num  17.2 21.9 18.1 35.6 24.8 15.7 28.4 52.8 37.5 38.3 ...
##  $ rainfall     : int  1 1 1 1 0 1 0 1 1 0 ...
str(train)
## 'data.frame':    2190 obs. of  13 variables:
##  $ id           : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ day          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pressure     : num  1017 1020 1024 1013 1022 ...
##  $ maxtemp      : num  21.2 16.2 19.4 18.1 21.3 20.6 19.5 15.8 17.6 16.5 ...
##  $ temparature  : num  20.6 16.9 16.1 17.8 18.4 18.6 18.4 13.6 16.5 14.4 ...
##  $ mintemp      : num  19.9 15.8 14.6 16.9 15.2 16.5 15.3 12.7 15.6 12 ...
##  $ dewpoint     : num  19.4 15.4 9.3 16.8 9.6 12.5 11.3 11.8 12.5 8.6 ...
##  $ humidity     : num  87 95 75 95 52 79 56 96 86 77 ...
##  $ cloud        : num  88 91 47 95 45 81 46 100 100 84 ...
##  $ sunshine     : num  1.1 0 8.3 0 3.6 0 7.6 0 0 1 ...
##  $ winddirection: num  60 50 70 60 40 20 20 50 50 50 ...
##  $ windspeed    : num  17.2 21.9 18.1 35.6 24.8 15.7 28.4 52.8 37.5 38.3 ...
##  $ rainfall     : int  1 1 1 1 0 1 0 1 1 0 ...

Missing values

#Check for missing values in training data
check_missing_values <- function(df) {
  missing_values <- colSums(is.na(df))

  if (sum(missing_values) == 0) {
    print("No missing values found.")
  } else {
    print("Missing values per column:")
    print(missing_values)
    print(paste("\nTotal missing values:", sum(missing_values)))
  }
}
check_missing_values(train)
## [1] "No missing values found."

No missing values in the training data

#Check for missing values in test data
check_missing_values <- function(df) {
  missing_values <- colSums(is.na(df))

  if (sum(missing_values) == 0) {
    print("No missing values found.")
  } else {
    print("Missing values per column:")
    print(missing_values)
    print(paste("\nTotal missing values:", sum(missing_values)))
  }
}
check_missing_values(test)
## [1] "Missing values per column:"
##            id           day      pressure       maxtemp   temparature 
##             0             0             0             0             0 
##       mintemp      dewpoint      humidity         cloud      sunshine 
##             0             0             0             0             0 
## winddirection     windspeed 
##             1             0 
## [1] "\nTotal missing values: 1"

1 missing value found in the column “winddirection”

#Replace the missing value with the mode
#Find the mode of the column
find_mode_na <- function(column){
  column <- column[!is.na(column)] #remove NAs.
  unique_values <- unique(column)
  counts <- tabulate(match(column, unique_values))
  mode_value <- unique_values[which.max(counts)]
  return(mode_value)
}

mode_winddirection <- find_mode_na(test$winddirection)
print(paste("Winddirection Mode:", mode_winddirection))
## [1] "Winddirection Mode: 70"

The winddirection that appears the most is 70.

#Replace the missing value with the mode
test$winddirection[is.na(test$winddirection)] <- mode_winddirection

EDA

From the correlation matrix we can make the assumption that clouds may be an indicator of rain on a given day.

From these two plots we can see that temperature and humidity are on the higher side, with a majority of days having a temperature ranging from 25 to 30 Celsius and Humidity being high on these days.

A closer look at the relationship between cloud cover and rainfall tells us that a higher cloud cover means a higher chance of rain.

Model Development

firstmodel <- glm(rainfall~., family = binomial, data = train)
summary(firstmodel)
## 
## Call:
## glm(formula = rainfall ~ ., family = binomial, data = train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   29.3031788 23.4625055   1.249 0.211688    
## id             0.0001782  0.0001097   1.625 0.104267    
## day            0.0003850  0.0007219   0.533 0.593796    
## pressure      -0.0365393  0.0225717  -1.619 0.105488    
## maxtemp        0.0261924  0.0680023   0.385 0.700112    
## temparature   -0.0424855  0.1155562  -0.368 0.713126    
## mintemp       -0.0730375  0.0896446  -0.815 0.415219    
## dewpoint       0.1512790  0.0418215   3.617 0.000298 ***
## humidity       0.0416386  0.0125025   3.330 0.000867 ***
## cloud          0.0638138  0.0062240  10.253  < 2e-16 ***
## sunshine      -0.1516206  0.0321242  -4.720 2.36e-06 ***
## winddirection -0.0002496  0.0011365  -0.220 0.826138    
## windspeed      0.0146435  0.0077181   1.897 0.057789 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2446.4  on 2189  degrees of freedom
## Residual deviance: 1440.5  on 2177  degrees of freedom
## AIC: 1466.5
## 
## Number of Fisher Scoring iterations: 5

From the results of using rainfall as the response variable and all remaining variables as the predictor we can see that dewpoint, humidity, cloud, and sunshine are variables that are highly significant in predicting rainfall.

model1 <- glm(rainfall~dewpoint+humidity+cloud+sunshine, family = binomial, data = train)
summary(model1)
## 
## Call:
## glm(formula = rainfall ~ dewpoint + humidity + cloud + sunshine, 
##     family = binomial, data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.167210   0.962996  -8.481  < 2e-16 ***
## dewpoint     0.099920   0.015362   6.504 7.80e-11 ***
## humidity     0.042311   0.012079   3.503  0.00046 ***
## cloud        0.064695   0.006143  10.532  < 2e-16 ***
## sunshine    -0.164520   0.030540  -5.387 7.16e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2446.4  on 2189  degrees of freedom
## Residual deviance: 1452.4  on 2185  degrees of freedom
## AIC: 1462.4
## 
## Number of Fisher Scoring iterations: 5

Submission

#Make predictions
model1_predictions <- predict(model1, type = "response", data = train)
test_predictions <- predict(model1, type = "response", newdata = test)
predicted_class <- ifelse(test_predictions > 0.5, 1, 0)
head(predicted_class)
## 1 2 3 4 5 6 
## 1 1 1 0 0 1
#Make submission file
submission_3 <- data.frame("id"=test$id, "rainfall"=predicted_class)
head(submission_3)
##     id rainfall
## 1 2190        1
## 2 2191        1
## 3 2192        1
## 4 2193        0
## 5 2194        0
## 6 2195        1
write.csv(submission_3, "submission_3.csv", row.names = FALSE)