str(train)
## 'data.frame': 2190 obs. of 13 variables:
## $ id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ pressure : num 1017 1020 1024 1013 1022 ...
## $ maxtemp : num 21.2 16.2 19.4 18.1 21.3 20.6 19.5 15.8 17.6 16.5 ...
## $ temparature : num 20.6 16.9 16.1 17.8 18.4 18.6 18.4 13.6 16.5 14.4 ...
## $ mintemp : num 19.9 15.8 14.6 16.9 15.2 16.5 15.3 12.7 15.6 12 ...
## $ dewpoint : num 19.4 15.4 9.3 16.8 9.6 12.5 11.3 11.8 12.5 8.6 ...
## $ humidity : num 87 95 75 95 52 79 56 96 86 77 ...
## $ cloud : num 88 91 47 95 45 81 46 100 100 84 ...
## $ sunshine : num 1.1 0 8.3 0 3.6 0 7.6 0 0 1 ...
## $ winddirection: num 60 50 70 60 40 20 20 50 50 50 ...
## $ windspeed : num 17.2 21.9 18.1 35.6 24.8 15.7 28.4 52.8 37.5 38.3 ...
## $ rainfall : int 1 1 1 1 0 1 0 1 1 0 ...
str(train)
## 'data.frame': 2190 obs. of 13 variables:
## $ id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ pressure : num 1017 1020 1024 1013 1022 ...
## $ maxtemp : num 21.2 16.2 19.4 18.1 21.3 20.6 19.5 15.8 17.6 16.5 ...
## $ temparature : num 20.6 16.9 16.1 17.8 18.4 18.6 18.4 13.6 16.5 14.4 ...
## $ mintemp : num 19.9 15.8 14.6 16.9 15.2 16.5 15.3 12.7 15.6 12 ...
## $ dewpoint : num 19.4 15.4 9.3 16.8 9.6 12.5 11.3 11.8 12.5 8.6 ...
## $ humidity : num 87 95 75 95 52 79 56 96 86 77 ...
## $ cloud : num 88 91 47 95 45 81 46 100 100 84 ...
## $ sunshine : num 1.1 0 8.3 0 3.6 0 7.6 0 0 1 ...
## $ winddirection: num 60 50 70 60 40 20 20 50 50 50 ...
## $ windspeed : num 17.2 21.9 18.1 35.6 24.8 15.7 28.4 52.8 37.5 38.3 ...
## $ rainfall : int 1 1 1 1 0 1 0 1 1 0 ...
#Check for missing values in training data
check_missing_values <- function(df) {
missing_values <- colSums(is.na(df))
if (sum(missing_values) == 0) {
print("No missing values found.")
} else {
print("Missing values per column:")
print(missing_values)
print(paste("\nTotal missing values:", sum(missing_values)))
}
}
check_missing_values(train)
## [1] "No missing values found."
No missing values in the training data
#Check for missing values in test data
check_missing_values <- function(df) {
missing_values <- colSums(is.na(df))
if (sum(missing_values) == 0) {
print("No missing values found.")
} else {
print("Missing values per column:")
print(missing_values)
print(paste("\nTotal missing values:", sum(missing_values)))
}
}
check_missing_values(test)
## [1] "Missing values per column:"
## id day pressure maxtemp temparature
## 0 0 0 0 0
## mintemp dewpoint humidity cloud sunshine
## 0 0 0 0 0
## winddirection windspeed
## 1 0
## [1] "\nTotal missing values: 1"
1 missing value found in the column “winddirection”
#Replace the missing value with the mode
#Find the mode of the column
find_mode_na <- function(column){
column <- column[!is.na(column)] #remove NAs.
unique_values <- unique(column)
counts <- tabulate(match(column, unique_values))
mode_value <- unique_values[which.max(counts)]
return(mode_value)
}
mode_winddirection <- find_mode_na(test$winddirection)
print(paste("Winddirection Mode:", mode_winddirection))
## [1] "Winddirection Mode: 70"
The winddirection that appears the most is 70.
#Replace the missing value with the mode
test$winddirection[is.na(test$winddirection)] <- mode_winddirection
From the correlation matrix we can make the assumption that clouds
may be an indicator of rain on a given day.
From these two plots we can see that temperature and humidity are on
the higher side, with a majority of days having a temperature ranging
from 25 to 30 Celsius and Humidity being high on these days.
A closer look at the relationship between cloud cover and rainfall tells us that a higher cloud cover means a higher chance of rain.
firstmodel <- glm(rainfall~., family = binomial, data = train)
summary(firstmodel)
##
## Call:
## glm(formula = rainfall ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 29.3031788 23.4625055 1.249 0.211688
## id 0.0001782 0.0001097 1.625 0.104267
## day 0.0003850 0.0007219 0.533 0.593796
## pressure -0.0365393 0.0225717 -1.619 0.105488
## maxtemp 0.0261924 0.0680023 0.385 0.700112
## temparature -0.0424855 0.1155562 -0.368 0.713126
## mintemp -0.0730375 0.0896446 -0.815 0.415219
## dewpoint 0.1512790 0.0418215 3.617 0.000298 ***
## humidity 0.0416386 0.0125025 3.330 0.000867 ***
## cloud 0.0638138 0.0062240 10.253 < 2e-16 ***
## sunshine -0.1516206 0.0321242 -4.720 2.36e-06 ***
## winddirection -0.0002496 0.0011365 -0.220 0.826138
## windspeed 0.0146435 0.0077181 1.897 0.057789 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2446.4 on 2189 degrees of freedom
## Residual deviance: 1440.5 on 2177 degrees of freedom
## AIC: 1466.5
##
## Number of Fisher Scoring iterations: 5
From the results of using rainfall as the response variable and all remaining variables as the predictor we can see that dewpoint, humidity, cloud, and sunshine are variables that are highly significant in predicting rainfall.
model1 <- glm(rainfall~dewpoint+humidity+cloud+sunshine, family = binomial, data = train)
summary(model1)
##
## Call:
## glm(formula = rainfall ~ dewpoint + humidity + cloud + sunshine,
## family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.167210 0.962996 -8.481 < 2e-16 ***
## dewpoint 0.099920 0.015362 6.504 7.80e-11 ***
## humidity 0.042311 0.012079 3.503 0.00046 ***
## cloud 0.064695 0.006143 10.532 < 2e-16 ***
## sunshine -0.164520 0.030540 -5.387 7.16e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2446.4 on 2189 degrees of freedom
## Residual deviance: 1452.4 on 2185 degrees of freedom
## AIC: 1462.4
##
## Number of Fisher Scoring iterations: 5
#Make predictions
model1_predictions <- predict(model1, type = "response", data = train)
test_predictions <- predict(model1, type = "response", newdata = test)
predicted_class <- ifelse(test_predictions > 0.5, 1, 0)
head(predicted_class)
## 1 2 3 4 5 6
## 1 1 1 0 0 1
#Make submission file
submission_3 <- data.frame("id"=test$id, "rainfall"=predicted_class)
head(submission_3)
## id rainfall
## 1 2190 1
## 2 2191 1
## 3 2192 1
## 4 2193 0
## 5 2194 0
## 6 2195 1
write.csv(submission_3, "submission_3.csv", row.names = FALSE)