The Malaysian population exhibits significant variation across its states, both in terms of population growth rates and demographic characteristics such as age distribution and ethnicity. These differences can impact resource allocation and planning, particularly for healthcare, education, housing, and other public services. This project seeks to address the disparities between states by forecasting population growth at the state level and categorizing states based on population density.
## Loading required package: httr
## Warning: package 'httr' was built under R version 4.4.2
## Loading required package: readr
## Warning: package 'readr' was built under R version 4.4.2
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'janitor' was built under R version 4.4.2
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
## Warning: package 'lubridate' was built under R version 4.4.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'viridis' was built under R version 4.4.2
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.4.2
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:httr':
##
## progress
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
## Warning: package 'prophet' was built under R version 4.4.2
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.4.2
## Loading required package: rlang
## state date sex age ethnicity population
## 1 Johor 1970-01-01 both overall overall 1325.6
## 2 Johor 1970-01-01 both 0-4 overall 210.1
## 3 Johor 1970-01-01 both 5-9 overall 215.7
## 4 Johor 1970-01-01 both 10-14 overall 192.2
## 5 Johor 1970-01-01 both 15-19 overall 152.8
## 6 Johor 1970-01-01 both 20-24 overall 110.7
## [1] 257295 6
Get column names and their types to understand the structure
## [1] "state" "date" "sex" "age" "ethnicity"
## [6] "population"
## state date sex age ethnicity population
## "character" "character" "character" "character" "character" "numeric"
## state date sex age
## Length:257295 Length:257295 Length:257295 Length:257295
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## ethnicity population
## Length:257295 Min. : 0.00
## Class :character 1st Qu.: 0.30
## Mode :character Median : 3.30
## Mean : 35.86
## 3rd Qu.: 21.90
## Max. :7363.40
population_data <- population_data %>%
clean_names()
population_data$date <- as.Date(population_data$date, format = "%Y-%m-%d")
missing_values <- sum(is.na(population_data))
cat("Number of missing values in the dataset:", missing_values, "\n")
## Number of missing values in the dataset: 0
population_data <- population_data %>%
distinct()
population_data$population <- as.numeric(population_data$population)
population_data <- population_data %>%
filter(!is.na(population))
# Clean the Age column by trimming whitespace
population_data$age <- trimws(population_data$age)
# Now get the unique values
unique_age_groups <- unique(population_data$age)
# Print the unique age groups
print(unique_age_groups)
## [1] "overall" "0-4" "5-9" "10-14" "15-19" "20-24" "25-29"
## [8] "30-34" "35-39" "40-44" "45-49" "50-54" "55-59" "60-64"
## [15] "65-69" "70+" "70-74" "75-79" "80+" "80-84" "85+"
# Combine ages above 70 as '70+'
population_data <- population_data %>%
mutate(age = case_when(
age %in% c("70-74", "75-79", "80+","80-84", "85+") ~ "70+",
TRUE ~ age
))
cleaned_age_groups <- unique(population_data$age)
print(cleaned_age_groups)
## [1] "overall" "0-4" "5-9" "10-14" "15-19" "20-24" "25-29"
## [8] "30-34" "35-39" "40-44" "45-49" "50-54" "55-59" "60-64"
## [15] "65-69" "70+"
population_data <- population_data %>%
filter(population >= 0 & population < 1e8) # Assuming population can't be negative or over 100 million
population_data$ethnicity <- tolower(population_data$ethnicity)
summary(population_data)
## state date sex age
## Length:257295 Min. :1970-01-01 Length:257295 Length:257295
## Class :character 1st Qu.:1993-01-01 Class :character Class :character
## Mode :character Median :2004-01-01 Mode :character Mode :character
## Mean :2003-04-15
## 3rd Qu.:2014-01-01
## Max. :2024-01-01
## ethnicity population
## Length:257295 Min. : 0.00
## Class :character 1st Qu.: 0.30
## Mode :character Median : 3.30
## Mean : 35.86
## 3rd Qu.: 21.90
## Max. :7363.40
head(population_data)
## state date sex age ethnicity population
## 1 Johor 1970-01-01 both overall overall 1325.6
## 2 Johor 1970-01-01 both 0-4 overall 210.1
## 3 Johor 1970-01-01 both 5-9 overall 215.7
## 4 Johor 1970-01-01 both 10-14 overall 192.2
## 5 Johor 1970-01-01 both 15-19 overall 152.8
## 6 Johor 1970-01-01 both 20-24 overall 110.7
## Summary of the dataset:
## state date sex age
## Length:257295 Min. :1970-01-01 Length:257295 Length:257295
## Class :character 1st Qu.:1993-01-01 Class :character Class :character
## Mode :character Median :2004-01-01 Mode :character Mode :character
## Mean :2003-04-15
## 3rd Qu.:2014-01-01
## Max. :2024-01-01
## ethnicity population
## Length:257295 Min. : 0.00
## Class :character 1st Qu.: 0.30
## Mode :character Median : 3.30
## Mean : 35.86
## 3rd Qu.: 21.90
## Max. :7363.40
# Set the order for the age groups
age_order <- c("overall", "0-4", "5-9", "10-14", "15-19", "20-24", "25-29",
"30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64",
"65-69", "70+")
# Ensure 'age' is a factor with the specified order
population_data <- population_data %>%
mutate(age = factor(age, levels = age_order))
# Create the plot
ggplot(population_data, aes(x = age, y = population, fill = age)) +
geom_bar(stat = "identity") +
labs(
title = "Distribution of Population by Age Group",
x = "Age Group",
y = "Population"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
filtered_data <- population_data %>%
filter(age != "overall")
# Create a distribution chart of the population by age group
ggplot(filtered_data, aes(x = age, y = population, fill = age)) +
geom_bar(stat = "identity") +
labs(
title = "Distribution of Population by Age Group",
x = "Age Group",
y = "Population"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
population_trend <- population_data %>%
group_by(state, date) %>%
summarize(total_population = sum(population, na.rm = TRUE))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
ggplot(population_trend, aes(x = date, y = total_population, color = state)) +
geom_line() +
labs(title = "Population Trends Over Time by State",
x = "Year",
y = "Total Population") +
theme_minimal()
population_ethnicity <- population_data %>%
group_by(ethnicity) %>%
summarize(total_population = sum(population, na.rm = TRUE))
ggplot(population_ethnicity, aes(x = reorder(ethnicity, -total_population), y = total_population, fill = ethnicity)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Population by Ethnicity",
x = "Ethnicity",
y = "Total Population") +
theme_minimal()
state_population <- population_data %>%
group_by(state) %>%
summarize(total_population = sum(population, na.rm = TRUE))
ggplot(state_population, aes(x = reorder(state, -total_population), y = total_population, fill = state)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Population by State",
x = "State",
y = "Total Population") +
theme_minimal()
gender_distribution <- population_data %>%
group_by(sex) %>%
summarize(total_population = sum(population, na.rm = TRUE))
ggplot(gender_distribution, aes(x = sex, y = total_population, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Gender Distribution of Population",
x = "Sex",
y = "Total Population") +
theme_minimal()
# Summarize data by date to get the total population on each date
date_summary <- population_data %>%
group_by(date) %>%
summarise(total_population = sum(population, na.rm = TRUE))
# Plotting the growth trends over time
ggplot(date_summary, aes(x = date, y = total_population)) +
geom_line(color = "blue", size = 1.2) + # Line for growth trend
geom_point(size = 2, color = "red") + # Add points for clarity
labs(title = "Population Growth Trend Over Time",
x = "Date",
y = "Total Population") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Summarize data by date to get the total population on each date
date_summary <- population_data %>%
group_by(date) %>%
summarise(total_population = sum(population, na.rm = TRUE)) %>%
ungroup()
# Filter data for the years between 1975 and 1990
date_summary_filtered <- date_summary %>%
filter(date >= as.Date("1975-01-01") & date <= as.Date("1990-12-31"))
# Calculate the percentage change in population relative to the first date for the filtered range
date_summary_filtered <- date_summary_filtered %>%
mutate(percentage_change = (total_population - lag(total_population)) / lag(total_population) * 100)
# Plotting the percentage growth trend over time for the filtered data
ggplot(date_summary_filtered, aes(x = date, y = percentage_change)) +
geom_line(color = "blue", size = 1.2) + # Line for growth trend
geom_point(size = 2, color = "red") + # Add points for clarity
labs(title = "Percentage Growth Trend from 1975 to 1990",
x = "Date",
y = "Percentage Change in Population") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# Summarize data by state and date to get the total population on each date
state_date_summary <- population_data %>%
group_by(state, date) %>%
summarise(total_population = sum(population, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
# Filter data for the years between 1975 and 1990
state_date_summary_filtered <- state_date_summary %>%
filter(date >= as.Date("1975-01-01") & date <= as.Date("1990-12-31"))
# Calculate the percentage change in population relative to the first date for each state
state_date_summary_filtered <- state_date_summary_filtered %>%
group_by(state) %>%
mutate(percentage_change = (total_population - lag(total_population)) / lag(total_population) * 100) %>%
ungroup()
# Identify the date where the spike in growth occurs (e.g., large percentage change)
max_growth_state <- state_date_summary_filtered %>%
filter(percentage_change == max(percentage_change, na.rm = TRUE))
ggplot(state_date_summary_filtered, aes(x = date, y = state, fill = percentage_change)) +
geom_tile() +
scale_fill_viridis() + # Color scale for heatmap
labs(title = "Heatmap of Population Growth by State (1975-1990)",
x = "Date",
y = "State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Filter and show only data for the year 1980
state_date_summary_1980 <- state_date_summary_filtered %>%
filter(format(date, "%Y") == "1980")
# Print out the growth trend for each state in 1980
print("Growth Trend for Each State in 1980:")
## [1] "Growth Trend for Each State in 1980:"
print(state_date_summary_1980)
## # A tibble: 14 × 4
## state date total_population percentage_change
## <chr> <date> <dbl> <dbl>
## 1 Johor 1980-01-01 13169. 105.
## 2 Kedah 1980-01-01 8959. 103.
## 3 Kelantan 1980-01-01 7166. 105.
## 4 Melaka 1980-01-01 3735 103.
## 5 Negeri Sembilan 1980-01-01 4614. 104.
## 6 Pahang 1980-01-01 6417. 109.
## 7 Perak 1980-01-01 14460. 102.
## 8 Perlis 1980-01-01 1189. 104.
## 9 Pulau Pinang 1980-01-01 7686 104.
## 10 Sabah 1980-01-01 8441. 108.
## 11 Sarawak 1980-01-01 10810. 105.
## 12 Selangor 1980-01-01 12192. 26.6
## 13 Terengganu 1980-01-01 4334. 106.
## 14 W.P. Kuala Lumpur 1980-01-01 7863. NA
Based on the heatmap and table above, all states except Selangor has a more than 100% increase in population size between the year 1979 and 1980. The factors of this increase may be due to:
Census Revision: The 1980 Census likely provided more accurate data than previous estimates, capturing previously underreported or uncounted populations.
Migration: Internal migration (e.g., urbanization) and international migration (e.g., foreign workers) likely contributed to population spikes.
Statistical Adjustments: Earlier estimates may have been inaccurate or based on extrapolation, with the 1980 Census offering more precise figures.
Economic Changes: Rapid industrialization and urban growth during the 1970s-1980s could have led to population increases, particularly in states with industrial hubs.
# Preprocess data: Convert categorical variables to numeric for correlation
population_data_numeric <- population_data %>%
mutate(sex = as.numeric(factor(sex)),
ethnicity = as.numeric(factor(ethnicity)),
state = as.numeric(factor(state)),
age = as.numeric(age))
# Compute correlation matrix
cor_matrix <- cor(population_data_numeric %>% select(population, age, sex, ethnicity, state))
# Plot heatmap of correlation matrix with correlation values displayed
corrplot(cor_matrix,
method = "color",
tl.cex = 0.8,
cl.cex = 0.8,
addCoef.col = "black", # Add correlation values in black color
title = "Correlation Heatmap of Population Data")
# Aggregate data by state and date to calculate the total population for each state
# Required to create a time-series dataset for population forecasting using Prophet
population_growth <- population_data %>%
group_by(state, date) %>%
summarise(total_population = sum(population, na.rm = TRUE), .groups = "drop")
# Define growth forecasting function to process and forecast population for each state
analyze_population_growth <- function(state_name) {
# Filter data for the specific state
state_data <- population_growth %>%
filter(state == state_name) %>%
mutate(date = as.Date(date)) %>%
arrange(date)
# Prepare data for Prophet by renaming the time column as ds and the target variable as y
# Prophet requires the dataset to have ds (date) and y (target variable) columns
prophet_data <- state_data %>%
rename(ds = date, y = total_population)
# Fit the Prophet model for each state's population data while suppressing daily and weekly seasonality.
suppressWarnings({
prophet_model <- prophet(prophet_data, weekly.seasonality = FALSE, daily.seasonality = FALSE)
})
# Generate future predictions (until 2030, yearly intervals) using make_future_dataframe
future <- make_future_dataframe(prophet_model, periods = 10, freq = "year") # Adjust periods for 10 years
forecast <- predict(prophet_model, future)
return(list(forecast = forecast, model = prophet_model))
}
#Apply growth analysis to all states
results <- lapply(unique(population_growth$state), function(state) {
analyze_population_growth(state)
})
## n.changepoints greater than number of observations. Using 11
# Combined results for all states into a single dataframe
forecasted_data <- do.call(rbind, lapply(seq_along(results), function(i) {
data.frame(
State = unique(population_growth$state)[i],
Year = as.integer(format(results[[i]]$forecast$ds, "%Y")),
Predicted = results[[i]]$forecast$yhat
)
}))
# Filter forecast data to show only future years (2021–2030)
future_data <- forecasted_data %>%
filter(Year >= 2021)
# Plot yearly population forecast for all states
ggplot(future_data, aes(x = Year, y = Predicted, color = State)) +
geom_line() +
labs(
title = "Yearly Population Growth Forecast for All States (2021-2030)",
x = "Year",
y = "Predicted Population"
) +
theme_minimal()
# Model Evaluation using RMSE (measure of how well the model fits the historical data)
# Calculated RMSE for each state to evaluate model performance and plotted RMSE values
state_metrics <- data.frame(
State = unique(population_growth$state),
RMSE = sapply(results, function(res) {
# RMSE calculation (only for illustrative purposes)
actual <- tail(res$model$history$y, n = 10)
predicted <- tail(res$forecast$yhat, n = 10)
sqrt(mean((actual - predicted)^2, na.rm = TRUE))
})
)
# RMSE plot
ggplot(state_metrics, aes(x = State, y = RMSE, fill = State)) +
geom_bar(stat = "identity") +
labs(title = "RMSE of Population Growth Forecasting by State", x = "State", y = "RMSE") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Add land area data to calculate density
land_area <- data.frame(
state = c("Johor", "Kedah", "Kelantan", "Melaka", "Negeri Sembilan", "Pahang",
"Pulau Pinang", "Perak", "Perlis", "Selangor", "Terengganu", "Sabah", "Sarawak",
"W.P. Kuala Lumpur", "W.P. Labuan", "W.P. Putrajaya"),
land_area_km2 = c(19210, 9500, 15099, 1664, 6686, 35964, 1048, 21035, 821, 8104,
13035, 73631, 124450, 243, 92, 49)
)
# Source: https://en.wikipedia.org/wiki/States_and_federal_territories_of_Malaysia
# Merge land area into population_data
population_data <- population_data %>%
left_join(land_area, by = "state")
# Verify if land_area_km2 exists and is valid
if (!"land_area_km2" %in% colnames(population_data)) {
stop("land_area_km2 column is missing in the dataset.")
}
if (any(is.na(population_data$land_area_km2) | population_data$land_area_km2 == 0)) {
stop("Missing or invalid land_area_km2 values detected.")
}
# Scale and calculate population density
population_data <- population_data %>%
mutate(
population = population * 1000, # Scale population to raw counts
population_density = population / land_area_km2 # Calculate population density
)
# Reclassify density categories
population_data <- population_data %>%
mutate(
density_category = case_when(
population_density < 100 ~ "low",
population_density < 1000 ~ "medium",
TRUE ~ "high"
) %>%
factor(levels = c("low", "medium", "high"))
)
# Aggregate data by state to calculate aggregated population, unique land area, and density
state_level_data <- population_data %>%
group_by(state) %>%
summarise(
population = sum(population, na.rm = TRUE),
land_area_km2 = unique(land_area_km2), # Assume consistent land_area_km2 per state
population_density = population / land_area_km2
)
# Train-Test split 80-20
set.seed(123)
train_index <- createDataPartition(population_data$density_category, p = 0.8, list = FALSE)
train_data <- population_data[train_index, ]
test_data <- population_data[-train_index, ]
# Random forest model using population and land_area_km2 as predictors and density_category as the target
rf_model <- randomForest(density_category ~ population + land_area_km2, data = train_data)
# Predictions and evaluation
rf_pred <- predict(rf_model, test_data)
rf_confusion <- confusionMatrix(rf_pred, test_data$density_category)
print("Random Forest Performance:")
## [1] "Random Forest Performance:"
print(rf_confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low medium high
## low 50002 1 0
## medium 1 1356 0
## high 0 0 97
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9999, 1)
## No Information Rate : 0.9717
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9993
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: low Class: medium Class: high
## Sensitivity 1.0000 0.99926 1.000000
## Specificity 0.9993 0.99998 1.000000
## Pos Pred Value 1.0000 0.99926 1.000000
## Neg Pred Value 0.9993 0.99998 1.000000
## Prevalence 0.9717 0.02637 0.001885
## Detection Rate 0.9717 0.02635 0.001885
## Detection Prevalence 0.9717 0.02637 0.001885
## Balanced Accuracy 0.9996 0.99962 1.000000
# Classify states by population density
state_level_data <- state_level_data %>%
mutate(predicted_category = predict(rf_model, newdata = state_level_data))
# Summarize states by predicted category
states_by_category <- state_level_data %>%
group_by(predicted_category) %>%
summarise(states = paste(unique(state), collapse = ", "))
# Print classification summary
print("States by Population Density Category:")
## [1] "States by Population Density Category:"
print(states_by_category)
## # A tibble: 3 × 2
## predicted_category states
## <fct> <chr>
## 1 low Sabah, Sarawak
## 2 medium Johor, Kedah, Kelantan, Melaka, Negeri Sembilan, Pahang, P…
## 3 high W.P. Kuala Lumpur, W.P. Labuan, W.P. Putrajaya