Homework 2 Decision Trees

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data <- read.csv("C:\\Users\\NCC-1701D\\Downloads\\archive (3)\\gdp_over_hours_worked.csv")

#Separate dataset of countries
countries <- read.csv("C:\\Users\\NCC-1701D\\Downloads\\countries.csv")


#Merging dataset of current countries to exclude continent subsets and the like
filtered_data <- data[data$country %in% countries$name, ]

# Removing time series and averaging over the 40 year span to concatenate data frame
averages_by_country <- filtered_data %>%
  group_by(country) %>%
  summarize(
    avg_pop = mean(pop, na.rm = TRUE),
    avg_labor_force = mean(labor_force, na.rm = TRUE),
    avg_gdp_ppp_c = mean(gdp_ppp_c, na.rm = TRUE),
    avg_gdp_ppp = mean(gdp_ppp, na.rm = TRUE),
    avg_gdp = mean(gdp, na.rm = TRUE),
    avg_gdp_c = mean(gdp_c, na.rm = TRUE),
    avg_unemployment_r = mean(unemployment_r, na.rm = TRUE),
    avg_pop_over_65 = mean(pop_over_65, na.rm = TRUE),
    avg_working_age_pop_pct = mean(working_age_pop_pct, na.rm = TRUE),
    avg_employment_rate = mean(employment_rate, na.rm = TRUE),
    avg_hours_worked = mean(hours_worked, na.rm = TRUE),
    avg_total_hours = mean(total_hours, na.rm = TRUE),
    avg_hours_per_employed = mean(hours_per_employed, na.rm = TRUE),
    avg_employed = mean(employed, na.rm = TRUE),
    avg_total_hours_alternative = mean(total_hours_alternative, na.rm = TRUE),
    avg_gdp_over_k_hours_worked = mean(gdp_over_k_hours_worked, na.rm = TRUE),
    avg_gdp_ppp_over_k_hours_worked = mean(gdp_ppp_over_k_hours_worked, na.rm = TRUE),
    avg_gdp_over_pop = mean(gdp_over_pop, na.rm = TRUE),
    avg_gdp_ppp_over_pop = mean(gdp_ppp_over_pop, na.rm = TRUE),
    avg_gdp_ppp_over_labor_force = mean(gdp_ppp_over_labor_force, na.rm = TRUE),
    avg_gdp_ppp_over_pop_c = mean(gdp_ppp_over_pop_c, na.rm = TRUE),
    avg_gdp_over_pop_c = mean(gdp_over_pop_c, na.rm = TRUE),
    avg_gdp_ppp_over_k_hours_worked_c = mean(gdp_ppp_over_k_hours_worked_c, na.rm = TRUE)
  )

Summary statistics and exploratory analysis

# top 15 countries by average GDP
top_15_avg_gdp <- averages_by_country %>%
  arrange(desc(avg_gdp)) %>%
  head(15)

#  top 15 countries by average population
top_15_avg_pop <- averages_by_country %>%
  arrange(desc(avg_pop)) %>%
  head(15)

# top 15 countries by average labor force
top_15_avg_labor_force <- averages_by_country %>%
  arrange(desc(avg_labor_force)) %>%
  head(15)

#  top 15 countries by average hours worked
top_15_avg_hours_worked <- averages_by_country %>%
  arrange(desc(avg_hours_worked)) %>%
  head(15)

# bar graphs for each of the top 15 averages
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

# Create a function to generate bar graphs
create_bar_graph <- function(data, x, y, title, x_label, y_label, fill_color) {
  ggplot(data, aes(x = reorder({{x}}, -{{y}}), y = {{y}})) +
    geom_bar(stat = "identity", fill = fill_color) +
    labs(title = title, x = x_label, y = y_label) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    coord_flip()
}

# Create bar graphs for each average
gdp_bar_graph <- create_bar_graph(top_15_avg_gdp, country, avg_gdp, "Top 15 Countries by Average GDP", "Country", "Average GDP (in USD)", "blue")
pop_bar_graph <- create_bar_graph(top_15_avg_pop, country, avg_pop, "Top 15 Countries by Average Population", "Country", "Average Population", "green")
labor_force_bar_graph <- create_bar_graph(top_15_avg_labor_force, country, avg_labor_force, "Top 15 Countries by Average Labor Force", "Country", "Average Labor Force", "red")
hours_worked_bar_graph <- create_bar_graph(top_15_avg_hours_worked, country, avg_hours_worked, "Top 15 Countries by Average Hours Worked", "Country", "Average Hours Worked", "purple")

# Display the bar graphs
print(gdp_bar_graph)

print(pop_bar_graph)

print(labor_force_bar_graph)

print(hours_worked_bar_graph)

### Creating Target Variable

# Create a logical vector indicating whether each country is in the top 15
averages_by_country$in_top_15 <- averages_by_country$country %in% top_15_avg_gdp$country

# Convert the logical values to binary (1 for TRUE, 0 for FALSE)
averages_by_country$in_top_15 <- as.integer(averages_by_country$in_top_15)

Data splitting into training and testing.

# Set the seed for reproducibility
set.seed(1)

# Specify the desired split ratio (e.g., 80% training, 20% testing)
split_ratio <- 0.8

# Get the total number of rows in the data frame
total_rows <- nrow(averages_by_country)

# Calculate the number of rows for training and testing sets
num_train_rows <- round(split_ratio * total_rows)
num_test_rows <- total_rows - num_train_rows

# Create a random permutation of row indices
row_indices <- sample(1:total_rows)

# Split the data frame into training and testing sets based on row indices
train_data <- averages_by_country[row_indices[1:num_train_rows], ]
test_data <- averages_by_country[row_indices[(num_train_rows + 1):total_rows], ]

Use decision tree algorithm to train model and test

library(rpart)  # For decision tree modeling
library(caret)  # For data splitting

## Warning: package 'caret' was built under R version 4.3.2

## Loading required package: lattice

# Fit a decision tree model using the specified columns as predictors
tree_model1 <- rpart(in_top_15 ~ avg_unemployment_r + avg_pop_over_65 + avg_working_age_pop_pct, data = train_data, method = "class")

tree_model2 <- rpart(in_top_15 ~ avg_labor_force + avg_pop + avg_employment_rate + avg_hours_worked, data = train_data, method = "class")

# Predict using the fitted model on the test data
predictions1 <- predict(tree_model1, test_data, type = "class")
predictions2 <- predict(tree_model2, test_data, type = "class")

# Create a confusion matrix
confusion_matrix1 <- table(Actual = test_data$in_top_15, Predicted = predictions1)
confusion_matrix2 <- table(Actual = test_data$in_top_15, Predicted = predictions2)

# Display the confusion matrix
confusion_matrix1

##       Predicted
## Actual  0  1
##      0 32  0
##      1  4  0

confusion_matrix2

##       Predicted
## Actual  0  1
##      0 31  1
##      1  1  3

  library(randomForest)

## Warning: package 'randomForest' was built under R version 4.3.2

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

  train_data <- na.omit(train_data)

  # Fit a random forest model
  rf_model <- randomForest(in_top_15 ~ avg_unemployment_r + avg_pop_over_65 + avg_working_age_pop_pct + avg_labor_force + avg_pop + avg_employment_rate + avg_hours_worked, data = train_data, ntree = 100)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?

  predicted_labels <- predict(rf_model, newdata = test_data)

  # Create a confusion matrix
  confusion_matrix3 <- table(Actual = test_data$in_top_15, Predicted = predicted_labels)
  
  # Print the confusion matrix
  print(confusion_matrix3)

##       Predicted
## Actual 0.0153333333333333 0.024 0.03 0.032 0.0673333333333333 0.159833333333333
##      0                  1     1    1     1                  1                 1
##      1                  0     0    0     0                  0                 0
##       Predicted
## Actual 0.368666666666667 0.567
##      0                 0     0
##      1                 1     1

The dataset that I chose included the gross domestic product of different countries over a 42 year period starting from 1980 to present. I first had to clean the data being a time series and the data itself having many “countries” that included subsets of countries such as “East Asia & Pacific” and “Low Income” that were not purely nations. Some of the exploratory analysis included graphs of the top 15 of a few of the variables that I wanted to use for my classification. For my target variable I created a new column that determined whether each row were in the top 15 of the average gdp. It was interesting for me to see whether or not population, hours worked, labor force and unemployment were deciding factors in determining the top gdp predictors. There were many other variables to choose from though as they were mostly dependent on gdp and decision tree target variables should be independent of their features theoretically, I chose the following. I split the set into training and test data while including some of the positive target rows in the training set to augment the accuracy of the model. In splitting the variables, we can clearly see through the confusion matrix that one set of the variables were better at prediction than the others. In response to the article, there are some definite limitations when considering decision trees but in my opinion, the interpretability of the model combined with it’s accuracy is highly useful. We can conclude at least from the results that labor force, population, employment rate and hours worked are better predictors than average unemployment, population over 65 and average working age when determining our target variable of whether or not a country is in the top 15 of average gdp over the time span. Considering also that there is only 3 true positives amongst 36 sampled for my testing, this can be considered to be statistically significant.

Homework 2 Decision Trees

John Ledesma

2023-12-21

Summary statistics and exploratory analysis

Data splitting into training and testing.

Use decision tree algorithm to train model and test