Functions & Loops

Assignment ~ Week 5

NIM: 52250007

SEMESTER 2

Yosef Teofani Tamba

🎓 Active students in the 2025/2026 academic year

Data Science Undergraduate at Institut Teknologi dan Sains Bandung (ITSB)

👨‍🏫 Lecturer

Mr. Bakti Siregar, M.Sc., CDS

📊 R Programming

💻 Python Programming

🤖 Data Science

📚 Currently in the Semester 2 (2026) - Focus on Data Science Programming

1 Objective

Build multi-layer functions with nested loops and conditional logic.
Handle multi-dataset simulations.
Perform advanced statistics, data transformation, and visualization.
Develop an automated data science workflow.

2 TASK 1 - Dynamic Multi-Formula Function

Function: Create a function that can calculate various types of mathematical formulas simultaneously based on user input.

The function used is compute_formula(x, formula) for linear, quadratic, cubic, and exponential formulas.

2.1 Formula

From this code, it can be seen that there are two main parts: the definition of mathematical formulas as standalone functions, and helper functions that map text input to the relevant formula

# FORMULA DEFINITIONS 

calc_linear      <- function(x) 2 * x + 5
calc_quadratic   <- function(x) x^2 - 4 * x + 4
calc_cubic       <- function(x) 0.1 * x^3 - 2 * x^2 + x
calc_exponential <- function(x) exp(0.3 * x)

# Helper function to map text input to the appropriate function
get_formula_func <- function(name) {
  switch(name,
         "linear"      = calc_linear,
         "quadratic"   = calc_quadratic,
         "cubic"       = calc_cubic,
         "exponential" = calc_exponential,
         NULL) # Returns NULL if no match is found
}

2.2 Data and Loop

From this code, it can be seen that there are two main parts: the definition of mathematical formulas as standalone functions, and helper functions that map text input to the relevant formula

# DATA PROCESSING & LOOPING

compute_data <- function(x_range, formulas) {
  results_list <- list()
  
  for (form_name in formulas) {
    func <- get_formula_func(form_name)
    
    if (is.null(func)) {
      message(paste("Warning: Formula '", form_name, "' is unrecognized."))
      next
    }
    
    # Calculate y-values for the entire x_range (Vectorized)
    y_values <- func(x_range)
    
    # Organize into a temporary list
    results_list[[form_name]] <- data.frame(
      x = x_range, 
      y = y_values, 
      formula = form_name
    )
  }
  
  # Combine all lists into a single dataframe
  final_df <- do.call(rbind, results_list)
  return(final_df)
}

# PROCESSING EXECUTION
x_input        <- 1:50  # Expanding the range for clearer visualization
selected_types <- c("linear", "quadratic", "cubic", "exponential", "typo_test")
result_data    <- compute_data(x_input, selected_types)

## Warning: Formula ' typo_test ' is unrecognized.

datatable(
  result_data, 
  options = list(
    pageLength = 10,            # Show 10 rows initially
    lengthMenu = c(10, 25, 50),  # Options for user to change row count
    order = list(list(2, 'asc')) # Sort by the 'formula' column
  ),
  caption = "Calculated Results Data"
)

2.3 visualization

From this visualization, we can see a line chart in which each line represents a mathematical formula. The X-axis ranges from 1 to 20, and the Y-axis shows the calculation results (the exponential curve will appear to shoot up at the far right).

plot_formulas_log <- function(df) {
  # Validate for empty data
  if (is.null(df) || nrow(df) == 0) {
    message("No valid data available to plot.")
    return(NULL)
  }
  
  ggplot(df, aes(x = x, y = y, color = formula)) +
    # Make lines slightly thinner to avoid overlap in log scale
    geom_line(linewidth = 1, alpha = 0.9) + 
    
    # --- KEY FOR CLARITY: Transform Y Scale to Logarithmic ---
    # trans = "log10" makes the distance between orders of magnitude (1, 10, 100, 1000) equal
    # labels = label_comma() converts scientific notation (1e+06) into commas (1,000,000)
    scale_y_log10(labels = label_comma(), 
                  breaks = trans_breaks("log10", function(x) 10^x)) +
    
    # Enhancing color palette
    scale_color_viridis_d(option = "turbo", name = "Formula Type") +
    
    labs(
      title = "Formula Comparison Chart (Logarithmic Scale)",
      subtitle = "Enables comparison between small vs. extremely large values",
      x = "X Axis (Input)", 
      y = "Y Axis (Output - Log Scale)"
    ) +
    theme_minimal() +
    theme(
      legend.position = "bottom",
      plot.title = element_text(face = "bold"),
      # Adding minor grid lines to assist in reading the log scale
      panel.grid.minor.y = element_line(color = "grey95") 
    )
}

# DISPLAY PLOT
plot_formulas_log(result_data)

3 TASK 2 - Nested Simulation: Multi-Sales & Discounts

Simulate sales data, use nested functions for cumulative calculations, and apply automatic discounts.

Function: simulate_sales(n_salesperson, days)
producing: sales_id, day, sales_amount, discount_rate.

3.1 Simulation Function

This simulation is a prototype sales report that combines identity data, daily performance, incentive logic (discounts), and progress toward total targets into a single data frame.

simulate_sales <- function(n_salespeople, days) {
  # Internal function to calculate cumulative values
  calculate_cumulative <- function(vec) { return(cumsum(vec)) }
  
  final_data <- data.frame()
  
  for (id in 1:n_salespeople) {
    sales_amount <- round(runif(days, 50, 500), 2)
    
    # Discount Logic: 20% if > 400, 10% if > 200
    discount_rate <- ifelse(sales_amount > 400, 0.2, 
                            ifelse(sales_amount > 200, 0.1, 0))
    
    # Combine data into a data frame
    temp_df <- data.frame(
      salesperson_id = paste("Salesperson", id),
      day = 1:days,
      sales_amount = sales_amount,
      discount_rate = paste0(discount_rate * 100, "%"),
      cumulative_sales = calculate_cumulative(sales_amount)
    )
    
    # Append to the final data frame
    final_data <- rbind(final_data, temp_df)
  }
  
  return(final_data)
}

# Execute function and store the results
sales_data <- simulate_sales(n_salespeople = 3, days = 5)

3.2 Table Sales

The purpose of the following table is no longer simply to present numbers, but to communicate data in a way that makes it easier for the audience to grasp key information—such as who has the highest sales or how the cumulative trend is developing—through the use of color and clear grouping.

base_table <- sales_data %>%
  kbl(caption = "Daily Sales Simulation Report") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F)

colored_table <- base_table %>%
  row_spec(0, bold = T, color = "white", background = "#2C3E50") %>% # Dark Blue Header
  column_spec(3, bold = T, background = "#ECF0F1") %>%              # Sales Column
  column_spec(5, color = "#27AE60", bold = T)                       # Cumulative Column (Green)

final_table <- colored_table %>%
  pack_rows("Salesperson Group 1", 1, 5) %>%
  pack_rows("Salesperson Group 2", 6, 10) %>%
  pack_rows("Salesperson Group 3", 11, 15)

# Display table
final_table

Daily Sales Simulation Report
salesperson_id	day	sales_amount	discount_rate	cumulative_sales
Salesperson Group 1
Salesperson 1	1	72.32	0%	72.32
Salesperson 1	2	192.93	0%	265.25
Salesperson 1	3	497.17	20%	762.42
Salesperson 1	4	185.48	0%	947.90
Salesperson 1	5	356.07	10%	1303.97
Salesperson Group 2
Salesperson 2	1	204.05	10%	204.05
Salesperson 2	2	462.32	20%	666.37
Salesperson 2	3	445.51	20%	1111.88
Salesperson 2	4	135.78	0%	1247.66
Salesperson 2	5	338.56	10%	1586.22
Salesperson Group 3
Salesperson 3	1	353.65	10%	353.65
Salesperson 3	2	164.02	0%	517.67
Salesperson 3	3	235.69	10%	753.36
Salesperson 3	4	451.90	20%	1205.26
Salesperson 3	5	173.46	0%	1378.72

3.3 Visualization

This simulation system successfully demonstrated that, with the right visualization, data that was initially just a set of random numbers can be transformed into a decision-making tool to determine who is eligible for a bonus (Salespeople 1 and 3) and who needs additional training (Salesperson 2).

ggplot(sales_data, aes(x = day, y = cumulative_sales, color = salesperson_id)) + 
  geom_line(linewidth = 1) + 
  geom_point(size = 2) + 
  theme_minimal() + 
  labs(
    title = "Cumulative Sales Trend",
    x = "Day",
    y = "Total Sales",
    color = "Salesperson ID"
  )

4 TASK 3 - Multi-Level Performance Categorization

Categorize sales figures into 5 levels and display them as bar and pie charts.

Description: Function categorize_performance(sales_amount) with 5 categories (Excellent, Very Good, Good, Average, Poor).

4.1 Funciton and Data

Strategic Data Transformation: This code is a feature engineering process that transforms raw data into meaningful information (actionable insights) for businesses using categorize_performance.

# 0. Setup Sample Data (Required to run the code)
sales_data <- data.frame(
  sales_amount = c(450, 320, 250, 150, 50, 410, 380, 120)
)

# 1. Define the Function
categorize_performance <- function(sales_vector) {
  categories <- character(length(sales_vector))
  
  for (i in 1:length(sales_vector)) {
    val <- sales_vector[i]
    if (val >= 400) { 
      categories[i] <- "Excellent" 
    } else if (val >= 300) { 
      categories[i] <- "Very Good" 
    } else if (val >= 200) { 
      categories[i] <- "Good" 
    } else if (val >= 100) { 
      categories[i] <- "Average" 
    } else { 
      categories[i] <- "Poor" 
    }
  }
  return(categories)
}

# 2. Execution (The variable name here must match the one used in the next step)
# I have renamed this to 'performance_results'
performance_results <- categorize_performance(sales_data$sales_amount)

4.2 Calculate Percentage

The performance results of this analysis are dominated by low-to-medium levels. The company’s primary focus should shift from merely maintaining good performance to improving “Poor” ratings so they rise to “Average” or “Good” levels. Use prop.table(counts) to convert absolute numbers into proportions.

# 3. Create a Frequency Table
# Now 'performance_results' is recognized because it was defined above
counts <- table(performance_results)

# 4. Calculate Percentages
percent <- round(prop.table(counts) * 100, 1)

# 5. Combine into a Summary Table
summary_table <- data.frame(
  Category = names(counts),
  Count = as.vector(counts),
  Percentage = paste0(as.vector(percent), "%")
)

# Display result
print(summary_table)

##    Category Count Percentage
## 1   Average     2        25%
## 2 Excellent     2        25%
## 3      Good     1      12.5%
## 4      Poor     1      12.5%
## 5 Very Good     2        25%

4.3 Visualization

4.3.1 Bar Chart

This dataset shows a fairly even distribution, though it leans toward the positive end. Fifty percent of the total data falls into the “Very Good” and ‘Excellent’ categories (sales above 300 units). However, it should be noted that there is still one data point in the “Poor” category, which may require further evaluation to determine the cause of the low sales figures.

# Sort Categories (Important to ensure they are arranged in a logical order)
summary_table$Category <- factor(summary_table$Category, 
                                 levels = c("Poor", "Average", "Good", "Very Good", "Excellent"))

fig <- plot_ly(
  data = summary_table,
  x = ~Category,
  y = ~Count,
  type = "bar",
  marker = list(color = 'skyblue', 
                line = list(color = 'white', width = 1.5)), # Add border
  hoverinfo = 'text',
  text = ~paste("<b>Category:</b> ", Category, 
                "<br><b>Quantity:</b> ", Count, 
                "<br><b>Percentage:</b> ", Percentage, "%")
)

# 3. Merapikan Layout
fig <- fig %>% layout(
  title = list(text = "<b> </b>", y = 0.95),
  xaxis = list(title = "Performance Category", tickangle = 0),
  yaxis = list(title = "Total Sales Count", gridcolor = '#ebebeb'),
  plot_bgcolor = 'rgba(0,0,0,0)', # For transparant background
  margin = list(t = 50)
)

# Show the visualization
fig

4.3.2 Pie Chart

Visually, the brightly colored areas (Yellow, Purple, Red) are much larger than the Blue and Green areas. This gives the positive impression that, proportionally, the “Poor” performance category represents only a small fraction (1/8) of the entire dataset analyzed.

fig_pie <- plot_ly(
  data = summary_table,
  labels = ~Category,
  values = ~Count,
  type = 'pie',
  textposition = 'inside',
  textinfo = 'label+percent', 
  insidetextfont = list(color = '#FFFFFF'),
  hoverinfo = 'text', 
  text = ~paste("Performance: ", Category, 
                "<br>Sales Count: ", Count, 
                "<br>Overall Percentage: ", Percentage),
  marker = list(colors = rainbow(nrow(summary_table)),
                line = list(color = '#FFFFFF', width = 1))
)

# Set the layout for Pie Chart
fig_pie <- fig_pie %>% layout(
  title = "Performance Distribution (Sales Count)",
  xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
  yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE)
)

# Display the interactive pie chart
fig_pie

5 TASK 4 - Multi-Company Dataset Simulation

Simulate a multi-level dataset using nested loops and conditional logic, then summarize the results. Use the function generate_company_data(n_company, n_employees) generate: company_id,employee_id, salary, department, performance_score, KPI_score.

5.1 Function Definition

This R code is designed to simulate human resources (HR) data across multiple companies simultaneously. By combining looping techniques to generate random data and the dplyr library for data processing, this code makes it easier to identify salary patterns and employee performance trends without needing actual data first.

# Define the number of companies and employees
n_company <- 3
n_employees <- 20
departments <- c("Sales", "IT", "HR", "Marketing", "Finance")

# Prepare an empty data container
all_data <- data.frame()

5.2 Looping

This section of the code is the main “engine” responsible for generating data. The process begins with an outer loop that determines which company is currently being processed (for example, Company 1, 2, or 3). For each of these companies, an inner loop is executed to create employee profiles one by one.

# Outer Loop: For each Company
for (c in 1:n_company) {
  company_id <- paste0("COMP_", c)
  
  # Inner Loop: For each Employee within that company
  for (e in 1:n_employees) {
    employee_id <- paste0("EMP_", c, "_", e)
    salary <- round(runif(1, 4000, 15000), 2)
    department <- sample(departments, 1)
    performance_score <- round(runif(1, 50, 100), 1)
    KPI_score <- round(runif(1, 50, 100), 1)
    
    # Conditional Logic: Determine Top Performer
    is_top_performer <- ifelse(KPI_score > 90, "Yes", "No")
    
    # Combine temporary data into the master table
    temp_df <- data.frame(company_id, employee_id, salary, department, 
                          performance_score, KPI_score, is_top_performer)
    all_data <- rbind(all_data, temp_df)
  }
}

# Display the first 6 rows of the generated data
head(all_data)

5.3 Summary

In this code block, the dplyr package is used to perform data transformation. The process begins with the group_by(company_id) function, which groups the data rows by company. After grouping, the summarise() function calculates specific statistics for each group. Here, the system calculates the average salary, the average performance score, and identifies the highest KPI value within each company. The final result is no longer a list of individual records, but a summary table (tibble) that shows the collective performance of each company entity.

# Create a summary table per company
summary_table <- all_data %>%
  group_by(company_id) %>%
  summarise(
    avg_salary = mean(salary),
    avg_performance = mean(performance_score),
    max_KPI = max(KPI_score)
  )

# Print the summary to the console
print(summary_table)

## # A tibble: 3 × 4
##   company_id avg_salary avg_performance max_KPI
##   <chr>           <dbl>           <dbl>   <dbl>
## 1 COMP_1          9911.            77.2    95.1
## 2 COMP_2          9490.            75.5    97.6
## 3 COMP_3          9050.            69.2    97.9

5.4 Visualization

Use ggplot2, which operates on a “Grammar of Graphics” principle—stacking layers of information. We map the company_id to the horizontal axis (X) and the avg_salary to the vertical axis (Y). By using geom_bar(stat = “identity”), we ensure the height of each bar represents the actual value from our summary.

# 1. Create a Bar Chart for Average Salary per Company
salary_plot <- ggplot(summary_table, aes(x = company_id, y = avg_salary, fill = company_id)) +
  geom_bar(stat = "identity", color = "black", width = 0.7) +
  geom_text(aes(label = round(avg_salary, 0)), vjust = -0.5) + # Add data labels above bars
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") + # Use a professional color palette
  labs(
    title = "Comparison of Average Salary Across Companies",
    subtitle = "Based on simulated data for 60 employees",
    x = "Company ID",
    y = "Average Salary (USD)",
    fill = "Company"
  )

# 2. Display the plot
print(salary_plot)

6 TASK 5 - Monte Carlo Simulation: Pi & Probability

Using repeated random simulations (Monte Carlo) to estimate the value of \(\pi\) and spatial probabilities. Using the function monte_carlo_pi(n_points) with additional probability analysis.

6.1 Parameters

This part sets up the “environment.” We define n_points as 2000, meaning we will “throw” 2000 random dots onto a coordinate plane to see where they land.

# Define the number of iterations (points)
n_points <- 2000

# Initialize counters
points_inside_circle <- 0
points_in_subsquare <- 0

# Prepare empty vectors to store coordinates (for efficiency)
x_vals <- numeric(n_points)
y_vals <- numeric(n_points)
status <- character(n_points)

6.2 Simulation Process (Looping)

set.seed(123) # Optional: ensures results are consistent every time you run it

for (i in 1:n_points) {
  # Generate random x and y coordinates between -1 and 1
  x <- runif(1, min = -1, max = 1)
  y <- runif(1, min = -1, max = 1)
  
  x_vals[i] <- x
  y_vals[i] <- y
  
  # Check if the point is inside the circle using the Pythagorean formula: x^2 + y^2 <= r^2
  if (x^2 + y^2 <= 1) {
    points_inside_circle <- points_inside_circle + 1
    status[i] <- "Inside Circle"
  } else {
    status[i] <- "Outside Circle"
  }
  
  # Sub-square analysis (Quadrant 1, range 0 to 0.5)
  if (x >= 0 & x <= 0.5 & y >= 0 & y <= 0.5) {
    points_in_subsquare <- points_in_subsquare + 1
  }
}

The essence of Monte Carlo is randomness. This loop checks if a point falls inside a circle (radius of 1) or inside a specific smaller box (sub-square).

6.3 Calculation and Textual Output

# Formula for Pi estimation: (Area of Circle / Area of Square) * 4
pi_estimate <- 4 * (points_inside_circle / n_points)

# Probability of a point landing in the sub-square
subsquare_prob <- points_in_subsquare / n_points

# Create a summary table
summary_results <- data.frame(
  Metric = c("Total Points", "Pi Estimate", "Sub-square Prob"),
  Value = c(n_points, pi_estimate, subsquare_prob)
)

# Simply call the object to print it neatly
print(summary_results)

##            Metric    Value
## 1    Total Points 2000.000
## 2     Pi Estimate    3.108
## 3 Sub-square Prob    0.059

Pi Estimate: As n_points increases, this value will get closer to the true value of \(3.14159...\)
Sub-square Probability: Shows how often a point falls in the specific \(0.5 \times 0.5\) area. Theoretically, this area is \(0.0625\) (\(6.25\%\)) of the total \(2 \times 2\) square area.

6.4 Visualization

# Combine results into a dataframe
df_plot <- data.frame(x = x_vals, y = y_vals, Status = status)

# Create the Plot
ggplot(df_plot, aes(x = x, y = y, color = Status)) +
  geom_point(alpha = 0.5) +
  coord_fixed() + # Ensures the circle is a perfect circle, not an oval
  scale_color_manual(values = c("Inside Circle" = "blue", "Outside Circle" = "red")) +
  theme_minimal() +
  labs(
    title = paste("Monte Carlo Pi Visualization (n =", n_points, ")"),
    subtitle = paste("Pi Estimate =", round(pi_estimate, 4)),
    x = "X Coordinate",
    y = "Y Coordinate"
  )

Visual Output Explanation:

Blue Dots (Inside Circle): Points where the distance from the center \((0,0)\) is less than or equal to 1.
Red Dots (Outside Circle): Points that landed in the corners of the square but outside the circle’s boundary.
Pattern: You will notice a circle shape organically forming from the cluster of random points.

7 TASK 6 - Advance Data Transformation & Feature Engineering

Performing data standardization/normalization and creating new category features, which are very useful for processing historical data before modeling. Using the functions normalize_columns(df) and z_score(df) along with new feature creation.

7.1 Data Preparation

This section creates a company_data data frame containing salary and performance_score to mimic your original Task.

# Setting a seed for reproducibility
set.seed(123)

# Creating simulation data
company_data <- data.frame(
  id = 1:10,
  salary = c(5000, 7000, 12000, 4500, 6000, 15000, 8000, 9000, 5500, 11000),
  performance_score = c(60, 75, 90, 55, 70, 95, 80, 85, 65, 88)
)

print(company_data)

##    id salary performance_score
## 1   1   5000                60
## 2   2   7000                75
## 3   3  12000                90
## 4   4   4500                55
## 5   5   6000                70
## 6   6  15000                95
## 7   7   8000                80
## 8   8   9000                85
## 9   9   5500                65
## 10 10  11000                88

7.2 Defining Normalization & Standardization Functions

# Min-Max Normalization: Rescales data to a range between 0 and 1
normalize_column <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

# Z-Score Standardization: Scales data so the Mean = 0 and SD = 1
z_score <- function(x) {
  return((x - mean(x)) / sd(x))
}

Key Concepts:

Min-Max: Ideal when you need a bounded range (0 to 1).
Z-Score: Ideal if your data contains outliers, as it measures how many standard deviations a value is from the mean.

7.3 Data Transformation & Feature Engineering

# Create a copy to keep the original data intact
df_transformed <- company_data 

# Target columns for transformation
cols_to_transform <- c("salary", "performance_score")

# Loop-based transformation
for (col in cols_to_transform) {
  # Create Normalization column
  df_transformed[[paste0(col, "_norm")]] <- normalize_column(df_transformed[[col]])
  # Create Z-score column
  df_transformed[[paste0(col, "_zscore")]] <- z_score(df_transformed[[col]])
}

# Feature Engineering: Categorizing salary into 3 levels
df_transformed$salary_bracket <- cut(df_transformed$salary, 
                                     breaks = 3, 
                                     labels = c("Low", "Medium", "High"))

# Displaying the transformed results
head(df_transformed)

You will see new columns. For example, salary_norm will contain values like \(0.25\), while salary_bracket will label employees as “Low”, “Medium”, or “High” earners.

7.4 Visualization of Results

7.4.1 Box Plot

This graph compares four variables: salary, performance_score, and their normalized versions (salary_norm and performance_score_norm). Using a box plot allows us to see statistical summaries such as the median and quartiles, while a jitter plot (black dots) shows the distribution of individual data points to detect data density or gaps in the distribution.

plot_data <- df_transformed %>%
  select(salary, performance_score, salary_norm, performance_score_norm) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")

p <- ggplot(plot_data, aes(x = variable, y = value, fill = variable, text = paste("Variable:", variable, "<br>Value:", round(value, 2)))) +
  geom_boxplot(alpha = 0.75, outlier.color = "red", outlier.size = 2) +
  geom_jitter(width = 0.12, alpha = 0.5, color = "black", size = 2) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Box Plot of Transformed Data",
    x = "Variable",
    y = "Value"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

ggplotly(p, tooltip = c("text", "y"))

7.4.2 Histogram

# Set plotting layout to 1 row and 2 columns
par(mfrow=c(1,2))

# Plot 1: Original Salary Distribution
hist(company_data$salary, 
     main="Original Salary", 
     xlab="Salary", 
     col="lightblue")

# Plot 2: Standardized Salary Distribution (Z-Score)
hist(df_transformed$salary_zscore, 
     main="Z-Score Salary", 
     xlab="Z-Score (Standard Deviations)", 
     col="lightgreen")

Output Explanation: Left Histogram: Shows the frequency of salaries in thousands (e.g., 5000–15000).

Right Histogram: Shows the exact same distribution shape, but the X-axis is now centered at 0. This confirms the data has been successfully standardized without losing its original structure.

8 TASK 7 - Mini Project: Company KPI Dashboard & Simulation

Integrate large-scale data synthesis, cluster analysis, and advanced visualization (such as regression lines). Generate a dataset for 5–10 companies, each with 50–200 employees.

8.1 Data Preparation

The first step is to create the dataset (generate_company_data)

# Creating a simulation dataset
set.seed(123)
n_company <- 5
n_employees <- 50

large_df <- data.frame(
  employee_id = 1:(n_company * n_employees),
  company_id = rep(paste0("Company_", 1:n_company), each = n_employees),
  department = sample(c("HR", "Tech", "Sales", "Finance"), n_company * n_employees, replace = TRUE),
  KPI_score = runif(n_company * n_employees, 50, 100),
  performance_score = runif(n_company * n_employees, 1, 10),
  salary = runif(n_company * n_employees, 5000, 15000)
)

# Output: Display the first 6 rows
head(large_df)

8.2 Employee Categorization (Logic & Looping)

# Initialize a new column
large_df$KPI_tier <- character(nrow(large_df))

# Loop for categorization
for (i in 1:nrow(large_df)) {
  if (large_df$KPI_score[i] >= 85) {
    large_df$KPI_tier[i] <- "Tier 1 (Excellent)"
  } else if (large_df$KPI_score[i] >= 70) {
    large_df$KPI_tier[i] <- "Tier 2 (Good)"
  } else {
    large_df$KPI_tier[i] <- "Tier 3 (Needs Improvement)"
  }
}

# Output: Check category frequency
table(large_df$KPI_tier)

## 
##         Tier 1 (Excellent)              Tier 2 (Good) 
##                         74                         64 
## Tier 3 (Needs Improvement) 
##                        112

Output Description: The output is a frequency table showing how many employees fall into Tiers 1, 2, and 3. This confirms that your if-else logic is working correctly.

8.3 Visualization

8.3.1 Visualizaation: KPI Distribution per Department

We use a grouped bar chart (geom_bar with “dodge”) to compare employee tiers across different departments.

p1 <- ggplot(large_df, aes(x = department, fill = KPI_tier)) +
  geom_bar(position = "dodge") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "KPI Tier Distribution per Department",
    x = "Department",
    y = "Employee Count",
    fill = "KPI Category"
  )

print(p1)

We can easily identify which department has the highest number of “Excellent” employees. An uneven distribution might suggest where performance improvements are needed.

8.3.2 Visualization: Correlation between Performance and SalaryCorrelation between Performance and Salary

Each dot represents an employee (colored by company). The black dashed line is the regression line; if it slopes upward to the right, it indicates a positive correlation between performance and pay.

p2 <- ggplot(large_df, aes(x = performance_score, y = salary, color = company_id)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Relationship Between Performance and Salary", 
    subtitle = "With Linear Regression Line",
    x = "Performance Score",
    y = "Salary"
  )

print(p2)

## `geom_smooth()` using formula = 'y ~ x'

8.4 Final Dashboard

This provides a statistical summary of all columns. Your final_dashboard_data is now fully processed and ready for further analysis or export.

final_dashboard_data <- large_df

# Display final data summary
summary(final_dashboard_data)

##   employee_id      company_id         department          KPI_score    
##  Min.   :  1.00   Length:250         Length:250         Min.   :50.02  
##  1st Qu.: 63.25   Class :character   Class :character   1st Qu.:61.26  
##  Median :125.50   Mode  :character   Mode  :character   Median :73.10  
##  Mean   :125.50                                         Mean   :74.15  
##  3rd Qu.:187.75                                         3rd Qu.:87.25  
##  Max.   :250.00                                         Max.   :99.97  
##  performance_score     salary        KPI_tier        
##  Min.   :1.011     Min.   : 5039   Length:250        
##  1st Qu.:3.539     1st Qu.: 7497   Class :character  
##  Median :5.745     Median : 9923   Mode  :character  
##  Mean   :5.614     Mean   : 9859                     
##  3rd Qu.:8.113     3rd Qu.:12194                     
##  Max.   :9.954     Max.   :14917

9 TASK 8 - Automated Report Generation

Using functions and loops to create automatic summaries (similar to the concept of automating file generation or preparing HTML tags before they are displayed in R Markdown)

automated_report <- function(df, output_dir = "company_reports") {
  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

  companies <- unique(df$company_id)
  summary_table <- data.frame()

  for (comp in companies) {
    comp_data <- df[df$company_id == comp, ]

    emp_count <- nrow(comp_data)
    avg_sal <- round(mean(comp_data$salary, na.rm = TRUE), 2)
    top_perf <- sum(comp_data$KPI_score > 90, na.rm = TRUE)

    summary_table <- rbind(summary_table, data.frame(
      company_id = comp,
      total_employees = emp_count,
      average_salary = avg_sal,
      top_performers = top_perf
    ))

    write.csv(
      comp_data,
      file = file.path(output_dir, paste0("Report_", comp, ".csv")),
      row.names = FALSE
    )
  }

  cat("\nAUTOMATED COMPANY SUMMARY REPORT\n")
  cat("================================\n")
  print(knitr::kable(summary_table, align = "c"))

  invisible(summary_table)
}

# Eksekusi
summary_result <- automated_report(final_dashboard_data)

## 
## AUTOMATED COMPANY SUMMARY REPORT
## ================================
## 
## 
## | company_id | total_employees | average_salary | top_performers |
## |:----------:|:---------------:|:--------------:|:--------------:|
## | Company_1  |       50        |    9491.84     |       9        |
## | Company_2  |       50        |    9711.25     |       9        |
## | Company_3  |       50        |    9999.22     |       10       |
## | Company_4  |       50        |    10275.41    |       9        |
## | Company_5  |       50        |    9817.82     |       10       |

Summary Report:

Uniform Workforce Size: All five companies (Company_1 through Company_5) maintain an identical headcount of 50 employees each.
Highest Average Salary: Company_4 pays the highest average salary at \(10,275.41\).
Lowest Average Salary: Company_1 has the lowest average payroll cost per employee at 9,491.84.
Top Performance Leaders: Company_3 and Company_5 lead in terms of high-achieving talent, with both reporting **10 top performers.