Data science plays an important role in modern decision-making by transforming raw data into meaningful insights. This project focuses on the use of functions and loops to simulate real-world data scenarios, perform analysis, and create visualizations. The practicum aims to develop programming skills and demonstrate how automated workflows can support data-driven solutions.
To apply functions and loops in data science tasks
To simulate real-world datasets (sales & company data)
To use conditional logic and data transformation
To perform analysis and create visualizations
To build an automated data science workflow
import numpy as np
import matplotlib.pyplot as plt
def compute_formula(x, formula_type):
"""
Compute different mathematical formulas
"""
if formula_type == "linear":
return 2*x + 3
elif formula_type == "quadratic":
return x**2 + 2*x + 1
elif formula_type == "cubic":
return x**3 - x**2 + 2
elif formula_type == "exponential":
return np.exp(0.2*x)
else:
raise ValueError("Invalid formula type")
def plot_formulas():
x = np.arange(1, 21)
formulas = ["linear", "quadratic", "cubic", "exponential"]
plt.figure()
for f in formulas:
y = compute_formula(x, f)
plt.plot(x, y, label=f)
plt.title("Comparison of Multiple Mathematical Functions")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.grid()
plt.show()
plot_formulas()
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def simulate_sales(n_salesperson, days):
"""
Simulate sales data with discount logic
"""
data = []
for s in range(n_salesperson):
cumulative = 0
for d in range(days):
sales = np.random.randint(100, 1000)
# Conditional discount
if sales > 700:
discount = 0.2
elif sales > 400:
discount = 0.1
else:
discount = 0.05
cumulative += sales
data.append([s, d, sales, discount, cumulative])
df = pd.DataFrame(data, columns=[
"sales_id", "day", "sales_amount", "discount", "cumulative_sales"
])
return df
def analyze_sales(df):
# Summary insight
print(" Sales Summary:")
print(df.groupby("sales_id")["sales_amount"].sum())
# Plot
sns.lineplot(data=df, x="day", y="cumulative_sales", hue="sales_id")
plt.title("Cumulative Sales per Salesperson")
plt.show()
sales_df = simulate_sales(5, 30)
analyze_sales(sales_df)
Sales Summary:
sales_id
0 18195
1 17570
2 15307
3 15634
4 14961
Name: sales_amount, dtype: int64
def categorize_performance(sales):
categories = []
for s in sales:
if s > 800:
categories.append("Excellent")
elif s > 600:
categories.append("Very Good")
elif s > 400:
categories.append("Good")
elif s > 200:
categories.append("Average")
else:
categories.append("Poor")
return categories
def plot_performance(df):
counts = df["category"].value_counts()
print("\n Category Percentage:")
print((counts / len(df) * 100).round(2))
# Bar
counts.plot(kind="bar", title="Performance Distribution")
plt.show()
# Pie
counts.plot(kind="pie", autopct='%1.1f%%')
plt.title("Performance Share")
plt.show()
sales_df["category"] = categorize_performance(sales_df["sales_amount"])
plot_performance(sales_df)
Category Percentage:
category
Very Good 27.33
Good 21.33
Excellent 20.00
Average 17.33
Poor 14.00
Name: count, dtype: float64
def generate_company_data(n_company, n_employees):
data = []
for c in range(n_company):
for e in range(n_employees):
salary = np.random.randint(3000, 15000)
performance = np.random.randint(50, 100)
kpi = np.random.randint(50, 100)
dept = np.random.choice(["HR", "IT", "Finance", "Marketing"])
data.append([c, e, salary, dept, performance, kpi])
df = pd.DataFrame(data, columns=[
"company_id", "employee_id", "salary",
"department", "performance_score", "KPI_score"
])
return df
def company_summary(df):
summary = df.groupby("company_id").agg({
"salary": "mean",
"performance_score": "mean",
"KPI_score": "max"
})
print("\n Company Summary:")
print(summary)
company_df = generate_company_data(5, 100)
company_summary(company_df)
Company Summary:
salary performance_score KPI_score
company_id
0 8544.81 72.81 99
1 8773.76 74.91 99
2 8626.85 75.82 99
3 8691.17 76.15 99
4 8755.02 75.70 99
def monte_carlo_pi(n_points):
x = np.random.rand(n_points)
y = np.random.rand(n_points)
inside = (x**2 + y**2) <= 1
pi_estimate = 4 * np.sum(inside) / n_points
print(f"Estimated Pi: {pi_estimate}")
plt.scatter(x[inside], y[inside], s=1, label="Inside")
plt.scatter(x[~inside], y[~inside], s=1, label="Outside")
plt.legend()
plt.title("Monte Carlo Simulation of Pi")
plt.show()
monte_carlo_pi(10000)
Estimated Pi: 3.1164
def add_features(df):
# KPI Category
df["performance_category"] = pd.cut(
df["performance_score"],
bins=[0,60,70,80,90,100],
labels=["Poor","Average","Good","Very Good","Excellent"]
)
# Salary Bracket
df["salary_bracket"] = pd.cut(
df["salary"],
bins=[0,5000,8000,12000,20000],
labels=["Low","Medium","High","Very High"]
)
return df
company_df = add_features(company_df)
*Visualization*
sns.boxplot(data=company_df, x="salary_bracket", y="KPI_score")
plt.title("KPI by Salary Bracket")
plt.show()
def mini_dashboard(df):
# Top performers
top = df[df["KPI_score"] > 90]
print("\n Top Performers Count:", len(top))
# Scatter + regression
sns.regplot(data=df, x="salary", y="KPI_score")
plt.title("Salary vs KPI Relationship")
plt.show()
# Department comparison
sns.barplot(data=df, x="department", y="KPI_score")
plt.title("Average KPI per Department")
plt.show()
# Salary distribution
sns.histplot(df["salary"], kde=True)
plt.title("Salary Distribution")
plt.show()
mini_dashboard(company_df)
Top Performers Count: 76
This practicum shows how functions, loops, and data science techniques can be used to process data efficiently. The project demonstrates data simulation, analysis, and visualization to generate useful insights. Overall, it helps build programming skills and understanding of real-world data science workflows.