Final-Project

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Numpy Operations
# Generate random data using numpy
np.random.seed(42)  # For reproducibility
data = np.random.randn(100, 4)  # 100 rows, 4 columns of normally distributed data

# 2. Pandas Operations
# Create a DataFrame using the numpy random data
df = pd.DataFrame(data, columns=['A', 'B', 'C', 'D'])

# Function 1 (Numpy): Calculate mean of each column
means = np.mean(data, axis=0)
print("Mean of each column in numpy array:", means)

# Function 2 (Pandas): Descriptive statistics of the DataFrame
print("\nPandas descriptive statistics:\n", df.describe())

# Function 3 (Numpy): Find the index of the max value in the data
max_index = np.argmax(data, axis=0)
print("\nIndex of the max value in each column:", max_index)

# 3. Pandas Operations
# Function 1 (Pandas): Calculate the correlation matrix of the DataFrame
correlation_matrix = df.corr()
print("\nPandas Correlation Matrix:\n", correlation_matrix)

# Function 2 (Pandas): Check for any missing values in the DataFrame
missing_values = df.isnull().sum()
print("\nPandas Missing Values in each column:\n", missing_values)

# Function 3 (Pandas): Sort the DataFrame by column 'A'
sorted_df = df.sort_values(by='A')
print("\nPandas DataFrame sorted by column 'A':\n", sorted_df.head())

# 4. Visualization with Matplotlib
# Function 1 (Matplotlib): Line plot for the first two columns
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(df['A'], label='A')
plt.plot(df['B'], label='B')
plt.title('Line Plot for Columns A and B')
plt.legend()

# Function 2 (Matplotlib): Histogram of column 'C'
plt.subplot(1, 2, 2)
plt.hist(df['C'], bins=20, color='orange')
plt.title('Histogram of Column C')

# Function 3 (Matplotlib): Show correlation heatmap
plt.figure(figsize=(5, 5))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar(label='Correlation Coefficient')
plt.title('Correlation Heatmap of Columns')
plt.xticks(range(len(df.columns)), df.columns)
plt.yticks(range(len(df.columns)), df.columns)

# Show all plots
plt.tight_layout()
plt.show()

Mean of each column in numpy array: [-0.00981123  0.0337458   0.0224957   0.04376417]

Pandas descriptive statistics:
                 A           B           C           D
count  100.000000  100.000000  100.000000  100.000000
mean    -0.009811    0.033746    0.022496    0.043764
std      0.868065    0.952234    1.044014    0.982240
min     -2.025143   -1.959670   -3.241267   -1.987569
25%     -0.716089   -0.564362   -0.616727   -0.727600
50%     -0.000248   -0.024646    0.068665    0.075219
75%      0.528231    0.547116    0.701519    0.778891
max      2.314659    3.852731    2.189803    2.720169

Index of the max value in each column: [55 52 94 44]

Pandas Correlation Matrix:
           A         B         C         D
A  1.000000 -0.016390  0.017745 -0.019994
B -0.016390  1.000000 -0.033668  0.136856
C  0.017745 -0.033668  1.000000 -0.038070
D -0.019994  0.136856 -0.038070  1.000000

Pandas Missing Values in each column:
 A    0
B    0
C    0
D    0
dtype: int64

Pandas DataFrame sorted by column 'A':
            A         B         C         D
59 -2.025143  0.186454 -0.661786  0.852433
99 -1.713135  1.353872 -0.114540  1.237816
11 -1.478522 -0.719844 -0.460639  1.057122
25 -1.415371 -0.420645 -0.342715 -0.802277
51 -1.377669 -0.937825  0.515035  0.513786

png

png