# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 1. Numpy Operations
# Generate random data using numpy
np.random.seed(42) # For reproducibility
data = np.random.randn(100, 4) # 100 rows, 4 columns of normally distributed data
# 2. Pandas Operations
# Create a DataFrame using the numpy random data
df = pd.DataFrame(data, columns=['A', 'B', 'C', 'D'])
# Function 1 (Numpy): Calculate mean of each column
means = np.mean(data, axis=0)
print("Mean of each column in numpy array:", means)
# Function 2 (Pandas): Descriptive statistics of the DataFrame
print("\nPandas descriptive statistics:\n", df.describe())
# Function 3 (Numpy): Find the index of the max value in the data
max_index = np.argmax(data, axis=0)
print("\nIndex of the max value in each column:", max_index)
# 3. Pandas Operations
# Function 1 (Pandas): Calculate the correlation matrix of the DataFrame
correlation_matrix = df.corr()
print("\nPandas Correlation Matrix:\n", correlation_matrix)
# Function 2 (Pandas): Check for any missing values in the DataFrame
missing_values = df.isnull().sum()
print("\nPandas Missing Values in each column:\n", missing_values)
# Function 3 (Pandas): Sort the DataFrame by column 'A'
sorted_df = df.sort_values(by='A')
print("\nPandas DataFrame sorted by column 'A':\n", sorted_df.head())
# 4. Visualization with Matplotlib
# Function 1 (Matplotlib): Line plot for the first two columns
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(df['A'], label='A')
plt.plot(df['B'], label='B')
plt.title('Line Plot for Columns A and B')
plt.legend()
# Function 2 (Matplotlib): Histogram of column 'C'
plt.subplot(1, 2, 2)
plt.hist(df['C'], bins=20, color='orange')
plt.title('Histogram of Column C')
# Function 3 (Matplotlib): Show correlation heatmap
plt.figure(figsize=(5, 5))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar(label='Correlation Coefficient')
plt.title('Correlation Heatmap of Columns')
plt.xticks(range(len(df.columns)), df.columns)
plt.yticks(range(len(df.columns)), df.columns)
# Show all plots
plt.tight_layout()
plt.show()
Mean of each column in numpy array: [-0.00981123 0.0337458 0.0224957 0.04376417]
Pandas descriptive statistics:
A B C D
count 100.000000 100.000000 100.000000 100.000000
mean -0.009811 0.033746 0.022496 0.043764
std 0.868065 0.952234 1.044014 0.982240
min -2.025143 -1.959670 -3.241267 -1.987569
25% -0.716089 -0.564362 -0.616727 -0.727600
50% -0.000248 -0.024646 0.068665 0.075219
75% 0.528231 0.547116 0.701519 0.778891
max 2.314659 3.852731 2.189803 2.720169
Index of the max value in each column: [55 52 94 44]
Pandas Correlation Matrix:
A B C D
A 1.000000 -0.016390 0.017745 -0.019994
B -0.016390 1.000000 -0.033668 0.136856
C 0.017745 -0.033668 1.000000 -0.038070
D -0.019994 0.136856 -0.038070 1.000000
Pandas Missing Values in each column:
A 0
B 0
C 0
D 0
dtype: int64
Pandas DataFrame sorted by column 'A':
A B C D
59 -2.025143 0.186454 -0.661786 0.852433
99 -1.713135 1.353872 -0.114540 1.237816
11 -1.478522 -0.719844 -0.460639 1.057122
25 -1.415371 -0.420645 -0.342715 -0.802277
51 -1.377669 -0.937825 0.515035 0.513786
png
png