import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Load the CSV file

file_path = ‘/home/jmcphaul/qm9.csv’
data = pd.read_csv(file_path)

Display the first few rows of the dataset

print(data.head())

Assuming ‘target’ is the column you want to predict

and the rest are features

features = data.drop(columns=[‘target’])
target = data[‘target’]

Data preprocessing (example)

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

Split the dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

Building a simple neural network model

input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation=‘relu’)(input_layer)
output_layer = Dense(1)(hidden_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=‘adam’, loss=‘mse’, metrics=[‘mae’])

Train the model

history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

Predict and evaluate the model

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f’Mean Squared Error: {mse}’)

Plotting the training loss

plt.plot(history.history[‘loss’], label=‘train’)
plt.plot(history.history[‘val_loss’], label=‘validation’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘Loss’)
plt.legend()
plt.show()

Install required packages

!pip install tensorflow pandas scikit-learn matplotlib

Import libraries

import tensorflow as tf from tensorflow.keras.layers import Input, Dense from tensorflow.keras.models import Model import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error

Load the CSV file

file_path = ‘qm9.csv’
data = pd.read_csv(file_path)

Display the first few rows of the dataset

print(data.head())

Get basic information about the dataset

print(data.info())

Check the basic statistics of the numerical features

print(data.describe())

Check for missing values

print(data.isnull().sum())

Selecting features and target

features = data.drop(columns=[‘mol_id’, ‘smiles’, ‘gap’])
target = data[‘gap’]

Scale the features

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

Split the dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

Display the shapes of the datasets to confirm splitting

print(f”Train Features: {X_train.shape}, Train Target: {y_train.shape}“)
print(f”Test Features: {X_test.shape}, Test Target: {y_test.shape}“)

Define the neural network architecture

input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation=‘relu’)(input_layer)
output_layer = Dense(1)(hidden_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=‘adam’, loss=‘mse’, metrics=[‘mae’])

Train the model

history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

Plotting the training loss

plt.plot(history.history[‘loss’], label=‘train’)
plt.plot(history.history[‘val_loss’], label=‘validation’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘Loss’)
plt.legend()
plt.show()

Predict and evaluate the model on test set

y_pred = model.predict(X_test)

Calculate Mean Squared Error

mse = mean_squared_error(y_test, y_pred)
print(f’Mean Squared Error: {mse}’)

Additional metrics if needed

mae = tf.keras.metrics.mean_absolute_error(y_test, y_pred).numpy()
print(f’Mean Absolute Error: {mae}’)

Drop non-numeric columns if any

data = data.select_dtypes(include=[np.number])

Standardize the data

data_scaled = scaler.fit_transform(data)

Split the data into training and testing sets

X_train, X_test = train_test_split(data_scaled, test_size=0.2, random_state=42)

Define the autoencoder model

input_dim = X_train.shape[1] encoding_dim = 32 # This can be adjusted

input_layer = Input(shape=(input_dim,)) encoded = Dense(encoding_dim, activation=‘relu’)(input_layer) decoded = Dense(input_dim, activation=‘sigmoid’)(encoded)

autoencoder = Model(input_layer, decoded)

autoencoder.compile(optimizer=‘adam’, loss=‘mse’)

Train the autoencoder

history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, shuffle=True, validation_data=(X_test, X_test), verbose=1)

Get the reconstruction loss

X_train_pred = autoencoder.predict(X_train) train_loss = np.mean(np.square(X_train - X_train_pred), axis=1)

X_test_pred = autoencoder.predict(X_test) test_loss = np.mean(np.square(X_test - X_test_pred), axis=1)

Set the threshold for anomaly detection

threshold = np.percentile(train_loss, 95) # 95th percentile

Identify anomalies

anomalies = test_loss > threshold

Print the results

print(f’Number of anomalies detected: {np.sum(anomalies)}’)

Plot the reconstruction loss

plt.figure(figsize=(10, 6)) plt.hist(test_loss, bins=50) plt.axvline(threshold, color=‘r’, linestyle=‘dashed’, linewidth=2) plt.xlabel(‘Reconstruction loss’) plt.ylabel(‘Number of samples’) plt.title(‘Reconstruction Loss for Test Data’) plt.show()

Define a function to dynamically adjust the encoding dimension

def dynamic_encoding_dim(data_point): # Example: Adjust the encoding dimension based on the mean of the data point mean_val = np.mean(data_point) if mean_val < -1: return 16 elif mean_val < 0: return 32 else: return 64

Define the autoencoder model with dynamic encoding dimension

def create_autoencoder(input_dim, encoding_dim): input_layer = Input(shape=(input_dim,)) encoded = Dense(encoding_dim, activation=‘relu’)(input_layer) decoded = Dense(input_dim, activation=‘sigmoid’)(encoded) autoencoder = Model(input_layer, decoded) autoencoder.compile(optimizer=‘adam’, loss=‘mse’) return autoencoder

Train the autoencoder with dynamic encoding dimension

for epoch in range(50): # Number of epochs for batch_start in range(0, X_train.shape[0], 256): # Batch size batch_end = min(batch_start + 256, X_train.shape[0]) X_batch = X_train[batch_start:batch_end] encoding_dim = dynamic_encoding_dim(np.mean(X_batch, axis=0)) autoencoder = create_autoencoder(input_dim, encoding_dim) autoencoder.fit(X_batch, X_batch, epochs=1, verbose=0)

$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –pty bash $ module load conda gcc $ module load cuda $ module load cudnn $ conda create –prefix ~/tensorflow_2.9 python=3.8 pip –y $ conda update -n base -c conda-forge conda $ conda activate /users/jmcphaul/tensorflow_2.9 $ source activate /users/jmcphaul/tensorflow_2.9/ $ pip install tensorflow==2.9.1 $ pip install ipykernel $ python3 -m ipykernel install –user –name tensorflow_2.9 –display-name TensorflowGPU29 $ python >>> import tensorflow as tf >>> tf.config.list_physical_devices(‘GPU’) [PhysicalDevice(name=‘/physical_device:GPU:0’, device_type=‘GPU’)]

$ conda create –prefix ~/pytorch_1.13 python=3.8 pip –y $ conda activate /users/jmcphaul/pytorch_1.13 $ source activate /users/jmcphaul/pytorch_1.13 $ source activate ~/pytorch_1.13 $ conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia –y $ python >>> import torch >>> torch.cuda.is_available() True >>> torch.cuda.device_count() 1

$ enroot import docker://ubuntu $ enroot create ubuntu.sqsh $ enroot start ubuntu

#Type ls to see the content of container: # ls

bin dev home lib32 libx32 mnt proc run srv tmp usr boot etc lib lib64 media opt root sbin sys users var

enroot import docker://godlovedc/lolcow enroot create godlovedc+lolcow.sqsh enroot start godlovedc+lolcow

conda activate /users/jmcphaul/tensorflow_2.9 source activate /users/jmcphaul/tensorflow_2.9/

cd $WORK/sqsh enroot import docker://nvcr.io#nvidia/tensorflow:22.12-tf2-py3 enroot import docker://nvcr.io/nvidia/tensorflow:24.06-tf2-py3 (returned unauthorized so i used the one above)

enroot create nvidia+tensorflow+22.12-tf2-py3.sqsh exit

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$WORK –pty $SHELL

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$HOME –pty bash conda activate /users/jmcphaul/tensorflow_2.9 tf.config.list_physical_devices(‘GPU’)

PAY ATTENTION HERE:

nano submit_job.sh {#!/bin/bash
#SBATCH -J Testing
#SBATCH -o output-%j.txt
#SBATCH -e error-%j.txt
#SBATCH -p batch -c 12 –mem=20G –gres=gpu:1
#SBATCH -t 1440
#SBATCH -D /link-to-your-folder/

srun --container-image=/work/users/tuev/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$WORK python testing.py}

chmod +x submit_job.sh

PAY ATTENTION - USING CONTAINER IN JUPYTER: after requesting node w container run jupyter commans with additional flag -allow-root
/workspace# jupyter lab --allow-root --no-browser --ip=0.0.0.0


 http://hostname:8888/?token=86f3c661ba3637af574f976a80d0d296c28208124d838e05

 **MODIFY THIS** (change hostname and open in firefox)

 http://bcm-dgxa100-0002:8888/?token=86f3c661ba3637af574f976a80d0d296c28208124d838e05

 once in firefox use python3 default kernal:
 import tensorflow as tf
 tf.config.list_physical_devices('GPU')

 THEN IN TERMINAL: 
 ln -s $WORK work

 bash: /hpc/sys/apps/lmod/lmod/libexec/lmod: No such file or directory
 (base) jmcphaul@bcm-dgxa100-0002:/workspace$ ln -s $WORK work
 (base) jmcphaul@bcm-dgxa100-0002:/workspace$ 

 exit

$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –pty $SHELL$ module load conda gcc; module load cuda; module load cudnn $ conda activate users/jmcphaul/tensorflow_2.9
$ pip install jupyterlab $ jupyter lab –ip=0.0.0.0 –no-browser

go to browser:

use TensorflowGPU29

import tensorflow as tf tf.config.list_physical_devices(‘GPU’)

— https://www.tensorflow.org/install/pip –

in terminal: nvidia-smi pip install –upgrade pip # For GPU users pip install tensorflow[and-cuda]

python3 -c “import tensorflow as tf; print(tf.reduce_sum(tf.random.normal([1000, 1000])))”

rerun notebook command

back to terminal : pip install jupyterlab jupyter lab –ip=0.0.0.0 –no-browser –allow-root

from terminal: cd $WORK wget https://raw.githubusercontent.com/SouthernMethodistUniversity/SMU_SuperPOD_101/e6315c29ca0542351b79233729708dfa16161cdf/files/model_CNN_CIFAR10.py

scontrol show partition

-or-

sinfo

nano submit_job.sh #!/bin/bash
#SBATCH -J CNN_CIFAR10_SPOD # job name to display in squeue
#SBATCH -t 60 # maximum runtime in minutes
#SBATCH -c 2 # request 2 CPUs
#SBATCH -G 1 # request 1 GPU A100
#SBATCH -p batch # specify the correct partition name
#SBATCH -D /work/users/tuev # link to your folder
#SBATCH –mem=32gb # request 32GB memory
#SBATCH –mail-user tuev@smu.edu # request to email your email ID
#SBATCH –mail-type=end # request to email when the job ends

module load conda gcc
module load cuda cudnn

conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py

sbatch submit_job.sh

nano submit_job1.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job name to display in squeue #SBATCH -t 60 # maximum runtime in minutes #SBATCH -c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p short # request queue name workshop (optional) #SBATCH -D /work/users/tuev # link to your folder
#SBATCH –mem=32gb # request 32gb memory
#SBATCH –mail-user tuev@smu.edu # request to email to your emailID
#SBATCH –mail-type=end # request to mail when the model ends

module load conda gcc
module load cuda cudnn

conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py

nano submit_job.sh #!/bin/bash
#SBATCH -J example_job
#SBATCH -t 60
#SBATCH -c 2
#SBATCH –gres=gpu:1
#SBATCH -p batch
#SBATCH –mem=32gb

module load conda gcc
module load cuda cudnn
conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py

ON SCREEN RIGHT AFTER SHH - LOGIN NO NODE REQUEST sbatch submit_job.sh

nano modelCNN.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job name to display in squeue #SBATCH -t 60 # maximum runtime in minutes #SBATCH -c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p batch # request queue name workshop (optional) #SBATCH -D /work/users/tuev # link to your folder #SBATCH –mem=32gb # request 32gb memory #SBATCH –mail-user tuev@smu.edu # request to email to your emailID #SBATCH –mail-type=end # request to mail when the model end

module load conda gcc module load cuda cudnn

conda activate ~/tensorflow_2.9 python model_CNN_CIFAR10.py

sbatch modelCNN.sh

nano modelCNN_ngc.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job name to display in squeue #SBATCH -t 60 # maximum runtime in minutes #SBATCH -c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p batch # request queue name workshop (optional) #SBATCH –mem=32gb # request 32gb memory #SBATCH –mail-user tuev@smu.edu # request to email to your emailID #SBATCH –mail-type=end # request to mail when the model end

srun –container-image=/work/users/tuev/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh –container-mounts=$WORK python $WORK/model_CNN_CIFAR10.py

sbatch modelCNN_ngc.sh

CURRENTLY ON LESSON 5. 7 more after this.
USEFUL COMMANDS:

Useful Commands Here are some basic SLURM commands for submitting, querying and deleting jobs in SuperPOD:

Command Actions srun -N1 -G1 –pty $SHELL Submit an interactive job (reserves 1 Node, 1GPU, 1CPU, 6gb RAM, 1 hour walltime) sbatch job.sh submit the job script job.sh sstat <job id> Check the status of the job given jobID sstat <job id> --format=AveCPU,AvePages,AveRSS,AveVMSize,JobID Narrow some information on sstat squeue -u <username> Check the status of all jobs submitted by given username scontrol show job <job id> Check the detailed information for job with given job ID scancel <job id> Delete the queued or running job given job ID Check pending, working job:$ squeue -u $USERNAME

JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON 12345 workshop bash tuev R 39:46 1 bcm-dgxa100-0002 The above Job has a JOBID=12345, which will be used below:

Check configuration of any requested job using JOBID: $ scontrol show job 12345 grep ReqTRES

ReqTRES=cpu=5,mem=30G,node=1,billing=5,gres/gpu=1 Delete any job $ scancel 12345 Checking how your job is running in node When you know your working node, for example bcm-dgxa100-0001, from login node, you can login to the compute node and check the processing:

Command to check working cpus: $ ssh bcm-dgxa100-0001 $ top -u $USERNAME Command to check working gpus:$ ssh bcm-dgxa100-0001 $ nvidia-smi OR to refresh the command every 0.2s $ watch -n .2 nvidia-smi

squeue -u jmcphaul

scontrol show job 12345 grep ReqTRES

SET USERNAME:

export USERNAME=jmcphaul

then: squeue -u $USERNAME

top -u jmcphaul

export USERNAME=jmcphaul

top -u $USERNAME

—————https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/06-RAPIDS/index.html------------- srun -n1 –gres=gpu:1 -c2 –mem=4gb –time=12:00:00 –pty $SHELL module load conda conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8

conda create -p ~/my_conda_env mamba -c conda-forge source activate ~/my_conda_env mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8

TROUBLESHOOTING: top (or htop) CONDA_VERBOSE=1 conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 conda config –remove-key channels conda config –add channels defaults conda config –add channels conda-forge conda config –add channels nvidia conda config –add channels rapidsai df -h conda install mamba -n base -c conda-forge mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 export http_proxy=http://your-proxy:port export https_proxy=https://your-proxy:port mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 CONDA_VERBOSE=1 conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 ping conda.anaconda.org nslookup conda.anaconda.org conda search python -c conda-forge

CONTINUES TROUBLESHOOTING: mamba clean –index-cache –force-pkgs-dirs conda clean –all mamba update mamba conda update mamba mamba update –all conda update conda

____________YAML________________________________ nano environment2.yaml name: rapids23.02 channels: - https://conda.anaconda.org/rapidsai - https://conda.anaconda.org/conda-forge - https://conda.anaconda.org/nvidia - defaults dependencies: - rapids=23.02 - python=3.10 - cudatoolkit=11.8

or -

name: rapids-23.02
channels:
- rapidsai
- conda-forge
- nvidia
- defaults
dependencies:
- rapids=23.02
- python=3.10
- cudatoolkit=11.8

conda config –set channel_priority flexible mamba env create -f environment2.yaml

Istall miniconda to home diorectory for full package control:

wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh # run installer bash Miniconda3-latest-Linux-x86_64.sh # Initialize ~/miniconda3/bin/conda init # Restart terminal source ~/.bashrc # or ~/.zshrc if use zsh

conda init mamba init

set base to active or change:

conda config –set auto_activate_base false # change mind: conda init –reverse $SHELL

conda activate rapids-23.02

conda init source ~/.bashrc # or ~/.zshrc

environments

conda env list conda install conda list conda update conda remove conda deactivate

remove and unistall the same

conda remove numpy scipy pandas # remove all packages conda remove –all # remove env conda remove -n my_env numpy # force removal conda remove –force numpy # remove strict channel mamba config –set channel_priority flexible

check network issues:

export HTTPS_PROXY=http://yourproxy:port
export HTTP_PROXY=http://yourproxy:port
unset HTTPS_PROXY unset HTTP_PROXY

From page

$ enroot import docker://nvcr.io#nvidia/rapidsai/rapidsai:cuda11.2-runtime-centos7-py3.10 $ enroot create nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh

srun -N1 -G1 -c10 –mem=6G –time=12:00:00 –container-image $WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=$WORK –pty $SHELL srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=$WORK –pty $SHELL

Restart terminal

source ~/.bashrc

conda config –set auto_activate_base true conda config –set auto_activate_base false

ENV DISAPPEARED

remake nano file nano environment2.yaml name: rapids-23.02
channels:
- rapidsai
- conda-forge
- nvidia
- defaults
dependencies:
- rapids=23.02
- python=3.10
- cudatoolkit=11.8

conda config –set channel_priority flexible mamba config –set channel_priority flexible mamba env create -f environment2.yaml

dmesg | grep -i “killed”

Remove the directory for the incomplete environment

rm -rf /users/jmcphaul/my_conda_envs/rapids23.02

swap space: dd if=/dev/zero of=myswapfile bs=1M count=8192

only me:

chmod 600 myswapfile mkswap myswapfile swapon myswapfile

source activate ~/my_conda_env conda install mamba -c conda-forge mamba –version

-or-

wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh

conda config –set channel_priority flexible mamba config –set channel_priority flexible

source ~/mambaforge/bin/activate
condayes

conda config –set auto_activate_base true mamba config –set auto_activate_base true

conda activate rapids23.02 conda remove jupyter_server conda clean –all
y

mamba install jupyter_server
mamba install jupyterlab

environment location: /users/jmcphaul/my_conda_envs/rapids23.02

conda activate base
conda create -n conda_update python=3.10 conda activate conda_update conda install -c conda-forge conda conda activate rapids23.02 conda activate conda_update
conda update conda conda update -n rapids23.02 –all -c conda-forge conda activate rapids23.02

and :

alias conda-deactivate-all=‘while [[ “$CONDA_SHLVL” -gt 0 ]]; do conda deactivate; done’ conda config –set auto_activate_base false conda deactivate source ~/.bashrc

lastly: conda_deactivate_all

Cleaning Up:

rm readline-8.1.tar.gz

NEXT SECTION

enroot import docker://nvcr.io#nvidia/nemo:22.09 enroot create nvidia+nemo+22.09.sqsh enroot import docker://nvcr.io#nvidia/nemo_bert_text_classification:20.07 enroot create nvidia+nemo_bert_text_classification+20.07.sqsh

cd $WORK mkdir nemo && cd nemo curl -s -O https://dl.fbaipublicfiles.com/glue/data/SST-2.zip\ && unzip -o SST-2.zip -d ./
&& sed 1d ./SST-2/train.tsv > ./train_nemo_format.tsv
&& sed 1d ./SST-2/dev.tsv > ./dev_nemo_format.tsv &

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=$WORK –pty bash -i

cd $WORK/nemo/SST-2 python /workspace/nemo/examples/nlp/text_classification/text_classification_with_bert.py
model.dataset.num_classes=2
model.dataset.max_seq_length=256
model.train_ds.batch_size=64
model.validation_ds.batch_size=64
model.language_model.pretrained_model_name=‘bert-base-cased’
model.train_ds.file_path=train_nemo_format.tsv
model.validation_ds.file_path=dev_nemo_format.tsv
trainer.num_nodes=1
trainer.max_epochs=20
trainer.precision=16
model.optim.name=adam
model.optim.lr=1e-4 Check the GPU usage with nvidia-smi command

find / -name text_classification_with_bert.py 2>/dev/null

cd /work/users/jmcphaul/nemo git clone https://github.com/NVIDIA/NeMo.git cd NeMo/examples/nlp/text_classification conda activate rapids23.02 pip install pytorch_lightening

-or-

pip install -r /work/users/jmcphaul/nemo/NeMo/requirements.txt (Install package and Requirements)

VERIFY INSTALL:

python -c “import pytorch_lightning as pl; print(pl.__version__)”

python text_classification_with_bert.py
model.dataset.num_classes=2
model.dataset.max_seq_length=256
model.train_ds.batch_size=64
model.validation_ds.batch_size=64
model.language_model.pretrained_model_name=‘bert-base-cased’
model.train_ds.file_path=/work/users/jmcphaul/nemo/train_nemo_format.tsv
model.validation_ds.file_path=/work/users/jmcphaul/nemo/dev_nemo_format.tsv
trainer.num_nodes=1
trainer.max_epochs=20
trainer.precision=16
model.optim.name=adam
model.optim.lr=1e-4

curl –proto ‘=https’ –tlsv1.2 -sSf https://sh.rustup.rs | sh source $HOME/.cargo/env rustc –version

pip install –upgrade pip setuptools

pip freeze > pip_packages.txt nano pip_packages.txt cat pip_packages.txt | xargs pip uninstall -y mamba install -c conda-forge transformers==4.9.2 tokenizers==0.10.3 pytorch-lightning omegaconf cython pandas numpy scikit-learn scipy matplotlib jupyterlab conda activate /users/jmcphaul/my_conda_envs/rapids23.02

Step 1: Clean the pip_packages.txt file

sed -i ‘/@/d’ pip_packages.txt

Step 2: Uninstall all packages listed in the cleaned file

cat pip_packages.txt | xargs pip uninstall -y

Step 3: Reinstall necessary packages with mamba

mamba install -c conda-forge transformers tokenizers pytorch-lightning omegaconf

mamba install -c conda-forge cython rust mamba install -c conda-forge cython conda activate /users/jmcphaul/my_conda_envs/rapids23.02 mamba install -c conda-forge cython rust

mamba install -c conda-forge cython rust

mamba install -c conda-forge setuptools packaging

sed -i ‘/@/d’ pip_packages.txt

cat pip_packages.txt | xargs pip uninstall -y # Step 1: Clean the pip_packages.txt file sed -i ‘/@/d’ pip_packages.txt

Step 2: Uninstall all packages listed in the cleaned file

cat pip_packages.txt | xargs pip uninstall -y

Step 3: Reinstall necessary packages with mamba

mamba install -c conda-forge transformers tokenizers pytorch-lightning omegaconf

mamba install -c conda-forge numpy

mamba install -c nvidia -c conda-forge nemo_toolkit[all]

python -c “import nemo; print(nemo.__version__)”

*** NANO *** Steps to Select and Copy All Text in nano: Open the File in nano:

sh Copy code nano pip_packages.txt Navigate to the Start of the File: Press Ctrl + _ (underscore) then Ctrl + A to move the cursor to the start of the file.

Start Marking Text: Press Ctrl + ^ (Ctrl key and the caret key) to start marking text. This sets the starting point of the selection. cd Move to the End of the File: Press Ctrl + _ (underscore) then Ctrl + E to move the cursor to the end of the file. This will highlight all the text from the start to the end of the file.

Copy the Selected Text: Press Alt + 6 (Option key and 6 on Mac) to copy the selected text to the clipboard.

Close the File: Press Ctrl + X to exit nano.

git clone https://github.com/NVIDIA/NeMo.git

mamba uninstall pytorch

conda list | grep cudatoolkit mamba list | grep cudatoolkit

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=$WORK –pty bash -i

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=$WORK –pty $SHELL

*** NOTE NEXT TIME FROM HERE:** Success. Logging you in… Success. Logging you in… Last login: Tue Jul 16 23:18:07 2024 from 129.119.70.150 jmcphaul@slogin-02:~$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=$WORK –pty bash -i bash: /hpc/sys/apps/lmod/lmod/libexec/lmod: No such file or directory jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ $ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=$WORK –pty $SHELL bash: $: command not found jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=$WORK –pty $SHELL bash: srun: command not found jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ ls examples scripts start-jupyter.sh tests tutorials jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ cd $WORK jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul$ ls environment.yml environment1st.yml environment2.yml modelCNN.sh model_CNN_CIFAR10.py ncurses-6.2 nemo readline-8.1 requirements2.txt sqsh submit_job1.sh jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul$ cd sqsh jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul/sqsh$ ls lua-5.3.5 nvidia+nemo+22.09.sqsh nvidia+tensorflow+22.12-tf2-py3.sqsh readline-8.1 tensorflow_22.12-tf2-py3.sif jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul/sqsh$

REDO THIS Page:

                                        https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/07-Application%20NEMO%20for%20Sentiment%20Analysis/index.html  ######################################

source ~/mambaforge/bin/activate
condayes

conda config –set auto_activate_base true mamba config –set auto_activate_base true srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=$WORK –pty bash -i

%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%% NEXT SECTION NEXT SECTION %%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%% nano cifar100m_job.sh #!/bin/bash
#SBATCH -J CIFAR100M # job name to display in squeue
#SBATCH -c 16 –mem=750G # requested partition
#SBATCH -o output-%j.txt # standard output file
#SBATCH -e error-%j.txt # standard error file
#SBATCH –gres=gpu:8
#SBATCH -t 1440 # maximum runtime in minutes
#SBATCH -D /work/users/tuev/cv1/cifar100/multi
#SBATCH –exclusive
#SBATCH –mail-user tuev@smu.edu
#SBATCH –mail-type=end

srun –container-image=$WORK/sqsh/nvidia+tensorflow+22.02-tf2-py3.sqsh --container-mounts=$WORK mpirun -np 8 –allow-run-as-root –oversubscribe python /work/users/tuev/cv1/cifar100/multi/cifar100spod-hvod.py chmod +x cifar100m_job.sh sbatch cifar100m_job.sh

basic run: srun -n1 –gres=gpu:1 -c2 –mem=12gb –time=12:00:00 –pty $SHELL

mkdir -p /users/jmcphaul/my_work_directory

BE ON LOGIN PAGE TO SUBMIT BATCH

Be on login node to submit the batch script:

try: import comet_ml # must be imported before torch (if installed) except ImportError: comet_ml = None

import numpy as np import torch import torch.distributed as dist import torch.nn as nn import yaml from torch.optim import lr_scheduler from tqdm import tqdm

FILE = Path(file).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative

import val as validate # for end-of-epoch mAP from models.experimental import attempt_load from models.yolo import Model from utils.autoanchor import check_anchors from utils.autobatch import check_train_batch_size
from utils.callbacks import Callbacks from utils.dataloaders import create_dataloader from utils.downloads import attempt_download, is_url
from utils.general import (

** && (( &*&& &&& ** %% \[ ** ^& \]$$\[$ \]$$$$$$$$$$$$$$$$$$$$$$$$ %%%%%%%%%%%%%%%%% NEXT &*(&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& NEXT )))

git clone https://github.com/ultralytics/yolov5.git srun -n1 –gres=gpu:1 –container-image $WORK/sqsh/nvidia+nemo+22.04.sqsh --container-mounts=$WORK –time=12:00:00 –pty $SHELL

cd $WORK ls cd yolov5 pip install -r requirements.txt

** NEW RUN NODE: srun –partition=short –gres=gpu:1 –cpus-per-task=8 –mem=64GB –time=1:00:00 –pty $SHELL *** srun –partition=short –gres=gpu:1 –cpus-per-task=8 –mem=64GB –time=1:00:00 –pty $SHELL module load conda gcc module load cuda module load cudnn conda create -n nemo_text_classification python=3.8 conda activate nemo_text_classification pip install tensorflow==2.11 pip install ipykernel python3 -m ipykernel install –user -nemo_text_classification –display-name NeMo mamba install -c conda-forge numpy pandas matplotlib scikit-learn jupyter mamba install -c pytorch pytorch torchvision torchaudio cudatoolkit=11.1 pip install nemo_toolkit[‘nlp’] pytorch-lightning transformers mamba install cython

BRANCH=‘r2.0.0rc0’ python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]

https://chatgpt.com/c/23dae27c-b53d-4d23-9351-1f322b12c7c5

tensorgpu

load_script.py import ctypes import os

Paths to the CUDA libraries

cuda_path = “/hpc/mp/repos/spack/opt/spack/linux-ubuntu22.04-zen2/gcc-13.2.0/cuda-12.4.1-vz7djzzlmvr6dgtrfgqletmtahxhwkm6/lib64/libcudart.so” cudnn_path = “/hpc/mp/repos/spack/opt/spack/linux-ubuntu22.04-zen2/gcc-13.2.0/cudnn-9.1.1.17-12-ld5h22cq2cbo6hpbi4dvlijb3mwenclg/lib/libcudnn.so”

Try loading the CUDA library

try: ctypes.CDLL(cuda_path) print(f”Successfully loaded {cuda_path}“) except OSError as e: print(f”Error loading {cuda_path}: {e}“)

Try loading the cuDNN library

try: ctypes.CDLL(cudnn_path) print(f”Successfully loaded {cudnn_path}“) except OSError as e: print(f”Error loading {cudnn_path}: {e}“)

$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$WORK –pty $SHELL root@bcm-dgxa100-0001:/workspace# jupyter lab –allow-root –no-browser –ip=0.0.0.0

cd $WORK python train_mnist.py

https://chatgpt.com/c/989a65f4-b75a-41d1-a025-22131aba3b7b https://gemini.google.com/app/221068a33333d467 https://chatgpt.com/g/g-phPzPLYLi-tensorflow-keras-gpt/c/8969e520-76fa-46b1-8148-4dd8b47359a3 https://chatgpt.com/c/989a65f4-b75a-41d1-a025-22131aba3b7b

srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image $WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$WORK –pty $SHELL conda –version

http://bcm-dgxa100-0008:8888/?token=6ceb3dee1350b0a72bf863516bb5733dc9a7f6ab1eb3c4af

http://bcm-dgxa100-0020:8888/?token=9cfcd29039a650d33396d64a4d623e18334a8d2fe1113eaa

qm9.csv

Great! Using mamba for managing your conda environment can be faster and more efficient. Here’s how you can resolve the issue using mamba:

Activate your conda environment:

conda activate /users/jmcphaul/my_conda_envs/rapids23.02

Check the installed PyTorch version and its dependencies:
```
python -c "import torch; print(torch.__version__)"
```
If this command fails, it indicates an issue with the PyTorch installation.
Reinstall PyTorch using mamba:

First, uninstall the current PyTorch installation:
```
mamba uninstall pytorch
```
Then, reinstall PyTorch with the appropriate CUDA version (assuming you’re using CUDA 11.1):
```
mamba install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia
```
Verify the installation again:
```
python -c "import torch; print(torch.__version__)"
```
Ensure that this command runs without any errors and prints the PyTorch version.

Run your script again:

python /work/users/jmcphaul/nemo/NeMo/examples/nlp/text_classification/text_classification_with_bert.py \
model.dataset.num_classes=2 \
model.dataset.max_seq_length=256 \
model.train_ds.batch_size=64 \
model.validation_ds.batch_size=64 \
model.language_model.pretrained_model_name='bert-base-cased' \
model.train_ds.file_path=/work/users/jmcphaul/nemo/train_nemo_format.tsv \
model.validation_ds.file_path=/work/users/jmcphaul/nemo/dev_nemo_format.tsv \
trainer.num_nodes=1 \
trainer.max_epochs=20 \
trainer.precision=16 \
model.optim.name=adam \
model.optim.lr=1e-4

By following these steps using mamba, you should be able to resolve the import error and run your script successfully. If you encounter any further issues, please let me know!

nvcr.io/nvidia/nemo_bert_text_classification:20.07 nvcr.io/nvidia/nemo:23.08 enroot import docker://nvcr.io#nvidia/nemo:23.08 enroot import docker://nvcr.io#nvidia/nemo_bert_text_classification:20.07

read the folowing links. b)create an organized PDF booklet for me to use and have fo reference
walk me through setting up access via my computer via terminal (linux/wsl/command)

https://southernmethodistuniversity.github.io/hpc_docs/accounts.html

https://southernmethodistuniversity.github.io/hpc_docs/index.html

https://southernmethodistuniversity.github.io/hpc_docs/access.html

https://southernmethodistuniversity.github.io/hpc_docs/portal.html

https://southernmethodistuniversity.github.io/hpc_docs/m3_migration.html

https://southernmethodistuniversity.github.io/hpc_docs/mp_update.html

https://southernmethodistuniversity.github.io/hpc_docs/examples/conda/README.html

https://southernmethodistuniversity.github.io/hpc_docs/examples/torch/README.html

https://southernmethodistuniversity.github.io/hpc_docs/tutorials/unix/unix.html

https://southernmethodistuniversity.github.io/hpc_docs/tutorials/unix/scripting.html

https://southernmethodistuniversity.github.io/hpc_docs/tutorials/modules/modules.html

https://southernmethodistuniversity.github.io/hpc_docs/tutorials/slurm/slurm.html