import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
file_path = ‘/home/jmcphaul/qm9.csv’
data = pd.read_csv(file_path)
print(data.head())
features = data.drop(columns=[‘target’])
target = data[‘target’]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation=‘relu’)(input_layer)
output_layer = Dense(1)(hidden_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=‘adam’, loss=‘mse’, metrics=[‘mae’])
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f’Mean Squared Error: {mse}’)
plt.plot(history.history[‘loss’], label=‘train’)
plt.plot(history.history[‘val_loss’], label=‘validation’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘Loss’)
plt.legend()
plt.show()
import tensorflow as tf from tensorflow.keras.layers import Input, Dense from tensorflow.keras.models import Model import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error
file_path = ‘qm9.csv’
data = pd.read_csv(file_path)
print(data.head())
print(data.info())
print(data.describe())
print(data.isnull().sum())
features = data.drop(columns=[‘mol_id’, ‘smiles’, ‘gap’])
target = data[‘gap’]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)
print(f”Train Features: {X_train.shape}, Train Target:
{y_train.shape}“)
print(f”Test Features: {X_test.shape}, Test Target: {y_test.shape}“)
input_layer = Input(shape=(X_train.shape[1],))
hidden_layer = Dense(64, activation=‘relu’)(input_layer)
output_layer = Dense(1)(hidden_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=‘adam’, loss=‘mse’, metrics=[‘mae’])
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)
plt.plot(history.history[‘loss’], label=‘train’)
plt.plot(history.history[‘val_loss’], label=‘validation’)
plt.xlabel(‘Epochs’)
plt.ylabel(‘Loss’)
plt.legend()
plt.show()
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f’Mean Squared Error: {mse}’)
mae = tf.keras.metrics.mean_absolute_error(y_test,
y_pred).numpy()
print(f’Mean Absolute Error: {mae}’)
data = data.select_dtypes(include=[np.number])
data_scaled = scaler.fit_transform(data)
X_train, X_test = train_test_split(data_scaled, test_size=0.2, random_state=42)
input_dim = X_train.shape[1] encoding_dim = 32 # This can be adjusted
input_layer = Input(shape=(input_dim,)) encoded = Dense(encoding_dim, activation=‘relu’)(input_layer) decoded = Dense(input_dim, activation=‘sigmoid’)(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=‘adam’, loss=‘mse’)
history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, shuffle=True, validation_data=(X_test, X_test), verbose=1)
X_train_pred = autoencoder.predict(X_train) train_loss = np.mean(np.square(X_train - X_train_pred), axis=1)
X_test_pred = autoencoder.predict(X_test) test_loss = np.mean(np.square(X_test - X_test_pred), axis=1)
threshold = np.percentile(train_loss, 95) # 95th percentile
anomalies = test_loss > threshold
print(f’Number of anomalies detected: {np.sum(anomalies)}’)
plt.figure(figsize=(10, 6)) plt.hist(test_loss, bins=50) plt.axvline(threshold, color=‘r’, linestyle=‘dashed’, linewidth=2) plt.xlabel(‘Reconstruction loss’) plt.ylabel(‘Number of samples’) plt.title(‘Reconstruction Loss for Test Data’) plt.show()
def dynamic_encoding_dim(data_point): # Example: Adjust the encoding dimension based on the mean of the data point mean_val = np.mean(data_point) if mean_val < -1: return 16 elif mean_val < 0: return 32 else: return 64
def create_autoencoder(input_dim, encoding_dim): input_layer = Input(shape=(input_dim,)) encoded = Dense(encoding_dim, activation=‘relu’)(input_layer) decoded = Dense(input_dim, activation=‘sigmoid’)(encoded) autoencoder = Model(input_layer, decoded) autoencoder.compile(optimizer=‘adam’, loss=‘mse’) return autoencoder
for epoch in range(50): # Number of epochs for batch_start in range(0, X_train.shape[0], 256): # Batch size batch_end = min(batch_start + 256, X_train.shape[0]) X_batch = X_train[batch_start:batch_end] encoding_dim = dynamic_encoding_dim(np.mean(X_batch, axis=0)) autoencoder = create_autoencoder(input_dim, encoding_dim) autoencoder.fit(X_batch, X_batch, epochs=1, verbose=0)
$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –pty bash $ module load conda gcc $ module load cuda $ module load cudnn $ conda create –prefix ~/tensorflow_2.9 python=3.8 pip –y $ conda update -n base -c conda-forge conda $ conda activate /users/jmcphaul/tensorflow_2.9 $ source activate /users/jmcphaul/tensorflow_2.9/ $ pip install tensorflow==2.9.1 $ pip install ipykernel $ python3 -m ipykernel install –user –name tensorflow_2.9 –display-name TensorflowGPU29 $ python >>> import tensorflow as tf >>> tf.config.list_physical_devices(‘GPU’) [PhysicalDevice(name=‘/physical_device:GPU:0’, device_type=‘GPU’)]
$ conda create –prefix ~/pytorch_1.13 python=3.8 pip –y $ conda activate /users/jmcphaul/pytorch_1.13 $ source activate /users/jmcphaul/pytorch_1.13 $ source activate ~/pytorch_1.13 $ conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia –y $ python >>> import torch >>> torch.cuda.is_available() True >>> torch.cuda.device_count() 1
$ enroot import docker://ubuntu $ enroot create ubuntu.sqsh $ enroot start ubuntu
#Type ls to see the content of container: # ls
bin dev home lib32 libx32 mnt proc run srv tmp usr boot etc lib lib64 media opt root sbin sys users var
enroot import docker://godlovedc/lolcow enroot create godlovedc+lolcow.sqsh enroot start godlovedc+lolcow
conda activate /users/jmcphaul/tensorflow_2.9 source activate /users/jmcphaul/tensorflow_2.9/
cd $WORK/sqsh enroot import docker://nvcr.io#nvidia/tensorflow:22.12-tf2-py3 enroot import docker://nvcr.io/nvidia/tensorflow:24.06-tf2-py3 (returned unauthorized so i used the one above)
enroot create nvidia+tensorflow+22.12-tf2-py3.sqsh exit
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=\)WORK –pty $SHELL
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=\)HOME –pty bash conda activate /users/jmcphaul/tensorflow_2.9 tf.config.list_physical_devices(‘GPU’)
PAY ATTENTION HERE:
nano submit_job.sh {#!/bin/bash
#SBATCH -J Testing
#SBATCH -o output-%j.txt
#SBATCH -e error-%j.txt
#SBATCH -p batch -c 12 –mem=20G –gres=gpu:1
#SBATCH -t 1440
#SBATCH -D /link-to-your-folder/
srun --container-image=/work/users/tuev/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=$WORK python testing.py}
chmod +x submit_job.sh
PAY ATTENTION - USING CONTAINER IN JUPYTER: after requesting node w container run jupyter commans with additional flag -allow-root
/workspace# jupyter lab --allow-root --no-browser --ip=0.0.0.0
http://hostname:8888/?token=86f3c661ba3637af574f976a80d0d296c28208124d838e05
**MODIFY THIS** (change hostname and open in firefox)
http://bcm-dgxa100-0002:8888/?token=86f3c661ba3637af574f976a80d0d296c28208124d838e05
once in firefox use python3 default kernal:
import tensorflow as tf
tf.config.list_physical_devices('GPU')
THEN IN TERMINAL:
ln -s $WORK work
bash: /hpc/sys/apps/lmod/lmod/libexec/lmod: No such file or directory
(base) jmcphaul@bcm-dgxa100-0002:/workspace$ ln -s $WORK work
(base) jmcphaul@bcm-dgxa100-0002:/workspace$
exit
$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –pty \(SHELL\) module load conda gcc; module load
cuda; module load cudnn $ conda activate
users/jmcphaul/tensorflow_2.9
$ pip install jupyterlab $ jupyter lab –ip=0.0.0.0 –no-browser
go to browser:
use TensorflowGPU29
import tensorflow as tf tf.config.list_physical_devices(‘GPU’)
— https://www.tensorflow.org/install/pip –
in terminal: nvidia-smi pip install –upgrade pip # For GPU users pip install tensorflow[and-cuda]
python3 -c “import tensorflow as tf; print(tf.reduce_sum(tf.random.normal([1000, 1000])))”
rerun notebook command
back to terminal : pip install jupyterlab jupyter lab –ip=0.0.0.0 –no-browser –allow-root
from terminal: cd $WORK wget https://raw.githubusercontent.com/SouthernMethodistUniversity/SMU_SuperPOD_101/e6315c29ca0542351b79233729708dfa16161cdf/files/model_CNN_CIFAR10.py
scontrol show partition
-or-
sinfo
nano submit_job.sh #!/bin/bash
#SBATCH -J CNN_CIFAR10_SPOD # job name to display in squeue
#SBATCH -t 60 # maximum runtime in minutes
#SBATCH -c 2 # request 2 CPUs
#SBATCH -G 1 # request 1 GPU A100
#SBATCH -p batch # specify the correct partition name
#SBATCH -D /work/users/tuev # link to your folder
#SBATCH –mem=32gb # request 32GB memory
#SBATCH –mail-user tuev@smu.edu # request to email your email ID
#SBATCH –mail-type=end # request to email when the job ends
module load conda gcc
module load cuda cudnn
conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py
sbatch submit_job.sh
nano submit_job1.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job
name to display in squeue #SBATCH -t 60 # maximum runtime in minutes
#SBATCH -c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p short # request queue name
workshop (optional) #SBATCH -D /work/users/tuev # link to your
folder
#SBATCH –mem=32gb # request 32gb memory
#SBATCH –mail-user tuev@smu.edu # request to email to your emailID
#SBATCH –mail-type=end # request to mail when the model ends
module load conda gcc
module load cuda cudnn
conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py
nano submit_job.sh #!/bin/bash
#SBATCH -J example_job
#SBATCH -t 60
#SBATCH -c 2
#SBATCH –gres=gpu:1
#SBATCH -p batch
#SBATCH –mem=32gb
module load conda gcc
module load cuda cudnn
conda activate ~/tensorflow_2.9
python model_CNN_CIFAR10.py
ON SCREEN RIGHT AFTER SHH - LOGIN NO NODE REQUEST sbatch submit_job.sh
nano modelCNN.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job name
to display in squeue #SBATCH -t 60 # maximum runtime in minutes #SBATCH
-c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p batch # request queue name
workshop (optional) #SBATCH -D /work/users/tuev # link to your folder
#SBATCH –mem=32gb # request 32gb memory #SBATCH –mail-user tuev@smu.edu # request to
email to your emailID #SBATCH –mail-type=end # request to mail when the
model end
module load conda gcc module load cuda cudnn
conda activate ~/tensorflow_2.9 python model_CNN_CIFAR10.py
sbatch modelCNN.sh
nano modelCNN_ngc.sh #!/bin/bash #SBATCH -J CNN_CIFAR10_SPOD # job
name to display in squeue #SBATCH -t 60 # maximum runtime in minutes
#SBATCH -c 2 # request 2 cpus
#SBATCH -G 1 # request 1 gpu a100 #SBATCH -p batch # request queue name
workshop (optional) #SBATCH –mem=32gb # request 32gb memory #SBATCH
–mail-user tuev@smu.edu
# request to email to your emailID #SBATCH –mail-type=end # request to
mail when the model end
srun –container-image=/work/users/tuev/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh –container-mounts=$WORK python $WORK/model_CNN_CIFAR10.py
sbatch modelCNN_ngc.sh
CURRENTLY ON LESSON 5. 7 more after this.
USEFUL COMMANDS:
Useful Commands Here are some basic SLURM commands for submitting, querying and deleting jobs in SuperPOD:
Command Actions srun -N1 -G1 –pty \(SHELL Submit an interactive job (reserves 1 Node, 1GPU, 1CPU, 6gb RAM, 1 hour walltime) sbatch job.sh submit the job script job.sh sstat <job id> Check the status of the job given jobID sstat <job id> --format=AveCPU,AvePages,AveRSS,AveVMSize,JobID Narrow some information on sstat squeue -u <username> Check the status of all jobs submitted by given username scontrol show job <job id> Check the detailed information for job with given job ID scancel <job id> Delete the queued or running job given job ID Check pending, working job:\) squeue -u $USERNAME
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON 12345 workshop bash tuev R 39:46 1 bcm-dgxa100-0002 The above Job has a JOBID=12345, which will be used below:
Check configuration of any requested job using JOBID: $ scontrol show job 12345 grep ReqTRES
ReqTRES=cpu=5,mem=30G,node=1,billing=5,gres/gpu=1 Delete any job $ scancel 12345 Checking how your job is running in node When you know your working node, for example bcm-dgxa100-0001, from login node, you can login to the compute node and check the processing:
Command to check working cpus: $ ssh bcm-dgxa100-0001 $ top -u \(USERNAME Command to check working gpus:\) ssh bcm-dgxa100-0001 $ nvidia-smi OR to refresh the command every 0.2s $ watch -n .2 nvidia-smi
squeue -u jmcphaul
scontrol show job 12345 grep ReqTRES
SET USERNAME:
export USERNAME=jmcphaul
then: squeue -u $USERNAME
top -u jmcphaul
export USERNAME=jmcphaul
top -u $USERNAME
—————https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/06-RAPIDS/index.html------------- srun -n1 –gres=gpu:1 -c2 –mem=4gb –time=12:00:00 –pty $SHELL module load conda conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8
conda create -p ~/my_conda_env mamba -c conda-forge source activate ~/my_conda_env mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8
TROUBLESHOOTING: top (or htop) CONDA_VERBOSE=1 conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 conda config –remove-key channels conda config –add channels defaults conda config –add channels conda-forge conda config –add channels nvidia conda config –add channels rapidsai df -h conda install mamba -n base -c conda-forge mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 export http_proxy=http://your-proxy:port export https_proxy=https://your-proxy:port mamba create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 CONDA_VERBOSE=1 conda create -n rapids-23.02 -c rapidsai -c conda-forge -c nvidia rapids=23.02 python=3.10 cudatoolkit=11.8 ping conda.anaconda.org nslookup conda.anaconda.org conda search python -c conda-forge
CONTINUES TROUBLESHOOTING: mamba clean –index-cache –force-pkgs-dirs conda clean –all mamba update mamba conda update mamba mamba update –all conda update conda
____________YAML________________________________ nano environment2.yaml name: rapids23.02 channels: - https://conda.anaconda.org/rapidsai - https://conda.anaconda.org/conda-forge - https://conda.anaconda.org/nvidia - defaults dependencies: - rapids=23.02 - python=3.10 - cudatoolkit=11.8
name: rapids-23.02
channels:
- rapidsai
- conda-forge
- nvidia
- defaults
dependencies:
- rapids=23.02
- python=3.10
- cudatoolkit=11.8
conda config –set channel_priority flexible mamba env create -f environment2.yaml
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh # run installer bash Miniconda3-latest-Linux-x86_64.sh # Initialize ~/miniconda3/bin/conda init # Restart terminal source ~/.bashrc # or ~/.zshrc if use zsh
conda init mamba init
conda config –set auto_activate_base false # change mind: conda init –reverse $SHELL
conda activate rapids-23.02
conda init source ~/.bashrc # or ~/.zshrc
conda env list conda install
conda remove numpy scipy pandas # remove all packages conda remove –all # remove env conda remove -n my_env numpy # force removal conda remove –force numpy # remove strict channel mamba config –set channel_priority flexible
export HTTPS_PROXY=http://yourproxy:port
export HTTP_PROXY=http://yourproxy:port
unset HTTPS_PROXY unset HTTP_PROXY
$ enroot import docker://nvcr.io#nvidia/rapidsai/rapidsai:cuda11.2-runtime-centos7-py3.10 $ enroot create nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh
srun -N1 -G1 -c10 –mem=6G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=\)WORK –pty $SHELL srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=\)WORK –pty $SHELL
source ~/.bashrc
conda config –set auto_activate_base true conda config –set auto_activate_base false
remake nano file nano environment2.yaml name: rapids-23.02
channels:
- rapidsai
- conda-forge
- nvidia
- defaults
dependencies:
- rapids=23.02
- python=3.10
- cudatoolkit=11.8
conda config –set channel_priority flexible mamba config –set channel_priority flexible mamba env create -f environment2.yaml
dmesg | grep -i “killed”
rm -rf /users/jmcphaul/my_conda_envs/rapids23.02
chmod 600 myswapfile mkswap myswapfile swapon myswapfile
source activate ~/my_conda_env conda install mamba -c conda-forge mamba –version
-or-
wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
conda config –set channel_priority flexible mamba config –set channel_priority flexible
source ~/mambaforge/bin/activate
condayes
conda config –set auto_activate_base true mamba config –set auto_activate_base true
conda activate rapids23.02 conda remove jupyter_server conda clean
–all
y
mamba install jupyter_server
mamba install jupyterlab
environment location: /users/jmcphaul/my_conda_envs/rapids23.02
conda activate base
conda create -n conda_update python=3.10 conda activate conda_update
conda install -c conda-forge conda conda activate rapids23.02 conda
activate conda_update
conda update conda conda update -n rapids23.02 –all -c conda-forge conda
activate rapids23.02
and :
alias conda-deactivate-all=‘while [[ “$CONDA_SHLVL” -gt 0 ]]; do conda deactivate; done’ conda config –set auto_activate_base false conda deactivate source ~/.bashrc
lastly: conda_deactivate_all
rm readline-8.1.tar.gz
enroot import docker://nvcr.io#nvidia/nemo:22.09 enroot create nvidia+nemo+22.09.sqsh enroot import docker://nvcr.io#nvidia/nemo_bert_text_classification:20.07 enroot create nvidia+nemo_bert_text_classification+20.07.sqsh
cd $WORK mkdir nemo && cd nemo curl -s -O https://dl.fbaipublicfiles.com/glue/data/SST-2.zip\
&& unzip -o SST-2.zip -d ./
&& sed 1d ./SST-2/train.tsv > ./train_nemo_format.tsv
&& sed 1d ./SST-2/dev.tsv > ./dev_nemo_format.tsv &
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=\)WORK –pty bash -i
cd $WORK/nemo/SST-2 python
/workspace/nemo/examples/nlp/text_classification/text_classification_with_bert.py
model.dataset.num_classes=2
model.dataset.max_seq_length=256
model.train_ds.batch_size=64
model.validation_ds.batch_size=64
model.language_model.pretrained_model_name=‘bert-base-cased’
model.train_ds.file_path=train_nemo_format.tsv
model.validation_ds.file_path=dev_nemo_format.tsv
trainer.num_nodes=1
trainer.max_epochs=20
trainer.precision=16
model.optim.name=adam
model.optim.lr=1e-4 Check the GPU usage with nvidia-smi command
find / -name text_classification_with_bert.py 2>/dev/null
cd /work/users/jmcphaul/nemo git clone https://github.com/NVIDIA/NeMo.git cd NeMo/examples/nlp/text_classification conda activate rapids23.02 pip install pytorch_lightening
-or-
pip install -r /work/users/jmcphaul/nemo/NeMo/requirements.txt (Install package and Requirements)
python -c “import pytorch_lightning as pl; print(pl.__version__)”
python text_classification_with_bert.py
model.dataset.num_classes=2
model.dataset.max_seq_length=256
model.train_ds.batch_size=64
model.validation_ds.batch_size=64
model.language_model.pretrained_model_name=‘bert-base-cased’
model.train_ds.file_path=/work/users/jmcphaul/nemo/train_nemo_format.tsv
model.validation_ds.file_path=/work/users/jmcphaul/nemo/dev_nemo_format.tsv
trainer.num_nodes=1
trainer.max_epochs=20
trainer.precision=16
model.optim.name=adam
model.optim.lr=1e-4
curl –proto ‘=https’ –tlsv1.2 -sSf https://sh.rustup.rs | sh source $HOME/.cargo/env rustc –version
pip install –upgrade pip setuptools
pip freeze > pip_packages.txt nano pip_packages.txt cat pip_packages.txt | xargs pip uninstall -y mamba install -c conda-forge transformers==4.9.2 tokenizers==0.10.3 pytorch-lightning omegaconf cython pandas numpy scikit-learn scipy matplotlib jupyterlab conda activate /users/jmcphaul/my_conda_envs/rapids23.02
sed -i ‘/@/d’ pip_packages.txt
cat pip_packages.txt | xargs pip uninstall -y
mamba install -c conda-forge transformers tokenizers pytorch-lightning omegaconf
mamba install -c conda-forge cython rust mamba install -c conda-forge cython conda activate /users/jmcphaul/my_conda_envs/rapids23.02 mamba install -c conda-forge cython rust
mamba install -c conda-forge cython rust
mamba install -c conda-forge setuptools packaging
sed -i ‘/@/d’ pip_packages.txt
cat pip_packages.txt | xargs pip uninstall -y # Step 1: Clean the pip_packages.txt file sed -i ‘/@/d’ pip_packages.txt
cat pip_packages.txt | xargs pip uninstall -y
mamba install -c conda-forge transformers tokenizers pytorch-lightning omegaconf
mamba install -c conda-forge numpy
mamba install -c nvidia -c conda-forge nemo_toolkit[all]
python -c “import nemo; print(nemo.__version__)”
*** NANO *** Steps to Select and Copy All Text in nano: Open the File in nano:
sh Copy code nano pip_packages.txt Navigate to the Start of the File: Press Ctrl + _ (underscore) then Ctrl + A to move the cursor to the start of the file.
Start Marking Text: Press Ctrl + ^ (Ctrl key and the caret key) to start marking text. This sets the starting point of the selection. cd Move to the End of the File: Press Ctrl + _ (underscore) then Ctrl + E to move the cursor to the end of the file. This will highlight all the text from the start to the end of the file.
Copy the Selected Text: Press Alt + 6 (Option key and 6 on Mac) to copy the selected text to the clipboard.
Close the File: Press Ctrl + X to exit nano.
git clone https://github.com/NVIDIA/NeMo.git
mamba uninstall pytorch
conda list | grep cudatoolkit mamba list | grep cudatoolkit
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=\)WORK –pty bash -i
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=\)WORK –pty $SHELL
*** NOTE NEXT TIME FROM HERE:** Success. Logging you in… Success. Logging you in… Last login: Tue Jul 16 23:18:07 2024 from 129.119.70.150 jmcphaul@slogin-02:~$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=\)WORK –pty bash -i bash: /hpc/sys/apps/lmod/lmod/libexec/lmod: No such file or directory jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ $ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=\)WORK –pty $SHELL bash: \(: command not found jmcphaul@bcm-dgxa100-0003:/workspace/nemo\) srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+rapidsai+rapidsai+cuda11.2-runtime-centos7-py3.10.sqsh --container-mounts=\)WORK –pty \(SHELL bash: srun: command not found jmcphaul@bcm-dgxa100-0003:/workspace/nemo\) ls examples scripts start-jupyter.sh tests tutorials jmcphaul@bcm-dgxa100-0003:/workspace/nemo$ cd \(WORK jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul\) ls environment.yml environment1st.yml environment2.yml modelCNN.sh model_CNN_CIFAR10.py ncurses-6.2 nemo readline-8.1 requirements2.txt sqsh submit_job1.sh jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul$ cd sqsh jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul/sqsh$ ls lua-5.3.5 nvidia+nemo+22.09.sqsh nvidia+tensorflow+22.12-tf2-py3.sqsh readline-8.1 tensorflow_22.12-tf2-py3.sif jmcphaul@bcm-dgxa100-0003:/work/users/jmcphaul/sqsh$
REDO THIS Page:
https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/07-Application%20NEMO%20for%20Sentiment%20Analysis/index.html ######################################
source ~/mambaforge/bin/activate
condayes
conda config –set auto_activate_base true mamba config –set auto_activate_base true srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+nemo+22.09.sqsh --container-mounts=\)WORK –pty bash -i
%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%
NEXT SECTION NEXT SECTION %%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%% nano cifar100m_job.sh #!/bin/bash
#SBATCH -J CIFAR100M # job name to display in squeue
#SBATCH -c 16 –mem=750G # requested partition
#SBATCH -o output-%j.txt # standard output file
#SBATCH -e error-%j.txt # standard error file
#SBATCH –gres=gpu:8
#SBATCH -t 1440 # maximum runtime in minutes
#SBATCH -D /work/users/tuev/cv1/cifar100/multi
#SBATCH –exclusive
#SBATCH –mail-user tuev@smu.edu
#SBATCH –mail-type=end
srun –container-image=\(WORK/sqsh/nvidia+tensorflow+22.02-tf2-py3.sqsh --container-mounts=\)WORK mpirun -np 8 –allow-run-as-root –oversubscribe python /work/users/tuev/cv1/cifar100/multi/cifar100spod-hvod.py chmod +x cifar100m_job.sh sbatch cifar100m_job.sh
basic run: srun -n1 –gres=gpu:1 -c2 –mem=12gb –time=12:00:00 –pty $SHELL
mkdir -p /users/jmcphaul/my_work_directory
BE ON LOGIN PAGE TO SUBMIT BATCH
Be on login node to submit the batch script:
try: import comet_ml # must be imported before torch (if installed) except ImportError: comet_ml = None
import numpy as np import torch import torch.distributed as dist import torch.nn as nn import yaml from torch.optim import lr_scheduler from tqdm import tqdm
FILE = Path(file).resolve() ROOT = FILE.parents[0] #
YOLOv5 root directory if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
import val as validate # for end-of-epoch mAP from
models.experimental import attempt_load from models.yolo import Model
from utils.autoanchor import check_anchors from utils.autobatch import
check_train_batch_size
from utils.callbacks import Callbacks from utils.dataloaders import
create_dataloader from utils.downloads import attempt_download,
is_url
from utils.general import (
** && (( &*&& &&& ** %% \[ ** ^& \]$$\[$ \]$$$$$$$$$$$$$$$$$$$$$$$$ %%%%%%%%%%%%%%%%% NEXT &*(&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& NEXT )))
git clone https://github.com/ultralytics/yolov5.git srun -n1 –gres=gpu:1 –container-image \(WORK/sqsh/nvidia+nemo+22.04.sqsh --container-mounts=\)WORK –time=12:00:00 –pty $SHELL
cd $WORK ls cd yolov5 pip install -r requirements.txt
** NEW RUN NODE: srun –partition=short –gres=gpu:1 –cpus-per-task=8 –mem=64GB –time=1:00:00 –pty $SHELL *** srun –partition=short –gres=gpu:1 –cpus-per-task=8 –mem=64GB –time=1:00:00 –pty $SHELL module load conda gcc module load cuda module load cudnn conda create -n nemo_text_classification python=3.8 conda activate nemo_text_classification pip install tensorflow==2.11 pip install ipykernel python3 -m ipykernel install –user -nemo_text_classification –display-name NeMo mamba install -c conda-forge numpy pandas matplotlib scikit-learn jupyter mamba install -c pytorch pytorch torchvision torchaudio cudatoolkit=11.1 pip install nemo_toolkit[‘nlp’] pytorch-lightning transformers mamba install cython
BRANCH=‘r2.0.0rc0’ python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]
https://chatgpt.com/c/23dae27c-b53d-4d23-9351-1f322b12c7c5
tensorgpu
load_script.py import ctypes import os
cuda_path = “/hpc/mp/repos/spack/opt/spack/linux-ubuntu22.04-zen2/gcc-13.2.0/cuda-12.4.1-vz7djzzlmvr6dgtrfgqletmtahxhwkm6/lib64/libcudart.so” cudnn_path = “/hpc/mp/repos/spack/opt/spack/linux-ubuntu22.04-zen2/gcc-13.2.0/cudnn-9.1.1.17-12-ld5h22cq2cbo6hpbi4dvlijb3mwenclg/lib/libcudnn.so”
try: ctypes.CDLL(cuda_path) print(f”Successfully loaded {cuda_path}“) except OSError as e: print(f”Error loading {cuda_path}: {e}“)
try: ctypes.CDLL(cudnn_path) print(f”Successfully loaded {cudnn_path}“) except OSError as e: print(f”Error loading {cudnn_path}: {e}“)
$ srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=\)WORK –pty $SHELL root@bcm-dgxa100-0001:/workspace# jupyter lab –allow-root –no-browser –ip=0.0.0.0
cd $WORK python train_mnist.py
https://chatgpt.com/c/989a65f4-b75a-41d1-a025-22131aba3b7b https://gemini.google.com/app/221068a33333d467 https://chatgpt.com/g/g-phPzPLYLi-tensorflow-keras-gpt/c/8969e520-76fa-46b1-8148-4dd8b47359a3 https://chatgpt.com/c/989a65f4-b75a-41d1-a025-22131aba3b7b
srun -N1 -G1 -c10 –mem=64G –time=12:00:00 –container-image \(WORK/sqsh/nvidia+tensorflow+22.12-tf2-py3.sqsh --container-mounts=\)WORK –pty $SHELL conda –version
http://bcm-dgxa100-0008:8888/?token=6ceb3dee1350b0a72bf863516bb5733dc9a7f6ab1eb3c4af
http://bcm-dgxa100-0020:8888/?token=9cfcd29039a650d33396d64a4d623e18334a8d2fe1113eaa
qm9.csv
Great! Using mamba
for managing your conda environment
can be faster and more efficient. Here’s how you can resolve the issue
using mamba
:
Activate your conda environment:
conda activate /users/jmcphaul/my_conda_envs/rapids23.02
Check the installed PyTorch version and its dependencies:
python -c "import torch; print(torch.__version__)"
If this command fails, it indicates an issue with the PyTorch installation.
Reinstall PyTorch using mamba:
First, uninstall the current PyTorch installation:
mamba uninstall pytorch
Then, reinstall PyTorch with the appropriate CUDA version (assuming you’re using CUDA 11.1):
mamba install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch -c nvidia
Verify the installation again:
python -c "import torch; print(torch.__version__)"
Ensure that this command runs without any errors and prints the PyTorch version.
Run your script again:
python /work/users/jmcphaul/nemo/NeMo/examples/nlp/text_classification/text_classification_with_bert.py \
model.dataset.num_classes=2 \
model.dataset.max_seq_length=256 \
model.train_ds.batch_size=64 \
model.validation_ds.batch_size=64 \
model.language_model.pretrained_model_name='bert-base-cased' \
model.train_ds.file_path=/work/users/jmcphaul/nemo/train_nemo_format.tsv \
model.validation_ds.file_path=/work/users/jmcphaul/nemo/dev_nemo_format.tsv \
trainer.num_nodes=1 \
trainer.max_epochs=20 \
trainer.precision=16 \
model.optim.name=adam \
model.optim.lr=1e-4
By following these steps using mamba
, you should be able
to resolve the import error and run your script successfully. If you
encounter any further issues, please let me know!
nvcr.io/nvidia/nemo_bert_text_classification:20.07 nvcr.io/nvidia/nemo:23.08 enroot import docker://nvcr.io#nvidia/nemo:23.08 enroot import docker://nvcr.io#nvidia/nemo_bert_text_classification:20.07
https://southernmethodistuniversity.github.io/hpc_docs/accounts.html
https://southernmethodistuniversity.github.io/hpc_docs/index.html
https://southernmethodistuniversity.github.io/hpc_docs/access.html
https://southernmethodistuniversity.github.io/hpc_docs/portal.html
https://southernmethodistuniversity.github.io/hpc_docs/m3_migration.html
https://southernmethodistuniversity.github.io/hpc_docs/mp_update.html
https://southernmethodistuniversity.github.io/hpc_docs/examples/conda/README.html
https://southernmethodistuniversity.github.io/hpc_docs/examples/torch/README.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/unix/unix.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/unix/scripting.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/modules/modules.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/slurm/slurm.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/slurm/best_practices.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/containers/containers.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/intro.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/session_1.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/session_2.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/session_3.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/session_4.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/cpp/session_5.html
https://southernmethodistuniversity.github.io/hpc_docs/tutorials/r/overview.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/r/environments.html https://southernmethodistuniversity.github.io/hpc_docs/tutorials/r/slurm.html
https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/index.html https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/04-Using%20JupterLab/index.html
https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/02-Working%20with%20Conda/index.html
https://carpentries-incubator.github.io/introduction-to-conda-for-data-scientists/aio/index.html#:~:text=Conda%20is%20a%20platform%20agnostic,and%20targets%20multiple%20programming%20languages.
https://vpn.smu.edu/+CSCOE+/portal.html
https://www.tensorflow.org/install https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb https://carpentries.org/community-lessons/ https://www.sciencedirect.com/science/article/pii/S0165027020303848 https://nlp.stanford.edu/sentiment/index.html https://gluebenchmark.com/tasks/ https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/text_classification.html https://www.google.com/search?q=raster-based+spatial+analysis&rlz=1C1ONGR_enUS1116US1116&oq=raster-based+spatial+analysis&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIICAEQABgWGB4yDQgCEAAYhgMYgAQYigUyDQgDEAAYhgMYgAQYigUyDQgEEAAYhgMYgAQYigUyCggFEAAYgAQYogQyCggGEAAYgAQYogQyCggHEAAYgAQYogQyCggIEAAYgAQYogTSAQgxMTU2ajBqN6gCALACAA&sourceid=chrome&ie=UTF-8 https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo https://docs.rapids.ai/visualization https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/08-Applications%20of%20Horovod%20for%20MultiGPUs/index.html https://rapids.ai/ https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/06-RAPIDS/index.html https://carpentries-incubator.github.io/introduction-to-conda-for-data-scientists/aio/index.html#:~:text=Conda%20is%20a%20platform%20agnostic,and%20targets%20multiple%20programming%20languages. https://slurm.schedmd.com/ https://www.smu.edu/oit/services/qualtrics https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/index.html https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/12-Pipeline%20with%20pretrained%20Hugging%20Face/index.html https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/03-Using%20container/index.html https://education.github.com/pack/offers https://southernmethodistuniversity.github.io/SMU_SuperPOD_101/06-RAPIDS/index.html https://www.smu.edu/oit/training https://catalog.ngc.nvidia.com/orgs/nvidia/teams/rapidsai/containers/rapidsai