LLM4HEP / test_models_parallel_gnu.sh
ho22joshua's picture
initial commit
cfcbbc8
#!/bin/bash
#
# test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel
#
# This script processes multiple LLM models for ATLAS H→γγ analysis.
# For each model, it runs 5 independent Snakemake workflows in parallel.
#
# Usage:
# ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model]
#
# Arguments:
# output_name: Name of output directory (default: "test")
# max_concurrent_models: Maximum models to run simultaneously (default: 5)
# tasks_per_model: Number of parallel tasks per model (default: 5)
#
# Examples:
# ./test_models_parallel_gnu.sh # Basic usage
# ./test_models_parallel_gnu.sh experiment1 # Custom output name
# ./test_models_parallel_gnu.sh test 3 5 # 3 models at once, 5 tasks each
# ./test_models_parallel_gnu.sh large_test 10 5 # 10 models, 5 tasks each
#
# Requirements:
# - GNU parallel must be installed
# - models.txt file with list of models to test
# - All workflow/*.smk files must be present
# - Python environment with required packages
#
# Features:
# - Scales to 20-30 models with 10 jobs each (200-300 total jobs)
# - Independent task execution - order doesn't matter
# - Automatic resource management via GNU parallel
# - Comprehensive error handling and logging
# - Temporary workspace in /dev/shm for fast I/O
#
# Output Structure:
# output_name/
# β”œβ”€β”€ model1_timestamp1/
# β”‚ β”œβ”€β”€ generated_code/
# β”‚ β”œβ”€β”€ logs/
# β”‚ β”œβ”€β”€ plots/
# β”‚ β”œβ”€β”€ prompt_pairs/
# β”‚ β”œβ”€β”€ snakemake_log/
# β”‚ └── stats.csv
# └── model2_timestamp2/
# └── ...
#
module load python
# Save the directory where the script was started
ORIG_DIR=$(pwd)
# Create a unique random folder in /dev/shm
TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX)
WORKDIR="$TMPDIR/llm_for_analysis"
conda activate llm_env
# Get the root of the current Git repository
SRC_DIR=$(git rev-parse --show-toplevel)
echo "Using Git repository root: $SRC_DIR"
# Copy files from the Git repo root, excluding .git, results/, and .snakemake/
rsync -av \
--exclude='.git' \
--exclude='results/' \
--exclude='.snakemake/' \
--exclude='test/' \
"$SRC_DIR/" \
"$WORKDIR/"
chmod +x "$WORKDIR/test_stats_parallel.sh"
cd "$WORKDIR"
mkdir -p results
MODEL_LIST="models.txt"
OUT_NAME="${1:-test}" # Take from first argument, default to "test"
MAX_JOBS="${2:-5}" # Maximum concurrent models (default 5)
TASK_JOBS="${3:-5}" # Jobs per model task (default 5)
echo "Configuration:"
echo " Output directory: $OUT_NAME"
echo " Max concurrent models: $MAX_JOBS"
echo " Tasks per model: $TASK_JOBS"
echo " Total potential jobs: $((MAX_JOBS * TASK_JOBS))"
echo ""
# Function to process a single model
process_model() {
local model=$1
local out_name=$2
local src_dir=$3
local work_dir=$4
# Use timestamp for unique run naming
local timestamp=$(date +"%Y%m%d_%H%M%S")
local MODEL_SAFE="${model//\//_}_$timestamp"
export MODEL_NAME="$model"
echo "Starting model [$timestamp]: $model"
# Create config file for this run
cat > config.yml <<EOF
model: '$model'
name: '$MODEL_SAFE'
out_dir: '$work_dir/results/$MODEL_SAFE'
EOF
# Run the parallel test script
if bash test_stats_parallel.sh; then
echo "Model $model completed successfully"
# Save results for this model
local DEST_DIR="$src_dir/$out_name/"
mkdir -p "$DEST_DIR"
cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/"
mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/"
cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" || true
cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" || true
return 0
else
echo "ERROR: Model $model failed"
return 1
fi
}
export -f process_model
# Export variables for the function
export SRC_DIR
export WORKDIR
# Read models into an array
mapfile -t MODELS < "$MODEL_LIST"
echo "Starting GNU parallel processing of ${#MODELS[@]} models..."
echo "Models to process: ${MODELS[*]}"
echo ""
# Use GNU parallel to process models
# --halt soon,fail=1: Stop as soon as one job fails
# --jobs $MAX_JOBS: Limit concurrent models
# --line-buffer: Show output as it comes
parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \
process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}"
# Check if all models succeeded
if [ $? -eq 0 ]; then
echo ""
echo "SUCCESS: All models processed successfully!"
echo "Results saved to: $SRC_DIR/$OUT_NAME/"
else
echo ""
echo "ERROR: Some models failed to process"
exit 1
fi
# Return to the original directory
cd "$ORIG_DIR"