LLM4HEP / test_models_parallel_gnu.sh

initial commit

cfcbbc8 about 2 months ago

4.79 kB

	#!/bin/bash
	#
	# test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel
	#
	# This script processes multiple LLM models for ATLAS H→γγ analysis.
	# For each model, it runs 5 independent Snakemake workflows in parallel.
	#
	# Usage:
	# ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model]
	#
	# Arguments:
	# output_name: Name of output directory (default: "test")
	# max_concurrent_models: Maximum models to run simultaneously (default: 5)
	# tasks_per_model: Number of parallel tasks per model (default: 5)
	#
	# Examples:
	# ./test_models_parallel_gnu.sh # Basic usage
	# ./test_models_parallel_gnu.sh experiment1 # Custom output name
	# ./test_models_parallel_gnu.sh test 3 5 # 3 models at once, 5 tasks each
	# ./test_models_parallel_gnu.sh large_test 10 5 # 10 models, 5 tasks each
	#
	# Requirements:
	# - GNU parallel must be installed
	# - models.txt file with list of models to test
	# - All workflow/*.smk files must be present
	# - Python environment with required packages
	#
	# Features:
	# - Scales to 20-30 models with 10 jobs each (200-300 total jobs)
	# - Independent task execution - order doesn't matter
	# - Automatic resource management via GNU parallel
	# - Comprehensive error handling and logging
	# - Temporary workspace in /dev/shm for fast I/O
	#
	# Output Structure:
	# output_name/
	# ├── model1_timestamp1/
	# │ ├── generated_code/
	# │ ├── logs/
	# │ ├── plots/
	# │ ├── prompt_pairs/
	# │ ├── snakemake_log/
	# │ └── stats.csv
	# └── model2_timestamp2/
	# └── ...
	#

	module load python

	# Save the directory where the script was started
	ORIG_DIR=$(pwd)

	# Create a unique random folder in /dev/shm
	TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX)
	WORKDIR="$TMPDIR/llm_for_analysis"

	conda activate llm_env

	# Get the root of the current Git repository
	SRC_DIR=$(git rev-parse --show-toplevel)
	echo "Using Git repository root: $SRC_DIR"

	# Copy files from the Git repo root, excluding .git, results/, and .snakemake/
	rsync -av \
	--exclude='.git' \
	--exclude='results/' \
	--exclude='.snakemake/' \
	--exclude='test/' \
	"$SRC_DIR/" \
	"$WORKDIR/"

	chmod +x "$WORKDIR/test_stats_parallel.sh"
	cd "$WORKDIR"
	mkdir -p results

	MODEL_LIST="models.txt"
	OUT_NAME="${1:-test}" # Take from first argument, default to "test"
	MAX_JOBS="${2:-5}" # Maximum concurrent models (default 5)
	TASK_JOBS="${3:-5}" # Jobs per model task (default 5)

	echo "Configuration:"
	echo " Output directory: $OUT_NAME"
	echo " Max concurrent models: $MAX_JOBS"
	echo " Tasks per model: $TASK_JOBS"
	echo " Total potential jobs: $((MAX_JOBS * TASK_JOBS))"
	echo ""

	# Function to process a single model
	process_model() {
	local model=$1
	local out_name=$2
	local src_dir=$3
	local work_dir=$4

	# Use timestamp for unique run naming
	local timestamp=$(date +"%Y%m%d_%H%M%S")
	local MODEL_SAFE="${model//\//_}_$timestamp"
	export MODEL_NAME="$model"

	echo "Starting model [$timestamp]: $model"

	# Create config file for this run
	cat > config.yml <<EOF
	model: '$model'
	name: '$MODEL_SAFE'
	out_dir: '$work_dir/results/$MODEL_SAFE'
	EOF

	# Run the parallel test script
	if bash test_stats_parallel.sh; then
	echo "Model $model completed successfully"

	# Save results for this model
	local DEST_DIR="$src_dir/$out_name/"
	mkdir -p "$DEST_DIR"
	cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/"
	mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/"
	cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" \|\| true
	cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" \|\| true

	return 0
	else
	echo "ERROR: Model $model failed"
	return 1
	fi
	}
	export -f process_model

	# Export variables for the function
	export SRC_DIR
	export WORKDIR

	# Read models into an array
	mapfile -t MODELS < "$MODEL_LIST"

	echo "Starting GNU parallel processing of ${#MODELS[@]} models..."
	echo "Models to process: ${MODELS[*]}"
	echo ""

	# Use GNU parallel to process models
	# --halt soon,fail=1: Stop as soon as one job fails
	# --jobs $MAX_JOBS: Limit concurrent models
	# --line-buffer: Show output as it comes
	parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \
	process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}"

	# Check if all models succeeded
	if [ $? -eq 0 ]; then
	echo ""
	echo "SUCCESS: All models processed successfully!"
	echo "Results saved to: $SRC_DIR/$OUT_NAME/"
	else
	echo ""
	echo "ERROR: Some models failed to process"
	exit 1
	fi

	# Return to the original directory
	cd "$ORIG_DIR"