| # | |
| # test_models_parallel_gnu.sh - Run multiple ATLAS models in parallel using GNU parallel | |
| # | |
| # This script processes multiple LLM models for ATLAS HβΞ³Ξ³ analysis. | |
| # For each model, it runs 5 independent Snakemake workflows in parallel. | |
| # | |
| # Usage: | |
| # ./test_models_parallel_gnu.sh [output_name] [max_concurrent_models] [tasks_per_model] | |
| # | |
| # Arguments: | |
| # output_name: Name of output directory (default: "test") | |
| # max_concurrent_models: Maximum models to run simultaneously (default: 5) | |
| # tasks_per_model: Number of parallel tasks per model (default: 5) | |
| # | |
| # Examples: | |
| # ./test_models_parallel_gnu.sh # Basic usage | |
| # ./test_models_parallel_gnu.sh experiment1 # Custom output name | |
| # ./test_models_parallel_gnu.sh test 3 5 # 3 models at once, 5 tasks each | |
| # ./test_models_parallel_gnu.sh large_test 10 5 # 10 models, 5 tasks each | |
| # | |
| # Requirements: | |
| # - GNU parallel must be installed | |
| # - models.txt file with list of models to test | |
| # - All workflow/*.smk files must be present | |
| # - Python environment with required packages | |
| # | |
| # Features: | |
| # - Scales to 20-30 models with 10 jobs each (200-300 total jobs) | |
| # - Independent task execution - order doesn't matter | |
| # - Automatic resource management via GNU parallel | |
| # - Comprehensive error handling and logging | |
| # - Temporary workspace in /dev/shm for fast I/O | |
| # | |
| # Output Structure: | |
| # output_name/ | |
| # βββ model1_timestamp1/ | |
| # β βββ generated_code/ | |
| # β βββ logs/ | |
| # β βββ plots/ | |
| # β βββ prompt_pairs/ | |
| # β βββ snakemake_log/ | |
| # β βββ stats.csv | |
| # βββ model2_timestamp2/ | |
| # βββ ... | |
| # | |
| module load python | |
| # Save the directory where the script was started | |
| ORIG_DIR=$(pwd) | |
| # Create a unique random folder in /dev/shm | |
| TMPDIR=$(mktemp -d /dev/shm/llm_run_temp_XXXXXX) | |
| WORKDIR="$TMPDIR/llm_for_analysis" | |
| conda activate llm_env | |
| # Get the root of the current Git repository | |
| SRC_DIR=$(git rev-parse --show-toplevel) | |
| echo "Using Git repository root: $SRC_DIR" | |
| # Copy files from the Git repo root, excluding .git, results/, and .snakemake/ | |
| rsync -av \ | |
| --exclude='.git' \ | |
| --exclude='results/' \ | |
| --exclude='.snakemake/' \ | |
| --exclude='test/' \ | |
| "$SRC_DIR/" \ | |
| "$WORKDIR/" | |
| chmod +x "$WORKDIR/test_stats_parallel.sh" | |
| cd "$WORKDIR" | |
| mkdir -p results | |
| MODEL_LIST="models.txt" | |
| OUT_NAME="${1:-test}" # Take from first argument, default to "test" | |
| MAX_JOBS="${2:-5}" # Maximum concurrent models (default 5) | |
| TASK_JOBS="${3:-5}" # Jobs per model task (default 5) | |
| echo "Configuration:" | |
| echo " Output directory: $OUT_NAME" | |
| echo " Max concurrent models: $MAX_JOBS" | |
| echo " Tasks per model: $TASK_JOBS" | |
| echo " Total potential jobs: $((MAX_JOBS * TASK_JOBS))" | |
| echo "" | |
| # Function to process a single model | |
| process_model() { | |
| local model=$1 | |
| local out_name=$2 | |
| local src_dir=$3 | |
| local work_dir=$4 | |
| # Use timestamp for unique run naming | |
| local timestamp=$(date +"%Y%m%d_%H%M%S") | |
| local MODEL_SAFE="${model//\//_}_$timestamp" | |
| export MODEL_NAME="$model" | |
| echo "Starting model [$timestamp]: $model" | |
| # Create config file for this run | |
| cat > config.yml <<EOF | |
| model: '$model' | |
| name: '$MODEL_SAFE' | |
| out_dir: '$work_dir/results/$MODEL_SAFE' | |
| EOF | |
| # Run the parallel test script | |
| if bash test_stats_parallel.sh; then | |
| echo "Model $model completed successfully" | |
| # Save results for this model | |
| local DEST_DIR="$src_dir/$out_name/" | |
| mkdir -p "$DEST_DIR" | |
| cp -r "$work_dir/results/$MODEL_SAFE" "$DEST_DIR/" | |
| mkdir -p "$DEST_DIR/$MODEL_SAFE/snakemake_log/" | |
| cp -r "$work_dir/.snakemake/log/"* "$DEST_DIR/$MODEL_SAFE/snakemake_log/" || true | |
| cp stats.csv "$DEST_DIR/$MODEL_SAFE/stats.csv" || true | |
| return 0 | |
| else | |
| echo "ERROR: Model $model failed" | |
| return 1 | |
| fi | |
| } | |
| export -f process_model | |
| # Export variables for the function | |
| export SRC_DIR | |
| export WORKDIR | |
| # Read models into an array | |
| mapfile -t MODELS < "$MODEL_LIST" | |
| echo "Starting GNU parallel processing of ${#MODELS[@]} models..." | |
| echo "Models to process: ${MODELS[*]}" | |
| echo "" | |
| # Use GNU parallel to process models | |
| # --halt soon,fail=1: Stop as soon as one job fails | |
| # --jobs $MAX_JOBS: Limit concurrent models | |
| # --line-buffer: Show output as it comes | |
| parallel --no-notice --halt soon,fail=1 --jobs "$MAX_JOBS" --line-buffer \ | |
| process_model {} "$OUT_NAME" "$SRC_DIR" "$WORKDIR" ::: "${MODELS[@]}" | |
| # Check if all models succeeded | |
| if [ $? -eq 0 ]; then | |
| echo "" | |
| echo "SUCCESS: All models processed successfully!" | |
| echo "Results saved to: $SRC_DIR/$OUT_NAME/" | |
| else | |
| echo "" | |
| echo "ERROR: Some models failed to process" | |
| exit 1 | |
| fi | |
| # Return to the original directory | |
| cd "$ORIG_DIR" | |