set -eux LLM_RECIPES_DIR=/code/llm-recipes source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh MAX_INPUT_TOKENS=503 BEAM_SIZE=20 python /code/llm-recipes/tools/hf_inference_distrubuted.py \ --model /work/models/translation_finetuned_hf/llama2-ja-zh-continuous-pretrained-v0-dev-finetune-chunked-docs-cleaned-all-averaged-246-250 \ -i /work/wmt2024_test/LLM/wmttest2024.src.sentence_splited.with_template.ja-zh.ja.jsonl \ -o /work/translation/wmt2024_test/en-ja/llama2-beam \ -g 0 1 2 3 4 5 6 \ --attn_implementation sdpa \ --dynamic_max_new_token_ratio 2.0 \ --num_return_sequences ${BEAM_SIZE} \ --num_beams ${BEAM_SIZE} \ --max_input_tokens ${MAX_INPUT_TOKENS} \ -b 503