| CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ | |
| --host 127.0.0.1 --port 8422 \ | |
| --model Qwen/QwQ-32B \ | |
| --tensor-parallel-size 2 \ | |
| --max-model-len 8192 \ | |
| # CUDA_VISIBLE_DEVICES=7 python -m vllm.entrypoints.openai.api_server \ | |
| # --host 127.0.0.1 --port 8422 \ | |
| # --model Qwen/Qwen3-8B \ | |
| # --enable-reasoning --reasoning-parser deepseek_r1 \ | |
| # --tensor-parallel-size 1 \ | |
| # --max-model-len 2048 \ | |
| # CUDA_VISIBLE_DEVICES=7 python -m vllm.entrypoints.openai.api_server \ | |
| # --host 127.0.0.1 --port 8422 \ | |
| # --model meta-llama/Meta-Llama-3-8B-Instruct \ | |
| # --enable-reasoning-parser \ | |
| # --tensor-parallel-size 1 \ | |
| # --max-model-len 1024 \ | |
| # CUDA_VISIBLE_DEVICES=6,7 python -m vllm.entrypoints.openai.api_server \ | |
| # --host 127.0.0.1 --port 8422 \ | |
| # --model Qwen/QwQ-32B \ | |
| # --tensor-parallel-size 2 \ | |
| # --max-model-len 8192 \ |