shacharmirkin · April 26, 2025 12:58
diff --git a/vllm_on_google_colab.ipynb b/vllm_on_google_colab.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8ukoKIR0yGmX"
      },
      "source": [
        "# Running a vLLM server on Google Colab"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "Ddt3zpjB_JS5"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "\n",
        "if \"google.colab\" not in sys.modules:\n",
        "    raise ValueError(\"This notebook is designed to run in Google Colab\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NVEqDqF9_JS6"
      },
      "source": [
        "## Steps"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SCaobxnMyCxj"
      },
      "source": [
        "### 1. Change the runtime to a GPU\n",
        "\n",
        "This notebook was tested with a T4 GPU\n",
        "\n",
        "### 2. Install requirements\n",
        "\n",
        "The notebook was tested with `vllm==0.8.4 pyngrok==7.2.5 litellm==1.67.2`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pxqqQtmzwrNP",
        "outputId": "25d2be09-8afa-4475-e2a6-b26f755f3830"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/7.6 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/7.6 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/7.6 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m53.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ],
      "source": [
        "!python -m pip install --quiet vllm pyngrok litellm"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KVY3Kj1nyvd8"
      },
      "source": [
        "### 3. Run the vLLM server"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "MTAspVV78UaE"
      },
      "outputs": [],
      "source": [
        "model = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
        "port = 8001"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dq8qQyRW_JS7"
      },
      "source": [
        "- We use `nohup` so we can move on to the next cell in Colab\n",
        "\n",
        "- `tail -f` is used so we can see the server messages as it loads. We continously check the log file until the server is ready. That is, until we see _Application startup complete_\n",
        "\n",
        "- `bfloat16` is not supported by T4, so we explicitly set the data type to `float16` (aka `half`) when running the `vllm serve` command\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cMrJqS0vHtGJ",
        "outputId": "62518a9f-fa67-4ae3-ceda-b520af66ef3f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "nohup: redirecting stderr to stdout\n"
          ]
        }
      ],
      "source": [
        "!nohup vllm serve {model} --port {port} --dtype half 2>&1 > vllm_output.log &"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Y28Gyl3jD1GL",
        "outputId": "a065269a-c404-4875-b2fc-de2e522e0547"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "INFO 04-26 12:12:21 [__init__.py:239] Automatically detected platform cuda.\n",
            "2025-04-26 12:12:23.275719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
            "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
            "E0000 00:00:1745669543.588236    1790 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
            "E0000 00:00:1745669543.670665    1790 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
            "2025-04-26 12:12:24.277597: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
            "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
            "INFO 04-26 12:12:31 [api_server.py:1034] vLLM API server version 0.8.4\n",
            "INFO 04-26 12:12:31 [api_server.py:1035] args: Namespace(subparser='serve', model_tag='Qwen/Qwen2.5-1.5B-Instruct', config='', host=None, port=8001, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='Qwen/Qwen2.5-1.5B-Instruct', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, load_format='auto', download_dir=None, model_loader_extra_config=None, use_tqdm_on_load=True, config_format=<ConfigFormat.AUTO: 'auto'>, dtype='half', kv_cache_dtype='auto', max_model_len=None, guided_decoding_backend='auto', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=None, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_token=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, enable_reasoning=False, reasoning_parser=None, disable_cascade_attn=False, disable_chunked_mm_input=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False, dispatch_function=<function ServeSubcommand.cmd at 0x7cb07f1b32e0>)\n",
            "WARNING 04-26 12:12:31 [config.py:2836] Casting torch.bfloat16 to torch.float16.\n",
            "INFO 04-26 12:12:47 [config.py:689] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.\n",
            "WARNING 04-26 12:12:47 [arg_utils.py:1731] Compute Capability < 8.0 is not supported by the V1 Engine. Falling back to V0. \n",
            "INFO 04-26 12:12:47 [api_server.py:246] Started engine process with PID 2007\n",
            "INFO 04-26 12:12:52 [__init__.py:239] Automatically detected platform cuda.\n",
            "2025-04-26 12:12:53.219088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
            "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
            "E0000 00:00:1745669573.239280    2007 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
            "E0000 00:00:1745669573.245609    2007 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
            "INFO 04-26 12:12:57 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":256}, use_cached_outputs=True, \n",
            "INFO 04-26 12:12:59 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
            "INFO 04-26 12:12:59 [cuda.py:289] Using XFormers backend.\n",
            "INFO 04-26 12:13:01 [parallel_state.py:959] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
            "INFO 04-26 12:13:01 [model_runner.py:1110] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n",
            "INFO 04-26 12:13:01 [weight_utils.py:265] Using model weights format ['*.safetensors']\n",
            "INFO 04-26 12:13:28 [weight_utils.py:281] Time spent downloading weights for Qwen/Qwen2.5-1.5B-Instruct: 26.430684 seconds\n",
            "INFO 04-26 12:13:28 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n",
            "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
            "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:05<00:00,  5.12s/it]\n",
            "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:05<00:00,  5.12s/it]\n",
            "\n",
            "INFO 04-26 12:13:33 [loader.py:458] Loading weights took 5.20 seconds\n",
            "INFO 04-26 12:13:33 [model_runner.py:1146] Model loading took 2.8876 GiB and 32.125980 seconds\n",
            "INFO 04-26 12:13:38 [worker.py:267] Memory profiling takes 4.83 seconds\n",
            "INFO 04-26 12:13:38 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB\n",
            "INFO 04-26 12:13:38 [worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 8.31GiB.\n",
            "INFO 04-26 12:13:39 [executor_base.py:112] # cuda blocks: 19446, # CPU blocks: 9362\n",
            "INFO 04-26 12:13:39 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 9.50x\n",
            "INFO 04-26 12:13:44 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
            "Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:30<00:00,  1.15it/s]\n",
            "INFO 04-26 12:14:14 [model_runner.py:1598] Graph capturing finished in 30 secs, took 0.19 GiB\n",
            "INFO 04-26 12:14:14 [llm_engine.py:449] init engine (profile, create kv cache, warmup model) took 40.71 seconds\n",
            "WARNING 04-26 12:14:14 [config.py:1177] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n",
            "INFO 04-26 12:14:14 [serving_chat.py:118] Using default chat sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n",
            "INFO 04-26 12:14:14 [serving_completion.py:61] Using default completion sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n",
            "INFO 04-26 12:14:14 [api_server.py:1081] Starting vLLM API server on http://0.0.0.0:8001\n",
            "INFO 04-26 12:14:14 [launcher.py:26] Available routes are:\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /openapi.json, Methods: HEAD, GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /docs, Methods: HEAD, GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /docs/oauth2-redirect, Methods: HEAD, GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /redoc, Methods: HEAD, GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /health, Methods: GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /load, Methods: GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /ping, Methods: GET, POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /tokenize, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /detokenize, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/models, Methods: GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /version, Methods: GET\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/chat/completions, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/completions, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/embeddings, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /pooling, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /score, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/score, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/audio/transcriptions, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /rerank, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/rerank, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /v2/rerank, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /invocations, Methods: POST\n",
            "INFO 04-26 12:14:14 [launcher.py:34] Route: /metrics, Methods: GET\n",
            "INFO:     Started server process [1790]\n",
            "INFO:     Waiting for application startup.\n",
            "INFO:     Application startup complete.\n"
          ]
        }
      ],
      "source": [
        "!tail -f vllm_output.log | sed '/Application startup complete/ q'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "npYjVvsPxaqW"
      },
      "source": [
        "### 4. Expose the Colab Service using ngrok\n",
        "\n",
        "\n",
        "  - To use ngrok, you need to create an account and get a token (https://ngrok.com/)\n",
        "  - Put the token in the Colab secrets as `NGROK_TOKEN` and grant access when prompted\n",
        "\n",
        "  The output of this cell is a public ngrok URL"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_Z8Bb2GNxV4O",
        "outputId": "30159dde-a8ef-40b1-a2a7-fae2dc1d0a99"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Ngrok tunnel URL: https://2d3d-35-196-44-108.ngrok-free.app\n"
          ]
        }
      ],
      "source": [
        "from google.colab import userdata\n",
        "from pyngrok import ngrok\n",
        "\n",
        "# Authenticate\n",
        "ngrok.set_auth_token(userdata.get(\"NGROK_TOKEN\"))\n",
        "\n",
        "# Terminate any existing ngrok tunnels\n",
        "ngrok.kill()\n",
        "\n",
        "# Open a tunnel to the port where vllm is running (default is 8000)\n",
        "tunnel = ngrok.connect(port)\n",
        "\n",
        "# Print the public URL provided by ngrok\n",
        "print(f\"Ngrok tunnel URL: {tunnel.public_url}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "G77uxsnwCgh1"
      },
      "source": [
        "### Test the URL via LiteLLM\n",
        "\n",
        "Here's an example of calling the deployed endpoint via litellm.\n",
        "The exact same code also works outside Colab.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 35,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QUfxmEEMDBtA",
        "outputId": "2945e5bc-7bcf-48fa-e321-93a3013da3e3"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Tomatoes are classified as vegetables, not fruits. They belong to the botanical family Solanaceae and can be found in the kitchen primarily due to\n"
          ]
        }
      ],
      "source": [
        "import litellm\n",
        "\n",
        "model_url = (\n",
        "    f\"{tunnel.public_url}/v1\"\n",
        ")\n",
        "\n",
        "response = litellm.completion(\n",
        "    model=f\"hosted_vllm/{model}\",  # Use the model name as it works in your direct request\n",
        "    messages=[\n",
        "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
        "        {\"role\": \"user\", \"content\": \"Is tomato a vegetable or a fruit?\"},\n",
        "    ],\n",
        "    api_base=model_url,\n",
        "    # api_key=bearer_token,  # use if your server requires a token\n",
        "    temperature=1,\n",
        "    top_p=0.95,\n",
        "    max_tokens=30,\n",
        ")\n",
        "\n",
        "print(response.choices[0].message.content)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nO_yOAyDB-An"
      },
      "source": [
        "### Cleaning up"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IdCEYjrdCAyM"
      },
      "source": [
        "Find the vllm & ngrok processes and use `kill` to terminate them.\n",
        "\n",
        "Once done, the above ngrok URL will no longer be accessible"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nJM0M7AhaFYB"
      },
      "outputs": [],
      "source": [
        "!ps aux | grep -- 'vllm\\|ngrok'"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V08KabQTCIe9"
      },
      "source": [
        "Alternatively, kill the process by their names"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 37,
      "metadata": {
        "id": "tc9hZssX64Rf"
      },
      "outputs": [],
      "source": [
        "#!killall vllm\n",
        "#!killall ngrok"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "8ukoKIR0yGmX"
	},
	"source": [
	"# Running a vLLM server on Google Colab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"id": "Ddt3zpjB_JS5"
	},
	"outputs": [],
	"source": [
	"import sys\n",
	"\n",
	"if \"google.colab\" not in sys.modules:\n",
	" raise ValueError(\"This notebook is designed to run in Google Colab\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "NVEqDqF9_JS6"
	},
	"source": [
	"## Steps"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "SCaobxnMyCxj"
	},
	"source": [
	"### 1. Change the runtime to a GPU\n",
	"\n",
	"This notebook was tested with a T4 GPU\n",
	"\n",
	"### 2. Install requirements\n",
	"\n",
	"The notebook was tested with `vllm==0.8.4 pyngrok==7.2.5 litellm==1.67.2`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "pxqqQtmzwrNP",
	"outputId": "25d2be09-8afa-4475-e2a6-b26f755f3830"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/7.6 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/7.6 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/7.6 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m53.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h"
	]
	}
	],
	"source": [
	"!python -m pip install --quiet vllm pyngrok litellm"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "KVY3Kj1nyvd8"
	},
	"source": [
	"### 3. Run the vLLM server"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"id": "MTAspVV78UaE"
	},
	"outputs": [],
	"source": [
	"model = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
	"port = 8001"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "dq8qQyRW_JS7"
	},
	"source": [
	"- We use `nohup` so we can move on to the next cell in Colab\n",
	"\n",
	"- `tail -f` is used so we can see the server messages as it loads. We continously check the log file until the server is ready. That is, until we see _Application startup complete_\n",
	"\n",
	"- `bfloat16` is not supported by T4, so we explicitly set the data type to `float16` (aka `half`) when running the `vllm serve` command\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "cMrJqS0vHtGJ",
	"outputId": "62518a9f-fa67-4ae3-ceda-b520af66ef3f"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"nohup: redirecting stderr to stdout\n"
	]
	}
	],
	"source": [
	"!nohup vllm serve {model} --port {port} --dtype half 2>&1 > vllm_output.log &"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Y28Gyl3jD1GL",
	"outputId": "a065269a-c404-4875-b2fc-de2e522e0547"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"INFO 04-26 12:12:21 [__init__.py:239] Automatically detected platform cuda.\n",
	"2025-04-26 12:12:23.275719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
	"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
	"E0000 00:00:1745669543.588236 1790 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
	"E0000 00:00:1745669543.670665 1790 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
	"2025-04-26 12:12:24.277597: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
	"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
	"INFO 04-26 12:12:31 [api_server.py:1034] vLLM API server version 0.8.4\n",
	"INFO 04-26 12:12:31 [api_server.py:1035] args: Namespace(subparser='serve', model_tag='Qwen/Qwen2.5-1.5B-Instruct', config='', host=None, port=8001, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='Qwen/Qwen2.5-1.5B-Instruct', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, load_format='auto', download_dir=None, model_loader_extra_config=None, use_tqdm_on_load=True, config_format=<ConfigFormat.AUTO: 'auto'>, dtype='half', kv_cache_dtype='auto', max_model_len=None, guided_decoding_backend='auto', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=None, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_token=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, enable_reasoning=False, reasoning_parser=None, disable_cascade_attn=False, disable_chunked_mm_input=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False, dispatch_function=<function ServeSubcommand.cmd at 0x7cb07f1b32e0>)\n",
	"WARNING 04-26 12:12:31 [config.py:2836] Casting torch.bfloat16 to torch.float16.\n",
	"INFO 04-26 12:12:47 [config.py:689] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.\n",
	"WARNING 04-26 12:12:47 [arg_utils.py:1731] Compute Capability < 8.0 is not supported by the V1 Engine. Falling back to V0. \n",
	"INFO 04-26 12:12:47 [api_server.py:246] Started engine process with PID 2007\n",
	"INFO 04-26 12:12:52 [__init__.py:239] Automatically detected platform cuda.\n",
	"2025-04-26 12:12:53.219088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
	"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
	"E0000 00:00:1745669573.239280 2007 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
	"E0000 00:00:1745669573.245609 2007 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
	"INFO 04-26 12:12:57 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":256}, use_cached_outputs=True, \n",
	"INFO 04-26 12:12:59 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n",
	"INFO 04-26 12:12:59 [cuda.py:289] Using XFormers backend.\n",
	"INFO 04-26 12:13:01 [parallel_state.py:959] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n",
	"INFO 04-26 12:13:01 [model_runner.py:1110] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n",
	"INFO 04-26 12:13:01 [weight_utils.py:265] Using model weights format ['*.safetensors']\n",
	"INFO 04-26 12:13:28 [weight_utils.py:281] Time spent downloading weights for Qwen/Qwen2.5-1.5B-Instruct: 26.430684 seconds\n",
	"INFO 04-26 12:13:28 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n",
	"Loading safetensors checkpoint shards: 0% Completed \| 0/1 [00:00<?, ?it/s]\n",
	"Loading safetensors checkpoint shards: 100% Completed \| 1/1 [00:05<00:00, 5.12s/it]\n",
	"Loading safetensors checkpoint shards: 100% Completed \| 1/1 [00:05<00:00, 5.12s/it]\n",
	"\n",
	"INFO 04-26 12:13:33 [loader.py:458] Loading weights took 5.20 seconds\n",
	"INFO 04-26 12:13:33 [model_runner.py:1146] Model loading took 2.8876 GiB and 32.125980 seconds\n",
	"INFO 04-26 12:13:38 [worker.py:267] Memory profiling takes 4.83 seconds\n",
	"INFO 04-26 12:13:38 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB\n",
	"INFO 04-26 12:13:38 [worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 8.31GiB.\n",
	"INFO 04-26 12:13:39 [executor_base.py:112] # cuda blocks: 19446, # CPU blocks: 9362\n",
	"INFO 04-26 12:13:39 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 9.50x\n",
	"INFO 04-26 12:13:44 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
	"Capturing CUDA graph shapes: 100%\|██████████\| 35/35 [00:30<00:00, 1.15it/s]\n",
	"INFO 04-26 12:14:14 [model_runner.py:1598] Graph capturing finished in 30 secs, took 0.19 GiB\n",
	"INFO 04-26 12:14:14 [llm_engine.py:449] init engine (profile, create kv cache, warmup model) took 40.71 seconds\n",
	"WARNING 04-26 12:14:14 [config.py:1177] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n",
	"INFO 04-26 12:14:14 [serving_chat.py:118] Using default chat sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n",
	"INFO 04-26 12:14:14 [serving_completion.py:61] Using default completion sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n",
	"INFO 04-26 12:14:14 [api_server.py:1081] Starting vLLM API server on http://0.0.0.0:8001\n",
	"INFO 04-26 12:14:14 [launcher.py:26] Available routes are:\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /openapi.json, Methods: HEAD, GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /docs, Methods: HEAD, GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /docs/oauth2-redirect, Methods: HEAD, GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /redoc, Methods: HEAD, GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /health, Methods: GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /load, Methods: GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /ping, Methods: GET, POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /tokenize, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /detokenize, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/models, Methods: GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /version, Methods: GET\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/chat/completions, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/completions, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/embeddings, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /pooling, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /score, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/score, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/audio/transcriptions, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /rerank, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/rerank, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /v2/rerank, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /invocations, Methods: POST\n",
	"INFO 04-26 12:14:14 [launcher.py:34] Route: /metrics, Methods: GET\n",
	"INFO: Started server process [1790]\n",
	"INFO: Waiting for application startup.\n",
	"INFO: Application startup complete.\n"
	]
	}
	],
	"source": [
	"!tail -f vllm_output.log \| sed '/Application startup complete/ q'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "npYjVvsPxaqW"
	},
	"source": [
	"### 4. Expose the Colab Service using ngrok\n",
	"\n",
	"\n",
	" - To use ngrok, you need to create an account and get a token (https://ngrok.com/)\n",
	" - Put the token in the Colab secrets as `NGROK_TOKEN` and grant access when prompted\n",
	"\n",
	" The output of this cell is a public ngrok URL"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "_Z8Bb2GNxV4O",
	"outputId": "30159dde-a8ef-40b1-a2a7-fae2dc1d0a99"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Ngrok tunnel URL: https://2d3d-35-196-44-108.ngrok-free.app\n"
	]
	}
	],
	"source": [
	"from google.colab import userdata\n",
	"from pyngrok import ngrok\n",
	"\n",
	"# Authenticate\n",
	"ngrok.set_auth_token(userdata.get(\"NGROK_TOKEN\"))\n",
	"\n",
	"# Terminate any existing ngrok tunnels\n",
	"ngrok.kill()\n",
	"\n",
	"# Open a tunnel to the port where vllm is running (default is 8000)\n",
	"tunnel = ngrok.connect(port)\n",
	"\n",
	"# Print the public URL provided by ngrok\n",
	"print(f\"Ngrok tunnel URL: {tunnel.public_url}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "G77uxsnwCgh1"
	},
	"source": [
	"### Test the URL via LiteLLM\n",
	"\n",
	"Here's an example of calling the deployed endpoint via litellm.\n",
	"The exact same code also works outside Colab.\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "QUfxmEEMDBtA",
	"outputId": "2945e5bc-7bcf-48fa-e321-93a3013da3e3"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Tomatoes are classified as vegetables, not fruits. They belong to the botanical family Solanaceae and can be found in the kitchen primarily due to\n"
	]
	}
	],
	"source": [
	"import litellm\n",
	"\n",
	"model_url = (\n",
	" f\"{tunnel.public_url}/v1\"\n",
	")\n",
	"\n",
	"response = litellm.completion(\n",
	" model=f\"hosted_vllm/{model}\", # Use the model name as it works in your direct request\n",
	" messages=[\n",
	" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
	" {\"role\": \"user\", \"content\": \"Is tomato a vegetable or a fruit?\"},\n",
	" ],\n",
	" api_base=model_url,\n",
	" # api_key=bearer_token, # use if your server requires a token\n",
	" temperature=1,\n",
	" top_p=0.95,\n",
	" max_tokens=30,\n",
	")\n",
	"\n",
	"print(response.choices[0].message.content)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "nO_yOAyDB-An"
	},
	"source": [
	"### Cleaning up"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "IdCEYjrdCAyM"
	},
	"source": [
	"Find the vllm & ngrok processes and use `kill` to terminate them.\n",
	"\n",
	"Once done, the above ngrok URL will no longer be accessible"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "nJM0M7AhaFYB"
	},
	"outputs": [],
	"source": [
	"!ps aux \| grep -- 'vllm\\\|ngrok'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "V08KabQTCIe9"
	},
	"source": [
	"Alternatively, kill the process by their names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"id": "tc9hZssX64Rf"
	},
	"outputs": [],
	"source": [
	"#!killall vllm\n",
	"#!killall ngrok"
	]
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"gpuType": "T4",
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3",
	"name": "python3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found