Last active
April 26, 2025 12:58
-
-
Save shacharmirkin/60d3403909ea5a540f7e17f2c3f2581a to your computer and use it in GitHub Desktop.
Serving an LLM with vLMM on Google Colab
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "8ukoKIR0yGmX" | |
| }, | |
| "source": [ | |
| "# Running a vLLM server on Google Colab" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "id": "Ddt3zpjB_JS5" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import sys\n", | |
| "\n", | |
| "if \"google.colab\" not in sys.modules:\n", | |
| " raise ValueError(\"This notebook is designed to run in Google Colab\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "NVEqDqF9_JS6" | |
| }, | |
| "source": [ | |
| "## Steps" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "SCaobxnMyCxj" | |
| }, | |
| "source": [ | |
| "### 1. Change the runtime to a GPU\n", | |
| "\n", | |
| "This notebook was tested with a T4 GPU\n", | |
| "\n", | |
| "### 2. Install requirements\n", | |
| "\n", | |
| "The notebook was tested with `vllm==0.8.4 pyngrok==7.2.5 litellm==1.67.2`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "pxqqQtmzwrNP", | |
| "outputId": "25d2be09-8afa-4475-e2a6-b26f755f3830" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/7.6 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/7.6 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/7.6 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m80.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m53.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!python -m pip install --quiet vllm pyngrok litellm" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "KVY3Kj1nyvd8" | |
| }, | |
| "source": [ | |
| "### 3. Run the vLLM server" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "id": "MTAspVV78UaE" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "model = \"Qwen/Qwen2.5-1.5B-Instruct\"\n", | |
| "port = 8001" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "dq8qQyRW_JS7" | |
| }, | |
| "source": [ | |
| "- We use `nohup` so we can move on to the next cell in Colab\n", | |
| "\n", | |
| "- `tail -f` is used so we can see the server messages as it loads. We continously check the log file until the server is ready. That is, until we see _Application startup complete_\n", | |
| "\n", | |
| "- `bfloat16` is not supported by T4, so we explicitly set the data type to `float16` (aka `half`) when running the `vllm serve` command\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "cMrJqS0vHtGJ", | |
| "outputId": "62518a9f-fa67-4ae3-ceda-b520af66ef3f" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "nohup: redirecting stderr to stdout\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!nohup vllm serve {model} --port {port} --dtype half 2>&1 > vllm_output.log &" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Y28Gyl3jD1GL", | |
| "outputId": "a065269a-c404-4875-b2fc-de2e522e0547" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "INFO 04-26 12:12:21 [__init__.py:239] Automatically detected platform cuda.\n", | |
| "2025-04-26 12:12:23.275719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", | |
| "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", | |
| "E0000 00:00:1745669543.588236 1790 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", | |
| "E0000 00:00:1745669543.670665 1790 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", | |
| "2025-04-26 12:12:24.277597: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", | |
| "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", | |
| "INFO 04-26 12:12:31 [api_server.py:1034] vLLM API server version 0.8.4\n", | |
| "INFO 04-26 12:12:31 [api_server.py:1035] args: Namespace(subparser='serve', model_tag='Qwen/Qwen2.5-1.5B-Instruct', config='', host=None, port=8001, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='Qwen/Qwen2.5-1.5B-Instruct', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, load_format='auto', download_dir=None, model_loader_extra_config=None, use_tqdm_on_load=True, config_format=<ConfigFormat.AUTO: 'auto'>, dtype='half', kv_cache_dtype='auto', max_model_len=None, guided_decoding_backend='auto', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=None, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_token=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, enable_reasoning=False, reasoning_parser=None, disable_cascade_attn=False, disable_chunked_mm_input=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False, dispatch_function=<function ServeSubcommand.cmd at 0x7cb07f1b32e0>)\n", | |
| "WARNING 04-26 12:12:31 [config.py:2836] Casting torch.bfloat16 to torch.float16.\n", | |
| "INFO 04-26 12:12:47 [config.py:689] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.\n", | |
| "WARNING 04-26 12:12:47 [arg_utils.py:1731] Compute Capability < 8.0 is not supported by the V1 Engine. Falling back to V0. \n", | |
| "INFO 04-26 12:12:47 [api_server.py:246] Started engine process with PID 2007\n", | |
| "INFO 04-26 12:12:52 [__init__.py:239] Automatically detected platform cuda.\n", | |
| "2025-04-26 12:12:53.219088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", | |
| "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", | |
| "E0000 00:00:1745669573.239280 2007 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", | |
| "E0000 00:00:1745669573.245609 2007 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", | |
| "INFO 04-26 12:12:57 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={\"splitting_ops\":[],\"compile_sizes\":[],\"cudagraph_capture_sizes\":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],\"max_capture_size\":256}, use_cached_outputs=True, \n", | |
| "INFO 04-26 12:12:59 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", | |
| "INFO 04-26 12:12:59 [cuda.py:289] Using XFormers backend.\n", | |
| "INFO 04-26 12:13:01 [parallel_state.py:959] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0\n", | |
| "INFO 04-26 12:13:01 [model_runner.py:1110] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", | |
| "INFO 04-26 12:13:01 [weight_utils.py:265] Using model weights format ['*.safetensors']\n", | |
| "INFO 04-26 12:13:28 [weight_utils.py:281] Time spent downloading weights for Qwen/Qwen2.5-1.5B-Instruct: 26.430684 seconds\n", | |
| "INFO 04-26 12:13:28 [weight_utils.py:315] No model.safetensors.index.json found in remote.\n", | |
| "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]\n", | |
| "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:05<00:00, 5.12s/it]\n", | |
| "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:05<00:00, 5.12s/it]\n", | |
| "\n", | |
| "INFO 04-26 12:13:33 [loader.py:458] Loading weights took 5.20 seconds\n", | |
| "INFO 04-26 12:13:33 [model_runner.py:1146] Model loading took 2.8876 GiB and 32.125980 seconds\n", | |
| "INFO 04-26 12:13:38 [worker.py:267] Memory profiling takes 4.83 seconds\n", | |
| "INFO 04-26 12:13:38 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB\n", | |
| "INFO 04-26 12:13:38 [worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 8.31GiB.\n", | |
| "INFO 04-26 12:13:39 [executor_base.py:112] # cuda blocks: 19446, # CPU blocks: 9362\n", | |
| "INFO 04-26 12:13:39 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 9.50x\n", | |
| "INFO 04-26 12:13:44 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n", | |
| "Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:30<00:00, 1.15it/s]\n", | |
| "INFO 04-26 12:14:14 [model_runner.py:1598] Graph capturing finished in 30 secs, took 0.19 GiB\n", | |
| "INFO 04-26 12:14:14 [llm_engine.py:449] init engine (profile, create kv cache, warmup model) took 40.71 seconds\n", | |
| "WARNING 04-26 12:14:14 [config.py:1177] Default sampling parameters have been overridden by the model's Hugging Face generation config recommended from the model creator. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.\n", | |
| "INFO 04-26 12:14:14 [serving_chat.py:118] Using default chat sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n", | |
| "INFO 04-26 12:14:14 [serving_completion.py:61] Using default completion sampling params from model: {'repetition_penalty': 1.1, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}\n", | |
| "INFO 04-26 12:14:14 [api_server.py:1081] Starting vLLM API server on http://0.0.0.0:8001\n", | |
| "INFO 04-26 12:14:14 [launcher.py:26] Available routes are:\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /openapi.json, Methods: HEAD, GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /docs, Methods: HEAD, GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /docs/oauth2-redirect, Methods: HEAD, GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /redoc, Methods: HEAD, GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /health, Methods: GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /load, Methods: GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /ping, Methods: GET, POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /tokenize, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /detokenize, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/models, Methods: GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /version, Methods: GET\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/chat/completions, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/completions, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/embeddings, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /pooling, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /score, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/score, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/audio/transcriptions, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /rerank, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v1/rerank, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /v2/rerank, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /invocations, Methods: POST\n", | |
| "INFO 04-26 12:14:14 [launcher.py:34] Route: /metrics, Methods: GET\n", | |
| "INFO: Started server process [1790]\n", | |
| "INFO: Waiting for application startup.\n", | |
| "INFO: Application startup complete.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!tail -f vllm_output.log | sed '/Application startup complete/ q'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "npYjVvsPxaqW" | |
| }, | |
| "source": [ | |
| "### 4. Expose the Colab Service using ngrok\n", | |
| "\n", | |
| "\n", | |
| " - To use ngrok, you need to create an account and get a token (https://ngrok.com/)\n", | |
| " - Put the token in the Colab secrets as `NGROK_TOKEN` and grant access when prompted\n", | |
| "\n", | |
| " The output of this cell is a public ngrok URL" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "_Z8Bb2GNxV4O", | |
| "outputId": "30159dde-a8ef-40b1-a2a7-fae2dc1d0a99" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Ngrok tunnel URL: https://2d3d-35-196-44-108.ngrok-free.app\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from google.colab import userdata\n", | |
| "from pyngrok import ngrok\n", | |
| "\n", | |
| "# Authenticate\n", | |
| "ngrok.set_auth_token(userdata.get(\"NGROK_TOKEN\"))\n", | |
| "\n", | |
| "# Terminate any existing ngrok tunnels\n", | |
| "ngrok.kill()\n", | |
| "\n", | |
| "# Open a tunnel to the port where vllm is running (default is 8000)\n", | |
| "tunnel = ngrok.connect(port)\n", | |
| "\n", | |
| "# Print the public URL provided by ngrok\n", | |
| "print(f\"Ngrok tunnel URL: {tunnel.public_url}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "G77uxsnwCgh1" | |
| }, | |
| "source": [ | |
| "### Test the URL via LiteLLM\n", | |
| "\n", | |
| "Here's an example of calling the deployed endpoint via litellm.\n", | |
| "The exact same code also works outside Colab.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "QUfxmEEMDBtA", | |
| "outputId": "2945e5bc-7bcf-48fa-e321-93a3013da3e3" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Tomatoes are classified as vegetables, not fruits. They belong to the botanical family Solanaceae and can be found in the kitchen primarily due to\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import litellm\n", | |
| "\n", | |
| "model_url = (\n", | |
| " f\"{tunnel.public_url}/v1\"\n", | |
| ")\n", | |
| "\n", | |
| "response = litellm.completion(\n", | |
| " model=f\"hosted_vllm/{model}\", # Use the model name as it works in your direct request\n", | |
| " messages=[\n", | |
| " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n", | |
| " {\"role\": \"user\", \"content\": \"Is tomato a vegetable or a fruit?\"},\n", | |
| " ],\n", | |
| " api_base=model_url,\n", | |
| " # api_key=bearer_token, # use if your server requires a token\n", | |
| " temperature=1,\n", | |
| " top_p=0.95,\n", | |
| " max_tokens=30,\n", | |
| ")\n", | |
| "\n", | |
| "print(response.choices[0].message.content)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "nO_yOAyDB-An" | |
| }, | |
| "source": [ | |
| "### Cleaning up" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "IdCEYjrdCAyM" | |
| }, | |
| "source": [ | |
| "Find the vllm & ngrok processes and use `kill` to terminate them.\n", | |
| "\n", | |
| "Once done, the above ngrok URL will no longer be accessible" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "nJM0M7AhaFYB" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!ps aux | grep -- 'vllm\\|ngrok'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "V08KabQTCIe9" | |
| }, | |
| "source": [ | |
| "Alternatively, kill the process by their names" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "id": "tc9hZssX64Rf" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#!killall vllm\n", | |
| "#!killall ngrok" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "accelerator": "GPU", | |
| "colab": { | |
| "gpuType": "T4", | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment