Created
July 19, 2025 01:39
-
-
Save rreece/e44d515d16c519e1da152cc32190dc72 to your computer and use it in GitHub Desktop.
OpenAI compatible chat client in the terminal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| openai-chat-client.py | |
| OpenAI compatible chat client in the terminal | |
| See: | |
| https://github.com/openai/openai-python | |
| https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py | |
| """ | |
| import argparse | |
| from dotenv import load_dotenv | |
| import os | |
| import time | |
| from openai import OpenAI, OpenAIError | |
| load_dotenv() | |
| API_KEY = os.getenv("API_KEY") | |
| BASE_URL = os.getenv("BASE_URL") | |
| MODEL = os.getenv("MODEL") | |
| SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT") | |
| DEBUG = False # Print headers for debugging | |
| def parse_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("-k", "--api_key", | |
| type=str, | |
| default=API_KEY, | |
| help="API key.") | |
| parser.add_argument("-m", "--model", | |
| type=str, | |
| default=MODEL, | |
| help="Path to model checkpoint.") | |
| parser.add_argument("-n", "--no_stream", | |
| action="store_true", | |
| default=False, | |
| help="Toggle streaming off.") | |
| parser.add_argument("-p", "--print_perf", | |
| action="store_true", | |
| default=False, | |
| help="Toggle printing of performance.") | |
| parser.add_argument("-s", "--system_prompt", | |
| type=str, | |
| default=SYSTEM_PROMPT, | |
| help="Chatbot's system prompt.") | |
| parser.add_argument("-u", "--base_url", | |
| type=str, | |
| default=BASE_URL, | |
| help="Base URL to API.") | |
| return parser.parse_args() | |
| def call_chat_api(model, messages, client, stream=True, print_perf=False): | |
| """ | |
| Sends a message to the vLLM API and gets the assistant's response. | |
| """ | |
| # Configuration | |
| temperature = 0.7 | |
| # Prepare API request parameters | |
| request_params = { | |
| "model": model, | |
| "messages": messages, | |
| "temperature": temperature, | |
| "stream": stream, | |
| } | |
| # Add stream_options only if stream=True | |
| if stream and print_perf: | |
| request_params["stream_options"] = {"include_usage": True} | |
| # Send request to the API | |
| start_time = time.time() | |
| try: | |
| response = client.chat.completions.create(**request_params) | |
| except OpenAIError as e: | |
| print(f"OpenAI API error: {e}") | |
| return None | |
| if DEBUG: | |
| print("Response Headers:", response.response.headers) | |
| content, usage, first_time = parse_response(response, stream, print_perf) | |
| end_time = time.time() | |
| if print_perf and usage: | |
| ttft = first_time - start_time | |
| generation_rate = (usage.completion_tokens - 1) / (end_time - first_time) | |
| # subtract 1 from completion_tokens because the generation time is since the first token | |
| print("Perf: TTFT = %.2f s, rate = %.2f tokens/s/user" % (ttft, generation_rate)) | |
| return content | |
| def parse_response(response, stream=True, print_perf=False): | |
| """ | |
| Parses the response for streaming and non-streaming cases. | |
| """ | |
| usage = None | |
| first_time = None | |
| if stream: | |
| print("Assistant: ", end="", flush=True) | |
| # Parsing the streaming response | |
| full_content = [] | |
| for chunk in response: | |
| if first_time is None: | |
| first_time = time.time() | |
| if chunk.choices: # Regular streamed content | |
| chunk_content = chunk.choices[0].delta.content | |
| if chunk_content: | |
| print(chunk_content, end="", flush=True) | |
| full_content.append(chunk_content) | |
| elif chunk.usage: # Final chunk with usage data | |
| usage = chunk.usage | |
| print("") | |
| content = "".join(full_content) | |
| else: | |
| content = response.choices[0].message.content | |
| usage = response.usage | |
| print("Assistant:", content) | |
| if print_perf and usage: | |
| print("Usage:", usage) | |
| return content, usage, first_time | |
| def main(): | |
| args = parse_args() | |
| model = args.model | |
| system_prompt = args.system_prompt | |
| base_url = args.base_url | |
| api_key = args.api_key | |
| stream = not args.no_stream # Default to streaming unless --no_stream is specified | |
| print_perf = args.print_perf | |
| # Initialize the OpenAI client | |
| client = OpenAI(api_key=api_key, base_url=base_url) | |
| # Initialize the conversation with a system message | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_prompt, | |
| }, | |
| ] | |
| print("You can start chatting with the assistant. Type 'exit' or 'quit' to end the conversation.\n") | |
| print(f"System Prompt: {system_prompt}\n") | |
| while True: | |
| # Get user input | |
| user_input = input("User: ") | |
| if user_input.strip().lower() in ['exit', 'quit']: | |
| print("Exiting chat.") | |
| break | |
| # Append the user's message to the conversation history | |
| messages.append({"role": "user", "content": user_input}) | |
| # Call the vLLM API to generate the assistant's response | |
| content = call_chat_api(model, messages, client, stream=stream, print_perf=print_perf) | |
| if content: | |
| # Append the assistant's message to the conversation history | |
| messages.append({"role": "assistant", "content": content}) | |
| else: | |
| print("An error occurred. Please try again.") | |
| print("") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment