Skip to content

Instantly share code, notes, and snippets.

@rreece
Created July 19, 2025 01:39
Show Gist options
  • Select an option

  • Save rreece/e44d515d16c519e1da152cc32190dc72 to your computer and use it in GitHub Desktop.

Select an option

Save rreece/e44d515d16c519e1da152cc32190dc72 to your computer and use it in GitHub Desktop.
OpenAI compatible chat client in the terminal
"""
openai-chat-client.py
OpenAI compatible chat client in the terminal
See:
https://github.com/openai/openai-python
https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py
"""
import argparse
from dotenv import load_dotenv
import os
import time
from openai import OpenAI, OpenAIError
load_dotenv()
API_KEY = os.getenv("API_KEY")
BASE_URL = os.getenv("BASE_URL")
MODEL = os.getenv("MODEL")
SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT")
DEBUG = False # Print headers for debugging
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-k", "--api_key",
type=str,
default=API_KEY,
help="API key.")
parser.add_argument("-m", "--model",
type=str,
default=MODEL,
help="Path to model checkpoint.")
parser.add_argument("-n", "--no_stream",
action="store_true",
default=False,
help="Toggle streaming off.")
parser.add_argument("-p", "--print_perf",
action="store_true",
default=False,
help="Toggle printing of performance.")
parser.add_argument("-s", "--system_prompt",
type=str,
default=SYSTEM_PROMPT,
help="Chatbot's system prompt.")
parser.add_argument("-u", "--base_url",
type=str,
default=BASE_URL,
help="Base URL to API.")
return parser.parse_args()
def call_chat_api(model, messages, client, stream=True, print_perf=False):
"""
Sends a message to the vLLM API and gets the assistant's response.
"""
# Configuration
temperature = 0.7
# Prepare API request parameters
request_params = {
"model": model,
"messages": messages,
"temperature": temperature,
"stream": stream,
}
# Add stream_options only if stream=True
if stream and print_perf:
request_params["stream_options"] = {"include_usage": True}
# Send request to the API
start_time = time.time()
try:
response = client.chat.completions.create(**request_params)
except OpenAIError as e:
print(f"OpenAI API error: {e}")
return None
if DEBUG:
print("Response Headers:", response.response.headers)
content, usage, first_time = parse_response(response, stream, print_perf)
end_time = time.time()
if print_perf and usage:
ttft = first_time - start_time
generation_rate = (usage.completion_tokens - 1) / (end_time - first_time)
# subtract 1 from completion_tokens because the generation time is since the first token
print("Perf: TTFT = %.2f s, rate = %.2f tokens/s/user" % (ttft, generation_rate))
return content
def parse_response(response, stream=True, print_perf=False):
"""
Parses the response for streaming and non-streaming cases.
"""
usage = None
first_time = None
if stream:
print("Assistant: ", end="", flush=True)
# Parsing the streaming response
full_content = []
for chunk in response:
if first_time is None:
first_time = time.time()
if chunk.choices: # Regular streamed content
chunk_content = chunk.choices[0].delta.content
if chunk_content:
print(chunk_content, end="", flush=True)
full_content.append(chunk_content)
elif chunk.usage: # Final chunk with usage data
usage = chunk.usage
print("")
content = "".join(full_content)
else:
content = response.choices[0].message.content
usage = response.usage
print("Assistant:", content)
if print_perf and usage:
print("Usage:", usage)
return content, usage, first_time
def main():
args = parse_args()
model = args.model
system_prompt = args.system_prompt
base_url = args.base_url
api_key = args.api_key
stream = not args.no_stream # Default to streaming unless --no_stream is specified
print_perf = args.print_perf
# Initialize the OpenAI client
client = OpenAI(api_key=api_key, base_url=base_url)
# Initialize the conversation with a system message
messages = [
{
"role": "system",
"content": system_prompt,
},
]
print("You can start chatting with the assistant. Type 'exit' or 'quit' to end the conversation.\n")
print(f"System Prompt: {system_prompt}\n")
while True:
# Get user input
user_input = input("User: ")
if user_input.strip().lower() in ['exit', 'quit']:
print("Exiting chat.")
break
# Append the user's message to the conversation history
messages.append({"role": "user", "content": user_input})
# Call the vLLM API to generate the assistant's response
content = call_chat_api(model, messages, client, stream=stream, print_perf=print_perf)
if content:
# Append the assistant's message to the conversation history
messages.append({"role": "assistant", "content": content})
else:
print("An error occurred. Please try again.")
print("")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment