Last active
April 8, 2024 14:05
-
-
Save hiro-v/27d6ab1548b1b29c79c8450d9759444c to your computer and use it in GitHub Desktop.
Profile RAM and NVIDIA GPU VRAM on Windows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $username = "jan" | |
| # Define paths using the username variable | |
| $nitroPath1 = "C:\\Users\\$username\\jan\\engines\\nitro-tensorrt-llm\\0.1.8\\ampere\\nitro.exe" | |
| $nitroPath2 = "C:\\Users\\$username\\jan\\extensions\\@janhq\\inference-nitro-extension\\dist\\bin\\win-cuda-12-0\\nitro.exe" | |
| $modelPath1 = "C:\\Users\\$username\\jan\\models\\mistral-7b-instruct-int4" | |
| $modelPath2 = "C:\\Users\\$username\\jan\\models\\mistral-ins-7b-q4\\mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
| # Function to get current RAM and VRAM usage | |
| function Get-MemoryUsage { | |
| $ram = (Get-Process -Name "nitro" -ErrorAction SilentlyContinue).WS | |
| $vramOutput = & "nvidia-smi" --query-gpu=memory.used --format=csv,noheader,nounits | |
| Write-Output "VRAM Output: $vramOutput" | |
| $vram = if ($vramOutput) { [int]$vramOutput.Trim() } else { 0 } # Default to 0 if null or empty | |
| return @{ RAM = $ram; VRAM = $vram } | |
| } | |
| # Function to perform load model operation and check response | |
| function Load-Model { | |
| param ( | |
| [string]$uri, | |
| [string]$body | |
| ) | |
| # Print JSON input in a formatted manner | |
| $jsonBody = $body | ConvertFrom-Json | ConvertTo-Json | |
| Write-Output "Sending JSON request body:" | |
| Write-Output $jsonBody | |
| $response = Invoke-WebRequest -Uri $uri -Method Post -ContentType "application/json" -Body $body | |
| if ($response.StatusCode -eq 200) { | |
| Write-Output "Model loaded successfully." | |
| Start-Sleep -Seconds 3 # Ensure the model is ready | |
| # Print the response body if status code is 200 | |
| $responseContent = $response.Content | ConvertFrom-Json | ConvertTo-Json | |
| Write-Output "Response Body:" | |
| Write-Output $responseContent | |
| } else { | |
| Write-Output "Failed to load model. Status code: $($response.StatusCode)" | |
| exit | |
| } | |
| } | |
| # Function to start Nitro, perform actions, and monitor memory usage | |
| function Start-Nitro { | |
| param ( | |
| [string]$nitroPath, | |
| [string]$modelType | |
| ) | |
| # Start Nitro | |
| Start-Process -FilePath $nitroPath | |
| # Get Memory usage after starting Nitro | |
| Start-Sleep -Seconds 5 | |
| $memoryAfterNitro = Get-MemoryUsage | |
| Write-Output "RAM after starting Nitro: $($memoryAfterNitro.RAM) bytes" | |
| Write-Output "VRAM after starting Nitro: $($memoryAfterNitro.VRAM) bytes" | |
| # Determine the correct load model request | |
| $webRequestUri = $null | |
| $webRequestBody = $null | |
| if ($modelType -eq "tensorrt_llm") { | |
| $webRequestUri = "http://localhost:3928/inferences/tensorrtllm/loadmodel" | |
| $webRequestBody = @" | |
| { | |
| "engine_path": "$modelPath1" | |
| } | |
| "@ | |
| } else { | |
| $webRequestUri = "http://localhost:3928/inferences/llamacpp/loadmodel" | |
| $webRequestBody = @" | |
| { | |
| "llama_model_path": "$modelPath2" | |
| } | |
| "@ | |
| } | |
| # Load model and ensure it's ready | |
| Load-Model -uri $webRequestUri -body $webRequestBody | |
| # Monitor memory usage and calculate peak/average | |
| $ramReadings = @() | |
| $vramReadings = @() | |
| $endTime = (Get-Date).AddSeconds(30) | |
| while ((Get-Date) -lt $endTime) { | |
| Start-Sleep -Seconds 3 | |
| $currentMemory = Get-MemoryUsage | |
| $ramReadings += $currentMemory.RAM | |
| $vramReadings += $currentMemory.VRAM | |
| Write-Output "Current RAM: $($currentMemory.RAM) bytes" | |
| Write-Output "Current VRAM: $($currentMemory.VRAM) bytes" | |
| } | |
| # Calculate peak and average for RAM and VRAM | |
| $peakRAM = ($ramReadings | Measure-Object -Maximum).Maximum | |
| $averageRAM = ($ramReadings | Measure-Object -Average).Average | |
| $peakVRAM = ($vramReadings | Measure-Object -Maximum).Maximum | |
| $averageVRAM = ($vramReadings | Measure-Object -Average).Average | |
| Write-Output "Peak RAM Usage: $peakRAM bytes" | |
| Write-Output "Average RAM Usage: $averageRAM bytes" | |
| Write-Output "Peak VRAM Usage: $peakVRAM bytes" | |
| Write-Output "Average VRAM Usage: $averageVRAM bytes" | |
| } | |
| # Execute for the first Nitro with type tensorrt_llm | |
| # Start-Nitro -nitroPath $nitroPath1 -modelType "tensorrt_llm" | |
| # Execute for the second Nitro with type llamacpp | |
| Start-Nitro -nitroPath $nitroPath2 -modelType "llamacpp" |
Author
Author
New instruction:
- Open Jan App -> Hub -> Make sure you installed
tensorrt-llm extensionand download 2 mistral models (q4 and int4) - Close Jan App and other applications if possible
- Fix
OPENAI_BASE_URLtohttp://localhost:3928/v1(llmperf -token_benchmark_ray.py) - Download above gist -> Open and update the
username. It will try to find binaries underC:\Users\<username>\jan\ - Powershell ISE Admin in console runs
Set-ExecutionPolicy RemoteSignedthenGet-ExecutionPolicyto verify it's notRestricted - Run ps1 gist in Powershell ISE Admin, here is the output (scroll down and comment/ uncomment llama.cpp/ tensorrt_llm section)
Current VRAM: 1 %
Peak RAM Usage: 592707584 bytes
Average RAM Usage: 560823091.2 bytes
Peak VRAM Usage: 1 %
Average VRAM Usage: 0.3 %
- Once the log for model loadded successfully shows up in step 4, run the benchmark script (make sure you use correct conda env and installed deps)
llama.cpp
python token_benchmark_ray.py --model "mistral-ins-7b-q4" --mean-input-tokens 2048 --stddev-input-tokens 150 --mean-output-tokens 512 --stddev-output-tokens 10 --max-num-completed-requests 2 --timeout 600 --num-concurrent-requests 1 --results-dir "result_outputs" --llm-api openai --additional-sampling-params '{}'
tensorrt_llm
python token_benchmark_ray.py --model "mistral-7b-instruct-int4" --mean-input-tokens 2048 --stddev-input-tokens 150 --mean-output-tokens 512 --stddev-output-tokens 10 --max-num-completed-requests 2 --timeout 600 --num-concurrent-requests 1 --results-dir "result_outputs" --llm-api openai --additional-sampling-params "{}"
- Manually close Nitro process then run again for 5 and 6 for
runs
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Steps to follow to benchmark Jan app on Windows 10/ 11:
llama.cpp Q4andtensorrt-llm INT4nvidia-smifor any other programs). This by default runs in 60 seconds with 3s as interval.http://localhost:1337llmperfbenchmark:git clone https://github.com/ray-project/llmperf cd llmperf/setdoes not play nice with Python environment variables - ostotken_benchmark_ray.pycondafrom https://docs.conda.io/projects/conda/en/latest/user-guide/install/windows.htmlllmperfandps1script have to run at the same time right after you click Jan Server start)mistral-ins-7b-q4~ llama.cpp Q4mistral-7b-instruct-int4~ TensorRT-LLM INT4