Updated version of vosk for node
Nothing difficult: just changed define a signature and wrapped each call parameter group to tuples.
Updated version of vosk for node
Nothing difficult: just changed define a signature and wrapped each call parameter group to tuples.
| // @ts-check | |
| 'use strict'; | |
| /** | |
| * @module vosk | |
| */ | |
| const os = require('os'); | |
| const path = require('path'); | |
| const {DataType, open, close, define} = require('ffi-rs'); | |
| const fs = require('fs'); | |
| const soname = (function () { | |
| if (os.platform() === 'win32') { | |
| let currentPath = process.env.Path; | |
| let dllDirectory = path.resolve(path.join(__dirname, 'lib', 'win-x86_64')); | |
| process.env.Path = dllDirectory + path.delimiter + currentPath; | |
| return path.join(__dirname, 'lib', 'win-x86_64', 'libvosk.dll'); | |
| } | |
| if (os.platform() === 'darwin') { | |
| return path.join(__dirname, 'lib', 'osx-universal', 'libvosk.dylib'); | |
| } | |
| if (os.platform() === 'linux' && os.arch() === 'arm64') { | |
| return path.join(__dirname, 'lib', 'linux-arm64', 'libvosk.so'); | |
| } | |
| return path.join(__dirname, 'lib', 'linux-x86_64', 'libvosk.so'); | |
| })(); | |
| if (!fs.existsSync(soname)) { | |
| throw new Error(`File doesn't exist: ${soname}`); | |
| } | |
| open({ | |
| library: 'libvosk', | |
| path : soname, | |
| }); | |
| /** @type {LibVosk} */ | |
| const libvosk = define({ | |
| vosk_set_log_level : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.I32 ]}, | |
| vosk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]}, | |
| vosk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
| vosk_spk_model_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.String ]}, | |
| vosk_spk_model_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
| vosk_recognizer_new : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float ]}, | |
| vosk_recognizer_new_spk : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.External]}, | |
| vosk_recognizer_new_grm : {library: 'libvosk', retType: DataType.External, paramsType: [DataType.External, DataType.Float, DataType.String ]}, | |
| vosk_recognizer_free : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]}, | |
| vosk_recognizer_set_max_alternatives: {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.I32 ]}, | |
| vosk_recognizer_set_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]}, | |
| vosk_recognizer_set_partial_words : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.Boolean ]}, | |
| vosk_recognizer_set_spk_model : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External, DataType.External ]}, | |
| vosk_recognizer_accept_waveform : {library: 'libvosk', retType: DataType.Boolean , paramsType: [DataType.External, DataType.U8Array, DataType.I32 ]}, | |
| vosk_recognizer_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
| vosk_recognizer_final_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
| vosk_recognizer_partial_result : {library: 'libvosk', retType: DataType.String , paramsType: [DataType.External ]}, | |
| vosk_recognizer_reset : {library: 'libvosk', retType: DataType.Void , paramsType: [DataType.External ]} | |
| }); | |
| /** | |
| * Set log level for Kaldi messages | |
| * @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. | |
| */ | |
| function setLogLevel(level) { | |
| libvosk.vosk_set_log_level([level]); | |
| } | |
| /** | |
| * Build a Model from a model file. | |
| * @see models [models](https://alphacephei.com/vosk/models) | |
| */ | |
| class Model { | |
| /** | |
| * Build a Model to be used with the voice recognition. Each language should have it's own Model | |
| * for the speech recognition to work. | |
| * @param {string} modelPath The abstract pathname to the model | |
| * @see models [models](https://alphacephei.com/vosk/models) | |
| */ | |
| constructor(modelPath) { | |
| /** | |
| * Store the handle. | |
| * For internal use only | |
| * @type {unknown} | |
| */ | |
| this.handle = libvosk.vosk_model_new([modelPath]); | |
| console.log("model is created"); | |
| } | |
| /** | |
| * Releases the model memory | |
| * | |
| * The model object is reference-counted so if some recognizer | |
| * depends on this model, model might still stay alive. When | |
| * last recognizer is released, model will be released too. | |
| */ | |
| free() { | |
| libvosk.vosk_model_free([this.handle]); | |
| } | |
| } | |
| /** | |
| * Build a Speaker Model from a speaker model file. | |
| * The Speaker Model enables speaker identification. | |
| * @see models [models](https://alphacephei.com/vosk/models) | |
| */ | |
| class SpeakerModel { | |
| /** | |
| * Loads speaker model data from the file and returns the model object | |
| * | |
| * @param {string} modelPath the path of the model on the filesystem | |
| * @see models [models](https://alphacephei.com/vosk/models) | |
| */ | |
| constructor(modelPath) { | |
| /** | |
| * Store the handle. | |
| * For internal use only | |
| * @type {unknown} | |
| */ | |
| this.handle = libvosk.vosk_spk_model_new([modelPath]); | |
| } | |
| /** | |
| * Releases the model memory | |
| * | |
| * The model object is reference-counted so if some recognizer | |
| * depends on this model, model might still stay alive. When | |
| * last recognizer is released, model will be released too. | |
| */ | |
| free() { | |
| libvosk.vosk_spk_model_free([this.handle]); | |
| } | |
| } | |
| /** | |
| * Helper to narrow down type while using `hasOwnProperty`. | |
| * @see hasOwnProperty [typescript issue](https://fettblog.eu/typescript-hasownproperty/) | |
| * @template {Object} Obj | |
| * @template {PropertyKey} Key | |
| * @param {Obj} obj | |
| * @param {Key} prop | |
| * @returns {obj is Obj & Record<Key, unknown>} | |
| */ | |
| function hasOwnProperty(obj, prop) { | |
| return obj.hasOwnProperty(prop); | |
| } | |
| /** | |
| * @template T | |
| * @template U | |
| * @typedef {{ [P in Exclude<keyof T, keyof U>]?: never }} Without | |
| */ | |
| /** | |
| * @template T | |
| * @template U | |
| * @typedef {(T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U} XOR | |
| */ | |
| /** | |
| * Create a Recognizer that will be able to transform audio streams into text using a Model. | |
| * @template {XOR<SpeakerRecognizerParam, Partial<GrammarRecognizerParam>>} T extra parameter | |
| * @see Model | |
| */ | |
| class Recognizer { | |
| handle; | |
| /** | |
| * Create a Recognizer that will handle speech to text recognition. | |
| * @constructor | |
| * @param {T & BaseRecognizerParam & Partial<SpeakerRecognizerParam>} param The Recognizer parameters | |
| * | |
| * Sometimes when you want to improve recognition accuracy and when you don't need | |
| * to recognize large vocabulary you can specify a list of phrases to recognize. This | |
| * will improve recognizer speed and accuracy but might return [unk] if user said | |
| * something different. | |
| * | |
| * Only recognizers with lookahead models support this type of quick configuration. | |
| * Precompiled HCLG graph models are not supported. | |
| */ | |
| constructor(param) { | |
| const {model, sampleRate} = param; | |
| // Prevent the user to receive unpredictable results | |
| if (hasOwnProperty(param, 'speakerModel') && hasOwnProperty(param, 'grammar')) { | |
| throw new Error('grammar and speakerModel cannot be used together for now.'); | |
| } | |
| /** | |
| * Store the handle. | |
| * For internal use only | |
| * @type {unknown} | |
| */ | |
| this.handle = hasOwnProperty(param, 'speakerModel') | |
| ? libvosk.vosk_recognizer_new_spk([model.handle, sampleRate, param.speakerModel.handle]) | |
| : hasOwnProperty(param, 'grammar') | |
| ? libvosk.vosk_recognizer_new_grm([model.handle, sampleRate, JSON.stringify(param.grammar)]) | |
| : libvosk.vosk_recognizer_new([model.handle, sampleRate]); | |
| } | |
| /** | |
| * Releases the model memory | |
| * | |
| * The model object is reference-counted so if some recognizer | |
| * depends on this model, model might still stay alive. When | |
| * last recognizer is released, model will be released too. | |
| */ | |
| free() { | |
| libvosk.vosk_recognizer_free([this.handle]); | |
| } | |
| /** Configures recognizer to output n-best results | |
| * | |
| * <pre> | |
| * { | |
| * "alternatives": [ | |
| * { "text": "one two three four five", "confidence": 0.97 }, | |
| * { "text": "one two three for five", "confidence": 0.03 }, | |
| * ] | |
| * } | |
| * </pre> | |
| * | |
| * @param max_alternatives - maximum alternatives to return from recognition results | |
| */ | |
| setMaxAlternatives(max_alternatives) { | |
| libvosk.vosk_recognizer_set_max_alternatives([this.handle, max_alternatives]); | |
| } | |
| /** Configures recognizer to output words with times | |
| * | |
| * <pre> | |
| * "result" : [{ | |
| * "conf" : 1.000000, | |
| * "end" : 1.110000, | |
| * "start" : 0.870000, | |
| * "word" : "what" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 1.530000, | |
| * "start" : 1.110000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 1.950000, | |
| * "start" : 1.530000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 2.340000, | |
| * "start" : 1.950000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 2.610000, | |
| * "start" : 2.340000, | |
| * "word" : "one" | |
| * }], | |
| * </pre> | |
| * | |
| * @param words - boolean value | |
| */ | |
| setWords(words) { | |
| libvosk.vosk_recognizer_set_words([this.handle, words]); | |
| } | |
| /** Same as above, but for partial results*/ | |
| setPartialWords(partial_words) { | |
| libvosk.vosk_recognizer_set_partial_words([this.handle, partial_words]); | |
| } | |
| /** Adds speaker recognition model to already created recognizer. Helps to initialize | |
| * speaker recognition for grammar-based recognizer. | |
| * | |
| * @param spk_model Speaker recognition model | |
| */ | |
| setSpkModel(spk_model) { | |
| libvosk.vosk_recognizer_set_spk_model([this.handle, spk_model.handle]); | |
| } | |
| /** | |
| * Accept voice data | |
| * | |
| * accept and process new chunk of voice data | |
| * | |
| * @param {Buffer} data audio data in PCM 16-bit mono format | |
| * @returns true if silence is occured and you can retrieve a new utterance with result method | |
| */ | |
| acceptWaveform(data) { | |
| return libvosk.vosk_recognizer_accept_waveform([this.handle, data, data.length]); | |
| }; | |
| /** | |
| * Accept voice data | |
| * | |
| * accept and process new chunk of voice data | |
| * | |
| * @param {Buffer} data audio data in PCM 16-bit mono format | |
| * @returns true if silence is occured and you can retrieve a new utterance with result method | |
| */ | |
| acceptWaveformAsync(data) { | |
| return new Promise((resolve, reject) => { | |
| libvosk.vosk_recognizer_accept_waveform.async([this.handle, data, data.length], function (err, result) { | |
| if (err) { | |
| reject(err); | |
| } | |
| else { | |
| resolve(result); | |
| } | |
| }); | |
| }); | |
| }; | |
| /** Returns speech recognition result in a string | |
| * | |
| * @returns the result in JSON format which contains decoded line, decoded | |
| * words, times in seconds and confidences. You can parse this result | |
| * with any json parser | |
| * <pre> | |
| * { | |
| * "result" : [{ | |
| * "conf" : 1.000000, | |
| * "end" : 1.110000, | |
| * "start" : 0.870000, | |
| * "word" : "what" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 1.530000, | |
| * "start" : 1.110000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 1.950000, | |
| * "start" : 1.530000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 2.340000, | |
| * "start" : 1.950000, | |
| * "word" : "zero" | |
| * }, { | |
| * "conf" : 1.000000, | |
| * "end" : 2.610000, | |
| * "start" : 2.340000, | |
| * "word" : "one" | |
| * }], | |
| * "text" : "what zero zero zero one" | |
| * } | |
| * </pre> | |
| */ | |
| resultString() { | |
| return libvosk.vosk_recognizer_result([this.handle]); | |
| }; | |
| /** | |
| * Returns speech recognition results | |
| * @returns {Result<T>} The results | |
| */ | |
| result() { | |
| return JSON.parse(libvosk.vosk_recognizer_result([this.handle])); | |
| }; | |
| /** | |
| * speech recognition text which is not yet finalized. | |
| * result may change as recognizer process more data. | |
| * | |
| * @returns {PartialResults} The partial results | |
| */ | |
| partialResult() { | |
| return JSON.parse(libvosk.vosk_recognizer_partial_result([this.handle])); | |
| }; | |
| /** | |
| * Returns speech recognition result. Same as result, but doesn't wait for silence | |
| * You usually call it in the end of the stream to get final bits of audio. It | |
| * flushes the feature pipeline, so all remaining audio chunks got processed. | |
| * | |
| * @returns {Result<T>} speech result. | |
| */ | |
| finalResult() { | |
| return JSON.parse(libvosk.vosk_recognizer_final_result([this.handle])); | |
| }; | |
| /** | |
| * | |
| * Resets current results so the recognition can continue from scratch | |
| */ | |
| reset() { | |
| libvosk.vosk_recognizer_reset([this.handle]); | |
| } | |
| } | |
| exports.setLogLevel = setLogLevel; | |
| exports.Model = Model; | |
| exports.SpeakerModel = SpeakerModel; | |
| exports.Recognizer = Recognizer; | |
| // Optional: Close library when done (call when appropriate) | |
| // close('libvosk'); |
| // vosk.d.ts | |
| declare module 'vosk' { | |
| /** | |
| * Set log level for Kaldi messages | |
| * @param level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. | |
| */ | |
| export function setLogLevel(level: number): void; | |
| export class Model { | |
| /** @internal */ | |
| handle: any; | |
| constructor(modelPath: string); | |
| free(): void; | |
| } | |
| export class SpeakerModel { | |
| /** @internal */ | |
| handle: any; | |
| constructor(modelPath: string); | |
| free(): void; | |
| } | |
| type WordResult = { | |
| /** Confidence (0-1) */ | |
| conf: number; | |
| /** Start time in seconds */ | |
| start: number; | |
| /** End time in seconds */ | |
| end: number; | |
| /** Recognized word */ | |
| word: string; | |
| }; | |
| type RecognitionResults = { | |
| result: WordResult[]; | |
| text: string; | |
| }; | |
| type SpeakerResults = { | |
| spk: number[]; | |
| spk_frames: number; | |
| }; | |
| type PartialResults = { | |
| partial: string; | |
| }; | |
| type BaseRecognizerParam = { | |
| model: Model; | |
| sampleRate: number; | |
| }; | |
| type SpeakerRecognizerParam = { | |
| speakerModel: SpeakerModel; | |
| }; | |
| type GrammarRecognizerParam = { | |
| grammar: string[]; | |
| }; | |
| type RecognizerParams<T> = T & BaseRecognizerParam; | |
| type Result<T> = T extends SpeakerRecognizerParam | |
| ? RecognitionResults & SpeakerResults | |
| : T extends GrammarRecognizerParam | |
| ? RecognitionResults | |
| : never; | |
| export class Recognizer<T extends SpeakerRecognizerParam | GrammarRecognizerParam> { | |
| /** @internal */ | |
| handle: any; | |
| constructor(params: RecognizerParams<T>); | |
| free(): void; | |
| setMaxAlternatives(max_alternatives: number): void; | |
| setWords(words: boolean): void; | |
| setPartialWords(partial_words: boolean): void; | |
| setSpkModel(spk_model: SpeakerModel): void; | |
| acceptWaveform(data: Buffer): boolean; | |
| resultString(): string; | |
| result(): Result<T>; | |
| partialResult(): PartialResults; | |
| finalResult(): Result<T>; | |
| reset(): void; | |
| } | |
| // Helper type for XOR (mutually exclusive) properties | |
| type Without<T, U> = { [P in Exclude<keyof T, keyof U>]?: never }; | |
| type XOR<T, U> = (T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U; | |
| export type RecognizerConstructorParams = BaseRecognizerParam & XOR< | |
| SpeakerRecognizerParam, | |
| GrammarRecognizerParam | |
| >; | |
| } | |
| interface LibVosk { | |
| /** | |
| * Set the log level for Vosk. | |
| * @param level - Log level (integer). | |
| */ | |
| vosk_set_log_level: (params: [number]) => void; | |
| /** | |
| * Create a new Vosk model. | |
| * @param modelPath - Path to the model (string). | |
| * @returns Pointer to the model. | |
| */ | |
| vosk_model_new: (params: [string]) => any; | |
| /** | |
| * Free a Vosk model. | |
| * @param modelHandle - Pointer to the model. | |
| */ | |
| vosk_model_free: (params: [any]) => void; | |
| /** | |
| * Create a new Vosk speaker model. | |
| * @param modelPath - Path to the speaker model (string). | |
| * @returns Pointer to the speaker model. | |
| */ | |
| vosk_spk_model_new: (params: [string]) => any; | |
| /** | |
| * Free a Vosk speaker model. | |
| * @param spkModelHandle - Pointer to the speaker model. | |
| */ | |
| vosk_spk_model_free: (params: [any]) => void; | |
| /** | |
| * Create a new recognizer without speaker or grammar. | |
| * @param modelHandle - Pointer to the model. | |
| * @param sampleRate - Sample rate (float). | |
| * @returns Pointer to the recognizer. | |
| */ | |
| vosk_recognizer_new: (params: [any, number]) => any; | |
| /** | |
| * Create a new recognizer with speaker model. | |
| * @param modelHandle - Pointer to the model. | |
| * @param sampleRate - Sample rate (float). | |
| * @param spkModelHandle - Pointer to the speaker model. | |
| * @returns Pointer to the recognizer. | |
| */ | |
| vosk_recognizer_new_spk: (params: [any, number, any]) => any; | |
| /** | |
| * Create a new recognizer with grammar. | |
| * @param modelHandle - Pointer to the model. | |
| * @param sampleRate - Sample rate (float). | |
| * @param grammar - Grammar string (JSON). | |
| * @returns Pointer to the recognizer. | |
| */ | |
| vosk_recognizer_new_grm: (params: [any, number, string]) => any; | |
| /** | |
| * Free a recognizer. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| */ | |
| vosk_recognizer_free: (params: [any]) => void; | |
| /** | |
| * Set the maximum number of alternatives for recognition results. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @param maxAlternatives - Maximum number of alternatives (integer). | |
| */ | |
| vosk_recognizer_set_max_alternatives: (params: [any, number]) => void; | |
| /** | |
| * Enable or disable word-level results. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @param words - Boolean to enable/disable word-level results. | |
| */ | |
| vosk_recognizer_set_words: (params: [any, boolean]) => void; | |
| /** | |
| * Enable or disable partial word-level results. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @param partialWords - Boolean to enable/disable partial word-level results. | |
| */ | |
| vosk_recognizer_set_partial_words: (params: [any, boolean]) => void; | |
| /** | |
| * Set the speaker model for an existing recognizer. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @param spkModelHandle - Pointer to the speaker model. | |
| */ | |
| vosk_recognizer_set_spk_model: (params: [any, any]) => void; | |
| /** | |
| * Accept waveform data for recognition. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @param data - Audio data buffer (pointer). | |
| * @param length - Length of the audio data (integer). | |
| * @returns Boolean indicating if silence was detected. | |
| */ | |
| vosk_recognizer_accept_waveform: (params: [any, any, number]) => boolean; | |
| /** | |
| * Get the final recognition result. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @returns JSON string with recognition results. | |
| */ | |
| vosk_recognizer_result: (params: [any]) => string; | |
| /** | |
| * Get the final recognition result without waiting for silence. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @returns JSON string with recognition results. | |
| */ | |
| vosk_recognizer_final_result: (params: [any]) => string; | |
| /** | |
| * Get the partial recognition result. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| * @returns JSON string with partial recognition results. | |
| */ | |
| vosk_recognizer_partial_result: (params: [any]) => string; | |
| /** | |
| * Reset the recognizer. | |
| * @param recognizerHandle - Pointer to the recognizer. | |
| */ | |
| vosk_recognizer_reset: (params: [any]) => void; | |
| } |