profile
viewpoint

Ask questionsContinuous recognition doesn't work well when throttling the input stream

When using startContinuousRecognitionAsync with throttled input stream, I'm getting a repeating output in "recognizing" and "recognized"

Sample code follows.

"use strict";

// pull in the required packages.
var sdk = require("microsoft-cognitiveservices-speech-sdk");
var fs = require("fs");
const Throttle = require('throttle');

// replace with your own subscription key,
// service region (e.g., "westus"), and
// the name of the file you want to run
// through the speech recognizer.
var subscriptionKey = "YourSubscriptionKey";
var serviceRegion = "YourServiceRegion"; // e.g., "westus"
var filename = "test-message-16k.wav"; // 16000 Hz, Mono

// create the push stream we need for the speech sdk.
var pushStream = sdk.AudioInputStream.createPushStream();

// open the file and push it to the push stream.
fs.createReadStream(filename).pipe(new Throttle(16384)).on('data', function(arrayBuffer) {
  pushStream.write(arrayBuffer.buffer);
}).on('end', function() {
  pushStream.close();
});

// we are done with the setup
console.log("Now recognizing from: " + filename);

// now create the audio-config pointing to our stream and
// the speech config specifying the language.
var audioConfig = sdk.AudioConfig.fromStreamInput(pushStream);
var speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);

// setting the recognition language to English.
speechConfig.speechRecognitionLanguage = "en-US";

// create the speech recognizer.
var recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
const reco = recognizer;
// const recConfig = reco.createRecognizerConfig(speechConfig);

// recConfig.recognitionMode = RecognizerConfig.Dictation;

reco.recognizing = (_s, event) => {
  console.log('(recognizing) Text: ' + event.result.text);
};

/*
 * The event recognized signals that a final recognition result is received.
 * This is the final event that a phrase has been recognized.
 * For continuous recognition, you will get one recognized event for each phrase recognized.
 */
reco.recognized = (s, e) => {
  // Indicates that recognizable speech was not detected, and that recognition is done.
  if (e.result.reason === sdk.ResultReason.NoMatch) {
    var noMatchDetail = sdk.NoMatchDetails.fromResult(e.result);

    console.log('(recognized)  Reason: ' + sdk.ResultReason[e.result.reason] + ' NoMatchReason: ' + sdk.NoMatchReason[noMatchDetail.reason]);
  } else {
    try {
    const obj = JSON.parse(e.result.json);

    // In case no real input was recognized (e.g., just a noise on the line), the Confidence level is 0, and should be ignored.
    console.log('(recognized)  Reason: ' + sdk.ResultReason[e.result.reason] + ' Text: ' + e.result.text);
    } catch (err) {
      console.error(err);
    }
  }
};

/*
 * The event signals that the service has stopped processing speech.
 * https://docs.microsoft.com/javascript/api/microsoft-cognitiveservices-speech-sdk/speechrecognitioncanceledeventargs?view=azure-node-latest
 * This can happen for two broad classes of reasons.
 * 1. An error is encountered.
 *    In this case the .errorDetails property will contain a textual representation of the error.
 * 2. Speech was detected to have ended.
 *    This can be caused by the end of the specified file being reached, or ~20 seconds of silence from a microphone input.
 */
reco.canceled = (s, e) => {
  const str = '(cancel) Reason: ' + sdk.CancellationReason[e.reason];

  if (e.reason === sdk.CancellationReason.Error) {
    str += ': ' + e.errorDetails;
  }
  console.log(str);
  this.stop();
};

// Signals that a new session has started with the speech service
reco.sessionStarted = (s, e) => {
  const str = '(sessionStarted) SessionId: ' + e.sessionId;

  console.log(str);
};

// Signals the end of a session with the speech service.
reco.sessionStopped = (s, e) => {
  const str = '(sessionStopped) SessionId: ' + e.sessionId;

  this.stop();
  console.log(str);
};

// Signals that the speech service has started to detect speech.
reco.speechStartDetected = (s, e) => {
  const str = '(speechStartDetected) SessionId: ' + e.sessionId;

  console.log(str);
};

// Signals that the speech service has detected that speech has stopped.
reco.speechEndDetected = (s, e) => {
  const str = '(speechEndDetected) SessionId: ' + e.sessionId;

  console.log(str);
};

// start the recognizer and wait for a result.
recognizer.startContinuousRecognitionAsync(
  null,
  function (err) {
    console.trace("err - " + err);

    recognizer.close();
    recognizer = undefined;
  });

Output:

Now recognizing from: test-message-16k.wav
(sessionStarted) SessionId: 2BD50B1AD5B14F529B8E12935EBD8C00
(speechStartDetected) SessionId: 2BD50B1AD5B14F529B8E12935EBD8C00
(recognizing) Text: this
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.
(recognizing) Text: this
(recognizing) Text: this is a
(recognizing) Text: this is a test
(recognizing) Text: this is a test message
(recognized)  Reason: RecognizedSpeech Text: This is a test message.

For a file that is a bit longer, each sentence becomes duplicated several times.

Please advise.

microsoft/cognitive-services-speech-sdk-js

Answer questions rhurey

Sorry you hit this.

It's a bug in the node sample. The sample is passing the wrong buffer to the push stream's write method, and it's much larger than the filled in data amount which is causing the issue.

The PushStream.write method wants an ArrayBuffer. The data event takes a Buffer, which has most (if not all) of the methods on ArrayBuffer.

The type correct way of writing into the PushStream is: pushStream.write(arrayBuffer.buffer.slice(arrayBuffer.byteOffset,arrayBuffer.byteLength + arrayBuffer.byteOffset));

Of course, this being JavaScript pushStream.write(arrayBuffer) also works because the SDK isn't calling any methods from ArrayBuffer that aren't on Buffer.

We'll get the sample updated. Thanks for reporting this.

useful!

Related questions

No questions were found.
source:https://uonfu.com/
Github User Rank List