DEV Community 👩‍💻👨‍💻

Masui Masanori
Masui Masanori

Posted on

[WebRTC][Web Audio API] Identify who is vocalizing

Intro

Especially when I don't use video with WebRTC, I can't identify who is vocalizing.
So I try to determine this from the client's volume this time.

Examples

Sharing connected client names

Because there is no specification for sharing client names in WebRTC, I will share them with SSE.

sseClient.go

...
type ClientName struct {
    Name string `json:"name"`
}
type ClientNames struct {
    Names []ClientName `json:"names"`
}
...
Enter fullscreen mode Exit fullscreen mode

sseHub.go

...
func (h *SSEHub) run() {
...
    for {
        select {
        case client := <-h.register:
            h.clients[client] = true
            signalPeerConnections(h)
            sendClientNames(h)
        case client := <-h.unregister:
            if _, ok := h.clients[client]; ok {
                delete(h.clients, client)
                signalPeerConnections(h)
                sendClientNames(h)
            }
        case track := <-h.addTrack:
...
        }
    }
}
...
func sendClientNames(h *SSEHub) {
    names := ClientNames{
        Names: make([]ClientName, len(h.clients)),
    }

    i := 0
    for ps := range h.clients {
        names.Names[i] = ClientName{
            Name: ps.client.userName,
        }
        i += 1
    }
    message, err := NewClientNameMessageJSON(names)
    if err != nil {
        log.Printf("Error sendClientNames Message: %s", err.Error())
        return
    }
    for ps := range h.clients {
        flusher, _ := ps.client.w.(http.Flusher)
        fmt.Fprintf(ps.client.w, "data: %s\n\n", message)
        flusher.Flush()
    }
}
Enter fullscreen mode Exit fullscreen mode

main.view.ts

...
type ConnectedClient = {
    name: ClientName,
    element: HTMLElement,
};
export class MainView {
...
    private clientArea: HTMLElement;
    private connectedClients: ConnectedClient[];
    public constructor() {
...
        this.clientArea = document.getElementById("client_names") as HTMLElement;
        this.connectedClients = new Array<ConnectedClient>();
    }
...
    public updateClientNames(names: ClientNames): void {
        if(names == null) {
            console.warn("updateClientNames were null");
            return;
        }
        const newClients = new Array<ConnectedClient>();
        for(const c of this.connectedClients) {
            const clientName = c.name.name;
            if(names.names.some(n => n.name === clientName)) {
                newClients.push(c);
            } else {
                this.clientArea.removeChild(c.element);
            }
        }
        for(const n of names.names) {
            const clientName = n;
            if(this.connectedClients.some(c => c.name.name === clientName.name) === false) {
                const newElement = document.createElement("div");
                newElement.textContent = clientName.name;
                this.clientArea.appendChild(newElement);
                this.connectedClients.push({
                    name: clientName,
                    element: newElement,
                });
            }
        }
    }
...
Enter fullscreen mode Exit fullscreen mode

Getting audio level

Audio levels can be obtained in several ways.
And I also can get them from local media stream tracks or remote media stream tracks.

Because retrieving audio levels from the remote media stream tracks would require processing the same number of times as the number of connections, I decided to retrieve them from the local media stream tracks this time.

Getting audio level by "RTCPeerConnection.getStats()"

I can get statistics of RTCPeerConnection.
I can get audio levels from audio media stream tracks.

Image description

0: "RTCAudioSource_1"
1:
    audioLevel: 0.15381328775902586
    echoReturnLoss: -30
    echoReturnLossEnhancement: 0.17551203072071075
    id: "RTCAudioSource_1"
    kind: "audio"
    timestamp: 1659880489574
    totalAudioEnergy: 0.06016985176246171
    totalSamplesDuration: 2.1399999999999983
    trackIdentifier: "f987f34e-ef52-4a27-a73e-910f00bfd090"
    type: "media-source"
Enter fullscreen mode Exit fullscreen mode

webrtc.controller.ts

...
    public init(videoUsed: boolean) {
...
        let audioTrack: MediaStreamTrack|null = null;
        navigator.mediaDevices.getUserMedia({ video: videoUsed, audio: true })
            .then(stream => {
                this.webcamStream = stream;
                const audios = this.webcamStream.getAudioTracks();
                for(const a of audios) {
                    audioTrack = a;
                }
            });
        setInterval(() => {
            if(this.peerConnection == null ||
                this.peerConnection.connectionState !== "connected") {
                return;
            }
            this.peerConnection.getStats(audioTrack).then((stats) => {
                for(const report of stats) {
                    for(const r of report) {
                        const audioLevel = this.getAudioLevel(r);
                        if(audioLevel != null &&
                            audioLevel > 0.0) {
                            // If the threshold established between 0 and 1 is exceeded,
                            // it is considered to be talking
                            console.log(audioLevel);
                        }
                    }
                }
            });
        }, 500);
    }
...
    private getAudioLevel(stat: any): number|null {
        if(stat == null ||
            typeof stat !== "object") {
            return null;
        }
        if(!("kind" in stat) ||
            stat.kind !== "audio" ||
            !("audioLevel" in stat)) {
            return null;
        }
        if(typeof stat.audioLevel === "number") {
            return stat.audioLevel;
        }
        if(typeof stat.audioLevel === "string") {
            const parsedResult = parseFloat(stat.audioLevel);
            if(isNaN(parsedResult) === false) {
                return parsedResult;
            }
        }
        return null;
    }
}
Enter fullscreen mode Exit fullscreen mode

Because the code is redundant and runs in the main thread, I selected to use a different way.

AudioWorkletNode and AudioWorkletProcessor

I also can get audio levels by "AudioWorkletNode" and "AudioWorkletProcessor".
They provide custom nodes what are worked on "AudioWorkletGlobalScope".

To use them, I will need to add a JavaScript file separate from the one running on the Main Global Scope.
To implement this, I refer to the sample of "GoogleChromeLabs".

volume-measurer-processor.js

// This code is based on GoogleChromeLabs/web-audio-samples(Copyright (c) 2022 The Chromium Authors) for reference
// https://github.com/GoogleChromeLabs/web-audio-samples

/* global currentTime */

const FRAME_INTERVAL = 1 / 60;

/**
 *  Measure microphone volume.
 *
 * @class VolumeMeter
 * @extends AudioWorkletProcessor
 */
class VolumeMeasurer extends AudioWorkletProcessor {

  constructor() {
    super();
    this._lastUpdate = currentTime;
  }

  calculateRMS(inputChannelData) {
    // Calculate the squared-sum.
    let sum = 0;
    // the value of "inputChannelData.length" is 128 by default.
    for (let i = 0; i < inputChannelData.length; i++) {
      sum += inputChannelData[i] * inputChannelData[i];
    }
    // Calculate the RMS(Root Mean Square) level.
    return Math.sqrt(sum / inputChannelData.length);
  }
  // "output" and "parameters" can be omitted
  process(inputs) {
    // This example only handles mono channel.
    const inputChannelData = inputs[0][0];
    // Calculate and post the RMS level every 16ms.
    if (currentTime - this._lastUpdate > FRAME_INTERVAL) {
      const volume = this.calculateRMS(inputChannelData);
      this.port.postMessage(volume);
      this._lastUpdate = currentTime;
    }
    return true;
  }
}

registerProcessor("volume-measurer", VolumeMeasurer);
Enter fullscreen mode Exit fullscreen mode

webrtc.controller.ts

...
export class WebRtcController {
    private webcamStream: MediaStream | null = null;
    private peerConnection: RTCPeerConnection | null = null;
...
    private localAudioContext: AudioContext;
    private localAudioNode: MediaStreamAudioSourceNode|null = null;
    public constructor() {
        this.localVideo = document.getElementById("local_video") as HTMLVideoElement;
        this.localAudioContext = new AudioContext();
    }
    public init(videoUsed: boolean) {
...
        navigator.mediaDevices.getUserMedia({ video: videoUsed, audio: true })
            .then(async stream => {
                this.webcamStream = stream;
                // the AudioWorkletProcessor sub classes must be added as modules before creating AudioWorkletNode.
                await this.localAudioContext.audioWorklet.addModule("./js/volume-measurer-processor.js");
                // Create a MediaStreamAudioSourceNode and connect AudioWorkletNode to use the AudioWorkletProcessor sub classes.
                this.localAudioNode = this.localAudioContext.createMediaStreamSource(stream);
                const volumeMeterNode = new AudioWorkletNode(this.localAudioContext, "volume-measurer");   
                // MainGlobalScope and AudioWorkletGlobalScope are communicated by "postMessage" and "onmessage".
                volumeMeterNode.port.onmessage = async ({data}) => {
                    if(this.peerConnection?.connectionState === "connected") {
                        // If the threshold established between 0 and 1 is exceeded,
                        // it is considered to be talking.
                        if(data > 0.05) {
                            console.log(`talking V:${data}`);
                        }
                    }
                };
                this.localAudioNode.connect(volumeMeterNode).connect(this.localAudioContext.destination);
            });
    }
...
    public connect() {
...
        this.peerConnection.onconnectionstatechange = () => {
            if(this.peerConnection?.connectionState === "connected") {
                // start VolumeMeasurer 
                this.localAudioContext.resume();
            } else {
                // stop VolumeMeasurer 
                this.localAudioContext.suspend();
            }
        };
...
    }
}
Enter fullscreen mode Exit fullscreen mode

Top comments (0)

🌚 Browsing with dark mode makes you a better developer.

It's a scientific fact.