Multimodal Speaker Recognition

Async Generate Multi Speaker

curl --request POST \
  --url https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker \
  --header 'Authorization: <api-key>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "asset_id": "re_657929111888723968"
}
'

import requests

url = "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker"

payload = { "asset_id": "re_657929111888723968" }
headers = {
    "Authorization": "<api-key>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: '<api-key>', 'Content-Type': 'application/json'},
  body: JSON.stringify({asset_id: 're_657929111888723968'})
};

fetch('https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'asset_id' => 're_657929111888723968'
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: <api-key>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker"

	payload := strings.NewReader("{\n  \"asset_id\": \"re_657929111888723968\"\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "<api-key>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker")
  .header("Authorization", "<api-key>")
  .header("Content-Type", "application/json")
  .body("{\n  \"asset_id\": \"re_657929111888723968\"\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = '<api-key>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"asset_id\": \"re_657929111888723968\"\n}"

response = http.request(request)
puts response.read_body

{
  "code": 200,
  "msg": "success",
  "data": {
    "task_id": "ec2449885ba84c4f943a80ff0633158e"
  },
  "failed": false,
  "success": true
}

{
  "code": 200,
  "message": "SUCCESS",
  "data": {
    "data": {
      "audio_transcription": [
        {
          "end_time": 1.8,
          "speaker": "Kiara S Stepsister",
          "start_time": 0.0,
          "text": "You wolfless Omega! Clean that up, you jinx!"
        }
      ],
      "faces": [
        {
          "face_file_blob": "api-backend/0c5dfd30-7285-4e1e-bd20-dd31fd405365/multimodal-asr/9e545636_batch_1_video_9_person_001.jpg",
          "face_file_bucket": "memories-cache",
          "face_file_protocol": "gs",
          "face_id": "9e545636-509a-4a7d-b7c8-6359ea6a6d8b_person_001",
          "name": "Kiara S Stepsister"
        }
      ],
      "usage_metadata": [
        {
          "duration": 0.0,
          "model": "gemini-2.5-pro",
          "output_tokens": 6143,
          "prompt_tokens": 442368
        },
        {
          "duration": 0.0,
          "model": "gemini-2.5-flash",
          "output_tokens": 15643,
          "prompt_tokens": 32508
        }
      ]
    },
    "msg": "Multimodal ASR completed successfully",
    "success": true
  },
  "task_id": "29799938cfd344db8e10243a266b9990"
}

POST

transcriptions

async-generate-multi-speaker

Async Generate Multi Speaker

curl --request POST \
  --url https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker \
  --header 'Authorization: <api-key>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "asset_id": "re_657929111888723968"
}
'

import requests

url = "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker"

payload = { "asset_id": "re_657929111888723968" }
headers = {
    "Authorization": "<api-key>",
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {Authorization: '<api-key>', 'Content-Type': 'application/json'},
  body: JSON.stringify({asset_id: 're_657929111888723968'})
};

fetch('https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'asset_id' => 're_657929111888723968'
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: <api-key>",
    "Content-Type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker"

	payload := strings.NewReader("{\n  \"asset_id\": \"re_657929111888723968\"\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Authorization", "<api-key>")
	req.Header.Add("Content-Type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker")
  .header("Authorization", "<api-key>")
  .header("Content-Type", "application/json")
  .body("{\n  \"asset_id\": \"re_657929111888723968\"\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Authorization"] = '<api-key>'
request["Content-Type"] = 'application/json'
request.body = "{\n  \"asset_id\": \"re_657929111888723968\"\n}"

response = http.request(request)
puts response.read_body

{
  "code": 200,
  "msg": "success",
  "data": {
    "task_id": "ec2449885ba84c4f943a80ff0633158e"
  },
  "failed": false,
  "success": true
}

{
  "code": 200,
  "message": "SUCCESS",
  "data": {
    "data": {
      "audio_transcription": [
        {
          "end_time": 1.8,
          "speaker": "Kiara S Stepsister",
          "start_time": 0.0,
          "text": "You wolfless Omega! Clean that up, you jinx!"
        }
      ],
      "faces": [
        {
          "face_file_blob": "api-backend/0c5dfd30-7285-4e1e-bd20-dd31fd405365/multimodal-asr/9e545636_batch_1_video_9_person_001.jpg",
          "face_file_bucket": "memories-cache",
          "face_file_protocol": "gs",
          "face_id": "9e545636-509a-4a7d-b7c8-6359ea6a6d8b_person_001",
          "name": "Kiara S Stepsister"
        }
      ],
      "usage_metadata": [
        {
          "duration": 0.0,
          "model": "gemini-2.5-pro",
          "output_tokens": 6143,
          "prompt_tokens": 442368
        },
        {
          "duration": 0.0,
          "model": "gemini-2.5-flash",
          "output_tokens": 15643,
          "prompt_tokens": 32508
        }
      ]
    },
    "msg": "Multimodal ASR completed successfully",
    "success": true
  },
  "task_id": "29799938cfd344db8e10243a266b9990"
}

Product: Visual Intelligence — Audio File Transcription Use case: Transcribe an uploaded audio/video file to text — async batch or sync, multiple providers (Whisper, ElevenLabs, AssemblyAI) with optional speaker labels. For live streams, see Live Audio Transcription. Host: https://mavi-backend.memories.ai/serve/api/v2 Auth: Authorization: sk-mavi-... (no Bearer prefix)

Identifies who said what by combining audio speaker diarization (pyannote) with face recognition (Gemini). Returns transcription segments labeled with real names alongside face images extracted from the video. Different from Speaker Diarization: Speaker Diarization only assigns anonymous labels (SPEAKER_00, SPEAKER_01). Multimodal Speaker Recognition matches voices to faces to produce named speaker identification — but costs ~100× more.

This is an async endpoint. You must configure a webhook URL in Webhooks Settings before calling this endpoint, otherwise you will not receive the processing results. See Webhooks Configuration Guide for details.

Pricing:

$0.1/min of video and audio

Request Body

Parameter	Type	Required	Description
asset_id	string	Yes	The unique identifier of the video or audio asset for multi-speaker identification

Code Example

curl --request POST \
  --url https://mavi-backend.memories.ai/serve/api/v2/transcriptions/async-generate-multi-speaker \
  --header 'Authorization: sk-mavi-...' \
  --header 'Content-Type: application/json' \
  --data '{
    "asset_id": "re_657929111888723968"
  }'

const BASE_URL = "https://mavi-backend.memories.ai/serve/api/v2/transcriptions";
const API_KEY = "sk-mavi-...";

const response = await fetch(`${BASE_URL}/async-generate-multi-speaker`, {
  method: 'POST',
  headers: {
    'Content-Type': 'application/json',
    'Authorization': API_KEY
  },
  body: JSON.stringify({
    asset_id: 're_657929111888723968'
  })
});

const data = await response.json();
console.log(data);

import requests

BASE_URL = "https://mavi-backend.memories.ai/serve/api/v2/transcriptions"
API_KEY = "sk-mavi-..."
HEADERS = {
    "Authorization": f"{API_KEY}"
}

def async_generate_multi_speaker(asset_id: str):
    url = f"{BASE_URL}/async-generate-multi-speaker"
    data = {"asset_id": asset_id}
    resp = requests.post(url, json=data, headers=HEADERS)
    return resp.json()

# Usage example
result = async_generate_multi_speaker("re_657929111888723968")
print(result)

Response

Returns the multi-speaker identification task information.

{
  "code": 200,
  "msg": "success",
  "data": {
    "task_id": "ec2449885ba84c4f943a80ff0633158e"
  },
  "failed": false,
  "success": true
}

{
  "code": 200,
  "message": "SUCCESS",
  "data": {
    "data": {
      "audio_transcription": [
        {
          "end_time": 1.8,
          "speaker": "Kiara S Stepsister",
          "start_time": 0.0,
          "text": "You wolfless Omega! Clean that up, you jinx!"
        }
      ],
      "faces": [
        {
          "face_file_blob": "api-backend/0c5dfd30-7285-4e1e-bd20-dd31fd405365/multimodal-asr/9e545636_batch_1_video_9_person_001.jpg",
          "face_file_bucket": "memories-cache",
          "face_file_protocol": "gs",
          "face_id": "9e545636-509a-4a7d-b7c8-6359ea6a6d8b_person_001",
          "name": "Kiara S Stepsister"
        }
      ],
      "usage_metadata": [
        {
          "duration": 0.0,
          "model": "gemini-2.5-pro",
          "output_tokens": 6143,
          "prompt_tokens": 442368
        },
        {
          "duration": 0.0,
          "model": "gemini-2.5-flash",
          "output_tokens": 15643,
          "prompt_tokens": 32508
        }
      ]
    },
    "msg": "Multimodal ASR completed successfully",
    "success": true
  },
  "task_id": "29799938cfd344db8e10243a266b9990"
}

Response Parameters

Parameter	Type	Description
code	string	Response code indicating the result status
msg	string	Response message describing the operation result
data	object	Response data object containing task information
data.task_id	string	Unique identifier of the multi-speaker identification task
success	boolean	Indicates whether the operation was successful
failed	boolean	Indicates whether the operation failed

Callback Response Parameters

When the multi-speaker identification is complete, a callback will be sent to your configured webhook URL.

Parameter	Type	Description
code	string	Response code (200 indicates success)
message	string	Status message (e.g., “SUCCESS”)
data	object	Response data object containing the multimodal ASR result and metadata
data.data	object	Inner data object containing transcription, faces, and usage information
data.data.audio_transcription	array	Array of transcription segments with speaker identification
data.data.audio_transcription[].start_time	number	Start time of the segment in seconds
data.data.audio_transcription[].end_time	number	End time of the segment in seconds
data.data.audio_transcription[].speaker	string	Identified speaker name
data.data.audio_transcription[].text	string	Transcription text for this segment
data.data.faces	array	Array of detected faces with metadata
data.data.faces[].face_id	string	Unique identifier for the detected face
data.data.faces[].name	string	Identified name of the person
data.data.faces[].face_file_protocol	string	Storage protocol (e.g., “gs” for Google Cloud Storage)
data.data.faces[].face_file_bucket	string	Storage bucket name
data.data.faces[].face_file_blob	string	File path in the storage bucket
data.data.usage_metadata	array	Array of usage statistics for different models used
data.data.usage_metadata[].duration	number	Processing duration in seconds
data.data.usage_metadata[].model	string	The AI model used (e.g., “gemini-2.5-pro”, “gemini-2.5-flash”)
data.data.usage_metadata[].output_tokens	integer	Number of tokens in the output
data.data.usage_metadata[].prompt_tokens	integer	Number of tokens in the input prompt
data.msg	string	Detailed message about the operation result
data.success	boolean	Indicates whether the multimodal ASR was successful
task_id	string	The task ID associated with this multi-speaker identification request

Authorizations

Authorization

string

header

required

Body

application/json

asset_id

string

required

The asset ID to identify multiple speakers for

Example:

"re_657929111888723968"

Response

200 - application/json

Multi-speaker identification task information

code

string

Response code indicating the result status

Example:

200

msg

string

Response message describing the operation result

Example:

"success"

data

object

Response data object containing task information

Show child attributes

success

boolean

Indicates whether the operation was successful

Example:

true

failed

boolean

Indicates whether the operation failed

Example:

false

Speaker Diarization Start Audio Stream Transcription

Get Started

Asset Management

Social Media Scraping

Audio File Transcription

Live Audio Transcription

Video Model APIs

Video Task APIs

Live Video Content Moderation

Live Video Understanding

Image Model APIs

Embeddings

Human ReID & Caption

Reference

Request Body

Code Example

Response

Response Parameters

Callback Response Parameters

Authorizations

Body

Response

​Request Body

​Code Example

​Response

​Response Parameters

​Callback Response Parameters

Authorizations

Body

Response

Request Body

Code Example

Response

Response Parameters

Callback Response Parameters