Multimodal Browser AI with Transformers.js for Images and Speech

<title>Multimodal Media Analyzer</title>

* { field–sizing: border–field; margin: 0; padding: 0; }

physique {

font–household: system–ui, sans–serif;

max–width: 820px;

margin: 0 auto;

padding: 1.5rem 1rem;

background: #f1f5f9;

shade: #1e293b;

}

header { margin–backside: 1.5rem; }

header h1 { font–measurement: 1.5rem; }

header p { shade: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

/* Mannequin standing indicators */

.mannequin–standing–bar {

show: flex;

hole: 0.5rem;

flex–wrap: wrap;

margin–prime: 0.75rem;

}

.mannequin–badge {

font–measurement: 0.78rem;

padding: 0.2rem 0.6rem;

border–radius: 12px;

background: #fef3c7;

shade: #92400e;

}

.mannequin–badge.prepared { background: #dcfce7; shade: #15803d; }

/* Tab bar */

.tabs {

show: flex;

background: white;

border–radius: 8px;

padding: 0.25rem;

hole: 0.25rem;

margin–backside: 1.25rem;

border: 1px stable #e2e8f0;

}

.tab {

flex: 1;

padding: 0.5rem;

textual content–align: heart;

border–radius: 6px;

cursor: pointer;

font–measurement: 0.9rem;

shade: #64748b;

transition: all 0.15s;

}

.tab.lively { background: #2563eb; shade: white; font-weight: 600; }

/* Enter panels */

.panel { show: none; }

.panel.lively { show: block; }

.add–space {

background: white;

border: 2px dashed #cbd5e1;

border–radius: 8px;

padding: 2rem;

textual content–align: heart;

cursor: pointer;

}

.add–space enter { show: none; }

#img-preview {

margin–prime: 1rem;

max–width: 100%;

max–top: 320px;

border–radius: 8px;

show: none;

object–match: cowl;

}

.mic–heart { textual content–align: heart; padding: 1rem 0; }

#rec-btn {

width: 72px; top: 72px;

border–radius: 50%; border: none;

background: #dc2626; shade: white;

font–measurement: 1.6rem; cursor: pointer;

show: flex; align–gadgets: heart; justify–content material: heart;

margin: 0 auto 0.5rem;

}

#rec-btn.recording { background: #374151; }

#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }

#rec-timer { font-weight: 600; shade: #374151; margin-bottom: 0.25rem; }

#rec-hint { font-size: 0.85rem; shade: #64748b; }

#wave-canvas { show: block; margin: 0.5rem auto; border-radius: 4px; }

/* Outcomes grid */

.outcomes–grid {

show: grid;

grid–template–columns: repeat(auto–match, minmax(220px, 1fr));

hole: 1rem;

margin–prime: 1.25rem;

}

.consequence–card {

background: white;

border: 1px stable #e2e8f0;

border–radius: 8px;

padding: 1rem;

}

.consequence–card h3 {

font–measurement: 0.75rem;

textual content–remodel: uppercase;

letter–spacing: 0.06em;

shade: #64748b;

margin–backside: 0.6rem;

}

.label–merchandise {

show: flex;

justify–content material: house–between;

align–gadgets: heart;

padding: 0.25rem 0;

font–measurement: 0.875rem;

border–backside: 1px stable #f1f5f9;

}

.label–rating {

font–measurement: 0.8rem;

shade: #64748b;

background: #f1f5f9;

padding: 0.1rem 0.4rem;

border–radius: 4px;

}

.caption–physique {

font–measurement: 0.95rem;

line–top: 1.5;

font–fashion: italic;

shade: #334155;

}

.transcript–physique {

font–measurement: 0.95rem;

line–top: 1.6;

shade: #334155;

white–house: pre–wrap;

}

.placeholder–textual content { shade: #94a3b8; font-style: italic; font-size: 0.9rem; }

#global-status {

font–measurement: 0.85rem;

shade: #64748b;

margin–backside: 1rem;

}

@media (max–width: 500px) {

.outcomes–grid { grid–template–columns: 1fr; }

}

<h1>Multimodal Media Analyzer</h1>

<p>Picture classification, captioning, and speech transcription — all in your browser.</p>

<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

</div>

</header>

<div id=“global-status”>Loading fashions in parallel — first run downloads ~400 MB complete.</div>

<div class=“tab lively” information–tab=“picture”>🖼 Picture Evaluation</div>

<div class=“tab” information–tab=“speech”>🎙 Speech Transcription</div>

</div>

<!— Picture panel —>

<p>Click on or drag an picture to analyze</p>

JPG, PNG, WebP, GIF supported

</p>

</div>

</div>

<!— Speech panel —>

<div id=“rec-hint”>Ready for Whisper mannequin...</div>

</div>

<!— Outcomes – proven for each modes —>

<!— Picture outcomes (proven in picture mode) —>

<h3>Classification</h3>

<p class=“placeholder-text”>No outcomes but.</p>

</div>

<h3>Caption</h3>

<p class=“placeholder-text”>No caption but.</p>

</div>

<!— Speech outcomes (proven in speech mode) —>

<h3>Transcription</h3>

<p class=“placeholder-text”>Report audio to see the transcription.</p>

</div>

import { pipeline }

from ‘https://cdn.jsdelivr.internet/npm/@huggingface/transformers@3.0.2’;

// ── Pipeline references ───────────────────────────────────────────────

let classifier, captioner, transcriber;

let readyCount = 0;

// Replace a mannequin badge to “prepared” state

perform markReady(badgeId, label) {

const badge = doc.getElementById(badgeId);

badge.textContent = `${label}: prepared`;

badge.classList.add(‘prepared’);

readyCount++;

if (readyCount === 3) {

globalStatus.textContent =

‘All fashions prepared. Add a picture or file audio.’;

recBtn.disabled = false;

recHint.textContent = ‘Click on to begin recording.’;

}

// Load all three pipelines concurrently

Promise.all([

pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

}),

pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

}),

pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

})

]).then(([cls, cap, asr]) => {

classifier = cls;

captioner = cap;

transcriber = asr;

}).catch(err => {

globalStatus.textContent = `Error loading fashions: ${err.message}`;

});

// ── UI references ─────────────────────────────────────────────────────

const globalStatus = doc.getElementById(‘global-status’);

const resultsGrid = doc.getElementById(‘results-grid’);

const recBtn = doc.getElementById(‘rec-btn’);

const recHint = doc.getElementById(‘rec-hint’);

const recTimer = doc.getElementById(‘rec-timer’);

const waveCanvas = doc.getElementById(‘wave-canvas’);

const waveCtx = waveCanvas.getContext(‘second’);

// ── Picture evaluation ────────────────────────────────────────────────────

async perform analyzeImage(dataUrl) {

if (!classifier || !captioner) {

globalStatus.textContent = ‘Fashions nonetheless loading. Please wait.’;

return;

}

globalStatus.textContent = ‘Operating classification and captioning…’;

// Present picture consequence playing cards, cover speech card

doc.getElementById(‘card-cls’).fashion.show = ‘block’;

doc.getElementById(‘card-cap’).fashion.show = ‘block’;

doc.getElementById(‘card-asr’).fashion.show = ‘none’;

resultsGrid.fashion.show = ‘grid’;

doc.getElementById(‘cls-content’).innerHTML =

‘

Classifying…

‘;

doc.getElementById(‘cap-content’).innerHTML =

‘

Producing caption…

‘;

strive {

// Run classification and captioning in parallel

const [classResults, captionResults] = await Promise.all([

classifier(dataUrl, { top_k: 4 }),

captioner(dataUrl, { max_new_tokens: 60 })

]);

// Render classification labels

doc.getElementById(‘cls-content’).innerHTML =

classResults.map(({ label, rating }) => `

<span>${label}</span>

<span class=“label-score”>${(rating * 100).toFixed(1)}%</span>

</div>`).be a part of(”);

// Render generated caption

doc.getElementById(‘cap-content’).innerHTML =

`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

globalStatus.textContent = ‘Evaluation full.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// File add handler for photographs

const imgDrop = doc.getElementById(‘img-drop’);

const imgInput = doc.getElementById(‘img-input’);

const imgPrev = doc.getElementById(‘img-preview’);

perform handleImageFile(file) {

if (!file?.sort.startsWith(‘picture/’)) return;

const reader = new FileReader();

reader.onload = e => {

imgPrev.src = e.goal.consequence;

imgPrev.fashion.show = ‘block’;

analyzeImage(e.goal.consequence);

};

reader.readAsDataURL(file);

}

imgDrop.addEventListener(‘click on’, () => imgInput.click on());

imgInput.addEventListener(‘change’, e => handleImageFile(e.goal.recordsdata[0]));

imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

imgDrop.addEventListener(‘drop’, e => {

e.preventDefault();

handleImageFile(e.dataTransfer.recordsdata[0]);

});

// ── Audio decoding helper ─────────────────────────────────────────────

async perform decodeAudio(arrayBuffer) {

const audioCtx = new AudioContext({ sampleRate: 16000 });

const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz

}

// ── Speech transcription ──────────────────────────────────────────────

async perform runTranscription(audioData) {

// Present speech consequence card, cover picture playing cards

doc.getElementById(‘card-cls’).fashion.show = ‘none’;

doc.getElementById(‘card-cap’).fashion.show = ‘none’;

doc.getElementById(‘card-asr’).fashion.show = ‘block’;

resultsGrid.fashion.show = ‘grid’;

doc.getElementById(‘asr-content’).innerHTML =

‘

Transcribing…

‘;

globalStatus.textContent = ‘Operating Whisper transcription…’;

strive {

const consequence = await transcriber(audioData, {

chunk_length_s: 30,

stride_length_s: 5

});

doc.getElementById(‘asr-content’).innerHTML =

`<p class=“transcript-body”>${consequence.textual content.trim()}</p>`;

globalStatus.textContent = ‘Transcription full.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// ── Microphone recording ──────────────────────────────────────────────

let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

let secs = 0;

perform drawWave() {

const buf = new Uint8Array(analyserNode.frequencyBinCount);

analyserNode.getByteTimeDomainData(buf);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.top);

waveCtx.beginPath();

waveCtx.strokeStyle = ‘#2563eb’;

waveCtx.lineWidth = 1.5;

buf.forEach((v, i) => {

const x = (i / buf.size) * waveCanvas.width;

const y = (v / 128.0) * (waveCanvas.top / 2);

i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

});

waveCtx.stroke();

animId = requestAnimationFrame(drawWave);

}

recBtn.addEventListener(‘click on’, async () => {

if (mediaRecorder?.state === ‘recording’) {

mediaRecorder.cease();

recBtn.classList.take away(‘recording’);

recBtn.textContent = ‘🎙’;

clearInterval(timerInterval);

cancelAnimationFrame(animId);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.top);

recHint.textContent = ‘Processing…’;

} else {

strive {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const actx = new AudioContext();

analyserNode = actx.createAnalyser();

actx.createMediaStreamSource(stream).join(analyserNode);

analyserNode.fftSize = 256;

mediaRecorder = new MediaRecorder(stream);

audioChunks = [];

mediaRecorder.ondataavailable = e => e.information.measurement && audioChunks.push(e.information);

mediaRecorder.onstop = async () => {

const blob = new Blob(audioChunks, { sort: ‘audio/webm’ });

const arrayBuffer = await blob.arrayBuffer();

const audioData = await decodeAudio(arrayBuffer);

stream.getTracks().forEach(t => t.cease());

await runTranscription(audioData);

recHint.textContent = ‘Click on to file once more.’;

};

mediaRecorder.begin();

recBtn.classList.add(‘recording’);

recBtn.textContent = ‘⏹’;

secs = 0;

recTimer.textContent = ‘0:00’;

timerInterval = setInterval(() => {

secs++;

recTimer.textContent =

`${Math.ground(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

}, 1000);

recHint.textContent = ‘Recording… click on to cease.’;

drawWave();

} catch (err) {

recHint.textContent = `Mic error: ${err.message}`;

}

});

// ── Tab switching ─────────────────────────────────────────────────────

doc.querySelectorAll(‘.tab’).forEach(tab => {

tab.addEventListener(‘click on’, () => {

doc.querySelectorAll(‘.tab, .panel’).forEach(el =>

el.classList.take away(‘lively’));

tab.classList.add(‘lively’);

doc.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘lively’);

});

Multimodal Browser AI with Transformers.js for Images and Speech

Leave a Reply Cancel reply

Follow US

Popular News

Christian Bale’s Infamous Terminator Set Meltdown 17 Years Ago Inspired Daniel Radcliffe’s New Sitcom Character

Rashee Rice Returns to Arrowhead Stadium for First Time Since NFL Suspension

Carrie Bradshaw Goes Out With a Whimper

Taylor Swift And Travis Kelce’s Wedding Plans Reportedly Placed ‘On Hold’

Zhong Kui revealed by developer Game Science

Categories

About US

Quick Links

Important Links

Subscribe US

Leave a Reply Cancel reply

Follow US

Weekly Newsletter

Popular News

Christian Bale’s Infamous Terminator Set Meltdown 17 Years Ago Inspired Daniel Radcliffe’s New Sitcom Character

Rashee Rice Returns to Arrowhead Stadium for First Time Since NFL Suspension

Carrie Bradshaw Goes Out With a Whimper

Taylor Swift And Travis Kelce’s Wedding Plans Reportedly Placed ‘On Hold’

Zhong Kui revealed by developer Game Science

Categories

About US

Quick Links

Important Links

Subscribe US